5da4da2b8a9f0640f70642ed30275e1f286e80f9
[gcc.git] / gcc / config / i386 / i386.c
1 /* Subroutines used for code generation on IA-32.
2 Copyright (C) 1988, 1992, 1994, 1995, 1996, 1997, 1998, 1999, 2000,
3 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012
4 Free Software Foundation, Inc.
5
6 This file is part of GCC.
7
8 GCC is free software; you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation; either version 3, or (at your option)
11 any later version.
12
13 GCC is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
21
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "tm.h"
26 #include "rtl.h"
27 #include "tree.h"
28 #include "tm_p.h"
29 #include "regs.h"
30 #include "hard-reg-set.h"
31 #include "insn-config.h"
32 #include "conditions.h"
33 #include "output.h"
34 #include "insn-codes.h"
35 #include "insn-attr.h"
36 #include "flags.h"
37 #include "except.h"
38 #include "function.h"
39 #include "recog.h"
40 #include "expr.h"
41 #include "optabs.h"
42 #include "diagnostic-core.h"
43 #include "toplev.h"
44 #include "basic-block.h"
45 #include "ggc.h"
46 #include "target.h"
47 #include "target-def.h"
48 #include "common/common-target.h"
49 #include "langhooks.h"
50 #include "reload.h"
51 #include "cgraph.h"
52 #include "gimple.h"
53 #include "dwarf2.h"
54 #include "df.h"
55 #include "tm-constrs.h"
56 #include "params.h"
57 #include "cselib.h"
58 #include "debug.h"
59 #include "sched-int.h"
60 #include "sbitmap.h"
61 #include "fibheap.h"
62 #include "opts.h"
63 #include "diagnostic.h"
64 #include "dumpfile.h"
65
66 enum upper_128bits_state
67 {
68 unknown = 0,
69 unused,
70 used
71 };
72
73 typedef struct block_info_def
74 {
75 /* State of the upper 128bits of AVX registers at exit. */
76 enum upper_128bits_state state;
77 /* TRUE if state of the upper 128bits of AVX registers is unchanged
78 in this block. */
79 bool unchanged;
80 /* TRUE if block has been processed. */
81 bool processed;
82 /* TRUE if block has been scanned. */
83 bool scanned;
84 /* Previous state of the upper 128bits of AVX registers at entry. */
85 enum upper_128bits_state prev;
86 } *block_info;
87
88 #define BLOCK_INFO(B) ((block_info) (B)->aux)
89
90 enum call_avx256_state
91 {
92 /* Callee returns 256bit AVX register. */
93 callee_return_avx256 = -1,
94 /* Callee returns and passes 256bit AVX register. */
95 callee_return_pass_avx256,
96 /* Callee passes 256bit AVX register. */
97 callee_pass_avx256,
98 /* Callee doesn't return nor passe 256bit AVX register, or no
99 256bit AVX register in function return. */
100 call_no_avx256,
101 /* vzeroupper intrinsic. */
102 vzeroupper_intrinsic
103 };
104
105 /* Check if a 256bit AVX register is referenced in stores. */
106
107 static void
108 check_avx256_stores (rtx dest, const_rtx set, void *data)
109 {
110 if ((REG_P (dest)
111 && VALID_AVX256_REG_MODE (GET_MODE (dest)))
112 || (GET_CODE (set) == SET
113 && REG_P (SET_SRC (set))
114 && VALID_AVX256_REG_MODE (GET_MODE (SET_SRC (set)))))
115 {
116 enum upper_128bits_state *state
117 = (enum upper_128bits_state *) data;
118 *state = used;
119 }
120 }
121
122 /* Helper function for move_or_delete_vzeroupper_1. Look for vzeroupper
123 in basic block BB. Delete it if upper 128bit AVX registers are
124 unused. If it isn't deleted, move it to just before a jump insn.
125
126 STATE is state of the upper 128bits of AVX registers at entry. */
127
128 static void
129 move_or_delete_vzeroupper_2 (basic_block bb,
130 enum upper_128bits_state state)
131 {
132 rtx insn, bb_end;
133 rtx vzeroupper_insn = NULL_RTX;
134 rtx pat;
135 int avx256;
136 bool unchanged;
137
138 if (BLOCK_INFO (bb)->unchanged)
139 {
140 if (dump_file)
141 fprintf (dump_file, " [bb %i] unchanged: upper 128bits: %d\n",
142 bb->index, state);
143
144 BLOCK_INFO (bb)->state = state;
145 return;
146 }
147
148 if (BLOCK_INFO (bb)->scanned && BLOCK_INFO (bb)->prev == state)
149 {
150 if (dump_file)
151 fprintf (dump_file, " [bb %i] scanned: upper 128bits: %d\n",
152 bb->index, BLOCK_INFO (bb)->state);
153 return;
154 }
155
156 BLOCK_INFO (bb)->prev = state;
157
158 if (dump_file)
159 fprintf (dump_file, " [bb %i] entry: upper 128bits: %d\n",
160 bb->index, state);
161
162 unchanged = true;
163
164 /* BB_END changes when it is deleted. */
165 bb_end = BB_END (bb);
166 insn = BB_HEAD (bb);
167 while (insn != bb_end)
168 {
169 insn = NEXT_INSN (insn);
170
171 if (!NONDEBUG_INSN_P (insn))
172 continue;
173
174 /* Move vzeroupper before jump/call. */
175 if (JUMP_P (insn) || CALL_P (insn))
176 {
177 if (!vzeroupper_insn)
178 continue;
179
180 if (PREV_INSN (insn) != vzeroupper_insn)
181 {
182 if (dump_file)
183 {
184 fprintf (dump_file, "Move vzeroupper after:\n");
185 print_rtl_single (dump_file, PREV_INSN (insn));
186 fprintf (dump_file, "before:\n");
187 print_rtl_single (dump_file, insn);
188 }
189 reorder_insns_nobb (vzeroupper_insn, vzeroupper_insn,
190 PREV_INSN (insn));
191 }
192 vzeroupper_insn = NULL_RTX;
193 continue;
194 }
195
196 pat = PATTERN (insn);
197
198 /* Check insn for vzeroupper intrinsic. */
199 if (GET_CODE (pat) == UNSPEC_VOLATILE
200 && XINT (pat, 1) == UNSPECV_VZEROUPPER)
201 {
202 if (dump_file)
203 {
204 /* Found vzeroupper intrinsic. */
205 fprintf (dump_file, "Found vzeroupper:\n");
206 print_rtl_single (dump_file, insn);
207 }
208 }
209 else
210 {
211 /* Check insn for vzeroall intrinsic. */
212 if (GET_CODE (pat) == PARALLEL
213 && GET_CODE (XVECEXP (pat, 0, 0)) == UNSPEC_VOLATILE
214 && XINT (XVECEXP (pat, 0, 0), 1) == UNSPECV_VZEROALL)
215 {
216 state = unused;
217 unchanged = false;
218
219 /* Delete pending vzeroupper insertion. */
220 if (vzeroupper_insn)
221 {
222 delete_insn (vzeroupper_insn);
223 vzeroupper_insn = NULL_RTX;
224 }
225 }
226 else if (state != used)
227 {
228 note_stores (pat, check_avx256_stores, &state);
229 if (state == used)
230 unchanged = false;
231 }
232 continue;
233 }
234
235 /* Process vzeroupper intrinsic. */
236 avx256 = INTVAL (XVECEXP (pat, 0, 0));
237
238 if (state == unused)
239 {
240 /* Since the upper 128bits are cleared, callee must not pass
241 256bit AVX register. We only need to check if callee
242 returns 256bit AVX register. */
243 if (avx256 == callee_return_avx256)
244 {
245 state = used;
246 unchanged = false;
247 }
248
249 /* Remove unnecessary vzeroupper since upper 128bits are
250 cleared. */
251 if (dump_file)
252 {
253 fprintf (dump_file, "Delete redundant vzeroupper:\n");
254 print_rtl_single (dump_file, insn);
255 }
256 delete_insn (insn);
257 }
258 else
259 {
260 /* Set state to UNUSED if callee doesn't return 256bit AVX
261 register. */
262 if (avx256 != callee_return_pass_avx256)
263 state = unused;
264
265 if (avx256 == callee_return_pass_avx256
266 || avx256 == callee_pass_avx256)
267 {
268 /* Must remove vzeroupper since callee passes in 256bit
269 AVX register. */
270 if (dump_file)
271 {
272 fprintf (dump_file, "Delete callee pass vzeroupper:\n");
273 print_rtl_single (dump_file, insn);
274 }
275 delete_insn (insn);
276 }
277 else
278 {
279 vzeroupper_insn = insn;
280 unchanged = false;
281 }
282 }
283 }
284
285 BLOCK_INFO (bb)->state = state;
286 BLOCK_INFO (bb)->unchanged = unchanged;
287 BLOCK_INFO (bb)->scanned = true;
288
289 if (dump_file)
290 fprintf (dump_file, " [bb %i] exit: %s: upper 128bits: %d\n",
291 bb->index, unchanged ? "unchanged" : "changed",
292 state);
293 }
294
295 /* Helper function for move_or_delete_vzeroupper. Process vzeroupper
296 in BLOCK and check its predecessor blocks. Treat UNKNOWN state
297 as USED if UNKNOWN_IS_UNUSED is true. Return TRUE if the exit
298 state is changed. */
299
300 static bool
301 move_or_delete_vzeroupper_1 (basic_block block, bool unknown_is_unused)
302 {
303 edge e;
304 edge_iterator ei;
305 enum upper_128bits_state state, old_state, new_state;
306 bool seen_unknown;
307
308 if (dump_file)
309 fprintf (dump_file, " Process [bb %i]: status: %d\n",
310 block->index, BLOCK_INFO (block)->processed);
311
312 if (BLOCK_INFO (block)->processed)
313 return false;
314
315 state = unused;
316
317 /* Check all predecessor edges of this block. */
318 seen_unknown = false;
319 FOR_EACH_EDGE (e, ei, block->preds)
320 {
321 if (e->src == block)
322 continue;
323 switch (BLOCK_INFO (e->src)->state)
324 {
325 case unknown:
326 if (!unknown_is_unused)
327 seen_unknown = true;
328 case unused:
329 break;
330 case used:
331 state = used;
332 goto done;
333 }
334 }
335
336 if (seen_unknown)
337 state = unknown;
338
339 done:
340 old_state = BLOCK_INFO (block)->state;
341 move_or_delete_vzeroupper_2 (block, state);
342 new_state = BLOCK_INFO (block)->state;
343
344 if (state != unknown || new_state == used)
345 BLOCK_INFO (block)->processed = true;
346
347 /* Need to rescan if the upper 128bits of AVX registers are changed
348 to USED at exit. */
349 if (new_state != old_state)
350 {
351 if (new_state == used)
352 cfun->machine->rescan_vzeroupper_p = 1;
353 return true;
354 }
355 else
356 return false;
357 }
358
359 /* Go through the instruction stream looking for vzeroupper. Delete
360 it if upper 128bit AVX registers are unused. If it isn't deleted,
361 move it to just before a jump insn. */
362
363 static void
364 move_or_delete_vzeroupper (void)
365 {
366 edge e;
367 edge_iterator ei;
368 basic_block bb;
369 fibheap_t worklist, pending, fibheap_swap;
370 sbitmap visited, in_worklist, in_pending, sbitmap_swap;
371 int *bb_order;
372 int *rc_order;
373 int i;
374
375 /* Set up block info for each basic block. */
376 alloc_aux_for_blocks (sizeof (struct block_info_def));
377
378 /* Process outgoing edges of entry point. */
379 if (dump_file)
380 fprintf (dump_file, "Process outgoing edges of entry point\n");
381
382 FOR_EACH_EDGE (e, ei, ENTRY_BLOCK_PTR->succs)
383 {
384 move_or_delete_vzeroupper_2 (e->dest,
385 cfun->machine->caller_pass_avx256_p
386 ? used : unused);
387 BLOCK_INFO (e->dest)->processed = true;
388 }
389
390 /* Compute reverse completion order of depth first search of the CFG
391 so that the data-flow runs faster. */
392 rc_order = XNEWVEC (int, n_basic_blocks - NUM_FIXED_BLOCKS);
393 bb_order = XNEWVEC (int, last_basic_block);
394 pre_and_rev_post_order_compute (NULL, rc_order, false);
395 for (i = 0; i < n_basic_blocks - NUM_FIXED_BLOCKS; i++)
396 bb_order[rc_order[i]] = i;
397 free (rc_order);
398
399 worklist = fibheap_new ();
400 pending = fibheap_new ();
401 visited = sbitmap_alloc (last_basic_block);
402 in_worklist = sbitmap_alloc (last_basic_block);
403 in_pending = sbitmap_alloc (last_basic_block);
404 sbitmap_zero (in_worklist);
405
406 /* Don't check outgoing edges of entry point. */
407 sbitmap_ones (in_pending);
408 FOR_EACH_BB (bb)
409 if (BLOCK_INFO (bb)->processed)
410 RESET_BIT (in_pending, bb->index);
411 else
412 {
413 move_or_delete_vzeroupper_1 (bb, false);
414 fibheap_insert (pending, bb_order[bb->index], bb);
415 }
416
417 if (dump_file)
418 fprintf (dump_file, "Check remaining basic blocks\n");
419
420 while (!fibheap_empty (pending))
421 {
422 fibheap_swap = pending;
423 pending = worklist;
424 worklist = fibheap_swap;
425 sbitmap_swap = in_pending;
426 in_pending = in_worklist;
427 in_worklist = sbitmap_swap;
428
429 sbitmap_zero (visited);
430
431 cfun->machine->rescan_vzeroupper_p = 0;
432
433 while (!fibheap_empty (worklist))
434 {
435 bb = (basic_block) fibheap_extract_min (worklist);
436 RESET_BIT (in_worklist, bb->index);
437 gcc_assert (!TEST_BIT (visited, bb->index));
438 if (!TEST_BIT (visited, bb->index))
439 {
440 edge_iterator ei;
441
442 SET_BIT (visited, bb->index);
443
444 if (move_or_delete_vzeroupper_1 (bb, false))
445 FOR_EACH_EDGE (e, ei, bb->succs)
446 {
447 if (e->dest == EXIT_BLOCK_PTR
448 || BLOCK_INFO (e->dest)->processed)
449 continue;
450
451 if (TEST_BIT (visited, e->dest->index))
452 {
453 if (!TEST_BIT (in_pending, e->dest->index))
454 {
455 /* Send E->DEST to next round. */
456 SET_BIT (in_pending, e->dest->index);
457 fibheap_insert (pending,
458 bb_order[e->dest->index],
459 e->dest);
460 }
461 }
462 else if (!TEST_BIT (in_worklist, e->dest->index))
463 {
464 /* Add E->DEST to current round. */
465 SET_BIT (in_worklist, e->dest->index);
466 fibheap_insert (worklist, bb_order[e->dest->index],
467 e->dest);
468 }
469 }
470 }
471 }
472
473 if (!cfun->machine->rescan_vzeroupper_p)
474 break;
475 }
476
477 free (bb_order);
478 fibheap_delete (worklist);
479 fibheap_delete (pending);
480 sbitmap_free (visited);
481 sbitmap_free (in_worklist);
482 sbitmap_free (in_pending);
483
484 if (dump_file)
485 fprintf (dump_file, "Process remaining basic blocks\n");
486
487 FOR_EACH_BB (bb)
488 move_or_delete_vzeroupper_1 (bb, true);
489
490 free_aux_for_blocks ();
491 }
492
493 static rtx legitimize_dllimport_symbol (rtx, bool);
494
495 #ifndef CHECK_STACK_LIMIT
496 #define CHECK_STACK_LIMIT (-1)
497 #endif
498
499 /* Return index of given mode in mult and division cost tables. */
500 #define MODE_INDEX(mode) \
501 ((mode) == QImode ? 0 \
502 : (mode) == HImode ? 1 \
503 : (mode) == SImode ? 2 \
504 : (mode) == DImode ? 3 \
505 : 4)
506
507 /* Processor costs (relative to an add) */
508 /* We assume COSTS_N_INSNS is defined as (N)*4 and an addition is 2 bytes. */
509 #define COSTS_N_BYTES(N) ((N) * 2)
510
511 #define DUMMY_STRINGOP_ALGS {libcall, {{-1, libcall}}}
512
513 const
514 struct processor_costs ix86_size_cost = {/* costs for tuning for size */
515 COSTS_N_BYTES (2), /* cost of an add instruction */
516 COSTS_N_BYTES (3), /* cost of a lea instruction */
517 COSTS_N_BYTES (2), /* variable shift costs */
518 COSTS_N_BYTES (3), /* constant shift costs */
519 {COSTS_N_BYTES (3), /* cost of starting multiply for QI */
520 COSTS_N_BYTES (3), /* HI */
521 COSTS_N_BYTES (3), /* SI */
522 COSTS_N_BYTES (3), /* DI */
523 COSTS_N_BYTES (5)}, /* other */
524 0, /* cost of multiply per each bit set */
525 {COSTS_N_BYTES (3), /* cost of a divide/mod for QI */
526 COSTS_N_BYTES (3), /* HI */
527 COSTS_N_BYTES (3), /* SI */
528 COSTS_N_BYTES (3), /* DI */
529 COSTS_N_BYTES (5)}, /* other */
530 COSTS_N_BYTES (3), /* cost of movsx */
531 COSTS_N_BYTES (3), /* cost of movzx */
532 0, /* "large" insn */
533 2, /* MOVE_RATIO */
534 2, /* cost for loading QImode using movzbl */
535 {2, 2, 2}, /* cost of loading integer registers
536 in QImode, HImode and SImode.
537 Relative to reg-reg move (2). */
538 {2, 2, 2}, /* cost of storing integer registers */
539 2, /* cost of reg,reg fld/fst */
540 {2, 2, 2}, /* cost of loading fp registers
541 in SFmode, DFmode and XFmode */
542 {2, 2, 2}, /* cost of storing fp registers
543 in SFmode, DFmode and XFmode */
544 3, /* cost of moving MMX register */
545 {3, 3}, /* cost of loading MMX registers
546 in SImode and DImode */
547 {3, 3}, /* cost of storing MMX registers
548 in SImode and DImode */
549 3, /* cost of moving SSE register */
550 {3, 3, 3}, /* cost of loading SSE registers
551 in SImode, DImode and TImode */
552 {3, 3, 3}, /* cost of storing SSE registers
553 in SImode, DImode and TImode */
554 3, /* MMX or SSE register to integer */
555 0, /* size of l1 cache */
556 0, /* size of l2 cache */
557 0, /* size of prefetch block */
558 0, /* number of parallel prefetches */
559 2, /* Branch cost */
560 COSTS_N_BYTES (2), /* cost of FADD and FSUB insns. */
561 COSTS_N_BYTES (2), /* cost of FMUL instruction. */
562 COSTS_N_BYTES (2), /* cost of FDIV instruction. */
563 COSTS_N_BYTES (2), /* cost of FABS instruction. */
564 COSTS_N_BYTES (2), /* cost of FCHS instruction. */
565 COSTS_N_BYTES (2), /* cost of FSQRT instruction. */
566 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
567 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}}},
568 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
569 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}}},
570 1, /* scalar_stmt_cost. */
571 1, /* scalar load_cost. */
572 1, /* scalar_store_cost. */
573 1, /* vec_stmt_cost. */
574 1, /* vec_to_scalar_cost. */
575 1, /* scalar_to_vec_cost. */
576 1, /* vec_align_load_cost. */
577 1, /* vec_unalign_load_cost. */
578 1, /* vec_store_cost. */
579 1, /* cond_taken_branch_cost. */
580 1, /* cond_not_taken_branch_cost. */
581 };
582
583 /* Processor costs (relative to an add) */
584 static const
585 struct processor_costs i386_cost = { /* 386 specific costs */
586 COSTS_N_INSNS (1), /* cost of an add instruction */
587 COSTS_N_INSNS (1), /* cost of a lea instruction */
588 COSTS_N_INSNS (3), /* variable shift costs */
589 COSTS_N_INSNS (2), /* constant shift costs */
590 {COSTS_N_INSNS (6), /* cost of starting multiply for QI */
591 COSTS_N_INSNS (6), /* HI */
592 COSTS_N_INSNS (6), /* SI */
593 COSTS_N_INSNS (6), /* DI */
594 COSTS_N_INSNS (6)}, /* other */
595 COSTS_N_INSNS (1), /* cost of multiply per each bit set */
596 {COSTS_N_INSNS (23), /* cost of a divide/mod for QI */
597 COSTS_N_INSNS (23), /* HI */
598 COSTS_N_INSNS (23), /* SI */
599 COSTS_N_INSNS (23), /* DI */
600 COSTS_N_INSNS (23)}, /* other */
601 COSTS_N_INSNS (3), /* cost of movsx */
602 COSTS_N_INSNS (2), /* cost of movzx */
603 15, /* "large" insn */
604 3, /* MOVE_RATIO */
605 4, /* cost for loading QImode using movzbl */
606 {2, 4, 2}, /* cost of loading integer registers
607 in QImode, HImode and SImode.
608 Relative to reg-reg move (2). */
609 {2, 4, 2}, /* cost of storing integer registers */
610 2, /* cost of reg,reg fld/fst */
611 {8, 8, 8}, /* cost of loading fp registers
612 in SFmode, DFmode and XFmode */
613 {8, 8, 8}, /* cost of storing fp registers
614 in SFmode, DFmode and XFmode */
615 2, /* cost of moving MMX register */
616 {4, 8}, /* cost of loading MMX registers
617 in SImode and DImode */
618 {4, 8}, /* cost of storing MMX registers
619 in SImode and DImode */
620 2, /* cost of moving SSE register */
621 {4, 8, 16}, /* cost of loading SSE registers
622 in SImode, DImode and TImode */
623 {4, 8, 16}, /* cost of storing SSE registers
624 in SImode, DImode and TImode */
625 3, /* MMX or SSE register to integer */
626 0, /* size of l1 cache */
627 0, /* size of l2 cache */
628 0, /* size of prefetch block */
629 0, /* number of parallel prefetches */
630 1, /* Branch cost */
631 COSTS_N_INSNS (23), /* cost of FADD and FSUB insns. */
632 COSTS_N_INSNS (27), /* cost of FMUL instruction. */
633 COSTS_N_INSNS (88), /* cost of FDIV instruction. */
634 COSTS_N_INSNS (22), /* cost of FABS instruction. */
635 COSTS_N_INSNS (24), /* cost of FCHS instruction. */
636 COSTS_N_INSNS (122), /* cost of FSQRT instruction. */
637 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
638 DUMMY_STRINGOP_ALGS},
639 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
640 DUMMY_STRINGOP_ALGS},
641 1, /* scalar_stmt_cost. */
642 1, /* scalar load_cost. */
643 1, /* scalar_store_cost. */
644 1, /* vec_stmt_cost. */
645 1, /* vec_to_scalar_cost. */
646 1, /* scalar_to_vec_cost. */
647 1, /* vec_align_load_cost. */
648 2, /* vec_unalign_load_cost. */
649 1, /* vec_store_cost. */
650 3, /* cond_taken_branch_cost. */
651 1, /* cond_not_taken_branch_cost. */
652 };
653
654 static const
655 struct processor_costs i486_cost = { /* 486 specific costs */
656 COSTS_N_INSNS (1), /* cost of an add instruction */
657 COSTS_N_INSNS (1), /* cost of a lea instruction */
658 COSTS_N_INSNS (3), /* variable shift costs */
659 COSTS_N_INSNS (2), /* constant shift costs */
660 {COSTS_N_INSNS (12), /* cost of starting multiply for QI */
661 COSTS_N_INSNS (12), /* HI */
662 COSTS_N_INSNS (12), /* SI */
663 COSTS_N_INSNS (12), /* DI */
664 COSTS_N_INSNS (12)}, /* other */
665 1, /* cost of multiply per each bit set */
666 {COSTS_N_INSNS (40), /* cost of a divide/mod for QI */
667 COSTS_N_INSNS (40), /* HI */
668 COSTS_N_INSNS (40), /* SI */
669 COSTS_N_INSNS (40), /* DI */
670 COSTS_N_INSNS (40)}, /* other */
671 COSTS_N_INSNS (3), /* cost of movsx */
672 COSTS_N_INSNS (2), /* cost of movzx */
673 15, /* "large" insn */
674 3, /* MOVE_RATIO */
675 4, /* cost for loading QImode using movzbl */
676 {2, 4, 2}, /* cost of loading integer registers
677 in QImode, HImode and SImode.
678 Relative to reg-reg move (2). */
679 {2, 4, 2}, /* cost of storing integer registers */
680 2, /* cost of reg,reg fld/fst */
681 {8, 8, 8}, /* cost of loading fp registers
682 in SFmode, DFmode and XFmode */
683 {8, 8, 8}, /* cost of storing fp registers
684 in SFmode, DFmode and XFmode */
685 2, /* cost of moving MMX register */
686 {4, 8}, /* cost of loading MMX registers
687 in SImode and DImode */
688 {4, 8}, /* cost of storing MMX registers
689 in SImode and DImode */
690 2, /* cost of moving SSE register */
691 {4, 8, 16}, /* cost of loading SSE registers
692 in SImode, DImode and TImode */
693 {4, 8, 16}, /* cost of storing SSE registers
694 in SImode, DImode and TImode */
695 3, /* MMX or SSE register to integer */
696 4, /* size of l1 cache. 486 has 8kB cache
697 shared for code and data, so 4kB is
698 not really precise. */
699 4, /* size of l2 cache */
700 0, /* size of prefetch block */
701 0, /* number of parallel prefetches */
702 1, /* Branch cost */
703 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
704 COSTS_N_INSNS (16), /* cost of FMUL instruction. */
705 COSTS_N_INSNS (73), /* cost of FDIV instruction. */
706 COSTS_N_INSNS (3), /* cost of FABS instruction. */
707 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
708 COSTS_N_INSNS (83), /* cost of FSQRT instruction. */
709 {{rep_prefix_4_byte, {{-1, rep_prefix_4_byte}}},
710 DUMMY_STRINGOP_ALGS},
711 {{rep_prefix_4_byte, {{-1, rep_prefix_4_byte}}},
712 DUMMY_STRINGOP_ALGS},
713 1, /* scalar_stmt_cost. */
714 1, /* scalar load_cost. */
715 1, /* scalar_store_cost. */
716 1, /* vec_stmt_cost. */
717 1, /* vec_to_scalar_cost. */
718 1, /* scalar_to_vec_cost. */
719 1, /* vec_align_load_cost. */
720 2, /* vec_unalign_load_cost. */
721 1, /* vec_store_cost. */
722 3, /* cond_taken_branch_cost. */
723 1, /* cond_not_taken_branch_cost. */
724 };
725
726 static const
727 struct processor_costs pentium_cost = {
728 COSTS_N_INSNS (1), /* cost of an add instruction */
729 COSTS_N_INSNS (1), /* cost of a lea instruction */
730 COSTS_N_INSNS (4), /* variable shift costs */
731 COSTS_N_INSNS (1), /* constant shift costs */
732 {COSTS_N_INSNS (11), /* cost of starting multiply for QI */
733 COSTS_N_INSNS (11), /* HI */
734 COSTS_N_INSNS (11), /* SI */
735 COSTS_N_INSNS (11), /* DI */
736 COSTS_N_INSNS (11)}, /* other */
737 0, /* cost of multiply per each bit set */
738 {COSTS_N_INSNS (25), /* cost of a divide/mod for QI */
739 COSTS_N_INSNS (25), /* HI */
740 COSTS_N_INSNS (25), /* SI */
741 COSTS_N_INSNS (25), /* DI */
742 COSTS_N_INSNS (25)}, /* other */
743 COSTS_N_INSNS (3), /* cost of movsx */
744 COSTS_N_INSNS (2), /* cost of movzx */
745 8, /* "large" insn */
746 6, /* MOVE_RATIO */
747 6, /* cost for loading QImode using movzbl */
748 {2, 4, 2}, /* cost of loading integer registers
749 in QImode, HImode and SImode.
750 Relative to reg-reg move (2). */
751 {2, 4, 2}, /* cost of storing integer registers */
752 2, /* cost of reg,reg fld/fst */
753 {2, 2, 6}, /* cost of loading fp registers
754 in SFmode, DFmode and XFmode */
755 {4, 4, 6}, /* cost of storing fp registers
756 in SFmode, DFmode and XFmode */
757 8, /* cost of moving MMX register */
758 {8, 8}, /* cost of loading MMX registers
759 in SImode and DImode */
760 {8, 8}, /* cost of storing MMX registers
761 in SImode and DImode */
762 2, /* cost of moving SSE register */
763 {4, 8, 16}, /* cost of loading SSE registers
764 in SImode, DImode and TImode */
765 {4, 8, 16}, /* cost of storing SSE registers
766 in SImode, DImode and TImode */
767 3, /* MMX or SSE register to integer */
768 8, /* size of l1 cache. */
769 8, /* size of l2 cache */
770 0, /* size of prefetch block */
771 0, /* number of parallel prefetches */
772 2, /* Branch cost */
773 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
774 COSTS_N_INSNS (3), /* cost of FMUL instruction. */
775 COSTS_N_INSNS (39), /* cost of FDIV instruction. */
776 COSTS_N_INSNS (1), /* cost of FABS instruction. */
777 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
778 COSTS_N_INSNS (70), /* cost of FSQRT instruction. */
779 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
780 DUMMY_STRINGOP_ALGS},
781 {{libcall, {{-1, rep_prefix_4_byte}}},
782 DUMMY_STRINGOP_ALGS},
783 1, /* scalar_stmt_cost. */
784 1, /* scalar load_cost. */
785 1, /* scalar_store_cost. */
786 1, /* vec_stmt_cost. */
787 1, /* vec_to_scalar_cost. */
788 1, /* scalar_to_vec_cost. */
789 1, /* vec_align_load_cost. */
790 2, /* vec_unalign_load_cost. */
791 1, /* vec_store_cost. */
792 3, /* cond_taken_branch_cost. */
793 1, /* cond_not_taken_branch_cost. */
794 };
795
796 static const
797 struct processor_costs pentiumpro_cost = {
798 COSTS_N_INSNS (1), /* cost of an add instruction */
799 COSTS_N_INSNS (1), /* cost of a lea instruction */
800 COSTS_N_INSNS (1), /* variable shift costs */
801 COSTS_N_INSNS (1), /* constant shift costs */
802 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
803 COSTS_N_INSNS (4), /* HI */
804 COSTS_N_INSNS (4), /* SI */
805 COSTS_N_INSNS (4), /* DI */
806 COSTS_N_INSNS (4)}, /* other */
807 0, /* cost of multiply per each bit set */
808 {COSTS_N_INSNS (17), /* cost of a divide/mod for QI */
809 COSTS_N_INSNS (17), /* HI */
810 COSTS_N_INSNS (17), /* SI */
811 COSTS_N_INSNS (17), /* DI */
812 COSTS_N_INSNS (17)}, /* other */
813 COSTS_N_INSNS (1), /* cost of movsx */
814 COSTS_N_INSNS (1), /* cost of movzx */
815 8, /* "large" insn */
816 6, /* MOVE_RATIO */
817 2, /* cost for loading QImode using movzbl */
818 {4, 4, 4}, /* cost of loading integer registers
819 in QImode, HImode and SImode.
820 Relative to reg-reg move (2). */
821 {2, 2, 2}, /* cost of storing integer registers */
822 2, /* cost of reg,reg fld/fst */
823 {2, 2, 6}, /* cost of loading fp registers
824 in SFmode, DFmode and XFmode */
825 {4, 4, 6}, /* cost of storing fp registers
826 in SFmode, DFmode and XFmode */
827 2, /* cost of moving MMX register */
828 {2, 2}, /* cost of loading MMX registers
829 in SImode and DImode */
830 {2, 2}, /* cost of storing MMX registers
831 in SImode and DImode */
832 2, /* cost of moving SSE register */
833 {2, 2, 8}, /* cost of loading SSE registers
834 in SImode, DImode and TImode */
835 {2, 2, 8}, /* cost of storing SSE registers
836 in SImode, DImode and TImode */
837 3, /* MMX or SSE register to integer */
838 8, /* size of l1 cache. */
839 256, /* size of l2 cache */
840 32, /* size of prefetch block */
841 6, /* number of parallel prefetches */
842 2, /* Branch cost */
843 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
844 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
845 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
846 COSTS_N_INSNS (2), /* cost of FABS instruction. */
847 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
848 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
849 /* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes
850 (we ensure the alignment). For small blocks inline loop is still a
851 noticeable win, for bigger blocks either rep movsl or rep movsb is
852 way to go. Rep movsb has apparently more expensive startup time in CPU,
853 but after 4K the difference is down in the noise. */
854 {{rep_prefix_4_byte, {{128, loop}, {1024, unrolled_loop},
855 {8192, rep_prefix_4_byte}, {-1, rep_prefix_1_byte}}},
856 DUMMY_STRINGOP_ALGS},
857 {{rep_prefix_4_byte, {{1024, unrolled_loop},
858 {8192, rep_prefix_4_byte}, {-1, libcall}}},
859 DUMMY_STRINGOP_ALGS},
860 1, /* scalar_stmt_cost. */
861 1, /* scalar load_cost. */
862 1, /* scalar_store_cost. */
863 1, /* vec_stmt_cost. */
864 1, /* vec_to_scalar_cost. */
865 1, /* scalar_to_vec_cost. */
866 1, /* vec_align_load_cost. */
867 2, /* vec_unalign_load_cost. */
868 1, /* vec_store_cost. */
869 3, /* cond_taken_branch_cost. */
870 1, /* cond_not_taken_branch_cost. */
871 };
872
873 static const
874 struct processor_costs geode_cost = {
875 COSTS_N_INSNS (1), /* cost of an add instruction */
876 COSTS_N_INSNS (1), /* cost of a lea instruction */
877 COSTS_N_INSNS (2), /* variable shift costs */
878 COSTS_N_INSNS (1), /* constant shift costs */
879 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
880 COSTS_N_INSNS (4), /* HI */
881 COSTS_N_INSNS (7), /* SI */
882 COSTS_N_INSNS (7), /* DI */
883 COSTS_N_INSNS (7)}, /* other */
884 0, /* cost of multiply per each bit set */
885 {COSTS_N_INSNS (15), /* cost of a divide/mod for QI */
886 COSTS_N_INSNS (23), /* HI */
887 COSTS_N_INSNS (39), /* SI */
888 COSTS_N_INSNS (39), /* DI */
889 COSTS_N_INSNS (39)}, /* other */
890 COSTS_N_INSNS (1), /* cost of movsx */
891 COSTS_N_INSNS (1), /* cost of movzx */
892 8, /* "large" insn */
893 4, /* MOVE_RATIO */
894 1, /* cost for loading QImode using movzbl */
895 {1, 1, 1}, /* cost of loading integer registers
896 in QImode, HImode and SImode.
897 Relative to reg-reg move (2). */
898 {1, 1, 1}, /* cost of storing integer registers */
899 1, /* cost of reg,reg fld/fst */
900 {1, 1, 1}, /* cost of loading fp registers
901 in SFmode, DFmode and XFmode */
902 {4, 6, 6}, /* cost of storing fp registers
903 in SFmode, DFmode and XFmode */
904
905 1, /* cost of moving MMX register */
906 {1, 1}, /* cost of loading MMX registers
907 in SImode and DImode */
908 {1, 1}, /* cost of storing MMX registers
909 in SImode and DImode */
910 1, /* cost of moving SSE register */
911 {1, 1, 1}, /* cost of loading SSE registers
912 in SImode, DImode and TImode */
913 {1, 1, 1}, /* cost of storing SSE registers
914 in SImode, DImode and TImode */
915 1, /* MMX or SSE register to integer */
916 64, /* size of l1 cache. */
917 128, /* size of l2 cache. */
918 32, /* size of prefetch block */
919 1, /* number of parallel prefetches */
920 1, /* Branch cost */
921 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
922 COSTS_N_INSNS (11), /* cost of FMUL instruction. */
923 COSTS_N_INSNS (47), /* cost of FDIV instruction. */
924 COSTS_N_INSNS (1), /* cost of FABS instruction. */
925 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
926 COSTS_N_INSNS (54), /* cost of FSQRT instruction. */
927 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
928 DUMMY_STRINGOP_ALGS},
929 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
930 DUMMY_STRINGOP_ALGS},
931 1, /* scalar_stmt_cost. */
932 1, /* scalar load_cost. */
933 1, /* scalar_store_cost. */
934 1, /* vec_stmt_cost. */
935 1, /* vec_to_scalar_cost. */
936 1, /* scalar_to_vec_cost. */
937 1, /* vec_align_load_cost. */
938 2, /* vec_unalign_load_cost. */
939 1, /* vec_store_cost. */
940 3, /* cond_taken_branch_cost. */
941 1, /* cond_not_taken_branch_cost. */
942 };
943
944 static const
945 struct processor_costs k6_cost = {
946 COSTS_N_INSNS (1), /* cost of an add instruction */
947 COSTS_N_INSNS (2), /* cost of a lea instruction */
948 COSTS_N_INSNS (1), /* variable shift costs */
949 COSTS_N_INSNS (1), /* constant shift costs */
950 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
951 COSTS_N_INSNS (3), /* HI */
952 COSTS_N_INSNS (3), /* SI */
953 COSTS_N_INSNS (3), /* DI */
954 COSTS_N_INSNS (3)}, /* other */
955 0, /* cost of multiply per each bit set */
956 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
957 COSTS_N_INSNS (18), /* HI */
958 COSTS_N_INSNS (18), /* SI */
959 COSTS_N_INSNS (18), /* DI */
960 COSTS_N_INSNS (18)}, /* other */
961 COSTS_N_INSNS (2), /* cost of movsx */
962 COSTS_N_INSNS (2), /* cost of movzx */
963 8, /* "large" insn */
964 4, /* MOVE_RATIO */
965 3, /* cost for loading QImode using movzbl */
966 {4, 5, 4}, /* cost of loading integer registers
967 in QImode, HImode and SImode.
968 Relative to reg-reg move (2). */
969 {2, 3, 2}, /* cost of storing integer registers */
970 4, /* cost of reg,reg fld/fst */
971 {6, 6, 6}, /* cost of loading fp registers
972 in SFmode, DFmode and XFmode */
973 {4, 4, 4}, /* cost of storing fp registers
974 in SFmode, DFmode and XFmode */
975 2, /* cost of moving MMX register */
976 {2, 2}, /* cost of loading MMX registers
977 in SImode and DImode */
978 {2, 2}, /* cost of storing MMX registers
979 in SImode and DImode */
980 2, /* cost of moving SSE register */
981 {2, 2, 8}, /* cost of loading SSE registers
982 in SImode, DImode and TImode */
983 {2, 2, 8}, /* cost of storing SSE registers
984 in SImode, DImode and TImode */
985 6, /* MMX or SSE register to integer */
986 32, /* size of l1 cache. */
987 32, /* size of l2 cache. Some models
988 have integrated l2 cache, but
989 optimizing for k6 is not important
990 enough to worry about that. */
991 32, /* size of prefetch block */
992 1, /* number of parallel prefetches */
993 1, /* Branch cost */
994 COSTS_N_INSNS (2), /* cost of FADD and FSUB insns. */
995 COSTS_N_INSNS (2), /* cost of FMUL instruction. */
996 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
997 COSTS_N_INSNS (2), /* cost of FABS instruction. */
998 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
999 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
1000 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
1001 DUMMY_STRINGOP_ALGS},
1002 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
1003 DUMMY_STRINGOP_ALGS},
1004 1, /* scalar_stmt_cost. */
1005 1, /* scalar load_cost. */
1006 1, /* scalar_store_cost. */
1007 1, /* vec_stmt_cost. */
1008 1, /* vec_to_scalar_cost. */
1009 1, /* scalar_to_vec_cost. */
1010 1, /* vec_align_load_cost. */
1011 2, /* vec_unalign_load_cost. */
1012 1, /* vec_store_cost. */
1013 3, /* cond_taken_branch_cost. */
1014 1, /* cond_not_taken_branch_cost. */
1015 };
1016
1017 static const
1018 struct processor_costs athlon_cost = {
1019 COSTS_N_INSNS (1), /* cost of an add instruction */
1020 COSTS_N_INSNS (2), /* cost of a lea instruction */
1021 COSTS_N_INSNS (1), /* variable shift costs */
1022 COSTS_N_INSNS (1), /* constant shift costs */
1023 {COSTS_N_INSNS (5), /* cost of starting multiply for QI */
1024 COSTS_N_INSNS (5), /* HI */
1025 COSTS_N_INSNS (5), /* SI */
1026 COSTS_N_INSNS (5), /* DI */
1027 COSTS_N_INSNS (5)}, /* other */
1028 0, /* cost of multiply per each bit set */
1029 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1030 COSTS_N_INSNS (26), /* HI */
1031 COSTS_N_INSNS (42), /* SI */
1032 COSTS_N_INSNS (74), /* DI */
1033 COSTS_N_INSNS (74)}, /* other */
1034 COSTS_N_INSNS (1), /* cost of movsx */
1035 COSTS_N_INSNS (1), /* cost of movzx */
1036 8, /* "large" insn */
1037 9, /* MOVE_RATIO */
1038 4, /* cost for loading QImode using movzbl */
1039 {3, 4, 3}, /* cost of loading integer registers
1040 in QImode, HImode and SImode.
1041 Relative to reg-reg move (2). */
1042 {3, 4, 3}, /* cost of storing integer registers */
1043 4, /* cost of reg,reg fld/fst */
1044 {4, 4, 12}, /* cost of loading fp registers
1045 in SFmode, DFmode and XFmode */
1046 {6, 6, 8}, /* cost of storing fp registers
1047 in SFmode, DFmode and XFmode */
1048 2, /* cost of moving MMX register */
1049 {4, 4}, /* cost of loading MMX registers
1050 in SImode and DImode */
1051 {4, 4}, /* cost of storing MMX registers
1052 in SImode and DImode */
1053 2, /* cost of moving SSE register */
1054 {4, 4, 6}, /* cost of loading SSE registers
1055 in SImode, DImode and TImode */
1056 {4, 4, 5}, /* cost of storing SSE registers
1057 in SImode, DImode and TImode */
1058 5, /* MMX or SSE register to integer */
1059 64, /* size of l1 cache. */
1060 256, /* size of l2 cache. */
1061 64, /* size of prefetch block */
1062 6, /* number of parallel prefetches */
1063 5, /* Branch cost */
1064 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1065 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1066 COSTS_N_INSNS (24), /* cost of FDIV instruction. */
1067 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1068 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1069 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1070 /* For some reason, Athlon deals better with REP prefix (relative to loops)
1071 compared to K8. Alignment becomes important after 8 bytes for memcpy and
1072 128 bytes for memset. */
1073 {{libcall, {{2048, rep_prefix_4_byte}, {-1, libcall}}},
1074 DUMMY_STRINGOP_ALGS},
1075 {{libcall, {{2048, rep_prefix_4_byte}, {-1, libcall}}},
1076 DUMMY_STRINGOP_ALGS},
1077 1, /* scalar_stmt_cost. */
1078 1, /* scalar load_cost. */
1079 1, /* scalar_store_cost. */
1080 1, /* vec_stmt_cost. */
1081 1, /* vec_to_scalar_cost. */
1082 1, /* scalar_to_vec_cost. */
1083 1, /* vec_align_load_cost. */
1084 2, /* vec_unalign_load_cost. */
1085 1, /* vec_store_cost. */
1086 3, /* cond_taken_branch_cost. */
1087 1, /* cond_not_taken_branch_cost. */
1088 };
1089
1090 static const
1091 struct processor_costs k8_cost = {
1092 COSTS_N_INSNS (1), /* cost of an add instruction */
1093 COSTS_N_INSNS (2), /* cost of a lea instruction */
1094 COSTS_N_INSNS (1), /* variable shift costs */
1095 COSTS_N_INSNS (1), /* constant shift costs */
1096 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1097 COSTS_N_INSNS (4), /* HI */
1098 COSTS_N_INSNS (3), /* SI */
1099 COSTS_N_INSNS (4), /* DI */
1100 COSTS_N_INSNS (5)}, /* other */
1101 0, /* cost of multiply per each bit set */
1102 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1103 COSTS_N_INSNS (26), /* HI */
1104 COSTS_N_INSNS (42), /* SI */
1105 COSTS_N_INSNS (74), /* DI */
1106 COSTS_N_INSNS (74)}, /* other */
1107 COSTS_N_INSNS (1), /* cost of movsx */
1108 COSTS_N_INSNS (1), /* cost of movzx */
1109 8, /* "large" insn */
1110 9, /* MOVE_RATIO */
1111 4, /* cost for loading QImode using movzbl */
1112 {3, 4, 3}, /* cost of loading integer registers
1113 in QImode, HImode and SImode.
1114 Relative to reg-reg move (2). */
1115 {3, 4, 3}, /* cost of storing integer registers */
1116 4, /* cost of reg,reg fld/fst */
1117 {4, 4, 12}, /* cost of loading fp registers
1118 in SFmode, DFmode and XFmode */
1119 {6, 6, 8}, /* cost of storing fp registers
1120 in SFmode, DFmode and XFmode */
1121 2, /* cost of moving MMX register */
1122 {3, 3}, /* cost of loading MMX registers
1123 in SImode and DImode */
1124 {4, 4}, /* cost of storing MMX registers
1125 in SImode and DImode */
1126 2, /* cost of moving SSE register */
1127 {4, 3, 6}, /* cost of loading SSE registers
1128 in SImode, DImode and TImode */
1129 {4, 4, 5}, /* cost of storing SSE registers
1130 in SImode, DImode and TImode */
1131 5, /* MMX or SSE register to integer */
1132 64, /* size of l1 cache. */
1133 512, /* size of l2 cache. */
1134 64, /* size of prefetch block */
1135 /* New AMD processors never drop prefetches; if they cannot be performed
1136 immediately, they are queued. We set number of simultaneous prefetches
1137 to a large constant to reflect this (it probably is not a good idea not
1138 to limit number of prefetches at all, as their execution also takes some
1139 time). */
1140 100, /* number of parallel prefetches */
1141 3, /* Branch cost */
1142 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1143 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1144 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1145 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1146 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1147 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1148 /* K8 has optimized REP instruction for medium sized blocks, but for very
1149 small blocks it is better to use loop. For large blocks, libcall can
1150 do nontemporary accesses and beat inline considerably. */
1151 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
1152 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1153 {{libcall, {{8, loop}, {24, unrolled_loop},
1154 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1155 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1156 4, /* scalar_stmt_cost. */
1157 2, /* scalar load_cost. */
1158 2, /* scalar_store_cost. */
1159 5, /* vec_stmt_cost. */
1160 0, /* vec_to_scalar_cost. */
1161 2, /* scalar_to_vec_cost. */
1162 2, /* vec_align_load_cost. */
1163 3, /* vec_unalign_load_cost. */
1164 3, /* vec_store_cost. */
1165 3, /* cond_taken_branch_cost. */
1166 2, /* cond_not_taken_branch_cost. */
1167 };
1168
1169 struct processor_costs amdfam10_cost = {
1170 COSTS_N_INSNS (1), /* cost of an add instruction */
1171 COSTS_N_INSNS (2), /* cost of a lea instruction */
1172 COSTS_N_INSNS (1), /* variable shift costs */
1173 COSTS_N_INSNS (1), /* constant shift costs */
1174 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1175 COSTS_N_INSNS (4), /* HI */
1176 COSTS_N_INSNS (3), /* SI */
1177 COSTS_N_INSNS (4), /* DI */
1178 COSTS_N_INSNS (5)}, /* other */
1179 0, /* cost of multiply per each bit set */
1180 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1181 COSTS_N_INSNS (35), /* HI */
1182 COSTS_N_INSNS (51), /* SI */
1183 COSTS_N_INSNS (83), /* DI */
1184 COSTS_N_INSNS (83)}, /* other */
1185 COSTS_N_INSNS (1), /* cost of movsx */
1186 COSTS_N_INSNS (1), /* cost of movzx */
1187 8, /* "large" insn */
1188 9, /* MOVE_RATIO */
1189 4, /* cost for loading QImode using movzbl */
1190 {3, 4, 3}, /* cost of loading integer registers
1191 in QImode, HImode and SImode.
1192 Relative to reg-reg move (2). */
1193 {3, 4, 3}, /* cost of storing integer registers */
1194 4, /* cost of reg,reg fld/fst */
1195 {4, 4, 12}, /* cost of loading fp registers
1196 in SFmode, DFmode and XFmode */
1197 {6, 6, 8}, /* cost of storing fp registers
1198 in SFmode, DFmode and XFmode */
1199 2, /* cost of moving MMX register */
1200 {3, 3}, /* cost of loading MMX registers
1201 in SImode and DImode */
1202 {4, 4}, /* cost of storing MMX registers
1203 in SImode and DImode */
1204 2, /* cost of moving SSE register */
1205 {4, 4, 3}, /* cost of loading SSE registers
1206 in SImode, DImode and TImode */
1207 {4, 4, 5}, /* cost of storing SSE registers
1208 in SImode, DImode and TImode */
1209 3, /* MMX or SSE register to integer */
1210 /* On K8:
1211 MOVD reg64, xmmreg Double FSTORE 4
1212 MOVD reg32, xmmreg Double FSTORE 4
1213 On AMDFAM10:
1214 MOVD reg64, xmmreg Double FADD 3
1215 1/1 1/1
1216 MOVD reg32, xmmreg Double FADD 3
1217 1/1 1/1 */
1218 64, /* size of l1 cache. */
1219 512, /* size of l2 cache. */
1220 64, /* size of prefetch block */
1221 /* New AMD processors never drop prefetches; if they cannot be performed
1222 immediately, they are queued. We set number of simultaneous prefetches
1223 to a large constant to reflect this (it probably is not a good idea not
1224 to limit number of prefetches at all, as their execution also takes some
1225 time). */
1226 100, /* number of parallel prefetches */
1227 2, /* Branch cost */
1228 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1229 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1230 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1231 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1232 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1233 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1234
1235 /* AMDFAM10 has optimized REP instruction for medium sized blocks, but for
1236 very small blocks it is better to use loop. For large blocks, libcall can
1237 do nontemporary accesses and beat inline considerably. */
1238 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
1239 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1240 {{libcall, {{8, loop}, {24, unrolled_loop},
1241 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1242 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1243 4, /* scalar_stmt_cost. */
1244 2, /* scalar load_cost. */
1245 2, /* scalar_store_cost. */
1246 6, /* vec_stmt_cost. */
1247 0, /* vec_to_scalar_cost. */
1248 2, /* scalar_to_vec_cost. */
1249 2, /* vec_align_load_cost. */
1250 2, /* vec_unalign_load_cost. */
1251 2, /* vec_store_cost. */
1252 2, /* cond_taken_branch_cost. */
1253 1, /* cond_not_taken_branch_cost. */
1254 };
1255
1256 struct processor_costs bdver1_cost = {
1257 COSTS_N_INSNS (1), /* cost of an add instruction */
1258 COSTS_N_INSNS (1), /* cost of a lea instruction */
1259 COSTS_N_INSNS (1), /* variable shift costs */
1260 COSTS_N_INSNS (1), /* constant shift costs */
1261 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1262 COSTS_N_INSNS (4), /* HI */
1263 COSTS_N_INSNS (4), /* SI */
1264 COSTS_N_INSNS (6), /* DI */
1265 COSTS_N_INSNS (6)}, /* other */
1266 0, /* cost of multiply per each bit set */
1267 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1268 COSTS_N_INSNS (35), /* HI */
1269 COSTS_N_INSNS (51), /* SI */
1270 COSTS_N_INSNS (83), /* DI */
1271 COSTS_N_INSNS (83)}, /* other */
1272 COSTS_N_INSNS (1), /* cost of movsx */
1273 COSTS_N_INSNS (1), /* cost of movzx */
1274 8, /* "large" insn */
1275 9, /* MOVE_RATIO */
1276 4, /* cost for loading QImode using movzbl */
1277 {5, 5, 4}, /* cost of loading integer registers
1278 in QImode, HImode and SImode.
1279 Relative to reg-reg move (2). */
1280 {4, 4, 4}, /* cost of storing integer registers */
1281 2, /* cost of reg,reg fld/fst */
1282 {5, 5, 12}, /* cost of loading fp registers
1283 in SFmode, DFmode and XFmode */
1284 {4, 4, 8}, /* cost of storing fp registers
1285 in SFmode, DFmode and XFmode */
1286 2, /* cost of moving MMX register */
1287 {4, 4}, /* cost of loading MMX registers
1288 in SImode and DImode */
1289 {4, 4}, /* cost of storing MMX registers
1290 in SImode and DImode */
1291 2, /* cost of moving SSE register */
1292 {4, 4, 4}, /* cost of loading SSE registers
1293 in SImode, DImode and TImode */
1294 {4, 4, 4}, /* cost of storing SSE registers
1295 in SImode, DImode and TImode */
1296 2, /* MMX or SSE register to integer */
1297 /* On K8:
1298 MOVD reg64, xmmreg Double FSTORE 4
1299 MOVD reg32, xmmreg Double FSTORE 4
1300 On AMDFAM10:
1301 MOVD reg64, xmmreg Double FADD 3
1302 1/1 1/1
1303 MOVD reg32, xmmreg Double FADD 3
1304 1/1 1/1 */
1305 16, /* size of l1 cache. */
1306 2048, /* size of l2 cache. */
1307 64, /* size of prefetch block */
1308 /* New AMD processors never drop prefetches; if they cannot be performed
1309 immediately, they are queued. We set number of simultaneous prefetches
1310 to a large constant to reflect this (it probably is not a good idea not
1311 to limit number of prefetches at all, as their execution also takes some
1312 time). */
1313 100, /* number of parallel prefetches */
1314 2, /* Branch cost */
1315 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1316 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1317 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1318 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1319 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1320 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1321
1322 /* BDVER1 has optimized REP instruction for medium sized blocks, but for
1323 very small blocks it is better to use loop. For large blocks, libcall
1324 can do nontemporary accesses and beat inline considerably. */
1325 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
1326 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1327 {{libcall, {{8, loop}, {24, unrolled_loop},
1328 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1329 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1330 6, /* scalar_stmt_cost. */
1331 4, /* scalar load_cost. */
1332 4, /* scalar_store_cost. */
1333 6, /* vec_stmt_cost. */
1334 0, /* vec_to_scalar_cost. */
1335 2, /* scalar_to_vec_cost. */
1336 4, /* vec_align_load_cost. */
1337 4, /* vec_unalign_load_cost. */
1338 4, /* vec_store_cost. */
1339 2, /* cond_taken_branch_cost. */
1340 1, /* cond_not_taken_branch_cost. */
1341 };
1342
1343 struct processor_costs bdver2_cost = {
1344 COSTS_N_INSNS (1), /* cost of an add instruction */
1345 COSTS_N_INSNS (1), /* cost of a lea instruction */
1346 COSTS_N_INSNS (1), /* variable shift costs */
1347 COSTS_N_INSNS (1), /* constant shift costs */
1348 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1349 COSTS_N_INSNS (4), /* HI */
1350 COSTS_N_INSNS (4), /* SI */
1351 COSTS_N_INSNS (6), /* DI */
1352 COSTS_N_INSNS (6)}, /* other */
1353 0, /* cost of multiply per each bit set */
1354 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1355 COSTS_N_INSNS (35), /* HI */
1356 COSTS_N_INSNS (51), /* SI */
1357 COSTS_N_INSNS (83), /* DI */
1358 COSTS_N_INSNS (83)}, /* other */
1359 COSTS_N_INSNS (1), /* cost of movsx */
1360 COSTS_N_INSNS (1), /* cost of movzx */
1361 8, /* "large" insn */
1362 9, /* MOVE_RATIO */
1363 4, /* cost for loading QImode using movzbl */
1364 {5, 5, 4}, /* cost of loading integer registers
1365 in QImode, HImode and SImode.
1366 Relative to reg-reg move (2). */
1367 {4, 4, 4}, /* cost of storing integer registers */
1368 2, /* cost of reg,reg fld/fst */
1369 {5, 5, 12}, /* cost of loading fp registers
1370 in SFmode, DFmode and XFmode */
1371 {4, 4, 8}, /* cost of storing fp registers
1372 in SFmode, DFmode and XFmode */
1373 2, /* cost of moving MMX register */
1374 {4, 4}, /* cost of loading MMX registers
1375 in SImode and DImode */
1376 {4, 4}, /* cost of storing MMX registers
1377 in SImode and DImode */
1378 2, /* cost of moving SSE register */
1379 {4, 4, 4}, /* cost of loading SSE registers
1380 in SImode, DImode and TImode */
1381 {4, 4, 4}, /* cost of storing SSE registers
1382 in SImode, DImode and TImode */
1383 2, /* MMX or SSE register to integer */
1384 /* On K8:
1385 MOVD reg64, xmmreg Double FSTORE 4
1386 MOVD reg32, xmmreg Double FSTORE 4
1387 On AMDFAM10:
1388 MOVD reg64, xmmreg Double FADD 3
1389 1/1 1/1
1390 MOVD reg32, xmmreg Double FADD 3
1391 1/1 1/1 */
1392 16, /* size of l1 cache. */
1393 2048, /* size of l2 cache. */
1394 64, /* size of prefetch block */
1395 /* New AMD processors never drop prefetches; if they cannot be performed
1396 immediately, they are queued. We set number of simultaneous prefetches
1397 to a large constant to reflect this (it probably is not a good idea not
1398 to limit number of prefetches at all, as their execution also takes some
1399 time). */
1400 100, /* number of parallel prefetches */
1401 2, /* Branch cost */
1402 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1403 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1404 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1405 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1406 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1407 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1408
1409 /* BDVER2 has optimized REP instruction for medium sized blocks, but for
1410 very small blocks it is better to use loop. For large blocks, libcall
1411 can do nontemporary accesses and beat inline considerably. */
1412 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
1413 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1414 {{libcall, {{8, loop}, {24, unrolled_loop},
1415 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1416 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1417 6, /* scalar_stmt_cost. */
1418 4, /* scalar load_cost. */
1419 4, /* scalar_store_cost. */
1420 6, /* vec_stmt_cost. */
1421 0, /* vec_to_scalar_cost. */
1422 2, /* scalar_to_vec_cost. */
1423 4, /* vec_align_load_cost. */
1424 4, /* vec_unalign_load_cost. */
1425 4, /* vec_store_cost. */
1426 2, /* cond_taken_branch_cost. */
1427 1, /* cond_not_taken_branch_cost. */
1428 };
1429
1430 struct processor_costs btver1_cost = {
1431 COSTS_N_INSNS (1), /* cost of an add instruction */
1432 COSTS_N_INSNS (2), /* cost of a lea instruction */
1433 COSTS_N_INSNS (1), /* variable shift costs */
1434 COSTS_N_INSNS (1), /* constant shift costs */
1435 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1436 COSTS_N_INSNS (4), /* HI */
1437 COSTS_N_INSNS (3), /* SI */
1438 COSTS_N_INSNS (4), /* DI */
1439 COSTS_N_INSNS (5)}, /* other */
1440 0, /* cost of multiply per each bit set */
1441 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1442 COSTS_N_INSNS (35), /* HI */
1443 COSTS_N_INSNS (51), /* SI */
1444 COSTS_N_INSNS (83), /* DI */
1445 COSTS_N_INSNS (83)}, /* other */
1446 COSTS_N_INSNS (1), /* cost of movsx */
1447 COSTS_N_INSNS (1), /* cost of movzx */
1448 8, /* "large" insn */
1449 9, /* MOVE_RATIO */
1450 4, /* cost for loading QImode using movzbl */
1451 {3, 4, 3}, /* cost of loading integer registers
1452 in QImode, HImode and SImode.
1453 Relative to reg-reg move (2). */
1454 {3, 4, 3}, /* cost of storing integer registers */
1455 4, /* cost of reg,reg fld/fst */
1456 {4, 4, 12}, /* cost of loading fp registers
1457 in SFmode, DFmode and XFmode */
1458 {6, 6, 8}, /* cost of storing fp registers
1459 in SFmode, DFmode and XFmode */
1460 2, /* cost of moving MMX register */
1461 {3, 3}, /* cost of loading MMX registers
1462 in SImode and DImode */
1463 {4, 4}, /* cost of storing MMX registers
1464 in SImode and DImode */
1465 2, /* cost of moving SSE register */
1466 {4, 4, 3}, /* cost of loading SSE registers
1467 in SImode, DImode and TImode */
1468 {4, 4, 5}, /* cost of storing SSE registers
1469 in SImode, DImode and TImode */
1470 3, /* MMX or SSE register to integer */
1471 /* On K8:
1472 MOVD reg64, xmmreg Double FSTORE 4
1473 MOVD reg32, xmmreg Double FSTORE 4
1474 On AMDFAM10:
1475 MOVD reg64, xmmreg Double FADD 3
1476 1/1 1/1
1477 MOVD reg32, xmmreg Double FADD 3
1478 1/1 1/1 */
1479 32, /* size of l1 cache. */
1480 512, /* size of l2 cache. */
1481 64, /* size of prefetch block */
1482 100, /* number of parallel prefetches */
1483 2, /* Branch cost */
1484 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1485 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1486 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1487 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1488 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1489 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1490
1491 /* BTVER1 has optimized REP instruction for medium sized blocks, but for
1492 very small blocks it is better to use loop. For large blocks, libcall can
1493 do nontemporary accesses and beat inline considerably. */
1494 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
1495 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1496 {{libcall, {{8, loop}, {24, unrolled_loop},
1497 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1498 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1499 4, /* scalar_stmt_cost. */
1500 2, /* scalar load_cost. */
1501 2, /* scalar_store_cost. */
1502 6, /* vec_stmt_cost. */
1503 0, /* vec_to_scalar_cost. */
1504 2, /* scalar_to_vec_cost. */
1505 2, /* vec_align_load_cost. */
1506 2, /* vec_unalign_load_cost. */
1507 2, /* vec_store_cost. */
1508 2, /* cond_taken_branch_cost. */
1509 1, /* cond_not_taken_branch_cost. */
1510 };
1511
1512 struct processor_costs btver2_cost = {
1513 COSTS_N_INSNS (1), /* cost of an add instruction */
1514 COSTS_N_INSNS (2), /* cost of a lea instruction */
1515 COSTS_N_INSNS (1), /* variable shift costs */
1516 COSTS_N_INSNS (1), /* constant shift costs */
1517 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1518 COSTS_N_INSNS (4), /* HI */
1519 COSTS_N_INSNS (3), /* SI */
1520 COSTS_N_INSNS (4), /* DI */
1521 COSTS_N_INSNS (5)}, /* other */
1522 0, /* cost of multiply per each bit set */
1523 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1524 COSTS_N_INSNS (35), /* HI */
1525 COSTS_N_INSNS (51), /* SI */
1526 COSTS_N_INSNS (83), /* DI */
1527 COSTS_N_INSNS (83)}, /* other */
1528 COSTS_N_INSNS (1), /* cost of movsx */
1529 COSTS_N_INSNS (1), /* cost of movzx */
1530 8, /* "large" insn */
1531 9, /* MOVE_RATIO */
1532 4, /* cost for loading QImode using movzbl */
1533 {3, 4, 3}, /* cost of loading integer registers
1534 in QImode, HImode and SImode.
1535 Relative to reg-reg move (2). */
1536 {3, 4, 3}, /* cost of storing integer registers */
1537 4, /* cost of reg,reg fld/fst */
1538 {4, 4, 12}, /* cost of loading fp registers
1539 in SFmode, DFmode and XFmode */
1540 {6, 6, 8}, /* cost of storing fp registers
1541 in SFmode, DFmode and XFmode */
1542 2, /* cost of moving MMX register */
1543 {3, 3}, /* cost of loading MMX registers
1544 in SImode and DImode */
1545 {4, 4}, /* cost of storing MMX registers
1546 in SImode and DImode */
1547 2, /* cost of moving SSE register */
1548 {4, 4, 3}, /* cost of loading SSE registers
1549 in SImode, DImode and TImode */
1550 {4, 4, 5}, /* cost of storing SSE registers
1551 in SImode, DImode and TImode */
1552 3, /* MMX or SSE register to integer */
1553 /* On K8:
1554 MOVD reg64, xmmreg Double FSTORE 4
1555 MOVD reg32, xmmreg Double FSTORE 4
1556 On AMDFAM10:
1557 MOVD reg64, xmmreg Double FADD 3
1558 1/1 1/1
1559 MOVD reg32, xmmreg Double FADD 3
1560 1/1 1/1 */
1561 32, /* size of l1 cache. */
1562 2048, /* size of l2 cache. */
1563 64, /* size of prefetch block */
1564 100, /* number of parallel prefetches */
1565 2, /* Branch cost */
1566 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1567 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1568 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1569 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1570 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1571 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1572
1573 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
1574 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1575 {{libcall, {{8, loop}, {24, unrolled_loop},
1576 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1577 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1578 4, /* scalar_stmt_cost. */
1579 2, /* scalar load_cost. */
1580 2, /* scalar_store_cost. */
1581 6, /* vec_stmt_cost. */
1582 0, /* vec_to_scalar_cost. */
1583 2, /* scalar_to_vec_cost. */
1584 2, /* vec_align_load_cost. */
1585 2, /* vec_unalign_load_cost. */
1586 2, /* vec_store_cost. */
1587 2, /* cond_taken_branch_cost. */
1588 1, /* cond_not_taken_branch_cost. */
1589 };
1590
1591 static const
1592 struct processor_costs pentium4_cost = {
1593 COSTS_N_INSNS (1), /* cost of an add instruction */
1594 COSTS_N_INSNS (3), /* cost of a lea instruction */
1595 COSTS_N_INSNS (4), /* variable shift costs */
1596 COSTS_N_INSNS (4), /* constant shift costs */
1597 {COSTS_N_INSNS (15), /* cost of starting multiply for QI */
1598 COSTS_N_INSNS (15), /* HI */
1599 COSTS_N_INSNS (15), /* SI */
1600 COSTS_N_INSNS (15), /* DI */
1601 COSTS_N_INSNS (15)}, /* other */
1602 0, /* cost of multiply per each bit set */
1603 {COSTS_N_INSNS (56), /* cost of a divide/mod for QI */
1604 COSTS_N_INSNS (56), /* HI */
1605 COSTS_N_INSNS (56), /* SI */
1606 COSTS_N_INSNS (56), /* DI */
1607 COSTS_N_INSNS (56)}, /* other */
1608 COSTS_N_INSNS (1), /* cost of movsx */
1609 COSTS_N_INSNS (1), /* cost of movzx */
1610 16, /* "large" insn */
1611 6, /* MOVE_RATIO */
1612 2, /* cost for loading QImode using movzbl */
1613 {4, 5, 4}, /* cost of loading integer registers
1614 in QImode, HImode and SImode.
1615 Relative to reg-reg move (2). */
1616 {2, 3, 2}, /* cost of storing integer registers */
1617 2, /* cost of reg,reg fld/fst */
1618 {2, 2, 6}, /* cost of loading fp registers
1619 in SFmode, DFmode and XFmode */
1620 {4, 4, 6}, /* cost of storing fp registers
1621 in SFmode, DFmode and XFmode */
1622 2, /* cost of moving MMX register */
1623 {2, 2}, /* cost of loading MMX registers
1624 in SImode and DImode */
1625 {2, 2}, /* cost of storing MMX registers
1626 in SImode and DImode */
1627 12, /* cost of moving SSE register */
1628 {12, 12, 12}, /* cost of loading SSE registers
1629 in SImode, DImode and TImode */
1630 {2, 2, 8}, /* cost of storing SSE registers
1631 in SImode, DImode and TImode */
1632 10, /* MMX or SSE register to integer */
1633 8, /* size of l1 cache. */
1634 256, /* size of l2 cache. */
1635 64, /* size of prefetch block */
1636 6, /* number of parallel prefetches */
1637 2, /* Branch cost */
1638 COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */
1639 COSTS_N_INSNS (7), /* cost of FMUL instruction. */
1640 COSTS_N_INSNS (43), /* cost of FDIV instruction. */
1641 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1642 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1643 COSTS_N_INSNS (43), /* cost of FSQRT instruction. */
1644 {{libcall, {{12, loop_1_byte}, {-1, rep_prefix_4_byte}}},
1645 DUMMY_STRINGOP_ALGS},
1646 {{libcall, {{6, loop_1_byte}, {48, loop}, {20480, rep_prefix_4_byte},
1647 {-1, libcall}}},
1648 DUMMY_STRINGOP_ALGS},
1649 1, /* scalar_stmt_cost. */
1650 1, /* scalar load_cost. */
1651 1, /* scalar_store_cost. */
1652 1, /* vec_stmt_cost. */
1653 1, /* vec_to_scalar_cost. */
1654 1, /* scalar_to_vec_cost. */
1655 1, /* vec_align_load_cost. */
1656 2, /* vec_unalign_load_cost. */
1657 1, /* vec_store_cost. */
1658 3, /* cond_taken_branch_cost. */
1659 1, /* cond_not_taken_branch_cost. */
1660 };
1661
1662 static const
1663 struct processor_costs nocona_cost = {
1664 COSTS_N_INSNS (1), /* cost of an add instruction */
1665 COSTS_N_INSNS (1), /* cost of a lea instruction */
1666 COSTS_N_INSNS (1), /* variable shift costs */
1667 COSTS_N_INSNS (1), /* constant shift costs */
1668 {COSTS_N_INSNS (10), /* cost of starting multiply for QI */
1669 COSTS_N_INSNS (10), /* HI */
1670 COSTS_N_INSNS (10), /* SI */
1671 COSTS_N_INSNS (10), /* DI */
1672 COSTS_N_INSNS (10)}, /* other */
1673 0, /* cost of multiply per each bit set */
1674 {COSTS_N_INSNS (66), /* cost of a divide/mod for QI */
1675 COSTS_N_INSNS (66), /* HI */
1676 COSTS_N_INSNS (66), /* SI */
1677 COSTS_N_INSNS (66), /* DI */
1678 COSTS_N_INSNS (66)}, /* other */
1679 COSTS_N_INSNS (1), /* cost of movsx */
1680 COSTS_N_INSNS (1), /* cost of movzx */
1681 16, /* "large" insn */
1682 17, /* MOVE_RATIO */
1683 4, /* cost for loading QImode using movzbl */
1684 {4, 4, 4}, /* cost of loading integer registers
1685 in QImode, HImode and SImode.
1686 Relative to reg-reg move (2). */
1687 {4, 4, 4}, /* cost of storing integer registers */
1688 3, /* cost of reg,reg fld/fst */
1689 {12, 12, 12}, /* cost of loading fp registers
1690 in SFmode, DFmode and XFmode */
1691 {4, 4, 4}, /* cost of storing fp registers
1692 in SFmode, DFmode and XFmode */
1693 6, /* cost of moving MMX register */
1694 {12, 12}, /* cost of loading MMX registers
1695 in SImode and DImode */
1696 {12, 12}, /* cost of storing MMX registers
1697 in SImode and DImode */
1698 6, /* cost of moving SSE register */
1699 {12, 12, 12}, /* cost of loading SSE registers
1700 in SImode, DImode and TImode */
1701 {12, 12, 12}, /* cost of storing SSE registers
1702 in SImode, DImode and TImode */
1703 8, /* MMX or SSE register to integer */
1704 8, /* size of l1 cache. */
1705 1024, /* size of l2 cache. */
1706 128, /* size of prefetch block */
1707 8, /* number of parallel prefetches */
1708 1, /* Branch cost */
1709 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1710 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1711 COSTS_N_INSNS (40), /* cost of FDIV instruction. */
1712 COSTS_N_INSNS (3), /* cost of FABS instruction. */
1713 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
1714 COSTS_N_INSNS (44), /* cost of FSQRT instruction. */
1715 {{libcall, {{12, loop_1_byte}, {-1, rep_prefix_4_byte}}},
1716 {libcall, {{32, loop}, {20000, rep_prefix_8_byte},
1717 {100000, unrolled_loop}, {-1, libcall}}}},
1718 {{libcall, {{6, loop_1_byte}, {48, loop}, {20480, rep_prefix_4_byte},
1719 {-1, libcall}}},
1720 {libcall, {{24, loop}, {64, unrolled_loop},
1721 {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1722 1, /* scalar_stmt_cost. */
1723 1, /* scalar load_cost. */
1724 1, /* scalar_store_cost. */
1725 1, /* vec_stmt_cost. */
1726 1, /* vec_to_scalar_cost. */
1727 1, /* scalar_to_vec_cost. */
1728 1, /* vec_align_load_cost. */
1729 2, /* vec_unalign_load_cost. */
1730 1, /* vec_store_cost. */
1731 3, /* cond_taken_branch_cost. */
1732 1, /* cond_not_taken_branch_cost. */
1733 };
1734
1735 static const
1736 struct processor_costs atom_cost = {
1737 COSTS_N_INSNS (1), /* cost of an add instruction */
1738 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1739 COSTS_N_INSNS (1), /* variable shift costs */
1740 COSTS_N_INSNS (1), /* constant shift costs */
1741 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1742 COSTS_N_INSNS (4), /* HI */
1743 COSTS_N_INSNS (3), /* SI */
1744 COSTS_N_INSNS (4), /* DI */
1745 COSTS_N_INSNS (2)}, /* other */
1746 0, /* cost of multiply per each bit set */
1747 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1748 COSTS_N_INSNS (26), /* HI */
1749 COSTS_N_INSNS (42), /* SI */
1750 COSTS_N_INSNS (74), /* DI */
1751 COSTS_N_INSNS (74)}, /* other */
1752 COSTS_N_INSNS (1), /* cost of movsx */
1753 COSTS_N_INSNS (1), /* cost of movzx */
1754 8, /* "large" insn */
1755 17, /* MOVE_RATIO */
1756 4, /* cost for loading QImode using movzbl */
1757 {4, 4, 4}, /* cost of loading integer registers
1758 in QImode, HImode and SImode.
1759 Relative to reg-reg move (2). */
1760 {4, 4, 4}, /* cost of storing integer registers */
1761 4, /* cost of reg,reg fld/fst */
1762 {12, 12, 12}, /* cost of loading fp registers
1763 in SFmode, DFmode and XFmode */
1764 {6, 6, 8}, /* cost of storing fp registers
1765 in SFmode, DFmode and XFmode */
1766 2, /* cost of moving MMX register */
1767 {8, 8}, /* cost of loading MMX registers
1768 in SImode and DImode */
1769 {8, 8}, /* cost of storing MMX registers
1770 in SImode and DImode */
1771 2, /* cost of moving SSE register */
1772 {8, 8, 8}, /* cost of loading SSE registers
1773 in SImode, DImode and TImode */
1774 {8, 8, 8}, /* cost of storing SSE registers
1775 in SImode, DImode and TImode */
1776 5, /* MMX or SSE register to integer */
1777 32, /* size of l1 cache. */
1778 256, /* size of l2 cache. */
1779 64, /* size of prefetch block */
1780 6, /* number of parallel prefetches */
1781 3, /* Branch cost */
1782 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1783 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1784 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1785 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1786 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1787 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1788 {{libcall, {{11, loop}, {-1, rep_prefix_4_byte}}},
1789 {libcall, {{32, loop}, {64, rep_prefix_4_byte},
1790 {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1791 {{libcall, {{8, loop}, {15, unrolled_loop},
1792 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1793 {libcall, {{24, loop}, {32, unrolled_loop},
1794 {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1795 1, /* scalar_stmt_cost. */
1796 1, /* scalar load_cost. */
1797 1, /* scalar_store_cost. */
1798 1, /* vec_stmt_cost. */
1799 1, /* vec_to_scalar_cost. */
1800 1, /* scalar_to_vec_cost. */
1801 1, /* vec_align_load_cost. */
1802 2, /* vec_unalign_load_cost. */
1803 1, /* vec_store_cost. */
1804 3, /* cond_taken_branch_cost. */
1805 1, /* cond_not_taken_branch_cost. */
1806 };
1807
1808 /* Generic64 should produce code tuned for Nocona and K8. */
1809 static const
1810 struct processor_costs generic64_cost = {
1811 COSTS_N_INSNS (1), /* cost of an add instruction */
1812 /* On all chips taken into consideration lea is 2 cycles and more. With
1813 this cost however our current implementation of synth_mult results in
1814 use of unnecessary temporary registers causing regression on several
1815 SPECfp benchmarks. */
1816 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1817 COSTS_N_INSNS (1), /* variable shift costs */
1818 COSTS_N_INSNS (1), /* constant shift costs */
1819 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1820 COSTS_N_INSNS (4), /* HI */
1821 COSTS_N_INSNS (3), /* SI */
1822 COSTS_N_INSNS (4), /* DI */
1823 COSTS_N_INSNS (2)}, /* other */
1824 0, /* cost of multiply per each bit set */
1825 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1826 COSTS_N_INSNS (26), /* HI */
1827 COSTS_N_INSNS (42), /* SI */
1828 COSTS_N_INSNS (74), /* DI */
1829 COSTS_N_INSNS (74)}, /* other */
1830 COSTS_N_INSNS (1), /* cost of movsx */
1831 COSTS_N_INSNS (1), /* cost of movzx */
1832 8, /* "large" insn */
1833 17, /* MOVE_RATIO */
1834 4, /* cost for loading QImode using movzbl */
1835 {4, 4, 4}, /* cost of loading integer registers
1836 in QImode, HImode and SImode.
1837 Relative to reg-reg move (2). */
1838 {4, 4, 4}, /* cost of storing integer registers */
1839 4, /* cost of reg,reg fld/fst */
1840 {12, 12, 12}, /* cost of loading fp registers
1841 in SFmode, DFmode and XFmode */
1842 {6, 6, 8}, /* cost of storing fp registers
1843 in SFmode, DFmode and XFmode */
1844 2, /* cost of moving MMX register */
1845 {8, 8}, /* cost of loading MMX registers
1846 in SImode and DImode */
1847 {8, 8}, /* cost of storing MMX registers
1848 in SImode and DImode */
1849 2, /* cost of moving SSE register */
1850 {8, 8, 8}, /* cost of loading SSE registers
1851 in SImode, DImode and TImode */
1852 {8, 8, 8}, /* cost of storing SSE registers
1853 in SImode, DImode and TImode */
1854 5, /* MMX or SSE register to integer */
1855 32, /* size of l1 cache. */
1856 512, /* size of l2 cache. */
1857 64, /* size of prefetch block */
1858 6, /* number of parallel prefetches */
1859 /* Benchmarks shows large regressions on K8 sixtrack benchmark when this
1860 value is increased to perhaps more appropriate value of 5. */
1861 3, /* Branch cost */
1862 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1863 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1864 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1865 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1866 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1867 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1868 {DUMMY_STRINGOP_ALGS,
1869 {libcall, {{32, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1870 {DUMMY_STRINGOP_ALGS,
1871 {libcall, {{32, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1872 1, /* scalar_stmt_cost. */
1873 1, /* scalar load_cost. */
1874 1, /* scalar_store_cost. */
1875 1, /* vec_stmt_cost. */
1876 1, /* vec_to_scalar_cost. */
1877 1, /* scalar_to_vec_cost. */
1878 1, /* vec_align_load_cost. */
1879 2, /* vec_unalign_load_cost. */
1880 1, /* vec_store_cost. */
1881 3, /* cond_taken_branch_cost. */
1882 1, /* cond_not_taken_branch_cost. */
1883 };
1884
1885 /* Generic32 should produce code tuned for PPro, Pentium4, Nocona,
1886 Athlon and K8. */
1887 static const
1888 struct processor_costs generic32_cost = {
1889 COSTS_N_INSNS (1), /* cost of an add instruction */
1890 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1891 COSTS_N_INSNS (1), /* variable shift costs */
1892 COSTS_N_INSNS (1), /* constant shift costs */
1893 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1894 COSTS_N_INSNS (4), /* HI */
1895 COSTS_N_INSNS (3), /* SI */
1896 COSTS_N_INSNS (4), /* DI */
1897 COSTS_N_INSNS (2)}, /* other */
1898 0, /* cost of multiply per each bit set */
1899 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1900 COSTS_N_INSNS (26), /* HI */
1901 COSTS_N_INSNS (42), /* SI */
1902 COSTS_N_INSNS (74), /* DI */
1903 COSTS_N_INSNS (74)}, /* other */
1904 COSTS_N_INSNS (1), /* cost of movsx */
1905 COSTS_N_INSNS (1), /* cost of movzx */
1906 8, /* "large" insn */
1907 17, /* MOVE_RATIO */
1908 4, /* cost for loading QImode using movzbl */
1909 {4, 4, 4}, /* cost of loading integer registers
1910 in QImode, HImode and SImode.
1911 Relative to reg-reg move (2). */
1912 {4, 4, 4}, /* cost of storing integer registers */
1913 4, /* cost of reg,reg fld/fst */
1914 {12, 12, 12}, /* cost of loading fp registers
1915 in SFmode, DFmode and XFmode */
1916 {6, 6, 8}, /* cost of storing fp registers
1917 in SFmode, DFmode and XFmode */
1918 2, /* cost of moving MMX register */
1919 {8, 8}, /* cost of loading MMX registers
1920 in SImode and DImode */
1921 {8, 8}, /* cost of storing MMX registers
1922 in SImode and DImode */
1923 2, /* cost of moving SSE register */
1924 {8, 8, 8}, /* cost of loading SSE registers
1925 in SImode, DImode and TImode */
1926 {8, 8, 8}, /* cost of storing SSE registers
1927 in SImode, DImode and TImode */
1928 5, /* MMX or SSE register to integer */
1929 32, /* size of l1 cache. */
1930 256, /* size of l2 cache. */
1931 64, /* size of prefetch block */
1932 6, /* number of parallel prefetches */
1933 3, /* Branch cost */
1934 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1935 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1936 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1937 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1938 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1939 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1940 {{libcall, {{32, loop}, {8192, rep_prefix_4_byte}, {-1, libcall}}},
1941 DUMMY_STRINGOP_ALGS},
1942 {{libcall, {{32, loop}, {8192, rep_prefix_4_byte}, {-1, libcall}}},
1943 DUMMY_STRINGOP_ALGS},
1944 1, /* scalar_stmt_cost. */
1945 1, /* scalar load_cost. */
1946 1, /* scalar_store_cost. */
1947 1, /* vec_stmt_cost. */
1948 1, /* vec_to_scalar_cost. */
1949 1, /* scalar_to_vec_cost. */
1950 1, /* vec_align_load_cost. */
1951 2, /* vec_unalign_load_cost. */
1952 1, /* vec_store_cost. */
1953 3, /* cond_taken_branch_cost. */
1954 1, /* cond_not_taken_branch_cost. */
1955 };
1956
1957 /* Set by -mtune. */
1958 const struct processor_costs *ix86_tune_cost = &pentium_cost;
1959
1960 /* Set by -mtune or -Os. */
1961 const struct processor_costs *ix86_cost = &pentium_cost;
1962
1963 /* Processor feature/optimization bitmasks. */
1964 #define m_386 (1<<PROCESSOR_I386)
1965 #define m_486 (1<<PROCESSOR_I486)
1966 #define m_PENT (1<<PROCESSOR_PENTIUM)
1967 #define m_PPRO (1<<PROCESSOR_PENTIUMPRO)
1968 #define m_PENT4 (1<<PROCESSOR_PENTIUM4)
1969 #define m_NOCONA (1<<PROCESSOR_NOCONA)
1970 #define m_P4_NOCONA (m_PENT4 | m_NOCONA)
1971 #define m_CORE2_32 (1<<PROCESSOR_CORE2_32)
1972 #define m_CORE2_64 (1<<PROCESSOR_CORE2_64)
1973 #define m_COREI7_32 (1<<PROCESSOR_COREI7_32)
1974 #define m_COREI7_64 (1<<PROCESSOR_COREI7_64)
1975 #define m_COREI7 (m_COREI7_32 | m_COREI7_64)
1976 #define m_CORE2I7_32 (m_CORE2_32 | m_COREI7_32)
1977 #define m_CORE2I7_64 (m_CORE2_64 | m_COREI7_64)
1978 #define m_CORE2I7 (m_CORE2I7_32 | m_CORE2I7_64)
1979 #define m_ATOM (1<<PROCESSOR_ATOM)
1980
1981 #define m_GEODE (1<<PROCESSOR_GEODE)
1982 #define m_K6 (1<<PROCESSOR_K6)
1983 #define m_K6_GEODE (m_K6 | m_GEODE)
1984 #define m_K8 (1<<PROCESSOR_K8)
1985 #define m_ATHLON (1<<PROCESSOR_ATHLON)
1986 #define m_ATHLON_K8 (m_K8 | m_ATHLON)
1987 #define m_AMDFAM10 (1<<PROCESSOR_AMDFAM10)
1988 #define m_BDVER1 (1<<PROCESSOR_BDVER1)
1989 #define m_BDVER2 (1<<PROCESSOR_BDVER2)
1990 #define m_BDVER (m_BDVER1 | m_BDVER2)
1991 #define m_BTVER (m_BTVER1 | m_BTVER2)
1992 #define m_BTVER1 (1<<PROCESSOR_BTVER1)
1993 #define m_BTVER2 (1<<PROCESSOR_BTVER2)
1994 #define m_AMD_MULTIPLE (m_ATHLON_K8 | m_AMDFAM10 | m_BDVER | m_BTVER)
1995
1996 #define m_GENERIC32 (1<<PROCESSOR_GENERIC32)
1997 #define m_GENERIC64 (1<<PROCESSOR_GENERIC64)
1998
1999 /* Generic instruction choice should be common subset of supported CPUs
2000 (PPro/PENT4/NOCONA/CORE2/Athlon/K8). */
2001 #define m_GENERIC (m_GENERIC32 | m_GENERIC64)
2002
2003 /* Feature tests against the various tunings. */
2004 unsigned char ix86_tune_features[X86_TUNE_LAST];
2005
2006 /* Feature tests against the various tunings used to create ix86_tune_features
2007 based on the processor mask. */
2008 static unsigned int initial_ix86_tune_features[X86_TUNE_LAST] = {
2009 /* X86_TUNE_USE_LEAVE: Leave does not affect Nocona SPEC2000 results
2010 negatively, so enabling for Generic64 seems like good code size
2011 tradeoff. We can't enable it for 32bit generic because it does not
2012 work well with PPro base chips. */
2013 m_386 | m_CORE2I7_64 | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC64,
2014
2015 /* X86_TUNE_PUSH_MEMORY */
2016 m_386 | m_P4_NOCONA | m_CORE2I7 | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC,
2017
2018 /* X86_TUNE_ZERO_EXTEND_WITH_AND */
2019 m_486 | m_PENT,
2020
2021 /* X86_TUNE_UNROLL_STRLEN */
2022 m_486 | m_PENT | m_PPRO | m_ATOM | m_CORE2I7 | m_K6 | m_AMD_MULTIPLE | m_GENERIC,
2023
2024 /* X86_TUNE_BRANCH_PREDICTION_HINTS: Branch hints were put in P4 based
2025 on simulation result. But after P4 was made, no performance benefit
2026 was observed with branch hints. It also increases the code size.
2027 As a result, icc never generates branch hints. */
2028 0,
2029
2030 /* X86_TUNE_DOUBLE_WITH_ADD */
2031 ~m_386,
2032
2033 /* X86_TUNE_USE_SAHF */
2034 m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_K6_GEODE | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER | m_GENERIC,
2035
2036 /* X86_TUNE_MOVX: Enable to zero extend integer registers to avoid
2037 partial dependencies. */
2038 m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_GEODE | m_AMD_MULTIPLE | m_GENERIC,
2039
2040 /* X86_TUNE_PARTIAL_REG_STALL: We probably ought to watch for partial
2041 register stalls on Generic32 compilation setting as well. However
2042 in current implementation the partial register stalls are not eliminated
2043 very well - they can be introduced via subregs synthesized by combine
2044 and can happen in caller/callee saving sequences. Because this option
2045 pays back little on PPro based chips and is in conflict with partial reg
2046 dependencies used by Athlon/P4 based chips, it is better to leave it off
2047 for generic32 for now. */
2048 m_PPRO,
2049
2050 /* X86_TUNE_PARTIAL_FLAG_REG_STALL */
2051 m_CORE2I7 | m_GENERIC,
2052
2053 /* X86_TUNE_LCP_STALL: Avoid an expensive length-changing prefix stall
2054 * on 16-bit immediate moves into memory on Core2 and Corei7. */
2055 m_CORE2I7 | m_GENERIC,
2056
2057 /* X86_TUNE_USE_HIMODE_FIOP */
2058 m_386 | m_486 | m_K6_GEODE,
2059
2060 /* X86_TUNE_USE_SIMODE_FIOP */
2061 ~(m_PENT | m_PPRO | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC),
2062
2063 /* X86_TUNE_USE_MOV0 */
2064 m_K6,
2065
2066 /* X86_TUNE_USE_CLTD */
2067 ~(m_PENT | m_CORE2I7 | m_ATOM | m_K6 | m_GENERIC),
2068
2069 /* X86_TUNE_USE_XCHGB: Use xchgb %rh,%rl instead of rolw/rorw $8,rx. */
2070 m_PENT4,
2071
2072 /* X86_TUNE_SPLIT_LONG_MOVES */
2073 m_PPRO,
2074
2075 /* X86_TUNE_READ_MODIFY_WRITE */
2076 ~m_PENT,
2077
2078 /* X86_TUNE_READ_MODIFY */
2079 ~(m_PENT | m_PPRO),
2080
2081 /* X86_TUNE_PROMOTE_QIMODE */
2082 m_386 | m_486 | m_PENT | m_CORE2I7 | m_ATOM | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC,
2083
2084 /* X86_TUNE_FAST_PREFIX */
2085 ~(m_386 | m_486 | m_PENT),
2086
2087 /* X86_TUNE_SINGLE_STRINGOP */
2088 m_386 | m_P4_NOCONA,
2089
2090 /* X86_TUNE_QIMODE_MATH */
2091 ~0,
2092
2093 /* X86_TUNE_HIMODE_MATH: On PPro this flag is meant to avoid partial
2094 register stalls. Just like X86_TUNE_PARTIAL_REG_STALL this option
2095 might be considered for Generic32 if our scheme for avoiding partial
2096 stalls was more effective. */
2097 ~m_PPRO,
2098
2099 /* X86_TUNE_PROMOTE_QI_REGS */
2100 0,
2101
2102 /* X86_TUNE_PROMOTE_HI_REGS */
2103 m_PPRO,
2104
2105 /* X86_TUNE_SINGLE_POP: Enable if single pop insn is preferred
2106 over esp addition. */
2107 m_386 | m_486 | m_PENT | m_PPRO,
2108
2109 /* X86_TUNE_DOUBLE_POP: Enable if double pop insn is preferred
2110 over esp addition. */
2111 m_PENT,
2112
2113 /* X86_TUNE_SINGLE_PUSH: Enable if single push insn is preferred
2114 over esp subtraction. */
2115 m_386 | m_486 | m_PENT | m_K6_GEODE,
2116
2117 /* X86_TUNE_DOUBLE_PUSH. Enable if double push insn is preferred
2118 over esp subtraction. */
2119 m_PENT | m_K6_GEODE,
2120
2121 /* X86_TUNE_INTEGER_DFMODE_MOVES: Enable if integer moves are preferred
2122 for DFmode copies */
2123 ~(m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_GEODE | m_AMD_MULTIPLE | m_ATOM | m_GENERIC),
2124
2125 /* X86_TUNE_PARTIAL_REG_DEPENDENCY */
2126 m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
2127
2128 /* X86_TUNE_SSE_PARTIAL_REG_DEPENDENCY: In the Generic model we have a
2129 conflict here in between PPro/Pentium4 based chips that thread 128bit
2130 SSE registers as single units versus K8 based chips that divide SSE
2131 registers to two 64bit halves. This knob promotes all store destinations
2132 to be 128bit to allow register renaming on 128bit SSE units, but usually
2133 results in one extra microop on 64bit SSE units. Experimental results
2134 shows that disabling this option on P4 brings over 20% SPECfp regression,
2135 while enabling it on K8 brings roughly 2.4% regression that can be partly
2136 masked by careful scheduling of moves. */
2137 m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMDFAM10 | m_BDVER | m_GENERIC,
2138
2139 /* X86_TUNE_SSE_UNALIGNED_LOAD_OPTIMAL */
2140 m_COREI7 | m_AMDFAM10 | m_BDVER | m_BTVER,
2141
2142 /* X86_TUNE_SSE_UNALIGNED_STORE_OPTIMAL */
2143 m_COREI7 | m_BDVER,
2144
2145 /* X86_TUNE_SSE_PACKED_SINGLE_INSN_OPTIMAL */
2146 m_BDVER ,
2147
2148 /* X86_TUNE_SSE_SPLIT_REGS: Set for machines where the type and dependencies
2149 are resolved on SSE register parts instead of whole registers, so we may
2150 maintain just lower part of scalar values in proper format leaving the
2151 upper part undefined. */
2152 m_ATHLON_K8,
2153
2154 /* X86_TUNE_SSE_TYPELESS_STORES */
2155 m_AMD_MULTIPLE,
2156
2157 /* X86_TUNE_SSE_LOAD0_BY_PXOR */
2158 m_PPRO | m_P4_NOCONA,
2159
2160 /* X86_TUNE_MEMORY_MISMATCH_STALL */
2161 m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
2162
2163 /* X86_TUNE_PROLOGUE_USING_MOVE */
2164 m_PPRO | m_CORE2I7 | m_ATOM | m_ATHLON_K8 | m_GENERIC,
2165
2166 /* X86_TUNE_EPILOGUE_USING_MOVE */
2167 m_PPRO | m_CORE2I7 | m_ATOM | m_ATHLON_K8 | m_GENERIC,
2168
2169 /* X86_TUNE_SHIFT1 */
2170 ~m_486,
2171
2172 /* X86_TUNE_USE_FFREEP */
2173 m_AMD_MULTIPLE,
2174
2175 /* X86_TUNE_INTER_UNIT_MOVES */
2176 ~(m_AMD_MULTIPLE | m_GENERIC),
2177
2178 /* X86_TUNE_INTER_UNIT_CONVERSIONS */
2179 ~(m_AMDFAM10 | m_BDVER ),
2180
2181 /* X86_TUNE_FOUR_JUMP_LIMIT: Some CPU cores are not able to predict more
2182 than 4 branch instructions in the 16 byte window. */
2183 m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
2184
2185 /* X86_TUNE_SCHEDULE */
2186 m_PENT | m_PPRO | m_CORE2I7 | m_ATOM | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC,
2187
2188 /* X86_TUNE_USE_BT */
2189 m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
2190
2191 /* X86_TUNE_USE_INCDEC */
2192 ~(m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_GENERIC),
2193
2194 /* X86_TUNE_PAD_RETURNS */
2195 m_CORE2I7 | m_AMD_MULTIPLE | m_GENERIC,
2196
2197 /* X86_TUNE_PAD_SHORT_FUNCTION: Pad short funtion. */
2198 m_ATOM,
2199
2200 /* X86_TUNE_EXT_80387_CONSTANTS */
2201 m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_K6_GEODE | m_ATHLON_K8 | m_GENERIC,
2202
2203 /* X86_TUNE_SHORTEN_X87_SSE */
2204 ~m_K8,
2205
2206 /* X86_TUNE_AVOID_VECTOR_DECODE */
2207 m_CORE2I7_64 | m_K8 | m_GENERIC64,
2208
2209 /* X86_TUNE_PROMOTE_HIMODE_IMUL: Modern CPUs have same latency for HImode
2210 and SImode multiply, but 386 and 486 do HImode multiply faster. */
2211 ~(m_386 | m_486),
2212
2213 /* X86_TUNE_SLOW_IMUL_IMM32_MEM: Imul of 32-bit constant and memory is
2214 vector path on AMD machines. */
2215 m_CORE2I7_64 | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER | m_GENERIC64,
2216
2217 /* X86_TUNE_SLOW_IMUL_IMM8: Imul of 8-bit constant is vector path on AMD
2218 machines. */
2219 m_CORE2I7_64 | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER | m_GENERIC64,
2220
2221 /* X86_TUNE_MOVE_M1_VIA_OR: On pentiums, it is faster to load -1 via OR
2222 than a MOV. */
2223 m_PENT,
2224
2225 /* X86_TUNE_NOT_UNPAIRABLE: NOT is not pairable on Pentium, while XOR is,
2226 but one byte longer. */
2227 m_PENT,
2228
2229 /* X86_TUNE_NOT_VECTORMODE: On AMD K6, NOT is vector decoded with memory
2230 operand that cannot be represented using a modRM byte. The XOR
2231 replacement is long decoded, so this split helps here as well. */
2232 m_K6,
2233
2234 /* X86_TUNE_USE_VECTOR_FP_CONVERTS: Prefer vector packed SSE conversion
2235 from FP to FP. */
2236 m_CORE2I7 | m_AMDFAM10 | m_GENERIC,
2237
2238 /* X86_TUNE_USE_VECTOR_CONVERTS: Prefer vector packed SSE conversion
2239 from integer to FP. */
2240 m_AMDFAM10,
2241
2242 /* X86_TUNE_FUSE_CMP_AND_BRANCH: Fuse a compare or test instruction
2243 with a subsequent conditional jump instruction into a single
2244 compare-and-branch uop. */
2245 m_BDVER,
2246
2247 /* X86_TUNE_OPT_AGU: Optimize for Address Generation Unit. This flag
2248 will impact LEA instruction selection. */
2249 m_ATOM,
2250
2251 /* X86_TUNE_VECTORIZE_DOUBLE: Enable double precision vector
2252 instructions. */
2253 ~m_ATOM,
2254
2255 /* X86_SOFTARE_PREFETCHING_BENEFICIAL: Enable software prefetching
2256 at -O3. For the moment, the prefetching seems badly tuned for Intel
2257 chips. */
2258 m_K6_GEODE | m_AMD_MULTIPLE,
2259
2260 /* X86_TUNE_AVX128_OPTIMAL: Enable 128-bit AVX instruction generation for
2261 the auto-vectorizer. */
2262 m_BDVER,
2263
2264 /* X86_TUNE_REASSOC_INT_TO_PARALLEL: Try to produce parallel computations
2265 during reassociation of integer computation. */
2266 m_ATOM,
2267
2268 /* X86_TUNE_REASSOC_FP_TO_PARALLEL: Try to produce parallel computations
2269 during reassociation of fp computation. */
2270 m_ATOM
2271 };
2272
2273 /* Feature tests against the various architecture variations. */
2274 unsigned char ix86_arch_features[X86_ARCH_LAST];
2275
2276 /* Feature tests against the various architecture variations, used to create
2277 ix86_arch_features based on the processor mask. */
2278 static unsigned int initial_ix86_arch_features[X86_ARCH_LAST] = {
2279 /* X86_ARCH_CMOV: Conditional move was added for pentiumpro. */
2280 ~(m_386 | m_486 | m_PENT | m_K6),
2281
2282 /* X86_ARCH_CMPXCHG: Compare and exchange was added for 80486. */
2283 ~m_386,
2284
2285 /* X86_ARCH_CMPXCHG8B: Compare and exchange 8 bytes was added for pentium. */
2286 ~(m_386 | m_486),
2287
2288 /* X86_ARCH_XADD: Exchange and add was added for 80486. */
2289 ~m_386,
2290
2291 /* X86_ARCH_BSWAP: Byteswap was added for 80486. */
2292 ~m_386,
2293 };
2294
2295 static const unsigned int x86_accumulate_outgoing_args
2296 = m_PPRO | m_P4_NOCONA | m_ATOM | m_CORE2I7 | m_AMD_MULTIPLE | m_GENERIC;
2297
2298 static const unsigned int x86_arch_always_fancy_math_387
2299 = m_PENT | m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC;
2300
2301 static const unsigned int x86_avx256_split_unaligned_load
2302 = m_COREI7 | m_GENERIC;
2303
2304 static const unsigned int x86_avx256_split_unaligned_store
2305 = m_COREI7 | m_BDVER | m_GENERIC;
2306
2307 /* In case the average insn count for single function invocation is
2308 lower than this constant, emit fast (but longer) prologue and
2309 epilogue code. */
2310 #define FAST_PROLOGUE_INSN_COUNT 20
2311
2312 /* Names for 8 (low), 8 (high), and 16-bit registers, respectively. */
2313 static const char *const qi_reg_name[] = QI_REGISTER_NAMES;
2314 static const char *const qi_high_reg_name[] = QI_HIGH_REGISTER_NAMES;
2315 static const char *const hi_reg_name[] = HI_REGISTER_NAMES;
2316
2317 /* Array of the smallest class containing reg number REGNO, indexed by
2318 REGNO. Used by REGNO_REG_CLASS in i386.h. */
2319
2320 enum reg_class const regclass_map[FIRST_PSEUDO_REGISTER] =
2321 {
2322 /* ax, dx, cx, bx */
2323 AREG, DREG, CREG, BREG,
2324 /* si, di, bp, sp */
2325 SIREG, DIREG, NON_Q_REGS, NON_Q_REGS,
2326 /* FP registers */
2327 FP_TOP_REG, FP_SECOND_REG, FLOAT_REGS, FLOAT_REGS,
2328 FLOAT_REGS, FLOAT_REGS, FLOAT_REGS, FLOAT_REGS,
2329 /* arg pointer */
2330 NON_Q_REGS,
2331 /* flags, fpsr, fpcr, frame */
2332 NO_REGS, NO_REGS, NO_REGS, NON_Q_REGS,
2333 /* SSE registers */
2334 SSE_FIRST_REG, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
2335 SSE_REGS, SSE_REGS,
2336 /* MMX registers */
2337 MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS,
2338 MMX_REGS, MMX_REGS,
2339 /* REX registers */
2340 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
2341 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
2342 /* SSE REX registers */
2343 SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
2344 SSE_REGS, SSE_REGS,
2345 };
2346
2347 /* The "default" register map used in 32bit mode. */
2348
2349 int const dbx_register_map[FIRST_PSEUDO_REGISTER] =
2350 {
2351 0, 2, 1, 3, 6, 7, 4, 5, /* general regs */
2352 12, 13, 14, 15, 16, 17, 18, 19, /* fp regs */
2353 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2354 21, 22, 23, 24, 25, 26, 27, 28, /* SSE */
2355 29, 30, 31, 32, 33, 34, 35, 36, /* MMX */
2356 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
2357 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
2358 };
2359
2360 /* The "default" register map used in 64bit mode. */
2361
2362 int const dbx64_register_map[FIRST_PSEUDO_REGISTER] =
2363 {
2364 0, 1, 2, 3, 4, 5, 6, 7, /* general regs */
2365 33, 34, 35, 36, 37, 38, 39, 40, /* fp regs */
2366 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2367 17, 18, 19, 20, 21, 22, 23, 24, /* SSE */
2368 41, 42, 43, 44, 45, 46, 47, 48, /* MMX */
2369 8,9,10,11,12,13,14,15, /* extended integer registers */
2370 25, 26, 27, 28, 29, 30, 31, 32, /* extended SSE registers */
2371 };
2372
2373 /* Define the register numbers to be used in Dwarf debugging information.
2374 The SVR4 reference port C compiler uses the following register numbers
2375 in its Dwarf output code:
2376 0 for %eax (gcc regno = 0)
2377 1 for %ecx (gcc regno = 2)
2378 2 for %edx (gcc regno = 1)
2379 3 for %ebx (gcc regno = 3)
2380 4 for %esp (gcc regno = 7)
2381 5 for %ebp (gcc regno = 6)
2382 6 for %esi (gcc regno = 4)
2383 7 for %edi (gcc regno = 5)
2384 The following three DWARF register numbers are never generated by
2385 the SVR4 C compiler or by the GNU compilers, but SDB on x86/svr4
2386 believes these numbers have these meanings.
2387 8 for %eip (no gcc equivalent)
2388 9 for %eflags (gcc regno = 17)
2389 10 for %trapno (no gcc equivalent)
2390 It is not at all clear how we should number the FP stack registers
2391 for the x86 architecture. If the version of SDB on x86/svr4 were
2392 a bit less brain dead with respect to floating-point then we would
2393 have a precedent to follow with respect to DWARF register numbers
2394 for x86 FP registers, but the SDB on x86/svr4 is so completely
2395 broken with respect to FP registers that it is hardly worth thinking
2396 of it as something to strive for compatibility with.
2397 The version of x86/svr4 SDB I have at the moment does (partially)
2398 seem to believe that DWARF register number 11 is associated with
2399 the x86 register %st(0), but that's about all. Higher DWARF
2400 register numbers don't seem to be associated with anything in
2401 particular, and even for DWARF regno 11, SDB only seems to under-
2402 stand that it should say that a variable lives in %st(0) (when
2403 asked via an `=' command) if we said it was in DWARF regno 11,
2404 but SDB still prints garbage when asked for the value of the
2405 variable in question (via a `/' command).
2406 (Also note that the labels SDB prints for various FP stack regs
2407 when doing an `x' command are all wrong.)
2408 Note that these problems generally don't affect the native SVR4
2409 C compiler because it doesn't allow the use of -O with -g and
2410 because when it is *not* optimizing, it allocates a memory
2411 location for each floating-point variable, and the memory
2412 location is what gets described in the DWARF AT_location
2413 attribute for the variable in question.
2414 Regardless of the severe mental illness of the x86/svr4 SDB, we
2415 do something sensible here and we use the following DWARF
2416 register numbers. Note that these are all stack-top-relative
2417 numbers.
2418 11 for %st(0) (gcc regno = 8)
2419 12 for %st(1) (gcc regno = 9)
2420 13 for %st(2) (gcc regno = 10)
2421 14 for %st(3) (gcc regno = 11)
2422 15 for %st(4) (gcc regno = 12)
2423 16 for %st(5) (gcc regno = 13)
2424 17 for %st(6) (gcc regno = 14)
2425 18 for %st(7) (gcc regno = 15)
2426 */
2427 int const svr4_dbx_register_map[FIRST_PSEUDO_REGISTER] =
2428 {
2429 0, 2, 1, 3, 6, 7, 5, 4, /* general regs */
2430 11, 12, 13, 14, 15, 16, 17, 18, /* fp regs */
2431 -1, 9, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2432 21, 22, 23, 24, 25, 26, 27, 28, /* SSE registers */
2433 29, 30, 31, 32, 33, 34, 35, 36, /* MMX registers */
2434 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
2435 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
2436 };
2437
2438 /* Define parameter passing and return registers. */
2439
2440 static int const x86_64_int_parameter_registers[6] =
2441 {
2442 DI_REG, SI_REG, DX_REG, CX_REG, R8_REG, R9_REG
2443 };
2444
2445 static int const x86_64_ms_abi_int_parameter_registers[4] =
2446 {
2447 CX_REG, DX_REG, R8_REG, R9_REG
2448 };
2449
2450 static int const x86_64_int_return_registers[4] =
2451 {
2452 AX_REG, DX_REG, DI_REG, SI_REG
2453 };
2454
2455 /* Define the structure for the machine field in struct function. */
2456
2457 struct GTY(()) stack_local_entry {
2458 unsigned short mode;
2459 unsigned short n;
2460 rtx rtl;
2461 struct stack_local_entry *next;
2462 };
2463
2464 /* Structure describing stack frame layout.
2465 Stack grows downward:
2466
2467 [arguments]
2468 <- ARG_POINTER
2469 saved pc
2470
2471 saved static chain if ix86_static_chain_on_stack
2472
2473 saved frame pointer if frame_pointer_needed
2474 <- HARD_FRAME_POINTER
2475 [saved regs]
2476 <- regs_save_offset
2477 [padding0]
2478
2479 [saved SSE regs]
2480 <- sse_regs_save_offset
2481 [padding1] |
2482 | <- FRAME_POINTER
2483 [va_arg registers] |
2484 |
2485 [frame] |
2486 |
2487 [padding2] | = to_allocate
2488 <- STACK_POINTER
2489 */
2490 struct ix86_frame
2491 {
2492 int nsseregs;
2493 int nregs;
2494 int va_arg_size;
2495 int red_zone_size;
2496 int outgoing_arguments_size;
2497
2498 /* The offsets relative to ARG_POINTER. */
2499 HOST_WIDE_INT frame_pointer_offset;
2500 HOST_WIDE_INT hard_frame_pointer_offset;
2501 HOST_WIDE_INT stack_pointer_offset;
2502 HOST_WIDE_INT hfp_save_offset;
2503 HOST_WIDE_INT reg_save_offset;
2504 HOST_WIDE_INT sse_reg_save_offset;
2505
2506 /* When save_regs_using_mov is set, emit prologue using
2507 move instead of push instructions. */
2508 bool save_regs_using_mov;
2509 };
2510
2511 /* Which cpu are we scheduling for. */
2512 enum attr_cpu ix86_schedule;
2513
2514 /* Which cpu are we optimizing for. */
2515 enum processor_type ix86_tune;
2516
2517 /* Which instruction set architecture to use. */
2518 enum processor_type ix86_arch;
2519
2520 /* true if sse prefetch instruction is not NOOP. */
2521 int x86_prefetch_sse;
2522
2523 /* -mstackrealign option */
2524 static const char ix86_force_align_arg_pointer_string[]
2525 = "force_align_arg_pointer";
2526
2527 static rtx (*ix86_gen_leave) (void);
2528 static rtx (*ix86_gen_add3) (rtx, rtx, rtx);
2529 static rtx (*ix86_gen_sub3) (rtx, rtx, rtx);
2530 static rtx (*ix86_gen_sub3_carry) (rtx, rtx, rtx, rtx, rtx);
2531 static rtx (*ix86_gen_one_cmpl2) (rtx, rtx);
2532 static rtx (*ix86_gen_monitor) (rtx, rtx, rtx);
2533 static rtx (*ix86_gen_andsp) (rtx, rtx, rtx);
2534 static rtx (*ix86_gen_allocate_stack_worker) (rtx, rtx);
2535 static rtx (*ix86_gen_adjust_stack_and_probe) (rtx, rtx, rtx);
2536 static rtx (*ix86_gen_probe_stack_range) (rtx, rtx, rtx);
2537 static rtx (*ix86_gen_tls_global_dynamic_64) (rtx, rtx, rtx);
2538 static rtx (*ix86_gen_tls_local_dynamic_base_64) (rtx, rtx);
2539
2540 /* Preferred alignment for stack boundary in bits. */
2541 unsigned int ix86_preferred_stack_boundary;
2542
2543 /* Alignment for incoming stack boundary in bits specified at
2544 command line. */
2545 static unsigned int ix86_user_incoming_stack_boundary;
2546
2547 /* Default alignment for incoming stack boundary in bits. */
2548 static unsigned int ix86_default_incoming_stack_boundary;
2549
2550 /* Alignment for incoming stack boundary in bits. */
2551 unsigned int ix86_incoming_stack_boundary;
2552
2553 /* Calling abi specific va_list type nodes. */
2554 static GTY(()) tree sysv_va_list_type_node;
2555 static GTY(()) tree ms_va_list_type_node;
2556
2557 /* Prefix built by ASM_GENERATE_INTERNAL_LABEL. */
2558 char internal_label_prefix[16];
2559 int internal_label_prefix_len;
2560
2561 /* Fence to use after loop using movnt. */
2562 tree x86_mfence;
2563
2564 /* Register class used for passing given 64bit part of the argument.
2565 These represent classes as documented by the PS ABI, with the exception
2566 of SSESF, SSEDF classes, that are basically SSE class, just gcc will
2567 use SF or DFmode move instead of DImode to avoid reformatting penalties.
2568
2569 Similarly we play games with INTEGERSI_CLASS to use cheaper SImode moves
2570 whenever possible (upper half does contain padding). */
2571 enum x86_64_reg_class
2572 {
2573 X86_64_NO_CLASS,
2574 X86_64_INTEGER_CLASS,
2575 X86_64_INTEGERSI_CLASS,
2576 X86_64_SSE_CLASS,
2577 X86_64_SSESF_CLASS,
2578 X86_64_SSEDF_CLASS,
2579 X86_64_SSEUP_CLASS,
2580 X86_64_X87_CLASS,
2581 X86_64_X87UP_CLASS,
2582 X86_64_COMPLEX_X87_CLASS,
2583 X86_64_MEMORY_CLASS
2584 };
2585
2586 #define MAX_CLASSES 4
2587
2588 /* Table of constants used by fldpi, fldln2, etc.... */
2589 static REAL_VALUE_TYPE ext_80387_constants_table [5];
2590 static bool ext_80387_constants_init = 0;
2591
2592 \f
2593 static struct machine_function * ix86_init_machine_status (void);
2594 static rtx ix86_function_value (const_tree, const_tree, bool);
2595 static bool ix86_function_value_regno_p (const unsigned int);
2596 static unsigned int ix86_function_arg_boundary (enum machine_mode,
2597 const_tree);
2598 static rtx ix86_static_chain (const_tree, bool);
2599 static int ix86_function_regparm (const_tree, const_tree);
2600 static void ix86_compute_frame_layout (struct ix86_frame *);
2601 static bool ix86_expand_vector_init_one_nonzero (bool, enum machine_mode,
2602 rtx, rtx, int);
2603 static void ix86_add_new_builtins (HOST_WIDE_INT);
2604 static tree ix86_canonical_va_list_type (tree);
2605 static void predict_jump (int);
2606 static unsigned int split_stack_prologue_scratch_regno (void);
2607 static bool i386_asm_output_addr_const_extra (FILE *, rtx);
2608
2609 enum ix86_function_specific_strings
2610 {
2611 IX86_FUNCTION_SPECIFIC_ARCH,
2612 IX86_FUNCTION_SPECIFIC_TUNE,
2613 IX86_FUNCTION_SPECIFIC_MAX
2614 };
2615
2616 static char *ix86_target_string (HOST_WIDE_INT, int, const char *,
2617 const char *, enum fpmath_unit, bool);
2618 static void ix86_debug_options (void) ATTRIBUTE_UNUSED;
2619 static void ix86_function_specific_save (struct cl_target_option *);
2620 static void ix86_function_specific_restore (struct cl_target_option *);
2621 static void ix86_function_specific_print (FILE *, int,
2622 struct cl_target_option *);
2623 static bool ix86_valid_target_attribute_p (tree, tree, tree, int);
2624 static bool ix86_valid_target_attribute_inner_p (tree, char *[],
2625 struct gcc_options *);
2626 static bool ix86_can_inline_p (tree, tree);
2627 static void ix86_set_current_function (tree);
2628 static unsigned int ix86_minimum_incoming_stack_boundary (bool);
2629
2630 static enum calling_abi ix86_function_abi (const_tree);
2631
2632 \f
2633 #ifndef SUBTARGET32_DEFAULT_CPU
2634 #define SUBTARGET32_DEFAULT_CPU "i386"
2635 #endif
2636
2637 /* The svr4 ABI for the i386 says that records and unions are returned
2638 in memory. */
2639 #ifndef DEFAULT_PCC_STRUCT_RETURN
2640 #define DEFAULT_PCC_STRUCT_RETURN 1
2641 #endif
2642
2643 /* Whether -mtune= or -march= were specified */
2644 static int ix86_tune_defaulted;
2645 static int ix86_arch_specified;
2646
2647 /* Vectorization library interface and handlers. */
2648 static tree (*ix86_veclib_handler) (enum built_in_function, tree, tree);
2649
2650 static tree ix86_veclibabi_svml (enum built_in_function, tree, tree);
2651 static tree ix86_veclibabi_acml (enum built_in_function, tree, tree);
2652
2653 /* Processor target table, indexed by processor number */
2654 struct ptt
2655 {
2656 const struct processor_costs *cost; /* Processor costs */
2657 const int align_loop; /* Default alignments. */
2658 const int align_loop_max_skip;
2659 const int align_jump;
2660 const int align_jump_max_skip;
2661 const int align_func;
2662 };
2663
2664 static const struct ptt processor_target_table[PROCESSOR_max] =
2665 {
2666 {&i386_cost, 4, 3, 4, 3, 4},
2667 {&i486_cost, 16, 15, 16, 15, 16},
2668 {&pentium_cost, 16, 7, 16, 7, 16},
2669 {&pentiumpro_cost, 16, 15, 16, 10, 16},
2670 {&geode_cost, 0, 0, 0, 0, 0},
2671 {&k6_cost, 32, 7, 32, 7, 32},
2672 {&athlon_cost, 16, 7, 16, 7, 16},
2673 {&pentium4_cost, 0, 0, 0, 0, 0},
2674 {&k8_cost, 16, 7, 16, 7, 16},
2675 {&nocona_cost, 0, 0, 0, 0, 0},
2676 /* Core 2 32-bit. */
2677 {&generic32_cost, 16, 10, 16, 10, 16},
2678 /* Core 2 64-bit. */
2679 {&generic64_cost, 16, 10, 16, 10, 16},
2680 /* Core i7 32-bit. */
2681 {&generic32_cost, 16, 10, 16, 10, 16},
2682 /* Core i7 64-bit. */
2683 {&generic64_cost, 16, 10, 16, 10, 16},
2684 {&generic32_cost, 16, 7, 16, 7, 16},
2685 {&generic64_cost, 16, 10, 16, 10, 16},
2686 {&amdfam10_cost, 32, 24, 32, 7, 32},
2687 {&bdver1_cost, 32, 24, 32, 7, 32},
2688 {&bdver2_cost, 32, 24, 32, 7, 32},
2689 {&btver1_cost, 32, 24, 32, 7, 32},
2690 {&btver2_cost, 32, 24, 32, 7, 32},
2691 {&atom_cost, 16, 15, 16, 7, 16}
2692 };
2693
2694 static const char *const cpu_names[TARGET_CPU_DEFAULT_max] =
2695 {
2696 "generic",
2697 "i386",
2698 "i486",
2699 "pentium",
2700 "pentium-mmx",
2701 "pentiumpro",
2702 "pentium2",
2703 "pentium3",
2704 "pentium4",
2705 "pentium-m",
2706 "prescott",
2707 "nocona",
2708 "core2",
2709 "corei7",
2710 "atom",
2711 "geode",
2712 "k6",
2713 "k6-2",
2714 "k6-3",
2715 "athlon",
2716 "athlon-4",
2717 "k8",
2718 "amdfam10",
2719 "bdver1",
2720 "bdver2",
2721 "btver1",
2722 "btver2"
2723 };
2724 \f
2725 /* Return true if a red-zone is in use. */
2726
2727 static inline bool
2728 ix86_using_red_zone (void)
2729 {
2730 return TARGET_RED_ZONE && !TARGET_64BIT_MS_ABI;
2731 }
2732 \f
2733 /* Return a string that documents the current -m options. The caller is
2734 responsible for freeing the string. */
2735
2736 static char *
2737 ix86_target_string (HOST_WIDE_INT isa, int flags, const char *arch,
2738 const char *tune, enum fpmath_unit fpmath,
2739 bool add_nl_p)
2740 {
2741 struct ix86_target_opts
2742 {
2743 const char *option; /* option string */
2744 HOST_WIDE_INT mask; /* isa mask options */
2745 };
2746
2747 /* This table is ordered so that options like -msse4.2 that imply
2748 preceding options while match those first. */
2749 static struct ix86_target_opts isa_opts[] =
2750 {
2751 { "-mfma4", OPTION_MASK_ISA_FMA4 },
2752 { "-mfma", OPTION_MASK_ISA_FMA },
2753 { "-mxop", OPTION_MASK_ISA_XOP },
2754 { "-mlwp", OPTION_MASK_ISA_LWP },
2755 { "-msse4a", OPTION_MASK_ISA_SSE4A },
2756 { "-msse4.2", OPTION_MASK_ISA_SSE4_2 },
2757 { "-msse4.1", OPTION_MASK_ISA_SSE4_1 },
2758 { "-mssse3", OPTION_MASK_ISA_SSSE3 },
2759 { "-msse3", OPTION_MASK_ISA_SSE3 },
2760 { "-msse2", OPTION_MASK_ISA_SSE2 },
2761 { "-msse", OPTION_MASK_ISA_SSE },
2762 { "-m3dnow", OPTION_MASK_ISA_3DNOW },
2763 { "-m3dnowa", OPTION_MASK_ISA_3DNOW_A },
2764 { "-mmmx", OPTION_MASK_ISA_MMX },
2765 { "-mabm", OPTION_MASK_ISA_ABM },
2766 { "-mbmi", OPTION_MASK_ISA_BMI },
2767 { "-mbmi2", OPTION_MASK_ISA_BMI2 },
2768 { "-mlzcnt", OPTION_MASK_ISA_LZCNT },
2769 { "-mhle", OPTION_MASK_ISA_HLE },
2770 { "-mrdseed", OPTION_MASK_ISA_RDSEED },
2771 { "-mprfchw", OPTION_MASK_ISA_PRFCHW },
2772 { "-madx", OPTION_MASK_ISA_ADX },
2773 { "-mtbm", OPTION_MASK_ISA_TBM },
2774 { "-mpopcnt", OPTION_MASK_ISA_POPCNT },
2775 { "-mmovbe", OPTION_MASK_ISA_MOVBE },
2776 { "-mcrc32", OPTION_MASK_ISA_CRC32 },
2777 { "-maes", OPTION_MASK_ISA_AES },
2778 { "-mpclmul", OPTION_MASK_ISA_PCLMUL },
2779 { "-mfsgsbase", OPTION_MASK_ISA_FSGSBASE },
2780 { "-mrdrnd", OPTION_MASK_ISA_RDRND },
2781 { "-mf16c", OPTION_MASK_ISA_F16C },
2782 { "-mrtm", OPTION_MASK_ISA_RTM },
2783 };
2784
2785 /* Flag options. */
2786 static struct ix86_target_opts flag_opts[] =
2787 {
2788 { "-m128bit-long-double", MASK_128BIT_LONG_DOUBLE },
2789 { "-m80387", MASK_80387 },
2790 { "-maccumulate-outgoing-args", MASK_ACCUMULATE_OUTGOING_ARGS },
2791 { "-malign-double", MASK_ALIGN_DOUBLE },
2792 { "-mcld", MASK_CLD },
2793 { "-mfp-ret-in-387", MASK_FLOAT_RETURNS },
2794 { "-mieee-fp", MASK_IEEE_FP },
2795 { "-minline-all-stringops", MASK_INLINE_ALL_STRINGOPS },
2796 { "-minline-stringops-dynamically", MASK_INLINE_STRINGOPS_DYNAMICALLY },
2797 { "-mms-bitfields", MASK_MS_BITFIELD_LAYOUT },
2798 { "-mno-align-stringops", MASK_NO_ALIGN_STRINGOPS },
2799 { "-mno-fancy-math-387", MASK_NO_FANCY_MATH_387 },
2800 { "-mno-push-args", MASK_NO_PUSH_ARGS },
2801 { "-mno-red-zone", MASK_NO_RED_ZONE },
2802 { "-momit-leaf-frame-pointer", MASK_OMIT_LEAF_FRAME_POINTER },
2803 { "-mrecip", MASK_RECIP },
2804 { "-mrtd", MASK_RTD },
2805 { "-msseregparm", MASK_SSEREGPARM },
2806 { "-mstack-arg-probe", MASK_STACK_PROBE },
2807 { "-mtls-direct-seg-refs", MASK_TLS_DIRECT_SEG_REFS },
2808 { "-mvect8-ret-in-mem", MASK_VECT8_RETURNS },
2809 { "-m8bit-idiv", MASK_USE_8BIT_IDIV },
2810 { "-mvzeroupper", MASK_VZEROUPPER },
2811 { "-mavx256-split-unaligned-load", MASK_AVX256_SPLIT_UNALIGNED_LOAD},
2812 { "-mavx256-split-unaligned-store", MASK_AVX256_SPLIT_UNALIGNED_STORE},
2813 { "-mprefer-avx128", MASK_PREFER_AVX128},
2814 };
2815
2816 const char *opts[ARRAY_SIZE (isa_opts) + ARRAY_SIZE (flag_opts) + 6][2];
2817
2818 char isa_other[40];
2819 char target_other[40];
2820 unsigned num = 0;
2821 unsigned i, j;
2822 char *ret;
2823 char *ptr;
2824 size_t len;
2825 size_t line_len;
2826 size_t sep_len;
2827 const char *abi;
2828
2829 memset (opts, '\0', sizeof (opts));
2830
2831 /* Add -march= option. */
2832 if (arch)
2833 {
2834 opts[num][0] = "-march=";
2835 opts[num++][1] = arch;
2836 }
2837
2838 /* Add -mtune= option. */
2839 if (tune)
2840 {
2841 opts[num][0] = "-mtune=";
2842 opts[num++][1] = tune;
2843 }
2844
2845 /* Add -m32/-m64/-mx32. */
2846 if ((isa & OPTION_MASK_ISA_64BIT) != 0)
2847 {
2848 if ((isa & OPTION_MASK_ABI_64) != 0)
2849 abi = "-m64";
2850 else
2851 abi = "-mx32";
2852 isa &= ~ (OPTION_MASK_ISA_64BIT
2853 | OPTION_MASK_ABI_64
2854 | OPTION_MASK_ABI_X32);
2855 }
2856 else
2857 abi = "-m32";
2858 opts[num++][0] = abi;
2859
2860 /* Pick out the options in isa options. */
2861 for (i = 0; i < ARRAY_SIZE (isa_opts); i++)
2862 {
2863 if ((isa & isa_opts[i].mask) != 0)
2864 {
2865 opts[num++][0] = isa_opts[i].option;
2866 isa &= ~ isa_opts[i].mask;
2867 }
2868 }
2869
2870 if (isa && add_nl_p)
2871 {
2872 opts[num++][0] = isa_other;
2873 sprintf (isa_other, "(other isa: %#" HOST_WIDE_INT_PRINT "x)",
2874 isa);
2875 }
2876
2877 /* Add flag options. */
2878 for (i = 0; i < ARRAY_SIZE (flag_opts); i++)
2879 {
2880 if ((flags & flag_opts[i].mask) != 0)
2881 {
2882 opts[num++][0] = flag_opts[i].option;
2883 flags &= ~ flag_opts[i].mask;
2884 }
2885 }
2886
2887 if (flags && add_nl_p)
2888 {
2889 opts[num++][0] = target_other;
2890 sprintf (target_other, "(other flags: %#x)", flags);
2891 }
2892
2893 /* Add -fpmath= option. */
2894 if (fpmath)
2895 {
2896 opts[num][0] = "-mfpmath=";
2897 switch ((int) fpmath)
2898 {
2899 case FPMATH_387:
2900 opts[num++][1] = "387";
2901 break;
2902
2903 case FPMATH_SSE:
2904 opts[num++][1] = "sse";
2905 break;
2906
2907 case FPMATH_387 | FPMATH_SSE:
2908 opts[num++][1] = "sse+387";
2909 break;
2910
2911 default:
2912 gcc_unreachable ();
2913 }
2914 }
2915
2916 /* Any options? */
2917 if (num == 0)
2918 return NULL;
2919
2920 gcc_assert (num < ARRAY_SIZE (opts));
2921
2922 /* Size the string. */
2923 len = 0;
2924 sep_len = (add_nl_p) ? 3 : 1;
2925 for (i = 0; i < num; i++)
2926 {
2927 len += sep_len;
2928 for (j = 0; j < 2; j++)
2929 if (opts[i][j])
2930 len += strlen (opts[i][j]);
2931 }
2932
2933 /* Build the string. */
2934 ret = ptr = (char *) xmalloc (len);
2935 line_len = 0;
2936
2937 for (i = 0; i < num; i++)
2938 {
2939 size_t len2[2];
2940
2941 for (j = 0; j < 2; j++)
2942 len2[j] = (opts[i][j]) ? strlen (opts[i][j]) : 0;
2943
2944 if (i != 0)
2945 {
2946 *ptr++ = ' ';
2947 line_len++;
2948
2949 if (add_nl_p && line_len + len2[0] + len2[1] > 70)
2950 {
2951 *ptr++ = '\\';
2952 *ptr++ = '\n';
2953 line_len = 0;
2954 }
2955 }
2956
2957 for (j = 0; j < 2; j++)
2958 if (opts[i][j])
2959 {
2960 memcpy (ptr, opts[i][j], len2[j]);
2961 ptr += len2[j];
2962 line_len += len2[j];
2963 }
2964 }
2965
2966 *ptr = '\0';
2967 gcc_assert (ret + len >= ptr);
2968
2969 return ret;
2970 }
2971
2972 /* Return true, if profiling code should be emitted before
2973 prologue. Otherwise it returns false.
2974 Note: For x86 with "hotfix" it is sorried. */
2975 static bool
2976 ix86_profile_before_prologue (void)
2977 {
2978 return flag_fentry != 0;
2979 }
2980
2981 /* Function that is callable from the debugger to print the current
2982 options. */
2983 void
2984 ix86_debug_options (void)
2985 {
2986 char *opts = ix86_target_string (ix86_isa_flags, target_flags,
2987 ix86_arch_string, ix86_tune_string,
2988 ix86_fpmath, true);
2989
2990 if (opts)
2991 {
2992 fprintf (stderr, "%s\n\n", opts);
2993 free (opts);
2994 }
2995 else
2996 fputs ("<no options>\n\n", stderr);
2997
2998 return;
2999 }
3000 \f
3001 /* Override various settings based on options. If MAIN_ARGS_P, the
3002 options are from the command line, otherwise they are from
3003 attributes. */
3004
3005 static void
3006 ix86_option_override_internal (bool main_args_p)
3007 {
3008 int i;
3009 unsigned int ix86_arch_mask, ix86_tune_mask;
3010 const bool ix86_tune_specified = (ix86_tune_string != NULL);
3011 const char *prefix;
3012 const char *suffix;
3013 const char *sw;
3014
3015 #define PTA_3DNOW (HOST_WIDE_INT_1 << 0)
3016 #define PTA_3DNOW_A (HOST_WIDE_INT_1 << 1)
3017 #define PTA_64BIT (HOST_WIDE_INT_1 << 2)
3018 #define PTA_ABM (HOST_WIDE_INT_1 << 3)
3019 #define PTA_AES (HOST_WIDE_INT_1 << 4)
3020 #define PTA_AVX (HOST_WIDE_INT_1 << 5)
3021 #define PTA_BMI (HOST_WIDE_INT_1 << 6)
3022 #define PTA_CX16 (HOST_WIDE_INT_1 << 7)
3023 #define PTA_F16C (HOST_WIDE_INT_1 << 8)
3024 #define PTA_FMA (HOST_WIDE_INT_1 << 9)
3025 #define PTA_FMA4 (HOST_WIDE_INT_1 << 10)
3026 #define PTA_FSGSBASE (HOST_WIDE_INT_1 << 11)
3027 #define PTA_LWP (HOST_WIDE_INT_1 << 12)
3028 #define PTA_LZCNT (HOST_WIDE_INT_1 << 13)
3029 #define PTA_MMX (HOST_WIDE_INT_1 << 14)
3030 #define PTA_MOVBE (HOST_WIDE_INT_1 << 15)
3031 #define PTA_NO_SAHF (HOST_WIDE_INT_1 << 16)
3032 #define PTA_PCLMUL (HOST_WIDE_INT_1 << 17)
3033 #define PTA_POPCNT (HOST_WIDE_INT_1 << 18)
3034 #define PTA_PREFETCH_SSE (HOST_WIDE_INT_1 << 19)
3035 #define PTA_RDRND (HOST_WIDE_INT_1 << 20)
3036 #define PTA_SSE (HOST_WIDE_INT_1 << 21)
3037 #define PTA_SSE2 (HOST_WIDE_INT_1 << 22)
3038 #define PTA_SSE3 (HOST_WIDE_INT_1 << 23)
3039 #define PTA_SSE4_1 (HOST_WIDE_INT_1 << 24)
3040 #define PTA_SSE4_2 (HOST_WIDE_INT_1 << 25)
3041 #define PTA_SSE4A (HOST_WIDE_INT_1 << 26)
3042 #define PTA_SSSE3 (HOST_WIDE_INT_1 << 27)
3043 #define PTA_TBM (HOST_WIDE_INT_1 << 28)
3044 #define PTA_XOP (HOST_WIDE_INT_1 << 29)
3045 #define PTA_AVX2 (HOST_WIDE_INT_1 << 30)
3046 #define PTA_BMI2 (HOST_WIDE_INT_1 << 31)
3047 #define PTA_RTM (HOST_WIDE_INT_1 << 32)
3048 #define PTA_HLE (HOST_WIDE_INT_1 << 33)
3049 #define PTA_PRFCHW (HOST_WIDE_INT_1 << 34)
3050 #define PTA_RDSEED (HOST_WIDE_INT_1 << 35)
3051 #define PTA_ADX (HOST_WIDE_INT_1 << 36)
3052 /* if this reaches 64, need to widen struct pta flags below */
3053
3054 static struct pta
3055 {
3056 const char *const name; /* processor name or nickname. */
3057 const enum processor_type processor;
3058 const enum attr_cpu schedule;
3059 const unsigned HOST_WIDE_INT flags;
3060 }
3061 const processor_alias_table[] =
3062 {
3063 {"i386", PROCESSOR_I386, CPU_NONE, 0},
3064 {"i486", PROCESSOR_I486, CPU_NONE, 0},
3065 {"i586", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
3066 {"pentium", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
3067 {"pentium-mmx", PROCESSOR_PENTIUM, CPU_PENTIUM, PTA_MMX},
3068 {"winchip-c6", PROCESSOR_I486, CPU_NONE, PTA_MMX},
3069 {"winchip2", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
3070 {"c3", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
3071 {"c3-2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, PTA_MMX | PTA_SSE},
3072 {"i686", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
3073 {"pentiumpro", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
3074 {"pentium2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, PTA_MMX},
3075 {"pentium3", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
3076 PTA_MMX | PTA_SSE},
3077 {"pentium3m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
3078 PTA_MMX | PTA_SSE},
3079 {"pentium-m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
3080 PTA_MMX | PTA_SSE | PTA_SSE2},
3081 {"pentium4", PROCESSOR_PENTIUM4, CPU_NONE,
3082 PTA_MMX |PTA_SSE | PTA_SSE2},
3083 {"pentium4m", PROCESSOR_PENTIUM4, CPU_NONE,
3084 PTA_MMX | PTA_SSE | PTA_SSE2},
3085 {"prescott", PROCESSOR_NOCONA, CPU_NONE,
3086 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3},
3087 {"nocona", PROCESSOR_NOCONA, CPU_NONE,
3088 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3089 | PTA_CX16 | PTA_NO_SAHF},
3090 {"core2", PROCESSOR_CORE2_64, CPU_CORE2,
3091 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3092 | PTA_SSSE3 | PTA_CX16},
3093 {"corei7", PROCESSOR_COREI7_64, CPU_COREI7,
3094 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3095 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_CX16},
3096 {"corei7-avx", PROCESSOR_COREI7_64, CPU_COREI7,
3097 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3098 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX
3099 | PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL},
3100 {"core-avx-i", PROCESSOR_COREI7_64, CPU_COREI7,
3101 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3102 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX
3103 | PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL | PTA_FSGSBASE
3104 | PTA_RDRND | PTA_F16C},
3105 {"core-avx2", PROCESSOR_COREI7_64, CPU_COREI7,
3106 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3107 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX | PTA_AVX2
3108 | PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL | PTA_FSGSBASE
3109 | PTA_RDRND | PTA_F16C | PTA_BMI | PTA_BMI2 | PTA_LZCNT
3110 | PTA_FMA | PTA_MOVBE | PTA_RTM | PTA_HLE},
3111 {"atom", PROCESSOR_ATOM, CPU_ATOM,
3112 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3113 | PTA_SSSE3 | PTA_CX16 | PTA_MOVBE},
3114 {"geode", PROCESSOR_GEODE, CPU_GEODE,
3115 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A |PTA_PREFETCH_SSE},
3116 {"k6", PROCESSOR_K6, CPU_K6, PTA_MMX},
3117 {"k6-2", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW},
3118 {"k6-3", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW},
3119 {"athlon", PROCESSOR_ATHLON, CPU_ATHLON,
3120 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
3121 {"athlon-tbird", PROCESSOR_ATHLON, CPU_ATHLON,
3122 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
3123 {"athlon-4", PROCESSOR_ATHLON, CPU_ATHLON,
3124 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE},
3125 {"athlon-xp", PROCESSOR_ATHLON, CPU_ATHLON,
3126 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE},
3127 {"athlon-mp", PROCESSOR_ATHLON, CPU_ATHLON,
3128 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE},
3129 {"x86-64", PROCESSOR_K8, CPU_K8,
3130 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_NO_SAHF},
3131 {"k8", PROCESSOR_K8, CPU_K8,
3132 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3133 | PTA_SSE2 | PTA_NO_SAHF},
3134 {"k8-sse3", PROCESSOR_K8, CPU_K8,
3135 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3136 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF},
3137 {"opteron", PROCESSOR_K8, CPU_K8,
3138 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3139 | PTA_SSE2 | PTA_NO_SAHF},
3140 {"opteron-sse3", PROCESSOR_K8, CPU_K8,
3141 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3142 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF},
3143 {"athlon64", PROCESSOR_K8, CPU_K8,
3144 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3145 | PTA_SSE2 | PTA_NO_SAHF},
3146 {"athlon64-sse3", PROCESSOR_K8, CPU_K8,
3147 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3148 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF},
3149 {"athlon-fx", PROCESSOR_K8, CPU_K8,
3150 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3151 | PTA_SSE2 | PTA_NO_SAHF},
3152 {"amdfam10", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
3153 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3154 | PTA_SSE2 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM},
3155 {"barcelona", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
3156 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3157 | PTA_SSE2 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM},
3158 {"bdver1", PROCESSOR_BDVER1, CPU_BDVER1,
3159 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3160 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
3161 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
3162 | PTA_XOP | PTA_LWP},
3163 {"bdver2", PROCESSOR_BDVER2, CPU_BDVER2,
3164 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3165 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
3166 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX
3167 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_TBM | PTA_F16C
3168 | PTA_FMA},
3169 {"btver1", PROCESSOR_BTVER1, CPU_GENERIC64,
3170 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3171 | PTA_SSSE3 | PTA_SSE4A |PTA_ABM | PTA_CX16},
3172 {"generic32", PROCESSOR_GENERIC32, CPU_PENTIUMPRO,
3173 PTA_HLE /* flags are only used for -march switch. */ },
3174 {"btver2", PROCESSOR_BTVER2, CPU_GENERIC64,
3175 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3176 | PTA_SSSE3 | PTA_SSE4A |PTA_ABM | PTA_CX16 | PTA_SSE4_1
3177 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX
3178 | PTA_BMI | PTA_F16C | PTA_MOVBE},
3179 {"generic64", PROCESSOR_GENERIC64, CPU_GENERIC64,
3180 PTA_64BIT
3181 | PTA_HLE /* flags are only used for -march switch. */ },
3182 };
3183
3184 /* -mrecip options. */
3185 static struct
3186 {
3187 const char *string; /* option name */
3188 unsigned int mask; /* mask bits to set */
3189 }
3190 const recip_options[] =
3191 {
3192 { "all", RECIP_MASK_ALL },
3193 { "none", RECIP_MASK_NONE },
3194 { "div", RECIP_MASK_DIV },
3195 { "sqrt", RECIP_MASK_SQRT },
3196 { "vec-div", RECIP_MASK_VEC_DIV },
3197 { "vec-sqrt", RECIP_MASK_VEC_SQRT },
3198 };
3199
3200 int const pta_size = ARRAY_SIZE (processor_alias_table);
3201
3202 /* Set up prefix/suffix so the error messages refer to either the command
3203 line argument, or the attribute(target). */
3204 if (main_args_p)
3205 {
3206 prefix = "-m";
3207 suffix = "";
3208 sw = "switch";
3209 }
3210 else
3211 {
3212 prefix = "option(\"";
3213 suffix = "\")";
3214 sw = "attribute";
3215 }
3216
3217 /* Turn off both OPTION_MASK_ABI_64 and OPTION_MASK_ABI_X32 if
3218 TARGET_64BIT_DEFAULT is true and TARGET_64BIT is false. */
3219 if (TARGET_64BIT_DEFAULT && !TARGET_64BIT)
3220 ix86_isa_flags &= ~(OPTION_MASK_ABI_64 | OPTION_MASK_ABI_X32);
3221 #ifdef TARGET_BI_ARCH
3222 else
3223 {
3224 #if TARGET_BI_ARCH == 1
3225 /* When TARGET_BI_ARCH == 1, by default, OPTION_MASK_ABI_64
3226 is on and OPTION_MASK_ABI_X32 is off. We turn off
3227 OPTION_MASK_ABI_64 if OPTION_MASK_ABI_X32 is turned on by
3228 -mx32. */
3229 if (TARGET_X32)
3230 ix86_isa_flags &= ~OPTION_MASK_ABI_64;
3231 #else
3232 /* When TARGET_BI_ARCH == 2, by default, OPTION_MASK_ABI_X32 is
3233 on and OPTION_MASK_ABI_64 is off. We turn off
3234 OPTION_MASK_ABI_X32 if OPTION_MASK_ABI_64 is turned on by
3235 -m64. */
3236 if (TARGET_LP64)
3237 ix86_isa_flags &= ~OPTION_MASK_ABI_X32;
3238 #endif
3239 }
3240 #endif
3241
3242 if (TARGET_X32)
3243 {
3244 /* Always turn on OPTION_MASK_ISA_64BIT and turn off
3245 OPTION_MASK_ABI_64 for TARGET_X32. */
3246 ix86_isa_flags |= OPTION_MASK_ISA_64BIT;
3247 ix86_isa_flags &= ~OPTION_MASK_ABI_64;
3248 }
3249 else if (TARGET_LP64)
3250 {
3251 /* Always turn on OPTION_MASK_ISA_64BIT and turn off
3252 OPTION_MASK_ABI_X32 for TARGET_LP64. */
3253 ix86_isa_flags |= OPTION_MASK_ISA_64BIT;
3254 ix86_isa_flags &= ~OPTION_MASK_ABI_X32;
3255 }
3256
3257 #ifdef SUBTARGET_OVERRIDE_OPTIONS
3258 SUBTARGET_OVERRIDE_OPTIONS;
3259 #endif
3260
3261 #ifdef SUBSUBTARGET_OVERRIDE_OPTIONS
3262 SUBSUBTARGET_OVERRIDE_OPTIONS;
3263 #endif
3264
3265 /* -fPIC is the default for x86_64. */
3266 if (TARGET_MACHO && TARGET_64BIT)
3267 flag_pic = 2;
3268
3269 /* Need to check -mtune=generic first. */
3270 if (ix86_tune_string)
3271 {
3272 if (!strcmp (ix86_tune_string, "generic")
3273 || !strcmp (ix86_tune_string, "i686")
3274 /* As special support for cross compilers we read -mtune=native
3275 as -mtune=generic. With native compilers we won't see the
3276 -mtune=native, as it was changed by the driver. */
3277 || !strcmp (ix86_tune_string, "native"))
3278 {
3279 if (TARGET_64BIT)
3280 ix86_tune_string = "generic64";
3281 else
3282 ix86_tune_string = "generic32";
3283 }
3284 /* If this call is for setting the option attribute, allow the
3285 generic32/generic64 that was previously set. */
3286 else if (!main_args_p
3287 && (!strcmp (ix86_tune_string, "generic32")
3288 || !strcmp (ix86_tune_string, "generic64")))
3289 ;
3290 else if (!strncmp (ix86_tune_string, "generic", 7))
3291 error ("bad value (%s) for %stune=%s %s",
3292 ix86_tune_string, prefix, suffix, sw);
3293 else if (!strcmp (ix86_tune_string, "x86-64"))
3294 warning (OPT_Wdeprecated, "%stune=x86-64%s is deprecated; use "
3295 "%stune=k8%s or %stune=generic%s instead as appropriate",
3296 prefix, suffix, prefix, suffix, prefix, suffix);
3297 }
3298 else
3299 {
3300 if (ix86_arch_string)
3301 ix86_tune_string = ix86_arch_string;
3302 if (!ix86_tune_string)
3303 {
3304 ix86_tune_string = cpu_names[TARGET_CPU_DEFAULT];
3305 ix86_tune_defaulted = 1;
3306 }
3307
3308 /* ix86_tune_string is set to ix86_arch_string or defaulted. We
3309 need to use a sensible tune option. */
3310 if (!strcmp (ix86_tune_string, "generic")
3311 || !strcmp (ix86_tune_string, "x86-64")
3312 || !strcmp (ix86_tune_string, "i686"))
3313 {
3314 if (TARGET_64BIT)
3315 ix86_tune_string = "generic64";
3316 else
3317 ix86_tune_string = "generic32";
3318 }
3319 }
3320
3321 if (ix86_stringop_alg == rep_prefix_8_byte && !TARGET_64BIT)
3322 {
3323 /* rep; movq isn't available in 32-bit code. */
3324 error ("-mstringop-strategy=rep_8byte not supported for 32-bit code");
3325 ix86_stringop_alg = no_stringop;
3326 }
3327
3328 if (!ix86_arch_string)
3329 ix86_arch_string = TARGET_64BIT ? "x86-64" : SUBTARGET32_DEFAULT_CPU;
3330 else
3331 ix86_arch_specified = 1;
3332
3333 if (global_options_set.x_ix86_pmode)
3334 {
3335 if ((TARGET_LP64 && ix86_pmode == PMODE_SI)
3336 || (!TARGET_64BIT && ix86_pmode == PMODE_DI))
3337 error ("address mode %qs not supported in the %s bit mode",
3338 TARGET_64BIT ? "short" : "long",
3339 TARGET_64BIT ? "64" : "32");
3340 }
3341 else
3342 ix86_pmode = TARGET_LP64 ? PMODE_DI : PMODE_SI;
3343
3344 if (!global_options_set.x_ix86_abi)
3345 ix86_abi = DEFAULT_ABI;
3346
3347 if (global_options_set.x_ix86_cmodel)
3348 {
3349 switch (ix86_cmodel)
3350 {
3351 case CM_SMALL:
3352 case CM_SMALL_PIC:
3353 if (flag_pic)
3354 ix86_cmodel = CM_SMALL_PIC;
3355 if (!TARGET_64BIT)
3356 error ("code model %qs not supported in the %s bit mode",
3357 "small", "32");
3358 break;
3359
3360 case CM_MEDIUM:
3361 case CM_MEDIUM_PIC:
3362 if (flag_pic)
3363 ix86_cmodel = CM_MEDIUM_PIC;
3364 if (!TARGET_64BIT)
3365 error ("code model %qs not supported in the %s bit mode",
3366 "medium", "32");
3367 else if (TARGET_X32)
3368 error ("code model %qs not supported in x32 mode",
3369 "medium");
3370 break;
3371
3372 case CM_LARGE:
3373 case CM_LARGE_PIC:
3374 if (flag_pic)
3375 ix86_cmodel = CM_LARGE_PIC;
3376 if (!TARGET_64BIT)
3377 error ("code model %qs not supported in the %s bit mode",
3378 "large", "32");
3379 else if (TARGET_X32)
3380 error ("code model %qs not supported in x32 mode",
3381 "large");
3382 break;
3383
3384 case CM_32:
3385 if (flag_pic)
3386 error ("code model %s does not support PIC mode", "32");
3387 if (TARGET_64BIT)
3388 error ("code model %qs not supported in the %s bit mode",
3389 "32", "64");
3390 break;
3391
3392 case CM_KERNEL:
3393 if (flag_pic)
3394 {
3395 error ("code model %s does not support PIC mode", "kernel");
3396 ix86_cmodel = CM_32;
3397 }
3398 if (!TARGET_64BIT)
3399 error ("code model %qs not supported in the %s bit mode",
3400 "kernel", "32");
3401 break;
3402
3403 default:
3404 gcc_unreachable ();
3405 }
3406 }
3407 else
3408 {
3409 /* For TARGET_64BIT and MS_ABI, force pic on, in order to enable the
3410 use of rip-relative addressing. This eliminates fixups that
3411 would otherwise be needed if this object is to be placed in a
3412 DLL, and is essentially just as efficient as direct addressing. */
3413 if (TARGET_64BIT && DEFAULT_ABI == MS_ABI)
3414 ix86_cmodel = CM_SMALL_PIC, flag_pic = 1;
3415 else if (TARGET_64BIT)
3416 ix86_cmodel = flag_pic ? CM_SMALL_PIC : CM_SMALL;
3417 else
3418 ix86_cmodel = CM_32;
3419 }
3420 if (TARGET_MACHO && ix86_asm_dialect == ASM_INTEL)
3421 {
3422 error ("-masm=intel not supported in this configuration");
3423 ix86_asm_dialect = ASM_ATT;
3424 }
3425 if ((TARGET_64BIT != 0) != ((ix86_isa_flags & OPTION_MASK_ISA_64BIT) != 0))
3426 sorry ("%i-bit mode not compiled in",
3427 (ix86_isa_flags & OPTION_MASK_ISA_64BIT) ? 64 : 32);
3428
3429 for (i = 0; i < pta_size; i++)
3430 if (! strcmp (ix86_arch_string, processor_alias_table[i].name))
3431 {
3432 ix86_schedule = processor_alias_table[i].schedule;
3433 ix86_arch = processor_alias_table[i].processor;
3434 /* Default cpu tuning to the architecture. */
3435 ix86_tune = ix86_arch;
3436
3437 if (TARGET_64BIT && !(processor_alias_table[i].flags & PTA_64BIT))
3438 error ("CPU you selected does not support x86-64 "
3439 "instruction set");
3440
3441 if (processor_alias_table[i].flags & PTA_MMX
3442 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_MMX))
3443 ix86_isa_flags |= OPTION_MASK_ISA_MMX;
3444 if (processor_alias_table[i].flags & PTA_3DNOW
3445 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW))
3446 ix86_isa_flags |= OPTION_MASK_ISA_3DNOW;
3447 if (processor_alias_table[i].flags & PTA_3DNOW_A
3448 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW_A))
3449 ix86_isa_flags |= OPTION_MASK_ISA_3DNOW_A;
3450 if (processor_alias_table[i].flags & PTA_SSE
3451 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE))
3452 ix86_isa_flags |= OPTION_MASK_ISA_SSE;
3453 if (processor_alias_table[i].flags & PTA_SSE2
3454 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE2))
3455 ix86_isa_flags |= OPTION_MASK_ISA_SSE2;
3456 if (processor_alias_table[i].flags & PTA_SSE3
3457 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE3))
3458 ix86_isa_flags |= OPTION_MASK_ISA_SSE3;
3459 if (processor_alias_table[i].flags & PTA_SSSE3
3460 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSSE3))
3461 ix86_isa_flags |= OPTION_MASK_ISA_SSSE3;
3462 if (processor_alias_table[i].flags & PTA_SSE4_1
3463 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_1))
3464 ix86_isa_flags |= OPTION_MASK_ISA_SSE4_1;
3465 if (processor_alias_table[i].flags & PTA_SSE4_2
3466 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_2))
3467 ix86_isa_flags |= OPTION_MASK_ISA_SSE4_2;
3468 if (processor_alias_table[i].flags & PTA_AVX
3469 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX))
3470 ix86_isa_flags |= OPTION_MASK_ISA_AVX;
3471 if (processor_alias_table[i].flags & PTA_AVX2
3472 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX2))
3473 ix86_isa_flags |= OPTION_MASK_ISA_AVX2;
3474 if (processor_alias_table[i].flags & PTA_FMA
3475 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA))
3476 ix86_isa_flags |= OPTION_MASK_ISA_FMA;
3477 if (processor_alias_table[i].flags & PTA_SSE4A
3478 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4A))
3479 ix86_isa_flags |= OPTION_MASK_ISA_SSE4A;
3480 if (processor_alias_table[i].flags & PTA_FMA4
3481 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA4))
3482 ix86_isa_flags |= OPTION_MASK_ISA_FMA4;
3483 if (processor_alias_table[i].flags & PTA_XOP
3484 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_XOP))
3485 ix86_isa_flags |= OPTION_MASK_ISA_XOP;
3486 if (processor_alias_table[i].flags & PTA_LWP
3487 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_LWP))
3488 ix86_isa_flags |= OPTION_MASK_ISA_LWP;
3489 if (processor_alias_table[i].flags & PTA_ABM
3490 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_ABM))
3491 ix86_isa_flags |= OPTION_MASK_ISA_ABM;
3492 if (processor_alias_table[i].flags & PTA_BMI
3493 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI))
3494 ix86_isa_flags |= OPTION_MASK_ISA_BMI;
3495 if (processor_alias_table[i].flags & (PTA_LZCNT | PTA_ABM)
3496 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_LZCNT))
3497 ix86_isa_flags |= OPTION_MASK_ISA_LZCNT;
3498 if (processor_alias_table[i].flags & PTA_TBM
3499 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_TBM))
3500 ix86_isa_flags |= OPTION_MASK_ISA_TBM;
3501 if (processor_alias_table[i].flags & PTA_BMI2
3502 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI2))
3503 ix86_isa_flags |= OPTION_MASK_ISA_BMI2;
3504 if (processor_alias_table[i].flags & PTA_CX16
3505 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_CX16))
3506 ix86_isa_flags |= OPTION_MASK_ISA_CX16;
3507 if (processor_alias_table[i].flags & (PTA_POPCNT | PTA_ABM)
3508 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_POPCNT))
3509 ix86_isa_flags |= OPTION_MASK_ISA_POPCNT;
3510 if (!(TARGET_64BIT && (processor_alias_table[i].flags & PTA_NO_SAHF))
3511 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SAHF))
3512 ix86_isa_flags |= OPTION_MASK_ISA_SAHF;
3513 if (processor_alias_table[i].flags & PTA_MOVBE
3514 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_MOVBE))
3515 ix86_isa_flags |= OPTION_MASK_ISA_MOVBE;
3516 if (processor_alias_table[i].flags & PTA_AES
3517 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AES))
3518 ix86_isa_flags |= OPTION_MASK_ISA_AES;
3519 if (processor_alias_table[i].flags & PTA_PCLMUL
3520 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_PCLMUL))
3521 ix86_isa_flags |= OPTION_MASK_ISA_PCLMUL;
3522 if (processor_alias_table[i].flags & PTA_FSGSBASE
3523 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_FSGSBASE))
3524 ix86_isa_flags |= OPTION_MASK_ISA_FSGSBASE;
3525 if (processor_alias_table[i].flags & PTA_RDRND
3526 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_RDRND))
3527 ix86_isa_flags |= OPTION_MASK_ISA_RDRND;
3528 if (processor_alias_table[i].flags & PTA_F16C
3529 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_F16C))
3530 ix86_isa_flags |= OPTION_MASK_ISA_F16C;
3531 if (processor_alias_table[i].flags & PTA_RTM
3532 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_RTM))
3533 ix86_isa_flags |= OPTION_MASK_ISA_RTM;
3534 if (processor_alias_table[i].flags & PTA_HLE
3535 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_HLE))
3536 ix86_isa_flags |= OPTION_MASK_ISA_HLE;
3537 if (processor_alias_table[i].flags & PTA_PRFCHW
3538 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_PRFCHW))
3539 ix86_isa_flags |= OPTION_MASK_ISA_PRFCHW;
3540 if (processor_alias_table[i].flags & PTA_RDSEED
3541 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_RDSEED))
3542 ix86_isa_flags |= OPTION_MASK_ISA_RDSEED;
3543 if (processor_alias_table[i].flags & PTA_ADX
3544 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_ADX))
3545 ix86_isa_flags |= OPTION_MASK_ISA_ADX;
3546 if (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE))
3547 x86_prefetch_sse = true;
3548
3549 break;
3550 }
3551
3552 if (!strcmp (ix86_arch_string, "generic"))
3553 error ("generic CPU can be used only for %stune=%s %s",
3554 prefix, suffix, sw);
3555 else if (!strncmp (ix86_arch_string, "generic", 7) || i == pta_size)
3556 error ("bad value (%s) for %sarch=%s %s",
3557 ix86_arch_string, prefix, suffix, sw);
3558
3559 ix86_arch_mask = 1u << ix86_arch;
3560 for (i = 0; i < X86_ARCH_LAST; ++i)
3561 ix86_arch_features[i] = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
3562
3563 for (i = 0; i < pta_size; i++)
3564 if (! strcmp (ix86_tune_string, processor_alias_table[i].name))
3565 {
3566 ix86_schedule = processor_alias_table[i].schedule;
3567 ix86_tune = processor_alias_table[i].processor;
3568 if (TARGET_64BIT)
3569 {
3570 if (!(processor_alias_table[i].flags & PTA_64BIT))
3571 {
3572 if (ix86_tune_defaulted)
3573 {
3574 ix86_tune_string = "x86-64";
3575 for (i = 0; i < pta_size; i++)
3576 if (! strcmp (ix86_tune_string,
3577 processor_alias_table[i].name))
3578 break;
3579 ix86_schedule = processor_alias_table[i].schedule;
3580 ix86_tune = processor_alias_table[i].processor;
3581 }
3582 else
3583 error ("CPU you selected does not support x86-64 "
3584 "instruction set");
3585 }
3586 }
3587 else
3588 {
3589 /* Adjust tuning when compiling for 32-bit ABI. */
3590 switch (ix86_tune)
3591 {
3592 case PROCESSOR_GENERIC64:
3593 ix86_tune = PROCESSOR_GENERIC32;
3594 ix86_schedule = CPU_PENTIUMPRO;
3595 break;
3596
3597 case PROCESSOR_CORE2_64:
3598 ix86_tune = PROCESSOR_CORE2_32;
3599 break;
3600
3601 case PROCESSOR_COREI7_64:
3602 ix86_tune = PROCESSOR_COREI7_32;
3603 break;
3604
3605 default:
3606 break;
3607 }
3608 }
3609 /* Intel CPUs have always interpreted SSE prefetch instructions as
3610 NOPs; so, we can enable SSE prefetch instructions even when
3611 -mtune (rather than -march) points us to a processor that has them.
3612 However, the VIA C3 gives a SIGILL, so we only do that for i686 and
3613 higher processors. */
3614 if (TARGET_CMOV
3615 && (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE)))
3616 x86_prefetch_sse = true;
3617 break;
3618 }
3619
3620 if (ix86_tune_specified && i == pta_size)
3621 error ("bad value (%s) for %stune=%s %s",
3622 ix86_tune_string, prefix, suffix, sw);
3623
3624 ix86_tune_mask = 1u << ix86_tune;
3625 for (i = 0; i < X86_TUNE_LAST; ++i)
3626 ix86_tune_features[i] = !!(initial_ix86_tune_features[i] & ix86_tune_mask);
3627
3628 #ifndef USE_IX86_FRAME_POINTER
3629 #define USE_IX86_FRAME_POINTER 0
3630 #endif
3631
3632 #ifndef USE_X86_64_FRAME_POINTER
3633 #define USE_X86_64_FRAME_POINTER 0
3634 #endif
3635
3636 /* Set the default values for switches whose default depends on TARGET_64BIT
3637 in case they weren't overwritten by command line options. */
3638 if (TARGET_64BIT)
3639 {
3640 if (optimize >= 1 && !global_options_set.x_flag_omit_frame_pointer)
3641 flag_omit_frame_pointer = !USE_X86_64_FRAME_POINTER;
3642 if (flag_asynchronous_unwind_tables == 2)
3643 flag_unwind_tables = flag_asynchronous_unwind_tables = 1;
3644 if (flag_pcc_struct_return == 2)
3645 flag_pcc_struct_return = 0;
3646 }
3647 else
3648 {
3649 if (optimize >= 1 && !global_options_set.x_flag_omit_frame_pointer)
3650 flag_omit_frame_pointer = !(USE_IX86_FRAME_POINTER || optimize_size);
3651 if (flag_asynchronous_unwind_tables == 2)
3652 flag_asynchronous_unwind_tables = !USE_IX86_FRAME_POINTER;
3653 if (flag_pcc_struct_return == 2)
3654 flag_pcc_struct_return = DEFAULT_PCC_STRUCT_RETURN;
3655 }
3656
3657 ix86_tune_cost = processor_target_table[ix86_tune].cost;
3658 if (optimize_size)
3659 ix86_cost = &ix86_size_cost;
3660 else
3661 ix86_cost = ix86_tune_cost;
3662
3663 /* Arrange to set up i386_stack_locals for all functions. */
3664 init_machine_status = ix86_init_machine_status;
3665
3666 /* Validate -mregparm= value. */
3667 if (global_options_set.x_ix86_regparm)
3668 {
3669 if (TARGET_64BIT)
3670 warning (0, "-mregparm is ignored in 64-bit mode");
3671 if (ix86_regparm > REGPARM_MAX)
3672 {
3673 error ("-mregparm=%d is not between 0 and %d",
3674 ix86_regparm, REGPARM_MAX);
3675 ix86_regparm = 0;
3676 }
3677 }
3678 if (TARGET_64BIT)
3679 ix86_regparm = REGPARM_MAX;
3680
3681 /* Default align_* from the processor table. */
3682 if (align_loops == 0)
3683 {
3684 align_loops = processor_target_table[ix86_tune].align_loop;
3685 align_loops_max_skip = processor_target_table[ix86_tune].align_loop_max_skip;
3686 }
3687 if (align_jumps == 0)
3688 {
3689 align_jumps = processor_target_table[ix86_tune].align_jump;
3690 align_jumps_max_skip = processor_target_table[ix86_tune].align_jump_max_skip;
3691 }
3692 if (align_functions == 0)
3693 {
3694 align_functions = processor_target_table[ix86_tune].align_func;
3695 }
3696
3697 /* Provide default for -mbranch-cost= value. */
3698 if (!global_options_set.x_ix86_branch_cost)
3699 ix86_branch_cost = ix86_cost->branch_cost;
3700
3701 if (TARGET_64BIT)
3702 {
3703 target_flags |= TARGET_SUBTARGET64_DEFAULT & ~target_flags_explicit;
3704
3705 /* Enable by default the SSE and MMX builtins. Do allow the user to
3706 explicitly disable any of these. In particular, disabling SSE and
3707 MMX for kernel code is extremely useful. */
3708 if (!ix86_arch_specified)
3709 ix86_isa_flags
3710 |= ((OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_MMX
3711 | TARGET_SUBTARGET64_ISA_DEFAULT) & ~ix86_isa_flags_explicit);
3712
3713 if (TARGET_RTD)
3714 warning (0, "%srtd%s is ignored in 64bit mode", prefix, suffix);
3715 }
3716 else
3717 {
3718 target_flags |= TARGET_SUBTARGET32_DEFAULT & ~target_flags_explicit;
3719
3720 if (!ix86_arch_specified)
3721 ix86_isa_flags
3722 |= TARGET_SUBTARGET32_ISA_DEFAULT & ~ix86_isa_flags_explicit;
3723
3724 /* i386 ABI does not specify red zone. It still makes sense to use it
3725 when programmer takes care to stack from being destroyed. */
3726 if (!(target_flags_explicit & MASK_NO_RED_ZONE))
3727 target_flags |= MASK_NO_RED_ZONE;
3728 }
3729
3730 /* Keep nonleaf frame pointers. */
3731 if (flag_omit_frame_pointer)
3732 target_flags &= ~MASK_OMIT_LEAF_FRAME_POINTER;
3733 else if (TARGET_OMIT_LEAF_FRAME_POINTER)
3734 flag_omit_frame_pointer = 1;
3735
3736 /* If we're doing fast math, we don't care about comparison order
3737 wrt NaNs. This lets us use a shorter comparison sequence. */
3738 if (flag_finite_math_only)
3739 target_flags &= ~MASK_IEEE_FP;
3740
3741 /* If the architecture always has an FPU, turn off NO_FANCY_MATH_387,
3742 since the insns won't need emulation. */
3743 if (x86_arch_always_fancy_math_387 & ix86_arch_mask)
3744 target_flags &= ~MASK_NO_FANCY_MATH_387;
3745
3746 /* Likewise, if the target doesn't have a 387, or we've specified
3747 software floating point, don't use 387 inline intrinsics. */
3748 if (!TARGET_80387)
3749 target_flags |= MASK_NO_FANCY_MATH_387;
3750
3751 /* Turn on MMX builtins for -msse. */
3752 if (TARGET_SSE)
3753 ix86_isa_flags |= OPTION_MASK_ISA_MMX & ~ix86_isa_flags_explicit;
3754
3755 /* Enable SSE prefetch. */
3756 if (TARGET_SSE || TARGET_PRFCHW)
3757 x86_prefetch_sse = true;
3758
3759 /* Turn on popcnt instruction for -msse4.2 or -mabm. */
3760 if (TARGET_SSE4_2 || TARGET_ABM)
3761 ix86_isa_flags |= OPTION_MASK_ISA_POPCNT & ~ix86_isa_flags_explicit;
3762
3763 /* Turn on lzcnt instruction for -mabm. */
3764 if (TARGET_ABM)
3765 ix86_isa_flags |= OPTION_MASK_ISA_LZCNT & ~ix86_isa_flags_explicit;
3766
3767 /* Validate -mpreferred-stack-boundary= value or default it to
3768 PREFERRED_STACK_BOUNDARY_DEFAULT. */
3769 ix86_preferred_stack_boundary = PREFERRED_STACK_BOUNDARY_DEFAULT;
3770 if (global_options_set.x_ix86_preferred_stack_boundary_arg)
3771 {
3772 int min = (TARGET_64BIT ? (TARGET_SSE ? 4 : 3) : 2);
3773 int max = (TARGET_SEH ? 4 : 12);
3774
3775 if (ix86_preferred_stack_boundary_arg < min
3776 || ix86_preferred_stack_boundary_arg > max)
3777 {
3778 if (min == max)
3779 error ("-mpreferred-stack-boundary is not supported "
3780 "for this target");
3781 else
3782 error ("-mpreferred-stack-boundary=%d is not between %d and %d",
3783 ix86_preferred_stack_boundary_arg, min, max);
3784 }
3785 else
3786 ix86_preferred_stack_boundary
3787 = (1 << ix86_preferred_stack_boundary_arg) * BITS_PER_UNIT;
3788 }
3789
3790 /* Set the default value for -mstackrealign. */
3791 if (ix86_force_align_arg_pointer == -1)
3792 ix86_force_align_arg_pointer = STACK_REALIGN_DEFAULT;
3793
3794 ix86_default_incoming_stack_boundary = PREFERRED_STACK_BOUNDARY;
3795
3796 /* Validate -mincoming-stack-boundary= value or default it to
3797 MIN_STACK_BOUNDARY/PREFERRED_STACK_BOUNDARY. */
3798 ix86_incoming_stack_boundary = ix86_default_incoming_stack_boundary;
3799 if (global_options_set.x_ix86_incoming_stack_boundary_arg)
3800 {
3801 if (ix86_incoming_stack_boundary_arg < (TARGET_64BIT ? 4 : 2)
3802 || ix86_incoming_stack_boundary_arg > 12)
3803 error ("-mincoming-stack-boundary=%d is not between %d and 12",
3804 ix86_incoming_stack_boundary_arg, TARGET_64BIT ? 4 : 2);
3805 else
3806 {
3807 ix86_user_incoming_stack_boundary
3808 = (1 << ix86_incoming_stack_boundary_arg) * BITS_PER_UNIT;
3809 ix86_incoming_stack_boundary
3810 = ix86_user_incoming_stack_boundary;
3811 }
3812 }
3813
3814 /* Accept -msseregparm only if at least SSE support is enabled. */
3815 if (TARGET_SSEREGPARM
3816 && ! TARGET_SSE)
3817 error ("%ssseregparm%s used without SSE enabled", prefix, suffix);
3818
3819 if (global_options_set.x_ix86_fpmath)
3820 {
3821 if (ix86_fpmath & FPMATH_SSE)
3822 {
3823 if (!TARGET_SSE)
3824 {
3825 warning (0, "SSE instruction set disabled, using 387 arithmetics");
3826 ix86_fpmath = FPMATH_387;
3827 }
3828 else if ((ix86_fpmath & FPMATH_387) && !TARGET_80387)
3829 {
3830 warning (0, "387 instruction set disabled, using SSE arithmetics");
3831 ix86_fpmath = FPMATH_SSE;
3832 }
3833 }
3834 }
3835 else
3836 ix86_fpmath = TARGET_FPMATH_DEFAULT;
3837
3838 /* If the i387 is disabled, then do not return values in it. */
3839 if (!TARGET_80387)
3840 target_flags &= ~MASK_FLOAT_RETURNS;
3841
3842 /* Use external vectorized library in vectorizing intrinsics. */
3843 if (global_options_set.x_ix86_veclibabi_type)
3844 switch (ix86_veclibabi_type)
3845 {
3846 case ix86_veclibabi_type_svml:
3847 ix86_veclib_handler = ix86_veclibabi_svml;
3848 break;
3849
3850 case ix86_veclibabi_type_acml:
3851 ix86_veclib_handler = ix86_veclibabi_acml;
3852 break;
3853
3854 default:
3855 gcc_unreachable ();
3856 }
3857
3858 if ((!USE_IX86_FRAME_POINTER
3859 || (x86_accumulate_outgoing_args & ix86_tune_mask))
3860 && !(target_flags_explicit & MASK_ACCUMULATE_OUTGOING_ARGS)
3861 && !optimize_size)
3862 target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
3863
3864 /* ??? Unwind info is not correct around the CFG unless either a frame
3865 pointer is present or M_A_O_A is set. Fixing this requires rewriting
3866 unwind info generation to be aware of the CFG and propagating states
3867 around edges. */
3868 if ((flag_unwind_tables || flag_asynchronous_unwind_tables
3869 || flag_exceptions || flag_non_call_exceptions)
3870 && flag_omit_frame_pointer
3871 && !(target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
3872 {
3873 if (target_flags_explicit & MASK_ACCUMULATE_OUTGOING_ARGS)
3874 warning (0, "unwind tables currently require either a frame pointer "
3875 "or %saccumulate-outgoing-args%s for correctness",
3876 prefix, suffix);
3877 target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
3878 }
3879
3880 /* If stack probes are required, the space used for large function
3881 arguments on the stack must also be probed, so enable
3882 -maccumulate-outgoing-args so this happens in the prologue. */
3883 if (TARGET_STACK_PROBE
3884 && !(target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
3885 {
3886 if (target_flags_explicit & MASK_ACCUMULATE_OUTGOING_ARGS)
3887 warning (0, "stack probing requires %saccumulate-outgoing-args%s "
3888 "for correctness", prefix, suffix);
3889 target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
3890 }
3891
3892 /* Figure out what ASM_GENERATE_INTERNAL_LABEL builds as a prefix. */
3893 {
3894 char *p;
3895 ASM_GENERATE_INTERNAL_LABEL (internal_label_prefix, "LX", 0);
3896 p = strchr (internal_label_prefix, 'X');
3897 internal_label_prefix_len = p - internal_label_prefix;
3898 *p = '\0';
3899 }
3900
3901 /* When scheduling description is not available, disable scheduler pass
3902 so it won't slow down the compilation and make x87 code slower. */
3903 if (!TARGET_SCHEDULE)
3904 flag_schedule_insns_after_reload = flag_schedule_insns = 0;
3905
3906 maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
3907 ix86_tune_cost->simultaneous_prefetches,
3908 global_options.x_param_values,
3909 global_options_set.x_param_values);
3910 maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
3911 ix86_tune_cost->prefetch_block,
3912 global_options.x_param_values,
3913 global_options_set.x_param_values);
3914 maybe_set_param_value (PARAM_L1_CACHE_SIZE,
3915 ix86_tune_cost->l1_cache_size,
3916 global_options.x_param_values,
3917 global_options_set.x_param_values);
3918 maybe_set_param_value (PARAM_L2_CACHE_SIZE,
3919 ix86_tune_cost->l2_cache_size,
3920 global_options.x_param_values,
3921 global_options_set.x_param_values);
3922
3923 /* Enable sw prefetching at -O3 for CPUS that prefetching is helpful. */
3924 if (flag_prefetch_loop_arrays < 0
3925 && HAVE_prefetch
3926 && optimize >= 3
3927 && TARGET_SOFTWARE_PREFETCHING_BENEFICIAL)
3928 flag_prefetch_loop_arrays = 1;
3929
3930 /* If using typedef char *va_list, signal that __builtin_va_start (&ap, 0)
3931 can be optimized to ap = __builtin_next_arg (0). */
3932 if (!TARGET_64BIT && !flag_split_stack)
3933 targetm.expand_builtin_va_start = NULL;
3934
3935 if (TARGET_64BIT)
3936 {
3937 ix86_gen_leave = gen_leave_rex64;
3938 if (Pmode == DImode)
3939 {
3940 ix86_gen_monitor = gen_sse3_monitor64_di;
3941 ix86_gen_tls_global_dynamic_64 = gen_tls_global_dynamic_64_di;
3942 ix86_gen_tls_local_dynamic_base_64
3943 = gen_tls_local_dynamic_base_64_di;
3944 }
3945 else
3946 {
3947 ix86_gen_monitor = gen_sse3_monitor64_si;
3948 ix86_gen_tls_global_dynamic_64 = gen_tls_global_dynamic_64_si;
3949 ix86_gen_tls_local_dynamic_base_64
3950 = gen_tls_local_dynamic_base_64_si;
3951 }
3952 }
3953 else
3954 {
3955 ix86_gen_leave = gen_leave;
3956 ix86_gen_monitor = gen_sse3_monitor;
3957 }
3958
3959 if (Pmode == DImode)
3960 {
3961 ix86_gen_add3 = gen_adddi3;
3962 ix86_gen_sub3 = gen_subdi3;
3963 ix86_gen_sub3_carry = gen_subdi3_carry;
3964 ix86_gen_one_cmpl2 = gen_one_cmpldi2;
3965 ix86_gen_andsp = gen_anddi3;
3966 ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_di;
3967 ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probedi;
3968 ix86_gen_probe_stack_range = gen_probe_stack_rangedi;
3969 }
3970 else
3971 {
3972 ix86_gen_add3 = gen_addsi3;
3973 ix86_gen_sub3 = gen_subsi3;
3974 ix86_gen_sub3_carry = gen_subsi3_carry;
3975 ix86_gen_one_cmpl2 = gen_one_cmplsi2;
3976 ix86_gen_andsp = gen_andsi3;
3977 ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_si;
3978 ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probesi;
3979 ix86_gen_probe_stack_range = gen_probe_stack_rangesi;
3980 }
3981
3982 #ifdef USE_IX86_CLD
3983 /* Use -mcld by default for 32-bit code if configured with --enable-cld. */
3984 if (!TARGET_64BIT)
3985 target_flags |= MASK_CLD & ~target_flags_explicit;
3986 #endif
3987
3988 if (!TARGET_64BIT && flag_pic)
3989 {
3990 if (flag_fentry > 0)
3991 sorry ("-mfentry isn%'t supported for 32-bit in combination "
3992 "with -fpic");
3993 flag_fentry = 0;
3994 }
3995 else if (TARGET_SEH)
3996 {
3997 if (flag_fentry == 0)
3998 sorry ("-mno-fentry isn%'t compatible with SEH");
3999 flag_fentry = 1;
4000 }
4001 else if (flag_fentry < 0)
4002 {
4003 #if defined(PROFILE_BEFORE_PROLOGUE)
4004 flag_fentry = 1;
4005 #else
4006 flag_fentry = 0;
4007 #endif
4008 }
4009
4010 if (TARGET_AVX)
4011 {
4012 /* When not optimize for size, enable vzeroupper optimization for
4013 TARGET_AVX with -fexpensive-optimizations and split 32-byte
4014 AVX unaligned load/store. */
4015 if (!optimize_size)
4016 {
4017 if (flag_expensive_optimizations
4018 && !(target_flags_explicit & MASK_VZEROUPPER))
4019 target_flags |= MASK_VZEROUPPER;
4020 if ((x86_avx256_split_unaligned_load & ix86_tune_mask)
4021 && !(target_flags_explicit & MASK_AVX256_SPLIT_UNALIGNED_LOAD))
4022 target_flags |= MASK_AVX256_SPLIT_UNALIGNED_LOAD;
4023 if ((x86_avx256_split_unaligned_store & ix86_tune_mask)
4024 && !(target_flags_explicit & MASK_AVX256_SPLIT_UNALIGNED_STORE))
4025 target_flags |= MASK_AVX256_SPLIT_UNALIGNED_STORE;
4026 /* Enable 128-bit AVX instruction generation for the auto-vectorizer. */
4027 if (TARGET_AVX128_OPTIMAL && !(target_flags_explicit & MASK_PREFER_AVX128))
4028 target_flags |= MASK_PREFER_AVX128;
4029 }
4030 }
4031 else
4032 {
4033 /* Disable vzeroupper pass if TARGET_AVX is disabled. */
4034 target_flags &= ~MASK_VZEROUPPER;
4035 }
4036
4037 if (ix86_recip_name)
4038 {
4039 char *p = ASTRDUP (ix86_recip_name);
4040 char *q;
4041 unsigned int mask, i;
4042 bool invert;
4043
4044 while ((q = strtok (p, ",")) != NULL)
4045 {
4046 p = NULL;
4047 if (*q == '!')
4048 {
4049 invert = true;
4050 q++;
4051 }
4052 else
4053 invert = false;
4054
4055 if (!strcmp (q, "default"))
4056 mask = RECIP_MASK_ALL;
4057 else
4058 {
4059 for (i = 0; i < ARRAY_SIZE (recip_options); i++)
4060 if (!strcmp (q, recip_options[i].string))
4061 {
4062 mask = recip_options[i].mask;
4063 break;
4064 }
4065
4066 if (i == ARRAY_SIZE (recip_options))
4067 {
4068 error ("unknown option for -mrecip=%s", q);
4069 invert = false;
4070 mask = RECIP_MASK_NONE;
4071 }
4072 }
4073
4074 recip_mask_explicit |= mask;
4075 if (invert)
4076 recip_mask &= ~mask;
4077 else
4078 recip_mask |= mask;
4079 }
4080 }
4081
4082 if (TARGET_RECIP)
4083 recip_mask |= RECIP_MASK_ALL & ~recip_mask_explicit;
4084 else if (target_flags_explicit & MASK_RECIP)
4085 recip_mask &= ~(RECIP_MASK_ALL & ~recip_mask_explicit);
4086
4087 /* Save the initial options in case the user does function specific
4088 options. */
4089 if (main_args_p)
4090 target_option_default_node = target_option_current_node
4091 = build_target_option_node ();
4092 }
4093
4094 /* Return TRUE if VAL is passed in register with 256bit AVX modes. */
4095
4096 static bool
4097 function_pass_avx256_p (const_rtx val)
4098 {
4099 if (!val)
4100 return false;
4101
4102 if (REG_P (val) && VALID_AVX256_REG_MODE (GET_MODE (val)))
4103 return true;
4104
4105 if (GET_CODE (val) == PARALLEL)
4106 {
4107 int i;
4108 rtx r;
4109
4110 for (i = XVECLEN (val, 0) - 1; i >= 0; i--)
4111 {
4112 r = XVECEXP (val, 0, i);
4113 if (GET_CODE (r) == EXPR_LIST
4114 && XEXP (r, 0)
4115 && REG_P (XEXP (r, 0))
4116 && (GET_MODE (XEXP (r, 0)) == OImode
4117 || VALID_AVX256_REG_MODE (GET_MODE (XEXP (r, 0)))))
4118 return true;
4119 }
4120 }
4121
4122 return false;
4123 }
4124
4125 /* Implement the TARGET_OPTION_OVERRIDE hook. */
4126
4127 static void
4128 ix86_option_override (void)
4129 {
4130 ix86_option_override_internal (true);
4131 }
4132
4133 /* Update register usage after having seen the compiler flags. */
4134
4135 static void
4136 ix86_conditional_register_usage (void)
4137 {
4138 int i, c_mask;
4139 unsigned int j;
4140
4141 /* The PIC register, if it exists, is fixed. */
4142 j = PIC_OFFSET_TABLE_REGNUM;
4143 if (j != INVALID_REGNUM)
4144 fixed_regs[j] = call_used_regs[j] = 1;
4145
4146 /* For 32-bit targets, squash the REX registers. */
4147 if (! TARGET_64BIT)
4148 {
4149 for (i = FIRST_REX_INT_REG; i <= LAST_REX_INT_REG; i++)
4150 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4151 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
4152 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4153 }
4154
4155 /* See the definition of CALL_USED_REGISTERS in i386.h. */
4156 c_mask = (TARGET_64BIT_MS_ABI ? (1 << 3)
4157 : TARGET_64BIT ? (1 << 2)
4158 : (1 << 1));
4159
4160 CLEAR_HARD_REG_SET (reg_class_contents[(int)CLOBBERED_REGS]);
4161
4162 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4163 {
4164 /* Set/reset conditionally defined registers from
4165 CALL_USED_REGISTERS initializer. */
4166 if (call_used_regs[i] > 1)
4167 call_used_regs[i] = !!(call_used_regs[i] & c_mask);
4168
4169 /* Calculate registers of CLOBBERED_REGS register set
4170 as call used registers from GENERAL_REGS register set. */
4171 if (TEST_HARD_REG_BIT (reg_class_contents[(int)GENERAL_REGS], i)
4172 && call_used_regs[i])
4173 SET_HARD_REG_BIT (reg_class_contents[(int)CLOBBERED_REGS], i);
4174 }
4175
4176 /* If MMX is disabled, squash the registers. */
4177 if (! TARGET_MMX)
4178 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4179 if (TEST_HARD_REG_BIT (reg_class_contents[(int)MMX_REGS], i))
4180 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4181
4182 /* If SSE is disabled, squash the registers. */
4183 if (! TARGET_SSE)
4184 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4185 if (TEST_HARD_REG_BIT (reg_class_contents[(int)SSE_REGS], i))
4186 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4187
4188 /* If the FPU is disabled, squash the registers. */
4189 if (! (TARGET_80387 || TARGET_FLOAT_RETURNS_IN_80387))
4190 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4191 if (TEST_HARD_REG_BIT (reg_class_contents[(int)FLOAT_REGS], i))
4192 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4193 }
4194
4195 \f
4196 /* Save the current options */
4197
4198 static void
4199 ix86_function_specific_save (struct cl_target_option *ptr)
4200 {
4201 ptr->arch = ix86_arch;
4202 ptr->schedule = ix86_schedule;
4203 ptr->tune = ix86_tune;
4204 ptr->branch_cost = ix86_branch_cost;
4205 ptr->tune_defaulted = ix86_tune_defaulted;
4206 ptr->arch_specified = ix86_arch_specified;
4207 ptr->x_ix86_isa_flags_explicit = ix86_isa_flags_explicit;
4208 ptr->ix86_target_flags_explicit = target_flags_explicit;
4209 ptr->x_recip_mask_explicit = recip_mask_explicit;
4210
4211 /* The fields are char but the variables are not; make sure the
4212 values fit in the fields. */
4213 gcc_assert (ptr->arch == ix86_arch);
4214 gcc_assert (ptr->schedule == ix86_schedule);
4215 gcc_assert (ptr->tune == ix86_tune);
4216 gcc_assert (ptr->branch_cost == ix86_branch_cost);
4217 }
4218
4219 /* Restore the current options */
4220
4221 static void
4222 ix86_function_specific_restore (struct cl_target_option *ptr)
4223 {
4224 enum processor_type old_tune = ix86_tune;
4225 enum processor_type old_arch = ix86_arch;
4226 unsigned int ix86_arch_mask, ix86_tune_mask;
4227 int i;
4228
4229 ix86_arch = (enum processor_type) ptr->arch;
4230 ix86_schedule = (enum attr_cpu) ptr->schedule;
4231 ix86_tune = (enum processor_type) ptr->tune;
4232 ix86_branch_cost = ptr->branch_cost;
4233 ix86_tune_defaulted = ptr->tune_defaulted;
4234 ix86_arch_specified = ptr->arch_specified;
4235 ix86_isa_flags_explicit = ptr->x_ix86_isa_flags_explicit;
4236 target_flags_explicit = ptr->ix86_target_flags_explicit;
4237 recip_mask_explicit = ptr->x_recip_mask_explicit;
4238
4239 /* Recreate the arch feature tests if the arch changed */
4240 if (old_arch != ix86_arch)
4241 {
4242 ix86_arch_mask = 1u << ix86_arch;
4243 for (i = 0; i < X86_ARCH_LAST; ++i)
4244 ix86_arch_features[i]
4245 = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
4246 }
4247
4248 /* Recreate the tune optimization tests */
4249 if (old_tune != ix86_tune)
4250 {
4251 ix86_tune_mask = 1u << ix86_tune;
4252 for (i = 0; i < X86_TUNE_LAST; ++i)
4253 ix86_tune_features[i]
4254 = !!(initial_ix86_tune_features[i] & ix86_tune_mask);
4255 }
4256 }
4257
4258 /* Print the current options */
4259
4260 static void
4261 ix86_function_specific_print (FILE *file, int indent,
4262 struct cl_target_option *ptr)
4263 {
4264 char *target_string
4265 = ix86_target_string (ptr->x_ix86_isa_flags, ptr->x_target_flags,
4266 NULL, NULL, ptr->x_ix86_fpmath, false);
4267
4268 fprintf (file, "%*sarch = %d (%s)\n",
4269 indent, "",
4270 ptr->arch,
4271 ((ptr->arch < TARGET_CPU_DEFAULT_max)
4272 ? cpu_names[ptr->arch]
4273 : "<unknown>"));
4274
4275 fprintf (file, "%*stune = %d (%s)\n",
4276 indent, "",
4277 ptr->tune,
4278 ((ptr->tune < TARGET_CPU_DEFAULT_max)
4279 ? cpu_names[ptr->tune]
4280 : "<unknown>"));
4281
4282 fprintf (file, "%*sbranch_cost = %d\n", indent, "", ptr->branch_cost);
4283
4284 if (target_string)
4285 {
4286 fprintf (file, "%*s%s\n", indent, "", target_string);
4287 free (target_string);
4288 }
4289 }
4290
4291 \f
4292 /* Inner function to process the attribute((target(...))), take an argument and
4293 set the current options from the argument. If we have a list, recursively go
4294 over the list. */
4295
4296 static bool
4297 ix86_valid_target_attribute_inner_p (tree args, char *p_strings[],
4298 struct gcc_options *enum_opts_set)
4299 {
4300 char *next_optstr;
4301 bool ret = true;
4302
4303 #define IX86_ATTR_ISA(S,O) { S, sizeof (S)-1, ix86_opt_isa, O, 0 }
4304 #define IX86_ATTR_STR(S,O) { S, sizeof (S)-1, ix86_opt_str, O, 0 }
4305 #define IX86_ATTR_ENUM(S,O) { S, sizeof (S)-1, ix86_opt_enum, O, 0 }
4306 #define IX86_ATTR_YES(S,O,M) { S, sizeof (S)-1, ix86_opt_yes, O, M }
4307 #define IX86_ATTR_NO(S,O,M) { S, sizeof (S)-1, ix86_opt_no, O, M }
4308
4309 enum ix86_opt_type
4310 {
4311 ix86_opt_unknown,
4312 ix86_opt_yes,
4313 ix86_opt_no,
4314 ix86_opt_str,
4315 ix86_opt_enum,
4316 ix86_opt_isa
4317 };
4318
4319 static const struct
4320 {
4321 const char *string;
4322 size_t len;
4323 enum ix86_opt_type type;
4324 int opt;
4325 int mask;
4326 } attrs[] = {
4327 /* isa options */
4328 IX86_ATTR_ISA ("3dnow", OPT_m3dnow),
4329 IX86_ATTR_ISA ("abm", OPT_mabm),
4330 IX86_ATTR_ISA ("bmi", OPT_mbmi),
4331 IX86_ATTR_ISA ("bmi2", OPT_mbmi2),
4332 IX86_ATTR_ISA ("lzcnt", OPT_mlzcnt),
4333 IX86_ATTR_ISA ("tbm", OPT_mtbm),
4334 IX86_ATTR_ISA ("aes", OPT_maes),
4335 IX86_ATTR_ISA ("avx", OPT_mavx),
4336 IX86_ATTR_ISA ("avx2", OPT_mavx2),
4337 IX86_ATTR_ISA ("mmx", OPT_mmmx),
4338 IX86_ATTR_ISA ("pclmul", OPT_mpclmul),
4339 IX86_ATTR_ISA ("popcnt", OPT_mpopcnt),
4340 IX86_ATTR_ISA ("sse", OPT_msse),
4341 IX86_ATTR_ISA ("sse2", OPT_msse2),
4342 IX86_ATTR_ISA ("sse3", OPT_msse3),
4343 IX86_ATTR_ISA ("sse4", OPT_msse4),
4344 IX86_ATTR_ISA ("sse4.1", OPT_msse4_1),
4345 IX86_ATTR_ISA ("sse4.2", OPT_msse4_2),
4346 IX86_ATTR_ISA ("sse4a", OPT_msse4a),
4347 IX86_ATTR_ISA ("ssse3", OPT_mssse3),
4348 IX86_ATTR_ISA ("fma4", OPT_mfma4),
4349 IX86_ATTR_ISA ("fma", OPT_mfma),
4350 IX86_ATTR_ISA ("xop", OPT_mxop),
4351 IX86_ATTR_ISA ("lwp", OPT_mlwp),
4352 IX86_ATTR_ISA ("fsgsbase", OPT_mfsgsbase),
4353 IX86_ATTR_ISA ("rdrnd", OPT_mrdrnd),
4354 IX86_ATTR_ISA ("f16c", OPT_mf16c),
4355 IX86_ATTR_ISA ("rtm", OPT_mrtm),
4356 IX86_ATTR_ISA ("hle", OPT_mhle),
4357 IX86_ATTR_ISA ("prfchw", OPT_mprfchw),
4358 IX86_ATTR_ISA ("rdseed", OPT_mrdseed),
4359 IX86_ATTR_ISA ("adx", OPT_madx),
4360
4361 /* enum options */
4362 IX86_ATTR_ENUM ("fpmath=", OPT_mfpmath_),
4363
4364 /* string options */
4365 IX86_ATTR_STR ("arch=", IX86_FUNCTION_SPECIFIC_ARCH),
4366 IX86_ATTR_STR ("tune=", IX86_FUNCTION_SPECIFIC_TUNE),
4367
4368 /* flag options */
4369 IX86_ATTR_YES ("cld",
4370 OPT_mcld,
4371 MASK_CLD),
4372
4373 IX86_ATTR_NO ("fancy-math-387",
4374 OPT_mfancy_math_387,
4375 MASK_NO_FANCY_MATH_387),
4376
4377 IX86_ATTR_YES ("ieee-fp",
4378 OPT_mieee_fp,
4379 MASK_IEEE_FP),
4380
4381 IX86_ATTR_YES ("inline-all-stringops",
4382 OPT_minline_all_stringops,
4383 MASK_INLINE_ALL_STRINGOPS),
4384
4385 IX86_ATTR_YES ("inline-stringops-dynamically",
4386 OPT_minline_stringops_dynamically,
4387 MASK_INLINE_STRINGOPS_DYNAMICALLY),
4388
4389 IX86_ATTR_NO ("align-stringops",
4390 OPT_mno_align_stringops,
4391 MASK_NO_ALIGN_STRINGOPS),
4392
4393 IX86_ATTR_YES ("recip",
4394 OPT_mrecip,
4395 MASK_RECIP),
4396
4397 };
4398
4399 /* If this is a list, recurse to get the options. */
4400 if (TREE_CODE (args) == TREE_LIST)
4401 {
4402 bool ret = true;
4403
4404 for (; args; args = TREE_CHAIN (args))
4405 if (TREE_VALUE (args)
4406 && !ix86_valid_target_attribute_inner_p (TREE_VALUE (args),
4407 p_strings, enum_opts_set))
4408 ret = false;
4409
4410 return ret;
4411 }
4412
4413 else if (TREE_CODE (args) != STRING_CST)
4414 gcc_unreachable ();
4415
4416 /* Handle multiple arguments separated by commas. */
4417 next_optstr = ASTRDUP (TREE_STRING_POINTER (args));
4418
4419 while (next_optstr && *next_optstr != '\0')
4420 {
4421 char *p = next_optstr;
4422 char *orig_p = p;
4423 char *comma = strchr (next_optstr, ',');
4424 const char *opt_string;
4425 size_t len, opt_len;
4426 int opt;
4427 bool opt_set_p;
4428 char ch;
4429 unsigned i;
4430 enum ix86_opt_type type = ix86_opt_unknown;
4431 int mask = 0;
4432
4433 if (comma)
4434 {
4435 *comma = '\0';
4436 len = comma - next_optstr;
4437 next_optstr = comma + 1;
4438 }
4439 else
4440 {
4441 len = strlen (p);
4442 next_optstr = NULL;
4443 }
4444
4445 /* Recognize no-xxx. */
4446 if (len > 3 && p[0] == 'n' && p[1] == 'o' && p[2] == '-')
4447 {
4448 opt_set_p = false;
4449 p += 3;
4450 len -= 3;
4451 }
4452 else
4453 opt_set_p = true;
4454
4455 /* Find the option. */
4456 ch = *p;
4457 opt = N_OPTS;
4458 for (i = 0; i < ARRAY_SIZE (attrs); i++)
4459 {
4460 type = attrs[i].type;
4461 opt_len = attrs[i].len;
4462 if (ch == attrs[i].string[0]
4463 && ((type != ix86_opt_str && type != ix86_opt_enum)
4464 ? len == opt_len
4465 : len > opt_len)
4466 && memcmp (p, attrs[i].string, opt_len) == 0)
4467 {
4468 opt = attrs[i].opt;
4469 mask = attrs[i].mask;
4470 opt_string = attrs[i].string;
4471 break;
4472 }
4473 }
4474
4475 /* Process the option. */
4476 if (opt == N_OPTS)
4477 {
4478 error ("attribute(target(\"%s\")) is unknown", orig_p);
4479 ret = false;
4480 }
4481
4482 else if (type == ix86_opt_isa)
4483 {
4484 struct cl_decoded_option decoded;
4485
4486 generate_option (opt, NULL, opt_set_p, CL_TARGET, &decoded);
4487 ix86_handle_option (&global_options, &global_options_set,
4488 &decoded, input_location);
4489 }
4490
4491 else if (type == ix86_opt_yes || type == ix86_opt_no)
4492 {
4493 if (type == ix86_opt_no)
4494 opt_set_p = !opt_set_p;
4495
4496 if (opt_set_p)
4497 target_flags |= mask;
4498 else
4499 target_flags &= ~mask;
4500 }
4501
4502 else if (type == ix86_opt_str)
4503 {
4504 if (p_strings[opt])
4505 {
4506 error ("option(\"%s\") was already specified", opt_string);
4507 ret = false;
4508 }
4509 else
4510 p_strings[opt] = xstrdup (p + opt_len);
4511 }
4512
4513 else if (type == ix86_opt_enum)
4514 {
4515 bool arg_ok;
4516 int value;
4517
4518 arg_ok = opt_enum_arg_to_value (opt, p + opt_len, &value, CL_TARGET);
4519 if (arg_ok)
4520 set_option (&global_options, enum_opts_set, opt, value,
4521 p + opt_len, DK_UNSPECIFIED, input_location,
4522 global_dc);
4523 else
4524 {
4525 error ("attribute(target(\"%s\")) is unknown", orig_p);
4526 ret = false;
4527 }
4528 }
4529
4530 else
4531 gcc_unreachable ();
4532 }
4533
4534 return ret;
4535 }
4536
4537 /* Return a TARGET_OPTION_NODE tree of the target options listed or NULL. */
4538
4539 tree
4540 ix86_valid_target_attribute_tree (tree args)
4541 {
4542 const char *orig_arch_string = ix86_arch_string;
4543 const char *orig_tune_string = ix86_tune_string;
4544 enum fpmath_unit orig_fpmath_set = global_options_set.x_ix86_fpmath;
4545 int orig_tune_defaulted = ix86_tune_defaulted;
4546 int orig_arch_specified = ix86_arch_specified;
4547 char *option_strings[IX86_FUNCTION_SPECIFIC_MAX] = { NULL, NULL };
4548 tree t = NULL_TREE;
4549 int i;
4550 struct cl_target_option *def
4551 = TREE_TARGET_OPTION (target_option_default_node);
4552 struct gcc_options enum_opts_set;
4553
4554 memset (&enum_opts_set, 0, sizeof (enum_opts_set));
4555
4556 /* Process each of the options on the chain. */
4557 if (! ix86_valid_target_attribute_inner_p (args, option_strings,
4558 &enum_opts_set))
4559 return NULL_TREE;
4560
4561 /* If the changed options are different from the default, rerun
4562 ix86_option_override_internal, and then save the options away.
4563 The string options are are attribute options, and will be undone
4564 when we copy the save structure. */
4565 if (ix86_isa_flags != def->x_ix86_isa_flags
4566 || target_flags != def->x_target_flags
4567 || option_strings[IX86_FUNCTION_SPECIFIC_ARCH]
4568 || option_strings[IX86_FUNCTION_SPECIFIC_TUNE]
4569 || enum_opts_set.x_ix86_fpmath)
4570 {
4571 /* If we are using the default tune= or arch=, undo the string assigned,
4572 and use the default. */
4573 if (option_strings[IX86_FUNCTION_SPECIFIC_ARCH])
4574 ix86_arch_string = option_strings[IX86_FUNCTION_SPECIFIC_ARCH];
4575 else if (!orig_arch_specified)
4576 ix86_arch_string = NULL;
4577
4578 if (option_strings[IX86_FUNCTION_SPECIFIC_TUNE])
4579 ix86_tune_string = option_strings[IX86_FUNCTION_SPECIFIC_TUNE];
4580 else if (orig_tune_defaulted)
4581 ix86_tune_string = NULL;
4582
4583 /* If fpmath= is not set, and we now have sse2 on 32-bit, use it. */
4584 if (enum_opts_set.x_ix86_fpmath)
4585 global_options_set.x_ix86_fpmath = (enum fpmath_unit) 1;
4586 else if (!TARGET_64BIT && TARGET_SSE)
4587 {
4588 ix86_fpmath = (enum fpmath_unit) (FPMATH_SSE | FPMATH_387);
4589 global_options_set.x_ix86_fpmath = (enum fpmath_unit) 1;
4590 }
4591
4592 /* Do any overrides, such as arch=xxx, or tune=xxx support. */
4593 ix86_option_override_internal (false);
4594
4595 /* Add any builtin functions with the new isa if any. */
4596 ix86_add_new_builtins (ix86_isa_flags);
4597
4598 /* Save the current options unless we are validating options for
4599 #pragma. */
4600 t = build_target_option_node ();
4601
4602 ix86_arch_string = orig_arch_string;
4603 ix86_tune_string = orig_tune_string;
4604 global_options_set.x_ix86_fpmath = orig_fpmath_set;
4605
4606 /* Free up memory allocated to hold the strings */
4607 for (i = 0; i < IX86_FUNCTION_SPECIFIC_MAX; i++)
4608 free (option_strings[i]);
4609 }
4610
4611 return t;
4612 }
4613
4614 /* Hook to validate attribute((target("string"))). */
4615
4616 static bool
4617 ix86_valid_target_attribute_p (tree fndecl,
4618 tree ARG_UNUSED (name),
4619 tree args,
4620 int ARG_UNUSED (flags))
4621 {
4622 struct cl_target_option cur_target;
4623 bool ret = true;
4624 tree old_optimize = build_optimization_node ();
4625 tree new_target, new_optimize;
4626 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
4627
4628 /* If the function changed the optimization levels as well as setting target
4629 options, start with the optimizations specified. */
4630 if (func_optimize && func_optimize != old_optimize)
4631 cl_optimization_restore (&global_options,
4632 TREE_OPTIMIZATION (func_optimize));
4633
4634 /* The target attributes may also change some optimization flags, so update
4635 the optimization options if necessary. */
4636 cl_target_option_save (&cur_target, &global_options);
4637 new_target = ix86_valid_target_attribute_tree (args);
4638 new_optimize = build_optimization_node ();
4639
4640 if (!new_target)
4641 ret = false;
4642
4643 else if (fndecl)
4644 {
4645 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
4646
4647 if (old_optimize != new_optimize)
4648 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
4649 }
4650
4651 cl_target_option_restore (&global_options, &cur_target);
4652
4653 if (old_optimize != new_optimize)
4654 cl_optimization_restore (&global_options,
4655 TREE_OPTIMIZATION (old_optimize));
4656
4657 return ret;
4658 }
4659
4660 \f
4661 /* Hook to determine if one function can safely inline another. */
4662
4663 static bool
4664 ix86_can_inline_p (tree caller, tree callee)
4665 {
4666 bool ret = false;
4667 tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
4668 tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
4669
4670 /* If callee has no option attributes, then it is ok to inline. */
4671 if (!callee_tree)
4672 ret = true;
4673
4674 /* If caller has no option attributes, but callee does then it is not ok to
4675 inline. */
4676 else if (!caller_tree)
4677 ret = false;
4678
4679 else
4680 {
4681 struct cl_target_option *caller_opts = TREE_TARGET_OPTION (caller_tree);
4682 struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree);
4683
4684 /* Callee's isa options should a subset of the caller's, i.e. a SSE4 function
4685 can inline a SSE2 function but a SSE2 function can't inline a SSE4
4686 function. */
4687 if ((caller_opts->x_ix86_isa_flags & callee_opts->x_ix86_isa_flags)
4688 != callee_opts->x_ix86_isa_flags)
4689 ret = false;
4690
4691 /* See if we have the same non-isa options. */
4692 else if (caller_opts->x_target_flags != callee_opts->x_target_flags)
4693 ret = false;
4694
4695 /* See if arch, tune, etc. are the same. */
4696 else if (caller_opts->arch != callee_opts->arch)
4697 ret = false;
4698
4699 else if (caller_opts->tune != callee_opts->tune)
4700 ret = false;
4701
4702 else if (caller_opts->x_ix86_fpmath != callee_opts->x_ix86_fpmath)
4703 ret = false;
4704
4705 else if (caller_opts->branch_cost != callee_opts->branch_cost)
4706 ret = false;
4707
4708 else
4709 ret = true;
4710 }
4711
4712 return ret;
4713 }
4714
4715 \f
4716 /* Remember the last target of ix86_set_current_function. */
4717 static GTY(()) tree ix86_previous_fndecl;
4718
4719 /* Establish appropriate back-end context for processing the function
4720 FNDECL. The argument might be NULL to indicate processing at top
4721 level, outside of any function scope. */
4722 static void
4723 ix86_set_current_function (tree fndecl)
4724 {
4725 /* Only change the context if the function changes. This hook is called
4726 several times in the course of compiling a function, and we don't want to
4727 slow things down too much or call target_reinit when it isn't safe. */
4728 if (fndecl && fndecl != ix86_previous_fndecl)
4729 {
4730 tree old_tree = (ix86_previous_fndecl
4731 ? DECL_FUNCTION_SPECIFIC_TARGET (ix86_previous_fndecl)
4732 : NULL_TREE);
4733
4734 tree new_tree = (fndecl
4735 ? DECL_FUNCTION_SPECIFIC_TARGET (fndecl)
4736 : NULL_TREE);
4737
4738 ix86_previous_fndecl = fndecl;
4739 if (old_tree == new_tree)
4740 ;
4741
4742 else if (new_tree)
4743 {
4744 cl_target_option_restore (&global_options,
4745 TREE_TARGET_OPTION (new_tree));
4746 target_reinit ();
4747 }
4748
4749 else if (old_tree)
4750 {
4751 struct cl_target_option *def
4752 = TREE_TARGET_OPTION (target_option_current_node);
4753
4754 cl_target_option_restore (&global_options, def);
4755 target_reinit ();
4756 }
4757 }
4758 }
4759
4760 \f
4761 /* Return true if this goes in large data/bss. */
4762
4763 static bool
4764 ix86_in_large_data_p (tree exp)
4765 {
4766 if (ix86_cmodel != CM_MEDIUM && ix86_cmodel != CM_MEDIUM_PIC)
4767 return false;
4768
4769 /* Functions are never large data. */
4770 if (TREE_CODE (exp) == FUNCTION_DECL)
4771 return false;
4772
4773 if (TREE_CODE (exp) == VAR_DECL && DECL_SECTION_NAME (exp))
4774 {
4775 const char *section = TREE_STRING_POINTER (DECL_SECTION_NAME (exp));
4776 if (strcmp (section, ".ldata") == 0
4777 || strcmp (section, ".lbss") == 0)
4778 return true;
4779 return false;
4780 }
4781 else
4782 {
4783 HOST_WIDE_INT size = int_size_in_bytes (TREE_TYPE (exp));
4784
4785 /* If this is an incomplete type with size 0, then we can't put it
4786 in data because it might be too big when completed. */
4787 if (!size || size > ix86_section_threshold)
4788 return true;
4789 }
4790
4791 return false;
4792 }
4793
4794 /* Switch to the appropriate section for output of DECL.
4795 DECL is either a `VAR_DECL' node or a constant of some sort.
4796 RELOC indicates whether forming the initial value of DECL requires
4797 link-time relocations. */
4798
4799 static section * x86_64_elf_select_section (tree, int, unsigned HOST_WIDE_INT)
4800 ATTRIBUTE_UNUSED;
4801
4802 static section *
4803 x86_64_elf_select_section (tree decl, int reloc,
4804 unsigned HOST_WIDE_INT align)
4805 {
4806 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4807 && ix86_in_large_data_p (decl))
4808 {
4809 const char *sname = NULL;
4810 unsigned int flags = SECTION_WRITE;
4811 switch (categorize_decl_for_section (decl, reloc))
4812 {
4813 case SECCAT_DATA:
4814 sname = ".ldata";
4815 break;
4816 case SECCAT_DATA_REL:
4817 sname = ".ldata.rel";
4818 break;
4819 case SECCAT_DATA_REL_LOCAL:
4820 sname = ".ldata.rel.local";
4821 break;
4822 case SECCAT_DATA_REL_RO:
4823 sname = ".ldata.rel.ro";
4824 break;
4825 case SECCAT_DATA_REL_RO_LOCAL:
4826 sname = ".ldata.rel.ro.local";
4827 break;
4828 case SECCAT_BSS:
4829 sname = ".lbss";
4830 flags |= SECTION_BSS;
4831 break;
4832 case SECCAT_RODATA:
4833 case SECCAT_RODATA_MERGE_STR:
4834 case SECCAT_RODATA_MERGE_STR_INIT:
4835 case SECCAT_RODATA_MERGE_CONST:
4836 sname = ".lrodata";
4837 flags = 0;
4838 break;
4839 case SECCAT_SRODATA:
4840 case SECCAT_SDATA:
4841 case SECCAT_SBSS:
4842 gcc_unreachable ();
4843 case SECCAT_TEXT:
4844 case SECCAT_TDATA:
4845 case SECCAT_TBSS:
4846 /* We don't split these for medium model. Place them into
4847 default sections and hope for best. */
4848 break;
4849 }
4850 if (sname)
4851 {
4852 /* We might get called with string constants, but get_named_section
4853 doesn't like them as they are not DECLs. Also, we need to set
4854 flags in that case. */
4855 if (!DECL_P (decl))
4856 return get_section (sname, flags, NULL);
4857 return get_named_section (decl, sname, reloc);
4858 }
4859 }
4860 return default_elf_select_section (decl, reloc, align);
4861 }
4862
4863 /* Build up a unique section name, expressed as a
4864 STRING_CST node, and assign it to DECL_SECTION_NAME (decl).
4865 RELOC indicates whether the initial value of EXP requires
4866 link-time relocations. */
4867
4868 static void ATTRIBUTE_UNUSED
4869 x86_64_elf_unique_section (tree decl, int reloc)
4870 {
4871 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4872 && ix86_in_large_data_p (decl))
4873 {
4874 const char *prefix = NULL;
4875 /* We only need to use .gnu.linkonce if we don't have COMDAT groups. */
4876 bool one_only = DECL_ONE_ONLY (decl) && !HAVE_COMDAT_GROUP;
4877
4878 switch (categorize_decl_for_section (decl, reloc))
4879 {
4880 case SECCAT_DATA:
4881 case SECCAT_DATA_REL:
4882 case SECCAT_DATA_REL_LOCAL:
4883 case SECCAT_DATA_REL_RO:
4884 case SECCAT_DATA_REL_RO_LOCAL:
4885 prefix = one_only ? ".ld" : ".ldata";
4886 break;
4887 case SECCAT_BSS:
4888 prefix = one_only ? ".lb" : ".lbss";
4889 break;
4890 case SECCAT_RODATA:
4891 case SECCAT_RODATA_MERGE_STR:
4892 case SECCAT_RODATA_MERGE_STR_INIT:
4893 case SECCAT_RODATA_MERGE_CONST:
4894 prefix = one_only ? ".lr" : ".lrodata";
4895 break;
4896 case SECCAT_SRODATA:
4897 case SECCAT_SDATA:
4898 case SECCAT_SBSS:
4899 gcc_unreachable ();
4900 case SECCAT_TEXT:
4901 case SECCAT_TDATA:
4902 case SECCAT_TBSS:
4903 /* We don't split these for medium model. Place them into
4904 default sections and hope for best. */
4905 break;
4906 }
4907 if (prefix)
4908 {
4909 const char *name, *linkonce;
4910 char *string;
4911
4912 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
4913 name = targetm.strip_name_encoding (name);
4914
4915 /* If we're using one_only, then there needs to be a .gnu.linkonce
4916 prefix to the section name. */
4917 linkonce = one_only ? ".gnu.linkonce" : "";
4918
4919 string = ACONCAT ((linkonce, prefix, ".", name, NULL));
4920
4921 DECL_SECTION_NAME (decl) = build_string (strlen (string), string);
4922 return;
4923 }
4924 }
4925 default_unique_section (decl, reloc);
4926 }
4927
4928 #ifdef COMMON_ASM_OP
4929 /* This says how to output assembler code to declare an
4930 uninitialized external linkage data object.
4931
4932 For medium model x86-64 we need to use .largecomm opcode for
4933 large objects. */
4934 void
4935 x86_elf_aligned_common (FILE *file,
4936 const char *name, unsigned HOST_WIDE_INT size,
4937 int align)
4938 {
4939 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4940 && size > (unsigned int)ix86_section_threshold)
4941 fputs (".largecomm\t", file);
4942 else
4943 fputs (COMMON_ASM_OP, file);
4944 assemble_name (file, name);
4945 fprintf (file, "," HOST_WIDE_INT_PRINT_UNSIGNED ",%u\n",
4946 size, align / BITS_PER_UNIT);
4947 }
4948 #endif
4949
4950 /* Utility function for targets to use in implementing
4951 ASM_OUTPUT_ALIGNED_BSS. */
4952
4953 void
4954 x86_output_aligned_bss (FILE *file, tree decl ATTRIBUTE_UNUSED,
4955 const char *name, unsigned HOST_WIDE_INT size,
4956 int align)
4957 {
4958 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4959 && size > (unsigned int)ix86_section_threshold)
4960 switch_to_section (get_named_section (decl, ".lbss", 0));
4961 else
4962 switch_to_section (bss_section);
4963 ASM_OUTPUT_ALIGN (file, floor_log2 (align / BITS_PER_UNIT));
4964 #ifdef ASM_DECLARE_OBJECT_NAME
4965 last_assemble_variable_decl = decl;
4966 ASM_DECLARE_OBJECT_NAME (file, name, decl);
4967 #else
4968 /* Standard thing is just output label for the object. */
4969 ASM_OUTPUT_LABEL (file, name);
4970 #endif /* ASM_DECLARE_OBJECT_NAME */
4971 ASM_OUTPUT_SKIP (file, size ? size : 1);
4972 }
4973 \f
4974 /* Decide whether we must probe the stack before any space allocation
4975 on this target. It's essentially TARGET_STACK_PROBE except when
4976 -fstack-check causes the stack to be already probed differently. */
4977
4978 bool
4979 ix86_target_stack_probe (void)
4980 {
4981 /* Do not probe the stack twice if static stack checking is enabled. */
4982 if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
4983 return false;
4984
4985 return TARGET_STACK_PROBE;
4986 }
4987 \f
4988 /* Decide whether we can make a sibling call to a function. DECL is the
4989 declaration of the function being targeted by the call and EXP is the
4990 CALL_EXPR representing the call. */
4991
4992 static bool
4993 ix86_function_ok_for_sibcall (tree decl, tree exp)
4994 {
4995 tree type, decl_or_type;
4996 rtx a, b;
4997
4998 /* If we are generating position-independent code, we cannot sibcall
4999 optimize any indirect call, or a direct call to a global function,
5000 as the PLT requires %ebx be live. (Darwin does not have a PLT.) */
5001 if (!TARGET_MACHO
5002 && !TARGET_64BIT
5003 && flag_pic
5004 && (!decl || !targetm.binds_local_p (decl)))
5005 return false;
5006
5007 /* If we need to align the outgoing stack, then sibcalling would
5008 unalign the stack, which may break the called function. */
5009 if (ix86_minimum_incoming_stack_boundary (true)
5010 < PREFERRED_STACK_BOUNDARY)
5011 return false;
5012
5013 if (decl)
5014 {
5015 decl_or_type = decl;
5016 type = TREE_TYPE (decl);
5017 }
5018 else
5019 {
5020 /* We're looking at the CALL_EXPR, we need the type of the function. */
5021 type = CALL_EXPR_FN (exp); /* pointer expression */
5022 type = TREE_TYPE (type); /* pointer type */
5023 type = TREE_TYPE (type); /* function type */
5024 decl_or_type = type;
5025 }
5026
5027 /* Check that the return value locations are the same. Like
5028 if we are returning floats on the 80387 register stack, we cannot
5029 make a sibcall from a function that doesn't return a float to a
5030 function that does or, conversely, from a function that does return
5031 a float to a function that doesn't; the necessary stack adjustment
5032 would not be executed. This is also the place we notice
5033 differences in the return value ABI. Note that it is ok for one
5034 of the functions to have void return type as long as the return
5035 value of the other is passed in a register. */
5036 a = ix86_function_value (TREE_TYPE (exp), decl_or_type, false);
5037 b = ix86_function_value (TREE_TYPE (DECL_RESULT (cfun->decl)),
5038 cfun->decl, false);
5039 if (STACK_REG_P (a) || STACK_REG_P (b))
5040 {
5041 if (!rtx_equal_p (a, b))
5042 return false;
5043 }
5044 else if (VOID_TYPE_P (TREE_TYPE (DECL_RESULT (cfun->decl))))
5045 {
5046 /* Disable sibcall if we need to generate vzeroupper after
5047 callee returns. */
5048 if (TARGET_VZEROUPPER
5049 && cfun->machine->callee_return_avx256_p
5050 && !cfun->machine->caller_return_avx256_p)
5051 return false;
5052 }
5053 else if (!rtx_equal_p (a, b))
5054 return false;
5055
5056 if (TARGET_64BIT)
5057 {
5058 /* The SYSV ABI has more call-clobbered registers;
5059 disallow sibcalls from MS to SYSV. */
5060 if (cfun->machine->call_abi == MS_ABI
5061 && ix86_function_type_abi (type) == SYSV_ABI)
5062 return false;
5063 }
5064 else
5065 {
5066 /* If this call is indirect, we'll need to be able to use a
5067 call-clobbered register for the address of the target function.
5068 Make sure that all such registers are not used for passing
5069 parameters. Note that DLLIMPORT functions are indirect. */
5070 if (!decl
5071 || (TARGET_DLLIMPORT_DECL_ATTRIBUTES && DECL_DLLIMPORT_P (decl)))
5072 {
5073 if (ix86_function_regparm (type, NULL) >= 3)
5074 {
5075 /* ??? Need to count the actual number of registers to be used,
5076 not the possible number of registers. Fix later. */
5077 return false;
5078 }
5079 }
5080 }
5081
5082 /* Otherwise okay. That also includes certain types of indirect calls. */
5083 return true;
5084 }
5085
5086 /* Handle "cdecl", "stdcall", "fastcall", "regparm", "thiscall",
5087 and "sseregparm" calling convention attributes;
5088 arguments as in struct attribute_spec.handler. */
5089
5090 static tree
5091 ix86_handle_cconv_attribute (tree *node, tree name,
5092 tree args,
5093 int flags ATTRIBUTE_UNUSED,
5094 bool *no_add_attrs)
5095 {
5096 if (TREE_CODE (*node) != FUNCTION_TYPE
5097 && TREE_CODE (*node) != METHOD_TYPE
5098 && TREE_CODE (*node) != FIELD_DECL
5099 && TREE_CODE (*node) != TYPE_DECL)
5100 {
5101 warning (OPT_Wattributes, "%qE attribute only applies to functions",
5102 name);
5103 *no_add_attrs = true;
5104 return NULL_TREE;
5105 }
5106
5107 /* Can combine regparm with all attributes but fastcall, and thiscall. */
5108 if (is_attribute_p ("regparm", name))
5109 {
5110 tree cst;
5111
5112 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
5113 {
5114 error ("fastcall and regparm attributes are not compatible");
5115 }
5116
5117 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
5118 {
5119 error ("regparam and thiscall attributes are not compatible");
5120 }
5121
5122 cst = TREE_VALUE (args);
5123 if (TREE_CODE (cst) != INTEGER_CST)
5124 {
5125 warning (OPT_Wattributes,
5126 "%qE attribute requires an integer constant argument",
5127 name);
5128 *no_add_attrs = true;
5129 }
5130 else if (compare_tree_int (cst, REGPARM_MAX) > 0)
5131 {
5132 warning (OPT_Wattributes, "argument to %qE attribute larger than %d",
5133 name, REGPARM_MAX);
5134 *no_add_attrs = true;
5135 }
5136
5137 return NULL_TREE;
5138 }
5139
5140 if (TARGET_64BIT)
5141 {
5142 /* Do not warn when emulating the MS ABI. */
5143 if ((TREE_CODE (*node) != FUNCTION_TYPE
5144 && TREE_CODE (*node) != METHOD_TYPE)
5145 || ix86_function_type_abi (*node) != MS_ABI)
5146 warning (OPT_Wattributes, "%qE attribute ignored",
5147 name);
5148 *no_add_attrs = true;
5149 return NULL_TREE;
5150 }
5151
5152 /* Can combine fastcall with stdcall (redundant) and sseregparm. */
5153 if (is_attribute_p ("fastcall", name))
5154 {
5155 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
5156 {
5157 error ("fastcall and cdecl attributes are not compatible");
5158 }
5159 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
5160 {
5161 error ("fastcall and stdcall attributes are not compatible");
5162 }
5163 if (lookup_attribute ("regparm", TYPE_ATTRIBUTES (*node)))
5164 {
5165 error ("fastcall and regparm attributes are not compatible");
5166 }
5167 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
5168 {
5169 error ("fastcall and thiscall attributes are not compatible");
5170 }
5171 }
5172
5173 /* Can combine stdcall with fastcall (redundant), regparm and
5174 sseregparm. */
5175 else if (is_attribute_p ("stdcall", name))
5176 {
5177 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
5178 {
5179 error ("stdcall and cdecl attributes are not compatible");
5180 }
5181 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
5182 {
5183 error ("stdcall and fastcall attributes are not compatible");
5184 }
5185 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
5186 {
5187 error ("stdcall and thiscall attributes are not compatible");
5188 }
5189 }
5190
5191 /* Can combine cdecl with regparm and sseregparm. */
5192 else if (is_attribute_p ("cdecl", name))
5193 {
5194 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
5195 {
5196 error ("stdcall and cdecl attributes are not compatible");
5197 }
5198 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
5199 {
5200 error ("fastcall and cdecl attributes are not compatible");
5201 }
5202 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
5203 {
5204 error ("cdecl and thiscall attributes are not compatible");
5205 }
5206 }
5207 else if (is_attribute_p ("thiscall", name))
5208 {
5209 if (TREE_CODE (*node) != METHOD_TYPE && pedantic)
5210 warning (OPT_Wattributes, "%qE attribute is used for none class-method",
5211 name);
5212 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
5213 {
5214 error ("stdcall and thiscall attributes are not compatible");
5215 }
5216 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
5217 {
5218 error ("fastcall and thiscall attributes are not compatible");
5219 }
5220 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
5221 {
5222 error ("cdecl and thiscall attributes are not compatible");
5223 }
5224 }
5225
5226 /* Can combine sseregparm with all attributes. */
5227
5228 return NULL_TREE;
5229 }
5230
5231 /* The transactional memory builtins are implicitly regparm or fastcall
5232 depending on the ABI. Override the generic do-nothing attribute that
5233 these builtins were declared with, and replace it with one of the two
5234 attributes that we expect elsewhere. */
5235
5236 static tree
5237 ix86_handle_tm_regparm_attribute (tree *node, tree name ATTRIBUTE_UNUSED,
5238 tree args ATTRIBUTE_UNUSED,
5239 int flags ATTRIBUTE_UNUSED,
5240 bool *no_add_attrs)
5241 {
5242 tree alt;
5243
5244 /* In no case do we want to add the placeholder attribute. */
5245 *no_add_attrs = true;
5246
5247 /* The 64-bit ABI is unchanged for transactional memory. */
5248 if (TARGET_64BIT)
5249 return NULL_TREE;
5250
5251 /* ??? Is there a better way to validate 32-bit windows? We have
5252 cfun->machine->call_abi, but that seems to be set only for 64-bit. */
5253 if (CHECK_STACK_LIMIT > 0)
5254 alt = tree_cons (get_identifier ("fastcall"), NULL, NULL);
5255 else
5256 {
5257 alt = tree_cons (NULL, build_int_cst (NULL, 2), NULL);
5258 alt = tree_cons (get_identifier ("regparm"), alt, NULL);
5259 }
5260 decl_attributes (node, alt, flags);
5261
5262 return NULL_TREE;
5263 }
5264
5265 /* This function determines from TYPE the calling-convention. */
5266
5267 unsigned int
5268 ix86_get_callcvt (const_tree type)
5269 {
5270 unsigned int ret = 0;
5271 bool is_stdarg;
5272 tree attrs;
5273
5274 if (TARGET_64BIT)
5275 return IX86_CALLCVT_CDECL;
5276
5277 attrs = TYPE_ATTRIBUTES (type);
5278 if (attrs != NULL_TREE)
5279 {
5280 if (lookup_attribute ("cdecl", attrs))
5281 ret |= IX86_CALLCVT_CDECL;
5282 else if (lookup_attribute ("stdcall", attrs))
5283 ret |= IX86_CALLCVT_STDCALL;
5284 else if (lookup_attribute ("fastcall", attrs))
5285 ret |= IX86_CALLCVT_FASTCALL;
5286 else if (lookup_attribute ("thiscall", attrs))
5287 ret |= IX86_CALLCVT_THISCALL;
5288
5289 /* Regparam isn't allowed for thiscall and fastcall. */
5290 if ((ret & (IX86_CALLCVT_THISCALL | IX86_CALLCVT_FASTCALL)) == 0)
5291 {
5292 if (lookup_attribute ("regparm", attrs))
5293 ret |= IX86_CALLCVT_REGPARM;
5294 if (lookup_attribute ("sseregparm", attrs))
5295 ret |= IX86_CALLCVT_SSEREGPARM;
5296 }
5297
5298 if (IX86_BASE_CALLCVT(ret) != 0)
5299 return ret;
5300 }
5301
5302 is_stdarg = stdarg_p (type);
5303 if (TARGET_RTD && !is_stdarg)
5304 return IX86_CALLCVT_STDCALL | ret;
5305
5306 if (ret != 0
5307 || is_stdarg
5308 || TREE_CODE (type) != METHOD_TYPE
5309 || ix86_function_type_abi (type) != MS_ABI)
5310 return IX86_CALLCVT_CDECL | ret;
5311
5312 return IX86_CALLCVT_THISCALL;
5313 }
5314
5315 /* Return 0 if the attributes for two types are incompatible, 1 if they
5316 are compatible, and 2 if they are nearly compatible (which causes a
5317 warning to be generated). */
5318
5319 static int
5320 ix86_comp_type_attributes (const_tree type1, const_tree type2)
5321 {
5322 unsigned int ccvt1, ccvt2;
5323
5324 if (TREE_CODE (type1) != FUNCTION_TYPE
5325 && TREE_CODE (type1) != METHOD_TYPE)
5326 return 1;
5327
5328 ccvt1 = ix86_get_callcvt (type1);
5329 ccvt2 = ix86_get_callcvt (type2);
5330 if (ccvt1 != ccvt2)
5331 return 0;
5332 if (ix86_function_regparm (type1, NULL)
5333 != ix86_function_regparm (type2, NULL))
5334 return 0;
5335
5336 return 1;
5337 }
5338 \f
5339 /* Return the regparm value for a function with the indicated TYPE and DECL.
5340 DECL may be NULL when calling function indirectly
5341 or considering a libcall. */
5342
5343 static int
5344 ix86_function_regparm (const_tree type, const_tree decl)
5345 {
5346 tree attr;
5347 int regparm;
5348 unsigned int ccvt;
5349
5350 if (TARGET_64BIT)
5351 return (ix86_function_type_abi (type) == SYSV_ABI
5352 ? X86_64_REGPARM_MAX : X86_64_MS_REGPARM_MAX);
5353 ccvt = ix86_get_callcvt (type);
5354 regparm = ix86_regparm;
5355
5356 if ((ccvt & IX86_CALLCVT_REGPARM) != 0)
5357 {
5358 attr = lookup_attribute ("regparm", TYPE_ATTRIBUTES (type));
5359 if (attr)
5360 {
5361 regparm = TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr)));
5362 return regparm;
5363 }
5364 }
5365 else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
5366 return 2;
5367 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
5368 return 1;
5369
5370 /* Use register calling convention for local functions when possible. */
5371 if (decl
5372 && TREE_CODE (decl) == FUNCTION_DECL
5373 && optimize
5374 && !(profile_flag && !flag_fentry))
5375 {
5376 /* FIXME: remove this CONST_CAST when cgraph.[ch] is constified. */
5377 struct cgraph_local_info *i = cgraph_local_info (CONST_CAST_TREE (decl));
5378 if (i && i->local && i->can_change_signature)
5379 {
5380 int local_regparm, globals = 0, regno;
5381
5382 /* Make sure no regparm register is taken by a
5383 fixed register variable. */
5384 for (local_regparm = 0; local_regparm < REGPARM_MAX; local_regparm++)
5385 if (fixed_regs[local_regparm])
5386 break;
5387
5388 /* We don't want to use regparm(3) for nested functions as
5389 these use a static chain pointer in the third argument. */
5390 if (local_regparm == 3 && DECL_STATIC_CHAIN (decl))
5391 local_regparm = 2;
5392
5393 /* In 32-bit mode save a register for the split stack. */
5394 if (!TARGET_64BIT && local_regparm == 3 && flag_split_stack)
5395 local_regparm = 2;
5396
5397 /* Each fixed register usage increases register pressure,
5398 so less registers should be used for argument passing.
5399 This functionality can be overriden by an explicit
5400 regparm value. */
5401 for (regno = AX_REG; regno <= DI_REG; regno++)
5402 if (fixed_regs[regno])
5403 globals++;
5404
5405 local_regparm
5406 = globals < local_regparm ? local_regparm - globals : 0;
5407
5408 if (local_regparm > regparm)
5409 regparm = local_regparm;
5410 }
5411 }
5412
5413 return regparm;
5414 }
5415
5416 /* Return 1 or 2, if we can pass up to SSE_REGPARM_MAX SFmode (1) and
5417 DFmode (2) arguments in SSE registers for a function with the
5418 indicated TYPE and DECL. DECL may be NULL when calling function
5419 indirectly or considering a libcall. Otherwise return 0. */
5420
5421 static int
5422 ix86_function_sseregparm (const_tree type, const_tree decl, bool warn)
5423 {
5424 gcc_assert (!TARGET_64BIT);
5425
5426 /* Use SSE registers to pass SFmode and DFmode arguments if requested
5427 by the sseregparm attribute. */
5428 if (TARGET_SSEREGPARM
5429 || (type && lookup_attribute ("sseregparm", TYPE_ATTRIBUTES (type))))
5430 {
5431 if (!TARGET_SSE)
5432 {
5433 if (warn)
5434 {
5435 if (decl)
5436 error ("calling %qD with attribute sseregparm without "
5437 "SSE/SSE2 enabled", decl);
5438 else
5439 error ("calling %qT with attribute sseregparm without "
5440 "SSE/SSE2 enabled", type);
5441 }
5442 return 0;
5443 }
5444
5445 return 2;
5446 }
5447
5448 /* For local functions, pass up to SSE_REGPARM_MAX SFmode
5449 (and DFmode for SSE2) arguments in SSE registers. */
5450 if (decl && TARGET_SSE_MATH && optimize
5451 && !(profile_flag && !flag_fentry))
5452 {
5453 /* FIXME: remove this CONST_CAST when cgraph.[ch] is constified. */
5454 struct cgraph_local_info *i = cgraph_local_info (CONST_CAST_TREE(decl));
5455 if (i && i->local && i->can_change_signature)
5456 return TARGET_SSE2 ? 2 : 1;
5457 }
5458
5459 return 0;
5460 }
5461
5462 /* Return true if EAX is live at the start of the function. Used by
5463 ix86_expand_prologue to determine if we need special help before
5464 calling allocate_stack_worker. */
5465
5466 static bool
5467 ix86_eax_live_at_start_p (void)
5468 {
5469 /* Cheat. Don't bother working forward from ix86_function_regparm
5470 to the function type to whether an actual argument is located in
5471 eax. Instead just look at cfg info, which is still close enough
5472 to correct at this point. This gives false positives for broken
5473 functions that might use uninitialized data that happens to be
5474 allocated in eax, but who cares? */
5475 return REGNO_REG_SET_P (df_get_live_out (ENTRY_BLOCK_PTR), 0);
5476 }
5477
5478 static bool
5479 ix86_keep_aggregate_return_pointer (tree fntype)
5480 {
5481 tree attr;
5482
5483 if (!TARGET_64BIT)
5484 {
5485 attr = lookup_attribute ("callee_pop_aggregate_return",
5486 TYPE_ATTRIBUTES (fntype));
5487 if (attr)
5488 return (TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr))) == 0);
5489
5490 /* For 32-bit MS-ABI the default is to keep aggregate
5491 return pointer. */
5492 if (ix86_function_type_abi (fntype) == MS_ABI)
5493 return true;
5494 }
5495 return KEEP_AGGREGATE_RETURN_POINTER != 0;
5496 }
5497
5498 /* Value is the number of bytes of arguments automatically
5499 popped when returning from a subroutine call.
5500 FUNDECL is the declaration node of the function (as a tree),
5501 FUNTYPE is the data type of the function (as a tree),
5502 or for a library call it is an identifier node for the subroutine name.
5503 SIZE is the number of bytes of arguments passed on the stack.
5504
5505 On the 80386, the RTD insn may be used to pop them if the number
5506 of args is fixed, but if the number is variable then the caller
5507 must pop them all. RTD can't be used for library calls now
5508 because the library is compiled with the Unix compiler.
5509 Use of RTD is a selectable option, since it is incompatible with
5510 standard Unix calling sequences. If the option is not selected,
5511 the caller must always pop the args.
5512
5513 The attribute stdcall is equivalent to RTD on a per module basis. */
5514
5515 static int
5516 ix86_return_pops_args (tree fundecl, tree funtype, int size)
5517 {
5518 unsigned int ccvt;
5519
5520 /* None of the 64-bit ABIs pop arguments. */
5521 if (TARGET_64BIT)
5522 return 0;
5523
5524 ccvt = ix86_get_callcvt (funtype);
5525
5526 if ((ccvt & (IX86_CALLCVT_STDCALL | IX86_CALLCVT_FASTCALL
5527 | IX86_CALLCVT_THISCALL)) != 0
5528 && ! stdarg_p (funtype))
5529 return size;
5530
5531 /* Lose any fake structure return argument if it is passed on the stack. */
5532 if (aggregate_value_p (TREE_TYPE (funtype), fundecl)
5533 && !ix86_keep_aggregate_return_pointer (funtype))
5534 {
5535 int nregs = ix86_function_regparm (funtype, fundecl);
5536 if (nregs == 0)
5537 return GET_MODE_SIZE (Pmode);
5538 }
5539
5540 return 0;
5541 }
5542 \f
5543 /* Argument support functions. */
5544
5545 /* Return true when register may be used to pass function parameters. */
5546 bool
5547 ix86_function_arg_regno_p (int regno)
5548 {
5549 int i;
5550 const int *parm_regs;
5551
5552 if (!TARGET_64BIT)
5553 {
5554 if (TARGET_MACHO)
5555 return (regno < REGPARM_MAX
5556 || (TARGET_SSE && SSE_REGNO_P (regno) && !fixed_regs[regno]));
5557 else
5558 return (regno < REGPARM_MAX
5559 || (TARGET_MMX && MMX_REGNO_P (regno)
5560 && (regno < FIRST_MMX_REG + MMX_REGPARM_MAX))
5561 || (TARGET_SSE && SSE_REGNO_P (regno)
5562 && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX)));
5563 }
5564
5565 if (TARGET_MACHO)
5566 {
5567 if (SSE_REGNO_P (regno) && TARGET_SSE)
5568 return true;
5569 }
5570 else
5571 {
5572 if (TARGET_SSE && SSE_REGNO_P (regno)
5573 && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX))
5574 return true;
5575 }
5576
5577 /* TODO: The function should depend on current function ABI but
5578 builtins.c would need updating then. Therefore we use the
5579 default ABI. */
5580
5581 /* RAX is used as hidden argument to va_arg functions. */
5582 if (ix86_abi == SYSV_ABI && regno == AX_REG)
5583 return true;
5584
5585 if (ix86_abi == MS_ABI)
5586 parm_regs = x86_64_ms_abi_int_parameter_registers;
5587 else
5588 parm_regs = x86_64_int_parameter_registers;
5589 for (i = 0; i < (ix86_abi == MS_ABI
5590 ? X86_64_MS_REGPARM_MAX : X86_64_REGPARM_MAX); i++)
5591 if (regno == parm_regs[i])
5592 return true;
5593 return false;
5594 }
5595
5596 /* Return if we do not know how to pass TYPE solely in registers. */
5597
5598 static bool
5599 ix86_must_pass_in_stack (enum machine_mode mode, const_tree type)
5600 {
5601 if (must_pass_in_stack_var_size_or_pad (mode, type))
5602 return true;
5603
5604 /* For 32-bit, we want TImode aggregates to go on the stack. But watch out!
5605 The layout_type routine is crafty and tries to trick us into passing
5606 currently unsupported vector types on the stack by using TImode. */
5607 return (!TARGET_64BIT && mode == TImode
5608 && type && TREE_CODE (type) != VECTOR_TYPE);
5609 }
5610
5611 /* It returns the size, in bytes, of the area reserved for arguments passed
5612 in registers for the function represented by fndecl dependent to the used
5613 abi format. */
5614 int
5615 ix86_reg_parm_stack_space (const_tree fndecl)
5616 {
5617 enum calling_abi call_abi = SYSV_ABI;
5618 if (fndecl != NULL_TREE && TREE_CODE (fndecl) == FUNCTION_DECL)
5619 call_abi = ix86_function_abi (fndecl);
5620 else
5621 call_abi = ix86_function_type_abi (fndecl);
5622 if (TARGET_64BIT && call_abi == MS_ABI)
5623 return 32;
5624 return 0;
5625 }
5626
5627 /* Returns value SYSV_ABI, MS_ABI dependent on fntype, specifying the
5628 call abi used. */
5629 enum calling_abi
5630 ix86_function_type_abi (const_tree fntype)
5631 {
5632 if (fntype != NULL_TREE && TYPE_ATTRIBUTES (fntype) != NULL_TREE)
5633 {
5634 enum calling_abi abi = ix86_abi;
5635 if (abi == SYSV_ABI)
5636 {
5637 if (lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (fntype)))
5638 abi = MS_ABI;
5639 }
5640 else if (lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (fntype)))
5641 abi = SYSV_ABI;
5642 return abi;
5643 }
5644 return ix86_abi;
5645 }
5646
5647 static bool
5648 ix86_function_ms_hook_prologue (const_tree fn)
5649 {
5650 if (fn && lookup_attribute ("ms_hook_prologue", DECL_ATTRIBUTES (fn)))
5651 {
5652 if (decl_function_context (fn) != NULL_TREE)
5653 error_at (DECL_SOURCE_LOCATION (fn),
5654 "ms_hook_prologue is not compatible with nested function");
5655 else
5656 return true;
5657 }
5658 return false;
5659 }
5660
5661 static enum calling_abi
5662 ix86_function_abi (const_tree fndecl)
5663 {
5664 if (! fndecl)
5665 return ix86_abi;
5666 return ix86_function_type_abi (TREE_TYPE (fndecl));
5667 }
5668
5669 /* Returns value SYSV_ABI, MS_ABI dependent on cfun, specifying the
5670 call abi used. */
5671 enum calling_abi
5672 ix86_cfun_abi (void)
5673 {
5674 if (! cfun)
5675 return ix86_abi;
5676 return cfun->machine->call_abi;
5677 }
5678
5679 /* Write the extra assembler code needed to declare a function properly. */
5680
5681 void
5682 ix86_asm_output_function_label (FILE *asm_out_file, const char *fname,
5683 tree decl)
5684 {
5685 bool is_ms_hook = ix86_function_ms_hook_prologue (decl);
5686
5687 if (is_ms_hook)
5688 {
5689 int i, filler_count = (TARGET_64BIT ? 32 : 16);
5690 unsigned int filler_cc = 0xcccccccc;
5691
5692 for (i = 0; i < filler_count; i += 4)
5693 fprintf (asm_out_file, ASM_LONG " %#x\n", filler_cc);
5694 }
5695
5696 #ifdef SUBTARGET_ASM_UNWIND_INIT
5697 SUBTARGET_ASM_UNWIND_INIT (asm_out_file);
5698 #endif
5699
5700 ASM_OUTPUT_LABEL (asm_out_file, fname);
5701
5702 /* Output magic byte marker, if hot-patch attribute is set. */
5703 if (is_ms_hook)
5704 {
5705 if (TARGET_64BIT)
5706 {
5707 /* leaq [%rsp + 0], %rsp */
5708 asm_fprintf (asm_out_file, ASM_BYTE
5709 "0x48, 0x8d, 0xa4, 0x24, 0x00, 0x00, 0x00, 0x00\n");
5710 }
5711 else
5712 {
5713 /* movl.s %edi, %edi
5714 push %ebp
5715 movl.s %esp, %ebp */
5716 asm_fprintf (asm_out_file, ASM_BYTE
5717 "0x8b, 0xff, 0x55, 0x8b, 0xec\n");
5718 }
5719 }
5720 }
5721
5722 /* regclass.c */
5723 extern void init_regs (void);
5724
5725 /* Implementation of call abi switching target hook. Specific to FNDECL
5726 the specific call register sets are set. See also
5727 ix86_conditional_register_usage for more details. */
5728 void
5729 ix86_call_abi_override (const_tree fndecl)
5730 {
5731 if (fndecl == NULL_TREE)
5732 cfun->machine->call_abi = ix86_abi;
5733 else
5734 cfun->machine->call_abi = ix86_function_type_abi (TREE_TYPE (fndecl));
5735 }
5736
5737 /* 64-bit MS and SYSV ABI have different set of call used registers. Avoid
5738 expensive re-initialization of init_regs each time we switch function context
5739 since this is needed only during RTL expansion. */
5740 static void
5741 ix86_maybe_switch_abi (void)
5742 {
5743 if (TARGET_64BIT &&
5744 call_used_regs[SI_REG] == (cfun->machine->call_abi == MS_ABI))
5745 reinit_regs ();
5746 }
5747
5748 /* Initialize a variable CUM of type CUMULATIVE_ARGS
5749 for a call to a function whose data type is FNTYPE.
5750 For a library call, FNTYPE is 0. */
5751
5752 void
5753 init_cumulative_args (CUMULATIVE_ARGS *cum, /* Argument info to initialize */
5754 tree fntype, /* tree ptr for function decl */
5755 rtx libname, /* SYMBOL_REF of library name or 0 */
5756 tree fndecl,
5757 int caller)
5758 {
5759 struct cgraph_local_info *i;
5760 tree fnret_type;
5761
5762 memset (cum, 0, sizeof (*cum));
5763
5764 /* Initialize for the current callee. */
5765 if (caller)
5766 {
5767 cfun->machine->callee_pass_avx256_p = false;
5768 cfun->machine->callee_return_avx256_p = false;
5769 }
5770
5771 if (fndecl)
5772 {
5773 i = cgraph_local_info (fndecl);
5774 cum->call_abi = ix86_function_abi (fndecl);
5775 fnret_type = TREE_TYPE (TREE_TYPE (fndecl));
5776 }
5777 else
5778 {
5779 i = NULL;
5780 cum->call_abi = ix86_function_type_abi (fntype);
5781 if (fntype)
5782 fnret_type = TREE_TYPE (fntype);
5783 else
5784 fnret_type = NULL;
5785 }
5786
5787 if (TARGET_VZEROUPPER && fnret_type)
5788 {
5789 rtx fnret_value = ix86_function_value (fnret_type, fntype,
5790 false);
5791 if (function_pass_avx256_p (fnret_value))
5792 {
5793 /* The return value of this function uses 256bit AVX modes. */
5794 if (caller)
5795 cfun->machine->callee_return_avx256_p = true;
5796 else
5797 cfun->machine->caller_return_avx256_p = true;
5798 }
5799 }
5800
5801 cum->caller = caller;
5802
5803 /* Set up the number of registers to use for passing arguments. */
5804
5805 if (TARGET_64BIT && cum->call_abi == MS_ABI && !ACCUMULATE_OUTGOING_ARGS)
5806 sorry ("ms_abi attribute requires -maccumulate-outgoing-args "
5807 "or subtarget optimization implying it");
5808 cum->nregs = ix86_regparm;
5809 if (TARGET_64BIT)
5810 {
5811 cum->nregs = (cum->call_abi == SYSV_ABI
5812 ? X86_64_REGPARM_MAX
5813 : X86_64_MS_REGPARM_MAX);
5814 }
5815 if (TARGET_SSE)
5816 {
5817 cum->sse_nregs = SSE_REGPARM_MAX;
5818 if (TARGET_64BIT)
5819 {
5820 cum->sse_nregs = (cum->call_abi == SYSV_ABI
5821 ? X86_64_SSE_REGPARM_MAX
5822 : X86_64_MS_SSE_REGPARM_MAX);
5823 }
5824 }
5825 if (TARGET_MMX)
5826 cum->mmx_nregs = MMX_REGPARM_MAX;
5827 cum->warn_avx = true;
5828 cum->warn_sse = true;
5829 cum->warn_mmx = true;
5830
5831 /* Because type might mismatch in between caller and callee, we need to
5832 use actual type of function for local calls.
5833 FIXME: cgraph_analyze can be told to actually record if function uses
5834 va_start so for local functions maybe_vaarg can be made aggressive
5835 helping K&R code.
5836 FIXME: once typesytem is fixed, we won't need this code anymore. */
5837 if (i && i->local && i->can_change_signature)
5838 fntype = TREE_TYPE (fndecl);
5839 cum->maybe_vaarg = (fntype
5840 ? (!prototype_p (fntype) || stdarg_p (fntype))
5841 : !libname);
5842
5843 if (!TARGET_64BIT)
5844 {
5845 /* If there are variable arguments, then we won't pass anything
5846 in registers in 32-bit mode. */
5847 if (stdarg_p (fntype))
5848 {
5849 cum->nregs = 0;
5850 cum->sse_nregs = 0;
5851 cum->mmx_nregs = 0;
5852 cum->warn_avx = 0;
5853 cum->warn_sse = 0;
5854 cum->warn_mmx = 0;
5855 return;
5856 }
5857
5858 /* Use ecx and edx registers if function has fastcall attribute,
5859 else look for regparm information. */
5860 if (fntype)
5861 {
5862 unsigned int ccvt = ix86_get_callcvt (fntype);
5863 if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
5864 {
5865 cum->nregs = 1;
5866 cum->fastcall = 1; /* Same first register as in fastcall. */
5867 }
5868 else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
5869 {
5870 cum->nregs = 2;
5871 cum->fastcall = 1;
5872 }
5873 else
5874 cum->nregs = ix86_function_regparm (fntype, fndecl);
5875 }
5876
5877 /* Set up the number of SSE registers used for passing SFmode
5878 and DFmode arguments. Warn for mismatching ABI. */
5879 cum->float_in_sse = ix86_function_sseregparm (fntype, fndecl, true);
5880 }
5881 }
5882
5883 /* Return the "natural" mode for TYPE. In most cases, this is just TYPE_MODE.
5884 But in the case of vector types, it is some vector mode.
5885
5886 When we have only some of our vector isa extensions enabled, then there
5887 are some modes for which vector_mode_supported_p is false. For these
5888 modes, the generic vector support in gcc will choose some non-vector mode
5889 in order to implement the type. By computing the natural mode, we'll
5890 select the proper ABI location for the operand and not depend on whatever
5891 the middle-end decides to do with these vector types.
5892
5893 The midde-end can't deal with the vector types > 16 bytes. In this
5894 case, we return the original mode and warn ABI change if CUM isn't
5895 NULL. */
5896
5897 static enum machine_mode
5898 type_natural_mode (const_tree type, const CUMULATIVE_ARGS *cum)
5899 {
5900 enum machine_mode mode = TYPE_MODE (type);
5901
5902 if (TREE_CODE (type) == VECTOR_TYPE && !VECTOR_MODE_P (mode))
5903 {
5904 HOST_WIDE_INT size = int_size_in_bytes (type);
5905 if ((size == 8 || size == 16 || size == 32)
5906 /* ??? Generic code allows us to create width 1 vectors. Ignore. */
5907 && TYPE_VECTOR_SUBPARTS (type) > 1)
5908 {
5909 enum machine_mode innermode = TYPE_MODE (TREE_TYPE (type));
5910
5911 if (TREE_CODE (TREE_TYPE (type)) == REAL_TYPE)
5912 mode = MIN_MODE_VECTOR_FLOAT;
5913 else
5914 mode = MIN_MODE_VECTOR_INT;
5915
5916 /* Get the mode which has this inner mode and number of units. */
5917 for (; mode != VOIDmode; mode = GET_MODE_WIDER_MODE (mode))
5918 if (GET_MODE_NUNITS (mode) == TYPE_VECTOR_SUBPARTS (type)
5919 && GET_MODE_INNER (mode) == innermode)
5920 {
5921 if (size == 32 && !TARGET_AVX)
5922 {
5923 static bool warnedavx;
5924
5925 if (cum
5926 && !warnedavx
5927 && cum->warn_avx)
5928 {
5929 warnedavx = true;
5930 warning (0, "AVX vector argument without AVX "
5931 "enabled changes the ABI");
5932 }
5933 return TYPE_MODE (type);
5934 }
5935 else if ((size == 8 || size == 16) && !TARGET_SSE)
5936 {
5937 static bool warnedsse;
5938
5939 if (cum
5940 && !warnedsse
5941 && cum->warn_sse)
5942 {
5943 warnedsse = true;
5944 warning (0, "SSE vector argument without SSE "
5945 "enabled changes the ABI");
5946 }
5947 return mode;
5948 }
5949 else
5950 return mode;
5951 }
5952
5953 gcc_unreachable ();
5954 }
5955 }
5956
5957 return mode;
5958 }
5959
5960 /* We want to pass a value in REGNO whose "natural" mode is MODE. However,
5961 this may not agree with the mode that the type system has chosen for the
5962 register, which is ORIG_MODE. If ORIG_MODE is not BLKmode, then we can
5963 go ahead and use it. Otherwise we have to build a PARALLEL instead. */
5964
5965 static rtx
5966 gen_reg_or_parallel (enum machine_mode mode, enum machine_mode orig_mode,
5967 unsigned int regno)
5968 {
5969 rtx tmp;
5970
5971 if (orig_mode != BLKmode)
5972 tmp = gen_rtx_REG (orig_mode, regno);
5973 else
5974 {
5975 tmp = gen_rtx_REG (mode, regno);
5976 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, const0_rtx);
5977 tmp = gen_rtx_PARALLEL (orig_mode, gen_rtvec (1, tmp));
5978 }
5979
5980 return tmp;
5981 }
5982
5983 /* x86-64 register passing implementation. See x86-64 ABI for details. Goal
5984 of this code is to classify each 8bytes of incoming argument by the register
5985 class and assign registers accordingly. */
5986
5987 /* Return the union class of CLASS1 and CLASS2.
5988 See the x86-64 PS ABI for details. */
5989
5990 static enum x86_64_reg_class
5991 merge_classes (enum x86_64_reg_class class1, enum x86_64_reg_class class2)
5992 {
5993 /* Rule #1: If both classes are equal, this is the resulting class. */
5994 if (class1 == class2)
5995 return class1;
5996
5997 /* Rule #2: If one of the classes is NO_CLASS, the resulting class is
5998 the other class. */
5999 if (class1 == X86_64_NO_CLASS)
6000 return class2;
6001 if (class2 == X86_64_NO_CLASS)
6002 return class1;
6003
6004 /* Rule #3: If one of the classes is MEMORY, the result is MEMORY. */
6005 if (class1 == X86_64_MEMORY_CLASS || class2 == X86_64_MEMORY_CLASS)
6006 return X86_64_MEMORY_CLASS;
6007
6008 /* Rule #4: If one of the classes is INTEGER, the result is INTEGER. */
6009 if ((class1 == X86_64_INTEGERSI_CLASS && class2 == X86_64_SSESF_CLASS)
6010 || (class2 == X86_64_INTEGERSI_CLASS && class1 == X86_64_SSESF_CLASS))
6011 return X86_64_INTEGERSI_CLASS;
6012 if (class1 == X86_64_INTEGER_CLASS || class1 == X86_64_INTEGERSI_CLASS
6013 || class2 == X86_64_INTEGER_CLASS || class2 == X86_64_INTEGERSI_CLASS)
6014 return X86_64_INTEGER_CLASS;
6015
6016 /* Rule #5: If one of the classes is X87, X87UP, or COMPLEX_X87 class,
6017 MEMORY is used. */
6018 if (class1 == X86_64_X87_CLASS
6019 || class1 == X86_64_X87UP_CLASS
6020 || class1 == X86_64_COMPLEX_X87_CLASS
6021 || class2 == X86_64_X87_CLASS
6022 || class2 == X86_64_X87UP_CLASS
6023 || class2 == X86_64_COMPLEX_X87_CLASS)
6024 return X86_64_MEMORY_CLASS;
6025
6026 /* Rule #6: Otherwise class SSE is used. */
6027 return X86_64_SSE_CLASS;
6028 }
6029
6030 /* Classify the argument of type TYPE and mode MODE.
6031 CLASSES will be filled by the register class used to pass each word
6032 of the operand. The number of words is returned. In case the parameter
6033 should be passed in memory, 0 is returned. As a special case for zero
6034 sized containers, classes[0] will be NO_CLASS and 1 is returned.
6035
6036 BIT_OFFSET is used internally for handling records and specifies offset
6037 of the offset in bits modulo 256 to avoid overflow cases.
6038
6039 See the x86-64 PS ABI for details.
6040 */
6041
6042 static int
6043 classify_argument (enum machine_mode mode, const_tree type,
6044 enum x86_64_reg_class classes[MAX_CLASSES], int bit_offset)
6045 {
6046 HOST_WIDE_INT bytes =
6047 (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
6048 int words
6049 = (bytes + (bit_offset % 64) / 8 + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
6050
6051 /* Variable sized entities are always passed/returned in memory. */
6052 if (bytes < 0)
6053 return 0;
6054
6055 if (mode != VOIDmode
6056 && targetm.calls.must_pass_in_stack (mode, type))
6057 return 0;
6058
6059 if (type && AGGREGATE_TYPE_P (type))
6060 {
6061 int i;
6062 tree field;
6063 enum x86_64_reg_class subclasses[MAX_CLASSES];
6064
6065 /* On x86-64 we pass structures larger than 32 bytes on the stack. */
6066 if (bytes > 32)
6067 return 0;
6068
6069 for (i = 0; i < words; i++)
6070 classes[i] = X86_64_NO_CLASS;
6071
6072 /* Zero sized arrays or structures are NO_CLASS. We return 0 to
6073 signalize memory class, so handle it as special case. */
6074 if (!words)
6075 {
6076 classes[0] = X86_64_NO_CLASS;
6077 return 1;
6078 }
6079
6080 /* Classify each field of record and merge classes. */
6081 switch (TREE_CODE (type))
6082 {
6083 case RECORD_TYPE:
6084 /* And now merge the fields of structure. */
6085 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
6086 {
6087 if (TREE_CODE (field) == FIELD_DECL)
6088 {
6089 int num;
6090
6091 if (TREE_TYPE (field) == error_mark_node)
6092 continue;
6093
6094 /* Bitfields are always classified as integer. Handle them
6095 early, since later code would consider them to be
6096 misaligned integers. */
6097 if (DECL_BIT_FIELD (field))
6098 {
6099 for (i = (int_bit_position (field)
6100 + (bit_offset % 64)) / 8 / 8;
6101 i < ((int_bit_position (field) + (bit_offset % 64))
6102 + tree_low_cst (DECL_SIZE (field), 0)
6103 + 63) / 8 / 8; i++)
6104 classes[i] =
6105 merge_classes (X86_64_INTEGER_CLASS,
6106 classes[i]);
6107 }
6108 else
6109 {
6110 int pos;
6111
6112 type = TREE_TYPE (field);
6113
6114 /* Flexible array member is ignored. */
6115 if (TYPE_MODE (type) == BLKmode
6116 && TREE_CODE (type) == ARRAY_TYPE
6117 && TYPE_SIZE (type) == NULL_TREE
6118 && TYPE_DOMAIN (type) != NULL_TREE
6119 && (TYPE_MAX_VALUE (TYPE_DOMAIN (type))
6120 == NULL_TREE))
6121 {
6122 static bool warned;
6123
6124 if (!warned && warn_psabi)
6125 {
6126 warned = true;
6127 inform (input_location,
6128 "the ABI of passing struct with"
6129 " a flexible array member has"
6130 " changed in GCC 4.4");
6131 }
6132 continue;
6133 }
6134 num = classify_argument (TYPE_MODE (type), type,
6135 subclasses,
6136 (int_bit_position (field)
6137 + bit_offset) % 256);
6138 if (!num)
6139 return 0;
6140 pos = (int_bit_position (field)
6141 + (bit_offset % 64)) / 8 / 8;
6142 for (i = 0; i < num && (i + pos) < words; i++)
6143 classes[i + pos] =
6144 merge_classes (subclasses[i], classes[i + pos]);
6145 }
6146 }
6147 }
6148 break;
6149
6150 case ARRAY_TYPE:
6151 /* Arrays are handled as small records. */
6152 {
6153 int num;
6154 num = classify_argument (TYPE_MODE (TREE_TYPE (type)),
6155 TREE_TYPE (type), subclasses, bit_offset);
6156 if (!num)
6157 return 0;
6158
6159 /* The partial classes are now full classes. */
6160 if (subclasses[0] == X86_64_SSESF_CLASS && bytes != 4)
6161 subclasses[0] = X86_64_SSE_CLASS;
6162 if (subclasses[0] == X86_64_INTEGERSI_CLASS
6163 && !((bit_offset % 64) == 0 && bytes == 4))
6164 subclasses[0] = X86_64_INTEGER_CLASS;
6165
6166 for (i = 0; i < words; i++)
6167 classes[i] = subclasses[i % num];
6168
6169 break;
6170 }
6171 case UNION_TYPE:
6172 case QUAL_UNION_TYPE:
6173 /* Unions are similar to RECORD_TYPE but offset is always 0.
6174 */
6175 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
6176 {
6177 if (TREE_CODE (field) == FIELD_DECL)
6178 {
6179 int num;
6180
6181 if (TREE_TYPE (field) == error_mark_node)
6182 continue;
6183
6184 num = classify_argument (TYPE_MODE (TREE_TYPE (field)),
6185 TREE_TYPE (field), subclasses,
6186 bit_offset);
6187 if (!num)
6188 return 0;
6189 for (i = 0; i < num; i++)
6190 classes[i] = merge_classes (subclasses[i], classes[i]);
6191 }
6192 }
6193 break;
6194
6195 default:
6196 gcc_unreachable ();
6197 }
6198
6199 if (words > 2)
6200 {
6201 /* When size > 16 bytes, if the first one isn't
6202 X86_64_SSE_CLASS or any other ones aren't
6203 X86_64_SSEUP_CLASS, everything should be passed in
6204 memory. */
6205 if (classes[0] != X86_64_SSE_CLASS)
6206 return 0;
6207
6208 for (i = 1; i < words; i++)
6209 if (classes[i] != X86_64_SSEUP_CLASS)
6210 return 0;
6211 }
6212
6213 /* Final merger cleanup. */
6214 for (i = 0; i < words; i++)
6215 {
6216 /* If one class is MEMORY, everything should be passed in
6217 memory. */
6218 if (classes[i] == X86_64_MEMORY_CLASS)
6219 return 0;
6220
6221 /* The X86_64_SSEUP_CLASS should be always preceded by
6222 X86_64_SSE_CLASS or X86_64_SSEUP_CLASS. */
6223 if (classes[i] == X86_64_SSEUP_CLASS
6224 && classes[i - 1] != X86_64_SSE_CLASS
6225 && classes[i - 1] != X86_64_SSEUP_CLASS)
6226 {
6227 /* The first one should never be X86_64_SSEUP_CLASS. */
6228 gcc_assert (i != 0);
6229 classes[i] = X86_64_SSE_CLASS;
6230 }
6231
6232 /* If X86_64_X87UP_CLASS isn't preceded by X86_64_X87_CLASS,
6233 everything should be passed in memory. */
6234 if (classes[i] == X86_64_X87UP_CLASS
6235 && (classes[i - 1] != X86_64_X87_CLASS))
6236 {
6237 static bool warned;
6238
6239 /* The first one should never be X86_64_X87UP_CLASS. */
6240 gcc_assert (i != 0);
6241 if (!warned && warn_psabi)
6242 {
6243 warned = true;
6244 inform (input_location,
6245 "the ABI of passing union with long double"
6246 " has changed in GCC 4.4");
6247 }
6248 return 0;
6249 }
6250 }
6251 return words;
6252 }
6253
6254 /* Compute alignment needed. We align all types to natural boundaries with
6255 exception of XFmode that is aligned to 64bits. */
6256 if (mode != VOIDmode && mode != BLKmode)
6257 {
6258 int mode_alignment = GET_MODE_BITSIZE (mode);
6259
6260 if (mode == XFmode)
6261 mode_alignment = 128;
6262 else if (mode == XCmode)
6263 mode_alignment = 256;
6264 if (COMPLEX_MODE_P (mode))
6265 mode_alignment /= 2;
6266 /* Misaligned fields are always returned in memory. */
6267 if (bit_offset % mode_alignment)
6268 return 0;
6269 }
6270
6271 /* for V1xx modes, just use the base mode */
6272 if (VECTOR_MODE_P (mode) && mode != V1DImode && mode != V1TImode
6273 && GET_MODE_SIZE (GET_MODE_INNER (mode)) == bytes)
6274 mode = GET_MODE_INNER (mode);
6275
6276 /* Classification of atomic types. */
6277 switch (mode)
6278 {
6279 case SDmode:
6280 case DDmode:
6281 classes[0] = X86_64_SSE_CLASS;
6282 return 1;
6283 case TDmode:
6284 classes[0] = X86_64_SSE_CLASS;
6285 classes[1] = X86_64_SSEUP_CLASS;
6286 return 2;
6287 case DImode:
6288 case SImode:
6289 case HImode:
6290 case QImode:
6291 case CSImode:
6292 case CHImode:
6293 case CQImode:
6294 {
6295 int size = (bit_offset % 64)+ (int) GET_MODE_BITSIZE (mode);
6296
6297 if (size <= 32)
6298 {
6299 classes[0] = X86_64_INTEGERSI_CLASS;
6300 return 1;
6301 }
6302 else if (size <= 64)
6303 {
6304 classes[0] = X86_64_INTEGER_CLASS;
6305 return 1;
6306 }
6307 else if (size <= 64+32)
6308 {
6309 classes[0] = X86_64_INTEGER_CLASS;
6310 classes[1] = X86_64_INTEGERSI_CLASS;
6311 return 2;
6312 }
6313 else if (size <= 64+64)
6314 {
6315 classes[0] = classes[1] = X86_64_INTEGER_CLASS;
6316 return 2;
6317 }
6318 else
6319 gcc_unreachable ();
6320 }
6321 case CDImode:
6322 case TImode:
6323 classes[0] = classes[1] = X86_64_INTEGER_CLASS;
6324 return 2;
6325 case COImode:
6326 case OImode:
6327 /* OImode shouldn't be used directly. */
6328 gcc_unreachable ();
6329 case CTImode:
6330 return 0;
6331 case SFmode:
6332 if (!(bit_offset % 64))
6333 classes[0] = X86_64_SSESF_CLASS;
6334 else
6335 classes[0] = X86_64_SSE_CLASS;
6336 return 1;
6337 case DFmode:
6338 classes[0] = X86_64_SSEDF_CLASS;
6339 return 1;
6340 case XFmode:
6341 classes[0] = X86_64_X87_CLASS;
6342 classes[1] = X86_64_X87UP_CLASS;
6343 return 2;
6344 case TFmode:
6345 classes[0] = X86_64_SSE_CLASS;
6346 classes[1] = X86_64_SSEUP_CLASS;
6347 return 2;
6348 case SCmode:
6349 classes[0] = X86_64_SSE_CLASS;
6350 if (!(bit_offset % 64))
6351 return 1;
6352 else
6353 {
6354 static bool warned;
6355
6356 if (!warned && warn_psabi)
6357 {
6358 warned = true;
6359 inform (input_location,
6360 "the ABI of passing structure with complex float"
6361 " member has changed in GCC 4.4");
6362 }
6363 classes[1] = X86_64_SSESF_CLASS;
6364 return 2;
6365 }
6366 case DCmode:
6367 classes[0] = X86_64_SSEDF_CLASS;
6368 classes[1] = X86_64_SSEDF_CLASS;
6369 return 2;
6370 case XCmode:
6371 classes[0] = X86_64_COMPLEX_X87_CLASS;
6372 return 1;
6373 case TCmode:
6374 /* This modes is larger than 16 bytes. */
6375 return 0;
6376 case V8SFmode:
6377 case V8SImode:
6378 case V32QImode:
6379 case V16HImode:
6380 case V4DFmode:
6381 case V4DImode:
6382 classes[0] = X86_64_SSE_CLASS;
6383 classes[1] = X86_64_SSEUP_CLASS;
6384 classes[2] = X86_64_SSEUP_CLASS;
6385 classes[3] = X86_64_SSEUP_CLASS;
6386 return 4;
6387 case V4SFmode:
6388 case V4SImode:
6389 case V16QImode:
6390 case V8HImode:
6391 case V2DFmode:
6392 case V2DImode:
6393 classes[0] = X86_64_SSE_CLASS;
6394 classes[1] = X86_64_SSEUP_CLASS;
6395 return 2;
6396 case V1TImode:
6397 case V1DImode:
6398 case V2SFmode:
6399 case V2SImode:
6400 case V4HImode:
6401 case V8QImode:
6402 classes[0] = X86_64_SSE_CLASS;
6403 return 1;
6404 case BLKmode:
6405 case VOIDmode:
6406 return 0;
6407 default:
6408 gcc_assert (VECTOR_MODE_P (mode));
6409
6410 if (bytes > 16)
6411 return 0;
6412
6413 gcc_assert (GET_MODE_CLASS (GET_MODE_INNER (mode)) == MODE_INT);
6414
6415 if (bit_offset + GET_MODE_BITSIZE (mode) <= 32)
6416 classes[0] = X86_64_INTEGERSI_CLASS;
6417 else
6418 classes[0] = X86_64_INTEGER_CLASS;
6419 classes[1] = X86_64_INTEGER_CLASS;
6420 return 1 + (bytes > 8);
6421 }
6422 }
6423
6424 /* Examine the argument and return set number of register required in each
6425 class. Return 0 iff parameter should be passed in memory. */
6426 static int
6427 examine_argument (enum machine_mode mode, const_tree type, int in_return,
6428 int *int_nregs, int *sse_nregs)
6429 {
6430 enum x86_64_reg_class regclass[MAX_CLASSES];
6431 int n = classify_argument (mode, type, regclass, 0);
6432
6433 *int_nregs = 0;
6434 *sse_nregs = 0;
6435 if (!n)
6436 return 0;
6437 for (n--; n >= 0; n--)
6438 switch (regclass[n])
6439 {
6440 case X86_64_INTEGER_CLASS:
6441 case X86_64_INTEGERSI_CLASS:
6442 (*int_nregs)++;
6443 break;
6444 case X86_64_SSE_CLASS:
6445 case X86_64_SSESF_CLASS:
6446 case X86_64_SSEDF_CLASS:
6447 (*sse_nregs)++;
6448 break;
6449 case X86_64_NO_CLASS:
6450 case X86_64_SSEUP_CLASS:
6451 break;
6452 case X86_64_X87_CLASS:
6453 case X86_64_X87UP_CLASS:
6454 if (!in_return)
6455 return 0;
6456 break;
6457 case X86_64_COMPLEX_X87_CLASS:
6458 return in_return ? 2 : 0;
6459 case X86_64_MEMORY_CLASS:
6460 gcc_unreachable ();
6461 }
6462 return 1;
6463 }
6464
6465 /* Construct container for the argument used by GCC interface. See
6466 FUNCTION_ARG for the detailed description. */
6467
6468 static rtx
6469 construct_container (enum machine_mode mode, enum machine_mode orig_mode,
6470 const_tree type, int in_return, int nintregs, int nsseregs,
6471 const int *intreg, int sse_regno)
6472 {
6473 /* The following variables hold the static issued_error state. */
6474 static bool issued_sse_arg_error;
6475 static bool issued_sse_ret_error;
6476 static bool issued_x87_ret_error;
6477
6478 enum machine_mode tmpmode;
6479 int bytes =
6480 (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
6481 enum x86_64_reg_class regclass[MAX_CLASSES];
6482 int n;
6483 int i;
6484 int nexps = 0;
6485 int needed_sseregs, needed_intregs;
6486 rtx exp[MAX_CLASSES];
6487 rtx ret;
6488
6489 n = classify_argument (mode, type, regclass, 0);
6490 if (!n)
6491 return NULL;
6492 if (!examine_argument (mode, type, in_return, &needed_intregs,
6493 &needed_sseregs))
6494 return NULL;
6495 if (needed_intregs > nintregs || needed_sseregs > nsseregs)
6496 return NULL;
6497
6498 /* We allowed the user to turn off SSE for kernel mode. Don't crash if
6499 some less clueful developer tries to use floating-point anyway. */
6500 if (needed_sseregs && !TARGET_SSE)
6501 {
6502 if (in_return)
6503 {
6504 if (!issued_sse_ret_error)
6505 {
6506 error ("SSE register return with SSE disabled");
6507 issued_sse_ret_error = true;
6508 }
6509 }
6510 else if (!issued_sse_arg_error)
6511 {
6512 error ("SSE register argument with SSE disabled");
6513 issued_sse_arg_error = true;
6514 }
6515 return NULL;
6516 }
6517
6518 /* Likewise, error if the ABI requires us to return values in the
6519 x87 registers and the user specified -mno-80387. */
6520 if (!TARGET_80387 && in_return)
6521 for (i = 0; i < n; i++)
6522 if (regclass[i] == X86_64_X87_CLASS
6523 || regclass[i] == X86_64_X87UP_CLASS
6524 || regclass[i] == X86_64_COMPLEX_X87_CLASS)
6525 {
6526 if (!issued_x87_ret_error)
6527 {
6528 error ("x87 register return with x87 disabled");
6529 issued_x87_ret_error = true;
6530 }
6531 return NULL;
6532 }
6533
6534 /* First construct simple cases. Avoid SCmode, since we want to use
6535 single register to pass this type. */
6536 if (n == 1 && mode != SCmode)
6537 switch (regclass[0])
6538 {
6539 case X86_64_INTEGER_CLASS:
6540 case X86_64_INTEGERSI_CLASS:
6541 return gen_rtx_REG (mode, intreg[0]);
6542 case X86_64_SSE_CLASS:
6543 case X86_64_SSESF_CLASS:
6544 case X86_64_SSEDF_CLASS:
6545 if (mode != BLKmode)
6546 return gen_reg_or_parallel (mode, orig_mode,
6547 SSE_REGNO (sse_regno));
6548 break;
6549 case X86_64_X87_CLASS:
6550 case X86_64_COMPLEX_X87_CLASS:
6551 return gen_rtx_REG (mode, FIRST_STACK_REG);
6552 case X86_64_NO_CLASS:
6553 /* Zero sized array, struct or class. */
6554 return NULL;
6555 default:
6556 gcc_unreachable ();
6557 }
6558 if (n == 2
6559 && regclass[0] == X86_64_SSE_CLASS
6560 && regclass[1] == X86_64_SSEUP_CLASS
6561 && mode != BLKmode)
6562 return gen_reg_or_parallel (mode, orig_mode,
6563 SSE_REGNO (sse_regno));
6564 if (n == 4
6565 && regclass[0] == X86_64_SSE_CLASS
6566 && regclass[1] == X86_64_SSEUP_CLASS
6567 && regclass[2] == X86_64_SSEUP_CLASS
6568 && regclass[3] == X86_64_SSEUP_CLASS
6569 && mode != BLKmode)
6570 return gen_reg_or_parallel (mode, orig_mode,
6571 SSE_REGNO (sse_regno));
6572 if (n == 2
6573 && regclass[0] == X86_64_X87_CLASS
6574 && regclass[1] == X86_64_X87UP_CLASS)
6575 return gen_rtx_REG (XFmode, FIRST_STACK_REG);
6576
6577 if (n == 2
6578 && regclass[0] == X86_64_INTEGER_CLASS
6579 && regclass[1] == X86_64_INTEGER_CLASS
6580 && (mode == CDImode || mode == TImode || mode == TFmode)
6581 && intreg[0] + 1 == intreg[1])
6582 return gen_rtx_REG (mode, intreg[0]);
6583
6584 /* Otherwise figure out the entries of the PARALLEL. */
6585 for (i = 0; i < n; i++)
6586 {
6587 int pos;
6588
6589 switch (regclass[i])
6590 {
6591 case X86_64_NO_CLASS:
6592 break;
6593 case X86_64_INTEGER_CLASS:
6594 case X86_64_INTEGERSI_CLASS:
6595 /* Merge TImodes on aligned occasions here too. */
6596 if (i * 8 + 8 > bytes)
6597 tmpmode
6598 = mode_for_size ((bytes - i * 8) * BITS_PER_UNIT, MODE_INT, 0);
6599 else if (regclass[i] == X86_64_INTEGERSI_CLASS)
6600 tmpmode = SImode;
6601 else
6602 tmpmode = DImode;
6603 /* We've requested 24 bytes we
6604 don't have mode for. Use DImode. */
6605 if (tmpmode == BLKmode)
6606 tmpmode = DImode;
6607 exp [nexps++]
6608 = gen_rtx_EXPR_LIST (VOIDmode,
6609 gen_rtx_REG (tmpmode, *intreg),
6610 GEN_INT (i*8));
6611 intreg++;
6612 break;
6613 case X86_64_SSESF_CLASS:
6614 exp [nexps++]
6615 = gen_rtx_EXPR_LIST (VOIDmode,
6616 gen_rtx_REG (SFmode,
6617 SSE_REGNO (sse_regno)),
6618 GEN_INT (i*8));
6619 sse_regno++;
6620 break;
6621 case X86_64_SSEDF_CLASS:
6622 exp [nexps++]
6623 = gen_rtx_EXPR_LIST (VOIDmode,
6624 gen_rtx_REG (DFmode,
6625 SSE_REGNO (sse_regno)),
6626 GEN_INT (i*8));
6627 sse_regno++;
6628 break;
6629 case X86_64_SSE_CLASS:
6630 pos = i;
6631 switch (n)
6632 {
6633 case 1:
6634 tmpmode = DImode;
6635 break;
6636 case 2:
6637 if (i == 0 && regclass[1] == X86_64_SSEUP_CLASS)
6638 {
6639 tmpmode = TImode;
6640 i++;
6641 }
6642 else
6643 tmpmode = DImode;
6644 break;
6645 case 4:
6646 gcc_assert (i == 0
6647 && regclass[1] == X86_64_SSEUP_CLASS
6648 && regclass[2] == X86_64_SSEUP_CLASS
6649 && regclass[3] == X86_64_SSEUP_CLASS);
6650 tmpmode = OImode;
6651 i += 3;
6652 break;
6653 default:
6654 gcc_unreachable ();
6655 }
6656 exp [nexps++]
6657 = gen_rtx_EXPR_LIST (VOIDmode,
6658 gen_rtx_REG (tmpmode,
6659 SSE_REGNO (sse_regno)),
6660 GEN_INT (pos*8));
6661 sse_regno++;
6662 break;
6663 default:
6664 gcc_unreachable ();
6665 }
6666 }
6667
6668 /* Empty aligned struct, union or class. */
6669 if (nexps == 0)
6670 return NULL;
6671
6672 ret = gen_rtx_PARALLEL (mode, rtvec_alloc (nexps));
6673 for (i = 0; i < nexps; i++)
6674 XVECEXP (ret, 0, i) = exp [i];
6675 return ret;
6676 }
6677
6678 /* Update the data in CUM to advance over an argument of mode MODE
6679 and data type TYPE. (TYPE is null for libcalls where that information
6680 may not be available.) */
6681
6682 static void
6683 function_arg_advance_32 (CUMULATIVE_ARGS *cum, enum machine_mode mode,
6684 const_tree type, HOST_WIDE_INT bytes,
6685 HOST_WIDE_INT words)
6686 {
6687 switch (mode)
6688 {
6689 default:
6690 break;
6691
6692 case BLKmode:
6693 if (bytes < 0)
6694 break;
6695 /* FALLTHRU */
6696
6697 case DImode:
6698 case SImode:
6699 case HImode:
6700 case QImode:
6701 cum->words += words;
6702 cum->nregs -= words;
6703 cum->regno += words;
6704
6705 if (cum->nregs <= 0)
6706 {
6707 cum->nregs = 0;
6708 cum->regno = 0;
6709 }
6710 break;
6711
6712 case OImode:
6713 /* OImode shouldn't be used directly. */
6714 gcc_unreachable ();
6715
6716 case DFmode:
6717 if (cum->float_in_sse < 2)
6718 break;
6719 case SFmode:
6720 if (cum->float_in_sse < 1)
6721 break;
6722 /* FALLTHRU */
6723
6724 case V8SFmode:
6725 case V8SImode:
6726 case V32QImode:
6727 case V16HImode:
6728 case V4DFmode:
6729 case V4DImode:
6730 case TImode:
6731 case V16QImode:
6732 case V8HImode:
6733 case V4SImode:
6734 case V2DImode:
6735 case V4SFmode:
6736 case V2DFmode:
6737 if (!type || !AGGREGATE_TYPE_P (type))
6738 {
6739 cum->sse_words += words;
6740 cum->sse_nregs -= 1;
6741 cum->sse_regno += 1;
6742 if (cum->sse_nregs <= 0)
6743 {
6744 cum->sse_nregs = 0;
6745 cum->sse_regno = 0;
6746 }
6747 }
6748 break;
6749
6750 case V8QImode:
6751 case V4HImode:
6752 case V2SImode:
6753 case V2SFmode:
6754 case V1TImode:
6755 case V1DImode:
6756 if (!type || !AGGREGATE_TYPE_P (type))
6757 {
6758 cum->mmx_words += words;
6759 cum->mmx_nregs -= 1;
6760 cum->mmx_regno += 1;
6761 if (cum->mmx_nregs <= 0)
6762 {
6763 cum->mmx_nregs = 0;
6764 cum->mmx_regno = 0;
6765 }
6766 }
6767 break;
6768 }
6769 }
6770
6771 static void
6772 function_arg_advance_64 (CUMULATIVE_ARGS *cum, enum machine_mode mode,
6773 const_tree type, HOST_WIDE_INT words, bool named)
6774 {
6775 int int_nregs, sse_nregs;
6776
6777 /* Unnamed 256bit vector mode parameters are passed on stack. */
6778 if (!named && VALID_AVX256_REG_MODE (mode))
6779 return;
6780
6781 if (examine_argument (mode, type, 0, &int_nregs, &sse_nregs)
6782 && sse_nregs <= cum->sse_nregs && int_nregs <= cum->nregs)
6783 {
6784 cum->nregs -= int_nregs;
6785 cum->sse_nregs -= sse_nregs;
6786 cum->regno += int_nregs;
6787 cum->sse_regno += sse_nregs;
6788 }
6789 else
6790 {
6791 int align = ix86_function_arg_boundary (mode, type) / BITS_PER_WORD;
6792 cum->words = (cum->words + align - 1) & ~(align - 1);
6793 cum->words += words;
6794 }
6795 }
6796
6797 static void
6798 function_arg_advance_ms_64 (CUMULATIVE_ARGS *cum, HOST_WIDE_INT bytes,
6799 HOST_WIDE_INT words)
6800 {
6801 /* Otherwise, this should be passed indirect. */
6802 gcc_assert (bytes == 1 || bytes == 2 || bytes == 4 || bytes == 8);
6803
6804 cum->words += words;
6805 if (cum->nregs > 0)
6806 {
6807 cum->nregs -= 1;
6808 cum->regno += 1;
6809 }
6810 }
6811
6812 /* Update the data in CUM to advance over an argument of mode MODE and
6813 data type TYPE. (TYPE is null for libcalls where that information
6814 may not be available.) */
6815
6816 static void
6817 ix86_function_arg_advance (cumulative_args_t cum_v, enum machine_mode mode,
6818 const_tree type, bool named)
6819 {
6820 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
6821 HOST_WIDE_INT bytes, words;
6822
6823 if (mode == BLKmode)
6824 bytes = int_size_in_bytes (type);
6825 else
6826 bytes = GET_MODE_SIZE (mode);
6827 words = (bytes + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
6828
6829 if (type)
6830 mode = type_natural_mode (type, NULL);
6831
6832 if (TARGET_64BIT && (cum ? cum->call_abi : ix86_abi) == MS_ABI)
6833 function_arg_advance_ms_64 (cum, bytes, words);
6834 else if (TARGET_64BIT)
6835 function_arg_advance_64 (cum, mode, type, words, named);
6836 else
6837 function_arg_advance_32 (cum, mode, type, bytes, words);
6838 }
6839
6840 /* Define where to put the arguments to a function.
6841 Value is zero to push the argument on the stack,
6842 or a hard register in which to store the argument.
6843
6844 MODE is the argument's machine mode.
6845 TYPE is the data type of the argument (as a tree).
6846 This is null for libcalls where that information may
6847 not be available.
6848 CUM is a variable of type CUMULATIVE_ARGS which gives info about
6849 the preceding args and about the function being called.
6850 NAMED is nonzero if this argument is a named parameter
6851 (otherwise it is an extra parameter matching an ellipsis). */
6852
6853 static rtx
6854 function_arg_32 (const CUMULATIVE_ARGS *cum, enum machine_mode mode,
6855 enum machine_mode orig_mode, const_tree type,
6856 HOST_WIDE_INT bytes, HOST_WIDE_INT words)
6857 {
6858 static bool warnedsse, warnedmmx;
6859
6860 /* Avoid the AL settings for the Unix64 ABI. */
6861 if (mode == VOIDmode)
6862 return constm1_rtx;
6863
6864 switch (mode)
6865 {
6866 default:
6867 break;
6868
6869 case BLKmode:
6870 if (bytes < 0)
6871 break;
6872 /* FALLTHRU */
6873 case DImode:
6874 case SImode:
6875 case HImode:
6876 case QImode:
6877 if (words <= cum->nregs)
6878 {
6879 int regno = cum->regno;
6880
6881 /* Fastcall allocates the first two DWORD (SImode) or
6882 smaller arguments to ECX and EDX if it isn't an
6883 aggregate type . */
6884 if (cum->fastcall)
6885 {
6886 if (mode == BLKmode
6887 || mode == DImode
6888 || (type && AGGREGATE_TYPE_P (type)))
6889 break;
6890
6891 /* ECX not EAX is the first allocated register. */
6892 if (regno == AX_REG)
6893 regno = CX_REG;
6894 }
6895 return gen_rtx_REG (mode, regno);
6896 }
6897 break;
6898
6899 case DFmode:
6900 if (cum->float_in_sse < 2)
6901 break;
6902 case SFmode:
6903 if (cum->float_in_sse < 1)
6904 break;
6905 /* FALLTHRU */
6906 case TImode:
6907 /* In 32bit, we pass TImode in xmm registers. */
6908 case V16QImode:
6909 case V8HImode:
6910 case V4SImode:
6911 case V2DImode:
6912 case V4SFmode:
6913 case V2DFmode:
6914 if (!type || !AGGREGATE_TYPE_P (type))
6915 {
6916 if (!TARGET_SSE && !warnedsse && cum->warn_sse)
6917 {
6918 warnedsse = true;
6919 warning (0, "SSE vector argument without SSE enabled "
6920 "changes the ABI");
6921 }
6922 if (cum->sse_nregs)
6923 return gen_reg_or_parallel (mode, orig_mode,
6924 cum->sse_regno + FIRST_SSE_REG);
6925 }
6926 break;
6927
6928 case OImode:
6929 /* OImode shouldn't be used directly. */
6930 gcc_unreachable ();
6931
6932 case V8SFmode:
6933 case V8SImode:
6934 case V32QImode:
6935 case V16HImode:
6936 case V4DFmode:
6937 case V4DImode:
6938 if (!type || !AGGREGATE_TYPE_P (type))
6939 {
6940 if (cum->sse_nregs)
6941 return gen_reg_or_parallel (mode, orig_mode,
6942 cum->sse_regno + FIRST_SSE_REG);
6943 }
6944 break;
6945
6946 case V8QImode:
6947 case V4HImode:
6948 case V2SImode:
6949 case V2SFmode:
6950 case V1TImode:
6951 case V1DImode:
6952 if (!type || !AGGREGATE_TYPE_P (type))
6953 {
6954 if (!TARGET_MMX && !warnedmmx && cum->warn_mmx)
6955 {
6956 warnedmmx = true;
6957 warning (0, "MMX vector argument without MMX enabled "
6958 "changes the ABI");
6959 }
6960 if (cum->mmx_nregs)
6961 return gen_reg_or_parallel (mode, orig_mode,
6962 cum->mmx_regno + FIRST_MMX_REG);
6963 }
6964 break;
6965 }
6966
6967 return NULL_RTX;
6968 }
6969
6970 static rtx
6971 function_arg_64 (const CUMULATIVE_ARGS *cum, enum machine_mode mode,
6972 enum machine_mode orig_mode, const_tree type, bool named)
6973 {
6974 /* Handle a hidden AL argument containing number of registers
6975 for varargs x86-64 functions. */
6976 if (mode == VOIDmode)
6977 return GEN_INT (cum->maybe_vaarg
6978 ? (cum->sse_nregs < 0
6979 ? X86_64_SSE_REGPARM_MAX
6980 : cum->sse_regno)
6981 : -1);
6982
6983 switch (mode)
6984 {
6985 default:
6986 break;
6987
6988 case V8SFmode:
6989 case V8SImode:
6990 case V32QImode:
6991 case V16HImode:
6992 case V4DFmode:
6993 case V4DImode:
6994 /* Unnamed 256bit vector mode parameters are passed on stack. */
6995 if (!named)
6996 return NULL;
6997 break;
6998 }
6999
7000 return construct_container (mode, orig_mode, type, 0, cum->nregs,
7001 cum->sse_nregs,
7002 &x86_64_int_parameter_registers [cum->regno],
7003 cum->sse_regno);
7004 }
7005
7006 static rtx
7007 function_arg_ms_64 (const CUMULATIVE_ARGS *cum, enum machine_mode mode,
7008 enum machine_mode orig_mode, bool named,
7009 HOST_WIDE_INT bytes)
7010 {
7011 unsigned int regno;
7012
7013 /* We need to add clobber for MS_ABI->SYSV ABI calls in expand_call.
7014 We use value of -2 to specify that current function call is MSABI. */
7015 if (mode == VOIDmode)
7016 return GEN_INT (-2);
7017
7018 /* If we've run out of registers, it goes on the stack. */
7019 if (cum->nregs == 0)
7020 return NULL_RTX;
7021
7022 regno = x86_64_ms_abi_int_parameter_registers[cum->regno];
7023
7024 /* Only floating point modes are passed in anything but integer regs. */
7025 if (TARGET_SSE && (mode == SFmode || mode == DFmode))
7026 {
7027 if (named)
7028 regno = cum->regno + FIRST_SSE_REG;
7029 else
7030 {
7031 rtx t1, t2;
7032
7033 /* Unnamed floating parameters are passed in both the
7034 SSE and integer registers. */
7035 t1 = gen_rtx_REG (mode, cum->regno + FIRST_SSE_REG);
7036 t2 = gen_rtx_REG (mode, regno);
7037 t1 = gen_rtx_EXPR_LIST (VOIDmode, t1, const0_rtx);
7038 t2 = gen_rtx_EXPR_LIST (VOIDmode, t2, const0_rtx);
7039 return gen_rtx_PARALLEL (mode, gen_rtvec (2, t1, t2));
7040 }
7041 }
7042 /* Handle aggregated types passed in register. */
7043 if (orig_mode == BLKmode)
7044 {
7045 if (bytes > 0 && bytes <= 8)
7046 mode = (bytes > 4 ? DImode : SImode);
7047 if (mode == BLKmode)
7048 mode = DImode;
7049 }
7050
7051 return gen_reg_or_parallel (mode, orig_mode, regno);
7052 }
7053
7054 /* Return where to put the arguments to a function.
7055 Return zero to push the argument on the stack, or a hard register in which to store the argument.
7056
7057 MODE is the argument's machine mode. TYPE is the data type of the
7058 argument. It is null for libcalls where that information may not be
7059 available. CUM gives information about the preceding args and about
7060 the function being called. NAMED is nonzero if this argument is a
7061 named parameter (otherwise it is an extra parameter matching an
7062 ellipsis). */
7063
7064 static rtx
7065 ix86_function_arg (cumulative_args_t cum_v, enum machine_mode omode,
7066 const_tree type, bool named)
7067 {
7068 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
7069 enum machine_mode mode = omode;
7070 HOST_WIDE_INT bytes, words;
7071 rtx arg;
7072
7073 if (mode == BLKmode)
7074 bytes = int_size_in_bytes (type);
7075 else
7076 bytes = GET_MODE_SIZE (mode);
7077 words = (bytes + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
7078
7079 /* To simplify the code below, represent vector types with a vector mode
7080 even if MMX/SSE are not active. */
7081 if (type && TREE_CODE (type) == VECTOR_TYPE)
7082 mode = type_natural_mode (type, cum);
7083
7084 if (TARGET_64BIT && (cum ? cum->call_abi : ix86_abi) == MS_ABI)
7085 arg = function_arg_ms_64 (cum, mode, omode, named, bytes);
7086 else if (TARGET_64BIT)
7087 arg = function_arg_64 (cum, mode, omode, type, named);
7088 else
7089 arg = function_arg_32 (cum, mode, omode, type, bytes, words);
7090
7091 if (TARGET_VZEROUPPER && function_pass_avx256_p (arg))
7092 {
7093 /* This argument uses 256bit AVX modes. */
7094 if (cum->caller)
7095 cfun->machine->callee_pass_avx256_p = true;
7096 else
7097 cfun->machine->caller_pass_avx256_p = true;
7098 }
7099
7100 return arg;
7101 }
7102
7103 /* A C expression that indicates when an argument must be passed by
7104 reference. If nonzero for an argument, a copy of that argument is
7105 made in memory and a pointer to the argument is passed instead of
7106 the argument itself. The pointer is passed in whatever way is
7107 appropriate for passing a pointer to that type. */
7108
7109 static bool
7110 ix86_pass_by_reference (cumulative_args_t cum_v ATTRIBUTE_UNUSED,
7111 enum machine_mode mode ATTRIBUTE_UNUSED,
7112 const_tree type, bool named ATTRIBUTE_UNUSED)
7113 {
7114 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
7115
7116 /* See Windows x64 Software Convention. */
7117 if (TARGET_64BIT && (cum ? cum->call_abi : ix86_abi) == MS_ABI)
7118 {
7119 int msize = (int) GET_MODE_SIZE (mode);
7120 if (type)
7121 {
7122 /* Arrays are passed by reference. */
7123 if (TREE_CODE (type) == ARRAY_TYPE)
7124 return true;
7125
7126 if (AGGREGATE_TYPE_P (type))
7127 {
7128 /* Structs/unions of sizes other than 8, 16, 32, or 64 bits
7129 are passed by reference. */
7130 msize = int_size_in_bytes (type);
7131 }
7132 }
7133
7134 /* __m128 is passed by reference. */
7135 switch (msize) {
7136 case 1: case 2: case 4: case 8:
7137 break;
7138 default:
7139 return true;
7140 }
7141 }
7142 else if (TARGET_64BIT && type && int_size_in_bytes (type) == -1)
7143 return 1;
7144
7145 return 0;
7146 }
7147
7148 /* Return true when TYPE should be 128bit aligned for 32bit argument
7149 passing ABI. XXX: This function is obsolete and is only used for
7150 checking psABI compatibility with previous versions of GCC. */
7151
7152 static bool
7153 ix86_compat_aligned_value_p (const_tree type)
7154 {
7155 enum machine_mode mode = TYPE_MODE (type);
7156 if (((TARGET_SSE && SSE_REG_MODE_P (mode))
7157 || mode == TDmode
7158 || mode == TFmode
7159 || mode == TCmode)
7160 && (!TYPE_USER_ALIGN (type) || TYPE_ALIGN (type) > 128))
7161 return true;
7162 if (TYPE_ALIGN (type) < 128)
7163 return false;
7164
7165 if (AGGREGATE_TYPE_P (type))
7166 {
7167 /* Walk the aggregates recursively. */
7168 switch (TREE_CODE (type))
7169 {
7170 case RECORD_TYPE:
7171 case UNION_TYPE:
7172 case QUAL_UNION_TYPE:
7173 {
7174 tree field;
7175
7176 /* Walk all the structure fields. */
7177 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
7178 {
7179 if (TREE_CODE (field) == FIELD_DECL
7180 && ix86_compat_aligned_value_p (TREE_TYPE (field)))
7181 return true;
7182 }
7183 break;
7184 }
7185
7186 case ARRAY_TYPE:
7187 /* Just for use if some languages passes arrays by value. */
7188 if (ix86_compat_aligned_value_p (TREE_TYPE (type)))
7189 return true;
7190 break;
7191
7192 default:
7193 gcc_unreachable ();
7194 }
7195 }
7196 return false;
7197 }
7198
7199 /* Return the alignment boundary for MODE and TYPE with alignment ALIGN.
7200 XXX: This function is obsolete and is only used for checking psABI
7201 compatibility with previous versions of GCC. */
7202
7203 static unsigned int
7204 ix86_compat_function_arg_boundary (enum machine_mode mode,
7205 const_tree type, unsigned int align)
7206 {
7207 /* In 32bit, only _Decimal128 and __float128 are aligned to their
7208 natural boundaries. */
7209 if (!TARGET_64BIT && mode != TDmode && mode != TFmode)
7210 {
7211 /* i386 ABI defines all arguments to be 4 byte aligned. We have to
7212 make an exception for SSE modes since these require 128bit
7213 alignment.
7214
7215 The handling here differs from field_alignment. ICC aligns MMX
7216 arguments to 4 byte boundaries, while structure fields are aligned
7217 to 8 byte boundaries. */
7218 if (!type)
7219 {
7220 if (!(TARGET_SSE && SSE_REG_MODE_P (mode)))
7221 align = PARM_BOUNDARY;
7222 }
7223 else
7224 {
7225 if (!ix86_compat_aligned_value_p (type))
7226 align = PARM_BOUNDARY;
7227 }
7228 }
7229 if (align > BIGGEST_ALIGNMENT)
7230 align = BIGGEST_ALIGNMENT;
7231 return align;
7232 }
7233
7234 /* Return true when TYPE should be 128bit aligned for 32bit argument
7235 passing ABI. */
7236
7237 static bool
7238 ix86_contains_aligned_value_p (const_tree type)
7239 {
7240 enum machine_mode mode = TYPE_MODE (type);
7241
7242 if (mode == XFmode || mode == XCmode)
7243 return false;
7244
7245 if (TYPE_ALIGN (type) < 128)
7246 return false;
7247
7248 if (AGGREGATE_TYPE_P (type))
7249 {
7250 /* Walk the aggregates recursively. */
7251 switch (TREE_CODE (type))
7252 {
7253 case RECORD_TYPE:
7254 case UNION_TYPE:
7255 case QUAL_UNION_TYPE:
7256 {
7257 tree field;
7258
7259 /* Walk all the structure fields. */
7260 for (field = TYPE_FIELDS (type);
7261 field;
7262 field = DECL_CHAIN (field))
7263 {
7264 if (TREE_CODE (field) == FIELD_DECL
7265 && ix86_contains_aligned_value_p (TREE_TYPE (field)))
7266 return true;
7267 }
7268 break;
7269 }
7270
7271 case ARRAY_TYPE:
7272 /* Just for use if some languages passes arrays by value. */
7273 if (ix86_contains_aligned_value_p (TREE_TYPE (type)))
7274 return true;
7275 break;
7276
7277 default:
7278 gcc_unreachable ();
7279 }
7280 }
7281 else
7282 return TYPE_ALIGN (type) >= 128;
7283
7284 return false;
7285 }
7286
7287 /* Gives the alignment boundary, in bits, of an argument with the
7288 specified mode and type. */
7289
7290 static unsigned int
7291 ix86_function_arg_boundary (enum machine_mode mode, const_tree type)
7292 {
7293 unsigned int align;
7294 if (type)
7295 {
7296 /* Since the main variant type is used for call, we convert it to
7297 the main variant type. */
7298 type = TYPE_MAIN_VARIANT (type);
7299 align = TYPE_ALIGN (type);
7300 }
7301 else
7302 align = GET_MODE_ALIGNMENT (mode);
7303 if (align < PARM_BOUNDARY)
7304 align = PARM_BOUNDARY;
7305 else
7306 {
7307 static bool warned;
7308 unsigned int saved_align = align;
7309
7310 if (!TARGET_64BIT)
7311 {
7312 /* i386 ABI defines XFmode arguments to be 4 byte aligned. */
7313 if (!type)
7314 {
7315 if (mode == XFmode || mode == XCmode)
7316 align = PARM_BOUNDARY;
7317 }
7318 else if (!ix86_contains_aligned_value_p (type))
7319 align = PARM_BOUNDARY;
7320
7321 if (align < 128)
7322 align = PARM_BOUNDARY;
7323 }
7324
7325 if (warn_psabi
7326 && !warned
7327 && align != ix86_compat_function_arg_boundary (mode, type,
7328 saved_align))
7329 {
7330 warned = true;
7331 inform (input_location,
7332 "The ABI for passing parameters with %d-byte"
7333 " alignment has changed in GCC 4.6",
7334 align / BITS_PER_UNIT);
7335 }
7336 }
7337
7338 return align;
7339 }
7340
7341 /* Return true if N is a possible register number of function value. */
7342
7343 static bool
7344 ix86_function_value_regno_p (const unsigned int regno)
7345 {
7346 switch (regno)
7347 {
7348 case AX_REG:
7349 return true;
7350
7351 case FIRST_FLOAT_REG:
7352 /* TODO: The function should depend on current function ABI but
7353 builtins.c would need updating then. Therefore we use the
7354 default ABI. */
7355 if (TARGET_64BIT && ix86_abi == MS_ABI)
7356 return false;
7357 return TARGET_FLOAT_RETURNS_IN_80387;
7358
7359 case FIRST_SSE_REG:
7360 return TARGET_SSE;
7361
7362 case FIRST_MMX_REG:
7363 if (TARGET_MACHO || TARGET_64BIT)
7364 return false;
7365 return TARGET_MMX;
7366 }
7367
7368 return false;
7369 }
7370
7371 /* Define how to find the value returned by a function.
7372 VALTYPE is the data type of the value (as a tree).
7373 If the precise function being called is known, FUNC is its FUNCTION_DECL;
7374 otherwise, FUNC is 0. */
7375
7376 static rtx
7377 function_value_32 (enum machine_mode orig_mode, enum machine_mode mode,
7378 const_tree fntype, const_tree fn)
7379 {
7380 unsigned int regno;
7381
7382 /* 8-byte vector modes in %mm0. See ix86_return_in_memory for where
7383 we normally prevent this case when mmx is not available. However
7384 some ABIs may require the result to be returned like DImode. */
7385 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 8)
7386 regno = FIRST_MMX_REG;
7387
7388 /* 16-byte vector modes in %xmm0. See ix86_return_in_memory for where
7389 we prevent this case when sse is not available. However some ABIs
7390 may require the result to be returned like integer TImode. */
7391 else if (mode == TImode
7392 || (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 16))
7393 regno = FIRST_SSE_REG;
7394
7395 /* 32-byte vector modes in %ymm0. */
7396 else if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 32)
7397 regno = FIRST_SSE_REG;
7398
7399 /* Floating point return values in %st(0) (unless -mno-fp-ret-in-387). */
7400 else if (X87_FLOAT_MODE_P (mode) && TARGET_FLOAT_RETURNS_IN_80387)
7401 regno = FIRST_FLOAT_REG;
7402 else
7403 /* Most things go in %eax. */
7404 regno = AX_REG;
7405
7406 /* Override FP return register with %xmm0 for local functions when
7407 SSE math is enabled or for functions with sseregparm attribute. */
7408 if ((fn || fntype) && (mode == SFmode || mode == DFmode))
7409 {
7410 int sse_level = ix86_function_sseregparm (fntype, fn, false);
7411 if ((sse_level >= 1 && mode == SFmode)
7412 || (sse_level == 2 && mode == DFmode))
7413 regno = FIRST_SSE_REG;
7414 }
7415
7416 /* OImode shouldn't be used directly. */
7417 gcc_assert (mode != OImode);
7418
7419 return gen_rtx_REG (orig_mode, regno);
7420 }
7421
7422 static rtx
7423 function_value_64 (enum machine_mode orig_mode, enum machine_mode mode,
7424 const_tree valtype)
7425 {
7426 rtx ret;
7427
7428 /* Handle libcalls, which don't provide a type node. */
7429 if (valtype == NULL)
7430 {
7431 unsigned int regno;
7432
7433 switch (mode)
7434 {
7435 case SFmode:
7436 case SCmode:
7437 case DFmode:
7438 case DCmode:
7439 case TFmode:
7440 case SDmode:
7441 case DDmode:
7442 case TDmode:
7443 regno = FIRST_SSE_REG;
7444 break;
7445 case XFmode:
7446 case XCmode:
7447 regno = FIRST_FLOAT_REG;
7448 break;
7449 case TCmode:
7450 return NULL;
7451 default:
7452 regno = AX_REG;
7453 }
7454
7455 return gen_rtx_REG (mode, regno);
7456 }
7457 else if (POINTER_TYPE_P (valtype))
7458 {
7459 /* Pointers are always returned in word_mode. */
7460 mode = word_mode;
7461 }
7462
7463 ret = construct_container (mode, orig_mode, valtype, 1,
7464 X86_64_REGPARM_MAX, X86_64_SSE_REGPARM_MAX,
7465 x86_64_int_return_registers, 0);
7466
7467 /* For zero sized structures, construct_container returns NULL, but we
7468 need to keep rest of compiler happy by returning meaningful value. */
7469 if (!ret)
7470 ret = gen_rtx_REG (orig_mode, AX_REG);
7471
7472 return ret;
7473 }
7474
7475 static rtx
7476 function_value_ms_64 (enum machine_mode orig_mode, enum machine_mode mode)
7477 {
7478 unsigned int regno = AX_REG;
7479
7480 if (TARGET_SSE)
7481 {
7482 switch (GET_MODE_SIZE (mode))
7483 {
7484 case 16:
7485 if((SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
7486 && !COMPLEX_MODE_P (mode))
7487 regno = FIRST_SSE_REG;
7488 break;
7489 case 8:
7490 case 4:
7491 if (mode == SFmode || mode == DFmode)
7492 regno = FIRST_SSE_REG;
7493 break;
7494 default:
7495 break;
7496 }
7497 }
7498 return gen_rtx_REG (orig_mode, regno);
7499 }
7500
7501 static rtx
7502 ix86_function_value_1 (const_tree valtype, const_tree fntype_or_decl,
7503 enum machine_mode orig_mode, enum machine_mode mode)
7504 {
7505 const_tree fn, fntype;
7506
7507 fn = NULL_TREE;
7508 if (fntype_or_decl && DECL_P (fntype_or_decl))
7509 fn = fntype_or_decl;
7510 fntype = fn ? TREE_TYPE (fn) : fntype_or_decl;
7511
7512 if (TARGET_64BIT && ix86_function_type_abi (fntype) == MS_ABI)
7513 return function_value_ms_64 (orig_mode, mode);
7514 else if (TARGET_64BIT)
7515 return function_value_64 (orig_mode, mode, valtype);
7516 else
7517 return function_value_32 (orig_mode, mode, fntype, fn);
7518 }
7519
7520 static rtx
7521 ix86_function_value (const_tree valtype, const_tree fntype_or_decl,
7522 bool outgoing ATTRIBUTE_UNUSED)
7523 {
7524 enum machine_mode mode, orig_mode;
7525
7526 orig_mode = TYPE_MODE (valtype);
7527 mode = type_natural_mode (valtype, NULL);
7528 return ix86_function_value_1 (valtype, fntype_or_decl, orig_mode, mode);
7529 }
7530
7531 /* Pointer function arguments and return values are promoted to
7532 word_mode. */
7533
7534 static enum machine_mode
7535 ix86_promote_function_mode (const_tree type, enum machine_mode mode,
7536 int *punsignedp, const_tree fntype,
7537 int for_return)
7538 {
7539 if (type != NULL_TREE && POINTER_TYPE_P (type))
7540 {
7541 *punsignedp = POINTERS_EXTEND_UNSIGNED;
7542 return word_mode;
7543 }
7544 return default_promote_function_mode (type, mode, punsignedp, fntype,
7545 for_return);
7546 }
7547
7548 /* Return true if a structure, union or array with MODE containing FIELD
7549 should be accessed using BLKmode. */
7550
7551 static bool
7552 ix86_member_type_forces_blk (const_tree field, enum machine_mode mode)
7553 {
7554 /* Union with XFmode must be in BLKmode. */
7555 return (mode == XFmode
7556 && (TREE_CODE (DECL_FIELD_CONTEXT (field)) == UNION_TYPE
7557 || TREE_CODE (DECL_FIELD_CONTEXT (field)) == QUAL_UNION_TYPE));
7558 }
7559
7560 rtx
7561 ix86_libcall_value (enum machine_mode mode)
7562 {
7563 return ix86_function_value_1 (NULL, NULL, mode, mode);
7564 }
7565
7566 /* Return true iff type is returned in memory. */
7567
7568 static bool ATTRIBUTE_UNUSED
7569 return_in_memory_32 (const_tree type, enum machine_mode mode)
7570 {
7571 HOST_WIDE_INT size;
7572
7573 if (mode == BLKmode)
7574 return true;
7575
7576 size = int_size_in_bytes (type);
7577
7578 if (MS_AGGREGATE_RETURN && AGGREGATE_TYPE_P (type) && size <= 8)
7579 return false;
7580
7581 if (VECTOR_MODE_P (mode) || mode == TImode)
7582 {
7583 /* User-created vectors small enough to fit in EAX. */
7584 if (size < 8)
7585 return false;
7586
7587 /* MMX/3dNow values are returned in MM0,
7588 except when it doesn't exits or the ABI prescribes otherwise. */
7589 if (size == 8)
7590 return !TARGET_MMX || TARGET_VECT8_RETURNS;
7591
7592 /* SSE values are returned in XMM0, except when it doesn't exist. */
7593 if (size == 16)
7594 return !TARGET_SSE;
7595
7596 /* AVX values are returned in YMM0, except when it doesn't exist. */
7597 if (size == 32)
7598 return !TARGET_AVX;
7599 }
7600
7601 if (mode == XFmode)
7602 return false;
7603
7604 if (size > 12)
7605 return true;
7606
7607 /* OImode shouldn't be used directly. */
7608 gcc_assert (mode != OImode);
7609
7610 return false;
7611 }
7612
7613 static bool ATTRIBUTE_UNUSED
7614 return_in_memory_64 (const_tree type, enum machine_mode mode)
7615 {
7616 int needed_intregs, needed_sseregs;
7617 return !examine_argument (mode, type, 1, &needed_intregs, &needed_sseregs);
7618 }
7619
7620 static bool ATTRIBUTE_UNUSED
7621 return_in_memory_ms_64 (const_tree type, enum machine_mode mode)
7622 {
7623 HOST_WIDE_INT size = int_size_in_bytes (type);
7624
7625 /* __m128 is returned in xmm0. */
7626 if ((SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
7627 && !COMPLEX_MODE_P (mode) && (GET_MODE_SIZE (mode) == 16 || size == 16))
7628 return false;
7629
7630 /* Otherwise, the size must be exactly in [1248]. */
7631 return size != 1 && size != 2 && size != 4 && size != 8;
7632 }
7633
7634 static bool
7635 ix86_return_in_memory (const_tree type, const_tree fntype ATTRIBUTE_UNUSED)
7636 {
7637 #ifdef SUBTARGET_RETURN_IN_MEMORY
7638 return SUBTARGET_RETURN_IN_MEMORY (type, fntype);
7639 #else
7640 const enum machine_mode mode = type_natural_mode (type, NULL);
7641
7642 if (TARGET_64BIT)
7643 {
7644 if (ix86_function_type_abi (fntype) == MS_ABI)
7645 return return_in_memory_ms_64 (type, mode);
7646 else
7647 return return_in_memory_64 (type, mode);
7648 }
7649 else
7650 return return_in_memory_32 (type, mode);
7651 #endif
7652 }
7653
7654 /* When returning SSE vector types, we have a choice of either
7655 (1) being abi incompatible with a -march switch, or
7656 (2) generating an error.
7657 Given no good solution, I think the safest thing is one warning.
7658 The user won't be able to use -Werror, but....
7659
7660 Choose the STRUCT_VALUE_RTX hook because that's (at present) only
7661 called in response to actually generating a caller or callee that
7662 uses such a type. As opposed to TARGET_RETURN_IN_MEMORY, which is called
7663 via aggregate_value_p for general type probing from tree-ssa. */
7664
7665 static rtx
7666 ix86_struct_value_rtx (tree type, int incoming ATTRIBUTE_UNUSED)
7667 {
7668 static bool warnedsse, warnedmmx;
7669
7670 if (!TARGET_64BIT && type)
7671 {
7672 /* Look at the return type of the function, not the function type. */
7673 enum machine_mode mode = TYPE_MODE (TREE_TYPE (type));
7674
7675 if (!TARGET_SSE && !warnedsse)
7676 {
7677 if (mode == TImode
7678 || (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 16))
7679 {
7680 warnedsse = true;
7681 warning (0, "SSE vector return without SSE enabled "
7682 "changes the ABI");
7683 }
7684 }
7685
7686 if (!TARGET_MMX && !warnedmmx)
7687 {
7688 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 8)
7689 {
7690 warnedmmx = true;
7691 warning (0, "MMX vector return without MMX enabled "
7692 "changes the ABI");
7693 }
7694 }
7695 }
7696
7697 return NULL;
7698 }
7699
7700 \f
7701 /* Create the va_list data type. */
7702
7703 /* Returns the calling convention specific va_list date type.
7704 The argument ABI can be DEFAULT_ABI, MS_ABI, or SYSV_ABI. */
7705
7706 static tree
7707 ix86_build_builtin_va_list_abi (enum calling_abi abi)
7708 {
7709 tree f_gpr, f_fpr, f_ovf, f_sav, record, type_decl;
7710
7711 /* For i386 we use plain pointer to argument area. */
7712 if (!TARGET_64BIT || abi == MS_ABI)
7713 return build_pointer_type (char_type_node);
7714
7715 record = lang_hooks.types.make_type (RECORD_TYPE);
7716 type_decl = build_decl (BUILTINS_LOCATION,
7717 TYPE_DECL, get_identifier ("__va_list_tag"), record);
7718
7719 f_gpr = build_decl (BUILTINS_LOCATION,
7720 FIELD_DECL, get_identifier ("gp_offset"),
7721 unsigned_type_node);
7722 f_fpr = build_decl (BUILTINS_LOCATION,
7723 FIELD_DECL, get_identifier ("fp_offset"),
7724 unsigned_type_node);
7725 f_ovf = build_decl (BUILTINS_LOCATION,
7726 FIELD_DECL, get_identifier ("overflow_arg_area"),
7727 ptr_type_node);
7728 f_sav = build_decl (BUILTINS_LOCATION,
7729 FIELD_DECL, get_identifier ("reg_save_area"),
7730 ptr_type_node);
7731
7732 va_list_gpr_counter_field = f_gpr;
7733 va_list_fpr_counter_field = f_fpr;
7734
7735 DECL_FIELD_CONTEXT (f_gpr) = record;
7736 DECL_FIELD_CONTEXT (f_fpr) = record;
7737 DECL_FIELD_CONTEXT (f_ovf) = record;
7738 DECL_FIELD_CONTEXT (f_sav) = record;
7739
7740 TYPE_STUB_DECL (record) = type_decl;
7741 TYPE_NAME (record) = type_decl;
7742 TYPE_FIELDS (record) = f_gpr;
7743 DECL_CHAIN (f_gpr) = f_fpr;
7744 DECL_CHAIN (f_fpr) = f_ovf;
7745 DECL_CHAIN (f_ovf) = f_sav;
7746
7747 layout_type (record);
7748
7749 /* The correct type is an array type of one element. */
7750 return build_array_type (record, build_index_type (size_zero_node));
7751 }
7752
7753 /* Setup the builtin va_list data type and for 64-bit the additional
7754 calling convention specific va_list data types. */
7755
7756 static tree
7757 ix86_build_builtin_va_list (void)
7758 {
7759 tree ret = ix86_build_builtin_va_list_abi (ix86_abi);
7760
7761 /* Initialize abi specific va_list builtin types. */
7762 if (TARGET_64BIT)
7763 {
7764 tree t;
7765 if (ix86_abi == MS_ABI)
7766 {
7767 t = ix86_build_builtin_va_list_abi (SYSV_ABI);
7768 if (TREE_CODE (t) != RECORD_TYPE)
7769 t = build_variant_type_copy (t);
7770 sysv_va_list_type_node = t;
7771 }
7772 else
7773 {
7774 t = ret;
7775 if (TREE_CODE (t) != RECORD_TYPE)
7776 t = build_variant_type_copy (t);
7777 sysv_va_list_type_node = t;
7778 }
7779 if (ix86_abi != MS_ABI)
7780 {
7781 t = ix86_build_builtin_va_list_abi (MS_ABI);
7782 if (TREE_CODE (t) != RECORD_TYPE)
7783 t = build_variant_type_copy (t);
7784 ms_va_list_type_node = t;
7785 }
7786 else
7787 {
7788 t = ret;
7789 if (TREE_CODE (t) != RECORD_TYPE)
7790 t = build_variant_type_copy (t);
7791 ms_va_list_type_node = t;
7792 }
7793 }
7794
7795 return ret;
7796 }
7797
7798 /* Worker function for TARGET_SETUP_INCOMING_VARARGS. */
7799
7800 static void
7801 setup_incoming_varargs_64 (CUMULATIVE_ARGS *cum)
7802 {
7803 rtx save_area, mem;
7804 alias_set_type set;
7805 int i, max;
7806
7807 /* GPR size of varargs save area. */
7808 if (cfun->va_list_gpr_size)
7809 ix86_varargs_gpr_size = X86_64_REGPARM_MAX * UNITS_PER_WORD;
7810 else
7811 ix86_varargs_gpr_size = 0;
7812
7813 /* FPR size of varargs save area. We don't need it if we don't pass
7814 anything in SSE registers. */
7815 if (TARGET_SSE && cfun->va_list_fpr_size)
7816 ix86_varargs_fpr_size = X86_64_SSE_REGPARM_MAX * 16;
7817 else
7818 ix86_varargs_fpr_size = 0;
7819
7820 if (! ix86_varargs_gpr_size && ! ix86_varargs_fpr_size)
7821 return;
7822
7823 save_area = frame_pointer_rtx;
7824 set = get_varargs_alias_set ();
7825
7826 max = cum->regno + cfun->va_list_gpr_size / UNITS_PER_WORD;
7827 if (max > X86_64_REGPARM_MAX)
7828 max = X86_64_REGPARM_MAX;
7829
7830 for (i = cum->regno; i < max; i++)
7831 {
7832 mem = gen_rtx_MEM (word_mode,
7833 plus_constant (Pmode, save_area, i * UNITS_PER_WORD));
7834 MEM_NOTRAP_P (mem) = 1;
7835 set_mem_alias_set (mem, set);
7836 emit_move_insn (mem,
7837 gen_rtx_REG (word_mode,
7838 x86_64_int_parameter_registers[i]));
7839 }
7840
7841 if (ix86_varargs_fpr_size)
7842 {
7843 enum machine_mode smode;
7844 rtx label, test;
7845
7846 /* Now emit code to save SSE registers. The AX parameter contains number
7847 of SSE parameter registers used to call this function, though all we
7848 actually check here is the zero/non-zero status. */
7849
7850 label = gen_label_rtx ();
7851 test = gen_rtx_EQ (VOIDmode, gen_rtx_REG (QImode, AX_REG), const0_rtx);
7852 emit_jump_insn (gen_cbranchqi4 (test, XEXP (test, 0), XEXP (test, 1),
7853 label));
7854
7855 /* ??? If !TARGET_SSE_TYPELESS_STORES, would we perform better if
7856 we used movdqa (i.e. TImode) instead? Perhaps even better would
7857 be if we could determine the real mode of the data, via a hook
7858 into pass_stdarg. Ignore all that for now. */
7859 smode = V4SFmode;
7860 if (crtl->stack_alignment_needed < GET_MODE_ALIGNMENT (smode))
7861 crtl->stack_alignment_needed = GET_MODE_ALIGNMENT (smode);
7862
7863 max = cum->sse_regno + cfun->va_list_fpr_size / 16;
7864 if (max > X86_64_SSE_REGPARM_MAX)
7865 max = X86_64_SSE_REGPARM_MAX;
7866
7867 for (i = cum->sse_regno; i < max; ++i)
7868 {
7869 mem = plus_constant (Pmode, save_area,
7870 i * 16 + ix86_varargs_gpr_size);
7871 mem = gen_rtx_MEM (smode, mem);
7872 MEM_NOTRAP_P (mem) = 1;
7873 set_mem_alias_set (mem, set);
7874 set_mem_align (mem, GET_MODE_ALIGNMENT (smode));
7875
7876 emit_move_insn (mem, gen_rtx_REG (smode, SSE_REGNO (i)));
7877 }
7878
7879 emit_label (label);
7880 }
7881 }
7882
7883 static void
7884 setup_incoming_varargs_ms_64 (CUMULATIVE_ARGS *cum)
7885 {
7886 alias_set_type set = get_varargs_alias_set ();
7887 int i;
7888
7889 /* Reset to zero, as there might be a sysv vaarg used
7890 before. */
7891 ix86_varargs_gpr_size = 0;
7892 ix86_varargs_fpr_size = 0;
7893
7894 for (i = cum->regno; i < X86_64_MS_REGPARM_MAX; i++)
7895 {
7896 rtx reg, mem;
7897
7898 mem = gen_rtx_MEM (Pmode,
7899 plus_constant (Pmode, virtual_incoming_args_rtx,
7900 i * UNITS_PER_WORD));
7901 MEM_NOTRAP_P (mem) = 1;
7902 set_mem_alias_set (mem, set);
7903
7904 reg = gen_rtx_REG (Pmode, x86_64_ms_abi_int_parameter_registers[i]);
7905 emit_move_insn (mem, reg);
7906 }
7907 }
7908
7909 static void
7910 ix86_setup_incoming_varargs (cumulative_args_t cum_v, enum machine_mode mode,
7911 tree type, int *pretend_size ATTRIBUTE_UNUSED,
7912 int no_rtl)
7913 {
7914 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
7915 CUMULATIVE_ARGS next_cum;
7916 tree fntype;
7917
7918 /* This argument doesn't appear to be used anymore. Which is good,
7919 because the old code here didn't suppress rtl generation. */
7920 gcc_assert (!no_rtl);
7921
7922 if (!TARGET_64BIT)
7923 return;
7924
7925 fntype = TREE_TYPE (current_function_decl);
7926
7927 /* For varargs, we do not want to skip the dummy va_dcl argument.
7928 For stdargs, we do want to skip the last named argument. */
7929 next_cum = *cum;
7930 if (stdarg_p (fntype))
7931 ix86_function_arg_advance (pack_cumulative_args (&next_cum), mode, type,
7932 true);
7933
7934 if (cum->call_abi == MS_ABI)
7935 setup_incoming_varargs_ms_64 (&next_cum);
7936 else
7937 setup_incoming_varargs_64 (&next_cum);
7938 }
7939
7940 /* Checks if TYPE is of kind va_list char *. */
7941
7942 static bool
7943 is_va_list_char_pointer (tree type)
7944 {
7945 tree canonic;
7946
7947 /* For 32-bit it is always true. */
7948 if (!TARGET_64BIT)
7949 return true;
7950 canonic = ix86_canonical_va_list_type (type);
7951 return (canonic == ms_va_list_type_node
7952 || (ix86_abi == MS_ABI && canonic == va_list_type_node));
7953 }
7954
7955 /* Implement va_start. */
7956
7957 static void
7958 ix86_va_start (tree valist, rtx nextarg)
7959 {
7960 HOST_WIDE_INT words, n_gpr, n_fpr;
7961 tree f_gpr, f_fpr, f_ovf, f_sav;
7962 tree gpr, fpr, ovf, sav, t;
7963 tree type;
7964 rtx ovf_rtx;
7965
7966 if (flag_split_stack
7967 && cfun->machine->split_stack_varargs_pointer == NULL_RTX)
7968 {
7969 unsigned int scratch_regno;
7970
7971 /* When we are splitting the stack, we can't refer to the stack
7972 arguments using internal_arg_pointer, because they may be on
7973 the old stack. The split stack prologue will arrange to
7974 leave a pointer to the old stack arguments in a scratch
7975 register, which we here copy to a pseudo-register. The split
7976 stack prologue can't set the pseudo-register directly because
7977 it (the prologue) runs before any registers have been saved. */
7978
7979 scratch_regno = split_stack_prologue_scratch_regno ();
7980 if (scratch_regno != INVALID_REGNUM)
7981 {
7982 rtx reg, seq;
7983
7984 reg = gen_reg_rtx (Pmode);
7985 cfun->machine->split_stack_varargs_pointer = reg;
7986
7987 start_sequence ();
7988 emit_move_insn (reg, gen_rtx_REG (Pmode, scratch_regno));
7989 seq = get_insns ();
7990 end_sequence ();
7991
7992 push_topmost_sequence ();
7993 emit_insn_after (seq, entry_of_function ());
7994 pop_topmost_sequence ();
7995 }
7996 }
7997
7998 /* Only 64bit target needs something special. */
7999 if (!TARGET_64BIT || is_va_list_char_pointer (TREE_TYPE (valist)))
8000 {
8001 if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
8002 std_expand_builtin_va_start (valist, nextarg);
8003 else
8004 {
8005 rtx va_r, next;
8006
8007 va_r = expand_expr (valist, NULL_RTX, VOIDmode, EXPAND_WRITE);
8008 next = expand_binop (ptr_mode, add_optab,
8009 cfun->machine->split_stack_varargs_pointer,
8010 crtl->args.arg_offset_rtx,
8011 NULL_RTX, 0, OPTAB_LIB_WIDEN);
8012 convert_move (va_r, next, 0);
8013 }
8014 return;
8015 }
8016
8017 f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
8018 f_fpr = DECL_CHAIN (f_gpr);
8019 f_ovf = DECL_CHAIN (f_fpr);
8020 f_sav = DECL_CHAIN (f_ovf);
8021
8022 valist = build_simple_mem_ref (valist);
8023 TREE_TYPE (valist) = TREE_TYPE (sysv_va_list_type_node);
8024 /* The following should be folded into the MEM_REF offset. */
8025 gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr), unshare_expr (valist),
8026 f_gpr, NULL_TREE);
8027 fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), unshare_expr (valist),
8028 f_fpr, NULL_TREE);
8029 ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), unshare_expr (valist),
8030 f_ovf, NULL_TREE);
8031 sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), unshare_expr (valist),
8032 f_sav, NULL_TREE);
8033
8034 /* Count number of gp and fp argument registers used. */
8035 words = crtl->args.info.words;
8036 n_gpr = crtl->args.info.regno;
8037 n_fpr = crtl->args.info.sse_regno;
8038
8039 if (cfun->va_list_gpr_size)
8040 {
8041 type = TREE_TYPE (gpr);
8042 t = build2 (MODIFY_EXPR, type,
8043 gpr, build_int_cst (type, n_gpr * 8));
8044 TREE_SIDE_EFFECTS (t) = 1;
8045 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
8046 }
8047
8048 if (TARGET_SSE && cfun->va_list_fpr_size)
8049 {
8050 type = TREE_TYPE (fpr);
8051 t = build2 (MODIFY_EXPR, type, fpr,
8052 build_int_cst (type, n_fpr * 16 + 8*X86_64_REGPARM_MAX));
8053 TREE_SIDE_EFFECTS (t) = 1;
8054 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
8055 }
8056
8057 /* Find the overflow area. */
8058 type = TREE_TYPE (ovf);
8059 if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
8060 ovf_rtx = crtl->args.internal_arg_pointer;
8061 else
8062 ovf_rtx = cfun->machine->split_stack_varargs_pointer;
8063 t = make_tree (type, ovf_rtx);
8064 if (words != 0)
8065 t = fold_build_pointer_plus_hwi (t, words * UNITS_PER_WORD);
8066 t = build2 (MODIFY_EXPR, type, ovf, t);
8067 TREE_SIDE_EFFECTS (t) = 1;
8068 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
8069
8070 if (ix86_varargs_gpr_size || ix86_varargs_fpr_size)
8071 {
8072 /* Find the register save area.
8073 Prologue of the function save it right above stack frame. */
8074 type = TREE_TYPE (sav);
8075 t = make_tree (type, frame_pointer_rtx);
8076 if (!ix86_varargs_gpr_size)
8077 t = fold_build_pointer_plus_hwi (t, -8 * X86_64_REGPARM_MAX);
8078 t = build2 (MODIFY_EXPR, type, sav, t);
8079 TREE_SIDE_EFFECTS (t) = 1;
8080 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
8081 }
8082 }
8083
8084 /* Implement va_arg. */
8085
8086 static tree
8087 ix86_gimplify_va_arg (tree valist, tree type, gimple_seq *pre_p,
8088 gimple_seq *post_p)
8089 {
8090 static const int intreg[6] = { 0, 1, 2, 3, 4, 5 };
8091 tree f_gpr, f_fpr, f_ovf, f_sav;
8092 tree gpr, fpr, ovf, sav, t;
8093 int size, rsize;
8094 tree lab_false, lab_over = NULL_TREE;
8095 tree addr, t2;
8096 rtx container;
8097 int indirect_p = 0;
8098 tree ptrtype;
8099 enum machine_mode nat_mode;
8100 unsigned int arg_boundary;
8101
8102 /* Only 64bit target needs something special. */
8103 if (!TARGET_64BIT || is_va_list_char_pointer (TREE_TYPE (valist)))
8104 return std_gimplify_va_arg_expr (valist, type, pre_p, post_p);
8105
8106 f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
8107 f_fpr = DECL_CHAIN (f_gpr);
8108 f_ovf = DECL_CHAIN (f_fpr);
8109 f_sav = DECL_CHAIN (f_ovf);
8110
8111 gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr),
8112 build_va_arg_indirect_ref (valist), f_gpr, NULL_TREE);
8113 valist = build_va_arg_indirect_ref (valist);
8114 fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), valist, f_fpr, NULL_TREE);
8115 ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), valist, f_ovf, NULL_TREE);
8116 sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), valist, f_sav, NULL_TREE);
8117
8118 indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
8119 if (indirect_p)
8120 type = build_pointer_type (type);
8121 size = int_size_in_bytes (type);
8122 rsize = (size + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
8123
8124 nat_mode = type_natural_mode (type, NULL);
8125 switch (nat_mode)
8126 {
8127 case V8SFmode:
8128 case V8SImode:
8129 case V32QImode:
8130 case V16HImode:
8131 case V4DFmode:
8132 case V4DImode:
8133 /* Unnamed 256bit vector mode parameters are passed on stack. */
8134 if (!TARGET_64BIT_MS_ABI)
8135 {
8136 container = NULL;
8137 break;
8138 }
8139
8140 default:
8141 container = construct_container (nat_mode, TYPE_MODE (type),
8142 type, 0, X86_64_REGPARM_MAX,
8143 X86_64_SSE_REGPARM_MAX, intreg,
8144 0);
8145 break;
8146 }
8147
8148 /* Pull the value out of the saved registers. */
8149
8150 addr = create_tmp_var (ptr_type_node, "addr");
8151
8152 if (container)
8153 {
8154 int needed_intregs, needed_sseregs;
8155 bool need_temp;
8156 tree int_addr, sse_addr;
8157
8158 lab_false = create_artificial_label (UNKNOWN_LOCATION);
8159 lab_over = create_artificial_label (UNKNOWN_LOCATION);
8160
8161 examine_argument (nat_mode, type, 0, &needed_intregs, &needed_sseregs);
8162
8163 need_temp = (!REG_P (container)
8164 && ((needed_intregs && TYPE_ALIGN (type) > 64)
8165 || TYPE_ALIGN (type) > 128));
8166
8167 /* In case we are passing structure, verify that it is consecutive block
8168 on the register save area. If not we need to do moves. */
8169 if (!need_temp && !REG_P (container))
8170 {
8171 /* Verify that all registers are strictly consecutive */
8172 if (SSE_REGNO_P (REGNO (XEXP (XVECEXP (container, 0, 0), 0))))
8173 {
8174 int i;
8175
8176 for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
8177 {
8178 rtx slot = XVECEXP (container, 0, i);
8179 if (REGNO (XEXP (slot, 0)) != FIRST_SSE_REG + (unsigned int) i
8180 || INTVAL (XEXP (slot, 1)) != i * 16)
8181 need_temp = 1;
8182 }
8183 }
8184 else
8185 {
8186 int i;
8187
8188 for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
8189 {
8190 rtx slot = XVECEXP (container, 0, i);
8191 if (REGNO (XEXP (slot, 0)) != (unsigned int) i
8192 || INTVAL (XEXP (slot, 1)) != i * 8)
8193 need_temp = 1;
8194 }
8195 }
8196 }
8197 if (!need_temp)
8198 {
8199 int_addr = addr;
8200 sse_addr = addr;
8201 }
8202 else
8203 {
8204 int_addr = create_tmp_var (ptr_type_node, "int_addr");
8205 sse_addr = create_tmp_var (ptr_type_node, "sse_addr");
8206 }
8207
8208 /* First ensure that we fit completely in registers. */
8209 if (needed_intregs)
8210 {
8211 t = build_int_cst (TREE_TYPE (gpr),
8212 (X86_64_REGPARM_MAX - needed_intregs + 1) * 8);
8213 t = build2 (GE_EXPR, boolean_type_node, gpr, t);
8214 t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
8215 t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
8216 gimplify_and_add (t, pre_p);
8217 }
8218 if (needed_sseregs)
8219 {
8220 t = build_int_cst (TREE_TYPE (fpr),
8221 (X86_64_SSE_REGPARM_MAX - needed_sseregs + 1) * 16
8222 + X86_64_REGPARM_MAX * 8);
8223 t = build2 (GE_EXPR, boolean_type_node, fpr, t);
8224 t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
8225 t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
8226 gimplify_and_add (t, pre_p);
8227 }
8228
8229 /* Compute index to start of area used for integer regs. */
8230 if (needed_intregs)
8231 {
8232 /* int_addr = gpr + sav; */
8233 t = fold_build_pointer_plus (sav, gpr);
8234 gimplify_assign (int_addr, t, pre_p);
8235 }
8236 if (needed_sseregs)
8237 {
8238 /* sse_addr = fpr + sav; */
8239 t = fold_build_pointer_plus (sav, fpr);
8240 gimplify_assign (sse_addr, t, pre_p);
8241 }
8242 if (need_temp)
8243 {
8244 int i, prev_size = 0;
8245 tree temp = create_tmp_var (type, "va_arg_tmp");
8246
8247 /* addr = &temp; */
8248 t = build1 (ADDR_EXPR, build_pointer_type (type), temp);
8249 gimplify_assign (addr, t, pre_p);
8250
8251 for (i = 0; i < XVECLEN (container, 0); i++)
8252 {
8253 rtx slot = XVECEXP (container, 0, i);
8254 rtx reg = XEXP (slot, 0);
8255 enum machine_mode mode = GET_MODE (reg);
8256 tree piece_type;
8257 tree addr_type;
8258 tree daddr_type;
8259 tree src_addr, src;
8260 int src_offset;
8261 tree dest_addr, dest;
8262 int cur_size = GET_MODE_SIZE (mode);
8263
8264 gcc_assert (prev_size <= INTVAL (XEXP (slot, 1)));
8265 prev_size = INTVAL (XEXP (slot, 1));
8266 if (prev_size + cur_size > size)
8267 {
8268 cur_size = size - prev_size;
8269 mode = mode_for_size (cur_size * BITS_PER_UNIT, MODE_INT, 1);
8270 if (mode == BLKmode)
8271 mode = QImode;
8272 }
8273 piece_type = lang_hooks.types.type_for_mode (mode, 1);
8274 if (mode == GET_MODE (reg))
8275 addr_type = build_pointer_type (piece_type);
8276 else
8277 addr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
8278 true);
8279 daddr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
8280 true);
8281
8282 if (SSE_REGNO_P (REGNO (reg)))
8283 {
8284 src_addr = sse_addr;
8285 src_offset = (REGNO (reg) - FIRST_SSE_REG) * 16;
8286 }
8287 else
8288 {
8289 src_addr = int_addr;
8290 src_offset = REGNO (reg) * 8;
8291 }
8292 src_addr = fold_convert (addr_type, src_addr);
8293 src_addr = fold_build_pointer_plus_hwi (src_addr, src_offset);
8294
8295 dest_addr = fold_convert (daddr_type, addr);
8296 dest_addr = fold_build_pointer_plus_hwi (dest_addr, prev_size);
8297 if (cur_size == GET_MODE_SIZE (mode))
8298 {
8299 src = build_va_arg_indirect_ref (src_addr);
8300 dest = build_va_arg_indirect_ref (dest_addr);
8301
8302 gimplify_assign (dest, src, pre_p);
8303 }
8304 else
8305 {
8306 tree copy
8307 = build_call_expr (builtin_decl_implicit (BUILT_IN_MEMCPY),
8308 3, dest_addr, src_addr,
8309 size_int (cur_size));
8310 gimplify_and_add (copy, pre_p);
8311 }
8312 prev_size += cur_size;
8313 }
8314 }
8315
8316 if (needed_intregs)
8317 {
8318 t = build2 (PLUS_EXPR, TREE_TYPE (gpr), gpr,
8319 build_int_cst (TREE_TYPE (gpr), needed_intregs * 8));
8320 gimplify_assign (gpr, t, pre_p);
8321 }
8322
8323 if (needed_sseregs)
8324 {
8325 t = build2 (PLUS_EXPR, TREE_TYPE (fpr), fpr,
8326 build_int_cst (TREE_TYPE (fpr), needed_sseregs * 16));
8327 gimplify_assign (fpr, t, pre_p);
8328 }
8329
8330 gimple_seq_add_stmt (pre_p, gimple_build_goto (lab_over));
8331
8332 gimple_seq_add_stmt (pre_p, gimple_build_label (lab_false));
8333 }
8334
8335 /* ... otherwise out of the overflow area. */
8336
8337 /* When we align parameter on stack for caller, if the parameter
8338 alignment is beyond MAX_SUPPORTED_STACK_ALIGNMENT, it will be
8339 aligned at MAX_SUPPORTED_STACK_ALIGNMENT. We will match callee
8340 here with caller. */
8341 arg_boundary = ix86_function_arg_boundary (VOIDmode, type);
8342 if ((unsigned int) arg_boundary > MAX_SUPPORTED_STACK_ALIGNMENT)
8343 arg_boundary = MAX_SUPPORTED_STACK_ALIGNMENT;
8344
8345 /* Care for on-stack alignment if needed. */
8346 if (arg_boundary <= 64 || size == 0)
8347 t = ovf;
8348 else
8349 {
8350 HOST_WIDE_INT align = arg_boundary / 8;
8351 t = fold_build_pointer_plus_hwi (ovf, align - 1);
8352 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
8353 build_int_cst (TREE_TYPE (t), -align));
8354 }
8355
8356 gimplify_expr (&t, pre_p, NULL, is_gimple_val, fb_rvalue);
8357 gimplify_assign (addr, t, pre_p);
8358
8359 t = fold_build_pointer_plus_hwi (t, rsize * UNITS_PER_WORD);
8360 gimplify_assign (unshare_expr (ovf), t, pre_p);
8361
8362 if (container)
8363 gimple_seq_add_stmt (pre_p, gimple_build_label (lab_over));
8364
8365 ptrtype = build_pointer_type_for_mode (type, ptr_mode, true);
8366 addr = fold_convert (ptrtype, addr);
8367
8368 if (indirect_p)
8369 addr = build_va_arg_indirect_ref (addr);
8370 return build_va_arg_indirect_ref (addr);
8371 }
8372 \f
8373 /* Return true if OPNUM's MEM should be matched
8374 in movabs* patterns. */
8375
8376 bool
8377 ix86_check_movabs (rtx insn, int opnum)
8378 {
8379 rtx set, mem;
8380
8381 set = PATTERN (insn);
8382 if (GET_CODE (set) == PARALLEL)
8383 set = XVECEXP (set, 0, 0);
8384 gcc_assert (GET_CODE (set) == SET);
8385 mem = XEXP (set, opnum);
8386 while (GET_CODE (mem) == SUBREG)
8387 mem = SUBREG_REG (mem);
8388 gcc_assert (MEM_P (mem));
8389 return volatile_ok || !MEM_VOLATILE_P (mem);
8390 }
8391 \f
8392 /* Initialize the table of extra 80387 mathematical constants. */
8393
8394 static void
8395 init_ext_80387_constants (void)
8396 {
8397 static const char * cst[5] =
8398 {
8399 "0.3010299956639811952256464283594894482", /* 0: fldlg2 */
8400 "0.6931471805599453094286904741849753009", /* 1: fldln2 */
8401 "1.4426950408889634073876517827983434472", /* 2: fldl2e */
8402 "3.3219280948873623478083405569094566090", /* 3: fldl2t */
8403 "3.1415926535897932385128089594061862044", /* 4: fldpi */
8404 };
8405 int i;
8406
8407 for (i = 0; i < 5; i++)
8408 {
8409 real_from_string (&ext_80387_constants_table[i], cst[i]);
8410 /* Ensure each constant is rounded to XFmode precision. */
8411 real_convert (&ext_80387_constants_table[i],
8412 XFmode, &ext_80387_constants_table[i]);
8413 }
8414
8415 ext_80387_constants_init = 1;
8416 }
8417
8418 /* Return non-zero if the constant is something that
8419 can be loaded with a special instruction. */
8420
8421 int
8422 standard_80387_constant_p (rtx x)
8423 {
8424 enum machine_mode mode = GET_MODE (x);
8425
8426 REAL_VALUE_TYPE r;
8427
8428 if (!(X87_FLOAT_MODE_P (mode) && (GET_CODE (x) == CONST_DOUBLE)))
8429 return -1;
8430
8431 if (x == CONST0_RTX (mode))
8432 return 1;
8433 if (x == CONST1_RTX (mode))
8434 return 2;
8435
8436 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
8437
8438 /* For XFmode constants, try to find a special 80387 instruction when
8439 optimizing for size or on those CPUs that benefit from them. */
8440 if (mode == XFmode
8441 && (optimize_function_for_size_p (cfun) || TARGET_EXT_80387_CONSTANTS))
8442 {
8443 int i;
8444
8445 if (! ext_80387_constants_init)
8446 init_ext_80387_constants ();
8447
8448 for (i = 0; i < 5; i++)
8449 if (real_identical (&r, &ext_80387_constants_table[i]))
8450 return i + 3;
8451 }
8452
8453 /* Load of the constant -0.0 or -1.0 will be split as
8454 fldz;fchs or fld1;fchs sequence. */
8455 if (real_isnegzero (&r))
8456 return 8;
8457 if (real_identical (&r, &dconstm1))
8458 return 9;
8459
8460 return 0;
8461 }
8462
8463 /* Return the opcode of the special instruction to be used to load
8464 the constant X. */
8465
8466 const char *
8467 standard_80387_constant_opcode (rtx x)
8468 {
8469 switch (standard_80387_constant_p (x))
8470 {
8471 case 1:
8472 return "fldz";
8473 case 2:
8474 return "fld1";
8475 case 3:
8476 return "fldlg2";
8477 case 4:
8478 return "fldln2";
8479 case 5:
8480 return "fldl2e";
8481 case 6:
8482 return "fldl2t";
8483 case 7:
8484 return "fldpi";
8485 case 8:
8486 case 9:
8487 return "#";
8488 default:
8489 gcc_unreachable ();
8490 }
8491 }
8492
8493 /* Return the CONST_DOUBLE representing the 80387 constant that is
8494 loaded by the specified special instruction. The argument IDX
8495 matches the return value from standard_80387_constant_p. */
8496
8497 rtx
8498 standard_80387_constant_rtx (int idx)
8499 {
8500 int i;
8501
8502 if (! ext_80387_constants_init)
8503 init_ext_80387_constants ();
8504
8505 switch (idx)
8506 {
8507 case 3:
8508 case 4:
8509 case 5:
8510 case 6:
8511 case 7:
8512 i = idx - 3;
8513 break;
8514
8515 default:
8516 gcc_unreachable ();
8517 }
8518
8519 return CONST_DOUBLE_FROM_REAL_VALUE (ext_80387_constants_table[i],
8520 XFmode);
8521 }
8522
8523 /* Return 1 if X is all 0s and 2 if x is all 1s
8524 in supported SSE/AVX vector mode. */
8525
8526 int
8527 standard_sse_constant_p (rtx x)
8528 {
8529 enum machine_mode mode = GET_MODE (x);
8530
8531 if (x == const0_rtx || x == CONST0_RTX (GET_MODE (x)))
8532 return 1;
8533 if (vector_all_ones_operand (x, mode))
8534 switch (mode)
8535 {
8536 case V16QImode:
8537 case V8HImode:
8538 case V4SImode:
8539 case V2DImode:
8540 if (TARGET_SSE2)
8541 return 2;
8542 case V32QImode:
8543 case V16HImode:
8544 case V8SImode:
8545 case V4DImode:
8546 if (TARGET_AVX2)
8547 return 2;
8548 default:
8549 break;
8550 }
8551
8552 return 0;
8553 }
8554
8555 /* Return the opcode of the special instruction to be used to load
8556 the constant X. */
8557
8558 const char *
8559 standard_sse_constant_opcode (rtx insn, rtx x)
8560 {
8561 switch (standard_sse_constant_p (x))
8562 {
8563 case 1:
8564 switch (get_attr_mode (insn))
8565 {
8566 case MODE_TI:
8567 return "%vpxor\t%0, %d0";
8568 case MODE_V2DF:
8569 return "%vxorpd\t%0, %d0";
8570 case MODE_V4SF:
8571 return "%vxorps\t%0, %d0";
8572
8573 case MODE_OI:
8574 return "vpxor\t%x0, %x0, %x0";
8575 case MODE_V4DF:
8576 return "vxorpd\t%x0, %x0, %x0";
8577 case MODE_V8SF:
8578 return "vxorps\t%x0, %x0, %x0";
8579
8580 default:
8581 break;
8582 }
8583
8584 case 2:
8585 if (TARGET_AVX)
8586 return "vpcmpeqd\t%0, %0, %0";
8587 else
8588 return "pcmpeqd\t%0, %0";
8589
8590 default:
8591 break;
8592 }
8593 gcc_unreachable ();
8594 }
8595
8596 /* Returns true if OP contains a symbol reference */
8597
8598 bool
8599 symbolic_reference_mentioned_p (rtx op)
8600 {
8601 const char *fmt;
8602 int i;
8603
8604 if (GET_CODE (op) == SYMBOL_REF || GET_CODE (op) == LABEL_REF)
8605 return true;
8606
8607 fmt = GET_RTX_FORMAT (GET_CODE (op));
8608 for (i = GET_RTX_LENGTH (GET_CODE (op)) - 1; i >= 0; i--)
8609 {
8610 if (fmt[i] == 'E')
8611 {
8612 int j;
8613
8614 for (j = XVECLEN (op, i) - 1; j >= 0; j--)
8615 if (symbolic_reference_mentioned_p (XVECEXP (op, i, j)))
8616 return true;
8617 }
8618
8619 else if (fmt[i] == 'e' && symbolic_reference_mentioned_p (XEXP (op, i)))
8620 return true;
8621 }
8622
8623 return false;
8624 }
8625
8626 /* Return true if it is appropriate to emit `ret' instructions in the
8627 body of a function. Do this only if the epilogue is simple, needing a
8628 couple of insns. Prior to reloading, we can't tell how many registers
8629 must be saved, so return false then. Return false if there is no frame
8630 marker to de-allocate. */
8631
8632 bool
8633 ix86_can_use_return_insn_p (void)
8634 {
8635 struct ix86_frame frame;
8636
8637 if (! reload_completed || frame_pointer_needed)
8638 return 0;
8639
8640 /* Don't allow more than 32k pop, since that's all we can do
8641 with one instruction. */
8642 if (crtl->args.pops_args && crtl->args.size >= 32768)
8643 return 0;
8644
8645 ix86_compute_frame_layout (&frame);
8646 return (frame.stack_pointer_offset == UNITS_PER_WORD
8647 && (frame.nregs + frame.nsseregs) == 0);
8648 }
8649 \f
8650 /* Value should be nonzero if functions must have frame pointers.
8651 Zero means the frame pointer need not be set up (and parms may
8652 be accessed via the stack pointer) in functions that seem suitable. */
8653
8654 static bool
8655 ix86_frame_pointer_required (void)
8656 {
8657 /* If we accessed previous frames, then the generated code expects
8658 to be able to access the saved ebp value in our frame. */
8659 if (cfun->machine->accesses_prev_frame)
8660 return true;
8661
8662 /* Several x86 os'es need a frame pointer for other reasons,
8663 usually pertaining to setjmp. */
8664 if (SUBTARGET_FRAME_POINTER_REQUIRED)
8665 return true;
8666
8667 /* For older 32-bit runtimes setjmp requires valid frame-pointer. */
8668 if (TARGET_32BIT_MS_ABI && cfun->calls_setjmp)
8669 return true;
8670
8671 /* Win64 SEH, very large frames need a frame-pointer as maximum stack
8672 allocation is 4GB. */
8673 if (TARGET_64BIT_MS_ABI && get_frame_size () > SEH_MAX_FRAME_SIZE)
8674 return true;
8675
8676 /* In ix86_option_override_internal, TARGET_OMIT_LEAF_FRAME_POINTER
8677 turns off the frame pointer by default. Turn it back on now if
8678 we've not got a leaf function. */
8679 if (TARGET_OMIT_LEAF_FRAME_POINTER
8680 && (!crtl->is_leaf
8681 || ix86_current_function_calls_tls_descriptor))
8682 return true;
8683
8684 if (crtl->profile && !flag_fentry)
8685 return true;
8686
8687 return false;
8688 }
8689
8690 /* Record that the current function accesses previous call frames. */
8691
8692 void
8693 ix86_setup_frame_addresses (void)
8694 {
8695 cfun->machine->accesses_prev_frame = 1;
8696 }
8697 \f
8698 #ifndef USE_HIDDEN_LINKONCE
8699 # if defined(HAVE_GAS_HIDDEN) && (SUPPORTS_ONE_ONLY - 0)
8700 # define USE_HIDDEN_LINKONCE 1
8701 # else
8702 # define USE_HIDDEN_LINKONCE 0
8703 # endif
8704 #endif
8705
8706 static int pic_labels_used;
8707
8708 /* Fills in the label name that should be used for a pc thunk for
8709 the given register. */
8710
8711 static void
8712 get_pc_thunk_name (char name[32], unsigned int regno)
8713 {
8714 gcc_assert (!TARGET_64BIT);
8715
8716 if (USE_HIDDEN_LINKONCE)
8717 sprintf (name, "__x86.get_pc_thunk.%s", reg_names[regno]);
8718 else
8719 ASM_GENERATE_INTERNAL_LABEL (name, "LPR", regno);
8720 }
8721
8722
8723 /* This function generates code for -fpic that loads %ebx with
8724 the return address of the caller and then returns. */
8725
8726 static void
8727 ix86_code_end (void)
8728 {
8729 rtx xops[2];
8730 int regno;
8731
8732 for (regno = AX_REG; regno <= SP_REG; regno++)
8733 {
8734 char name[32];
8735 tree decl;
8736
8737 if (!(pic_labels_used & (1 << regno)))
8738 continue;
8739
8740 get_pc_thunk_name (name, regno);
8741
8742 decl = build_decl (BUILTINS_LOCATION, FUNCTION_DECL,
8743 get_identifier (name),
8744 build_function_type_list (void_type_node, NULL_TREE));
8745 DECL_RESULT (decl) = build_decl (BUILTINS_LOCATION, RESULT_DECL,
8746 NULL_TREE, void_type_node);
8747 TREE_PUBLIC (decl) = 1;
8748 TREE_STATIC (decl) = 1;
8749 DECL_IGNORED_P (decl) = 1;
8750
8751 #if TARGET_MACHO
8752 if (TARGET_MACHO)
8753 {
8754 switch_to_section (darwin_sections[text_coal_section]);
8755 fputs ("\t.weak_definition\t", asm_out_file);
8756 assemble_name (asm_out_file, name);
8757 fputs ("\n\t.private_extern\t", asm_out_file);
8758 assemble_name (asm_out_file, name);
8759 putc ('\n', asm_out_file);
8760 ASM_OUTPUT_LABEL (asm_out_file, name);
8761 DECL_WEAK (decl) = 1;
8762 }
8763 else
8764 #endif
8765 if (USE_HIDDEN_LINKONCE)
8766 {
8767 DECL_COMDAT_GROUP (decl) = DECL_ASSEMBLER_NAME (decl);
8768
8769 targetm.asm_out.unique_section (decl, 0);
8770 switch_to_section (get_named_section (decl, NULL, 0));
8771
8772 targetm.asm_out.globalize_label (asm_out_file, name);
8773 fputs ("\t.hidden\t", asm_out_file);
8774 assemble_name (asm_out_file, name);
8775 putc ('\n', asm_out_file);
8776 ASM_DECLARE_FUNCTION_NAME (asm_out_file, name, decl);
8777 }
8778 else
8779 {
8780 switch_to_section (text_section);
8781 ASM_OUTPUT_LABEL (asm_out_file, name);
8782 }
8783
8784 DECL_INITIAL (decl) = make_node (BLOCK);
8785 current_function_decl = decl;
8786 init_function_start (decl);
8787 first_function_block_is_cold = false;
8788 /* Make sure unwind info is emitted for the thunk if needed. */
8789 final_start_function (emit_barrier (), asm_out_file, 1);
8790
8791 /* Pad stack IP move with 4 instructions (two NOPs count
8792 as one instruction). */
8793 if (TARGET_PAD_SHORT_FUNCTION)
8794 {
8795 int i = 8;
8796
8797 while (i--)
8798 fputs ("\tnop\n", asm_out_file);
8799 }
8800
8801 xops[0] = gen_rtx_REG (Pmode, regno);
8802 xops[1] = gen_rtx_MEM (Pmode, stack_pointer_rtx);
8803 output_asm_insn ("mov%z0\t{%1, %0|%0, %1}", xops);
8804 fputs ("\tret\n", asm_out_file);
8805 final_end_function ();
8806 init_insn_lengths ();
8807 free_after_compilation (cfun);
8808 set_cfun (NULL);
8809 current_function_decl = NULL;
8810 }
8811
8812 if (flag_split_stack)
8813 file_end_indicate_split_stack ();
8814 }
8815
8816 /* Emit code for the SET_GOT patterns. */
8817
8818 const char *
8819 output_set_got (rtx dest, rtx label ATTRIBUTE_UNUSED)
8820 {
8821 rtx xops[3];
8822
8823 xops[0] = dest;
8824
8825 if (TARGET_VXWORKS_RTP && flag_pic)
8826 {
8827 /* Load (*VXWORKS_GOTT_BASE) into the PIC register. */
8828 xops[2] = gen_rtx_MEM (Pmode,
8829 gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_BASE));
8830 output_asm_insn ("mov{l}\t{%2, %0|%0, %2}", xops);
8831
8832 /* Load (*VXWORKS_GOTT_BASE)[VXWORKS_GOTT_INDEX] into the PIC register.
8833 Use %P and a local symbol in order to print VXWORKS_GOTT_INDEX as
8834 an unadorned address. */
8835 xops[2] = gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_INDEX);
8836 SYMBOL_REF_FLAGS (xops[2]) |= SYMBOL_FLAG_LOCAL;
8837 output_asm_insn ("mov{l}\t{%P2(%0), %0|%0, DWORD PTR %P2[%0]}", xops);
8838 return "";
8839 }
8840
8841 xops[1] = gen_rtx_SYMBOL_REF (Pmode, GOT_SYMBOL_NAME);
8842
8843 if (!flag_pic)
8844 {
8845 xops[2] = gen_rtx_LABEL_REF (Pmode, label ? label : gen_label_rtx ());
8846
8847 output_asm_insn ("mov%z0\t{%2, %0|%0, %2}", xops);
8848
8849 #if TARGET_MACHO
8850 /* Output the Mach-O "canonical" label name ("Lxx$pb") here too. This
8851 is what will be referenced by the Mach-O PIC subsystem. */
8852 if (!label)
8853 ASM_OUTPUT_LABEL (asm_out_file, MACHOPIC_FUNCTION_BASE_NAME);
8854 #endif
8855
8856 targetm.asm_out.internal_label (asm_out_file, "L",
8857 CODE_LABEL_NUMBER (XEXP (xops[2], 0)));
8858 }
8859 else
8860 {
8861 char name[32];
8862 get_pc_thunk_name (name, REGNO (dest));
8863 pic_labels_used |= 1 << REGNO (dest);
8864
8865 xops[2] = gen_rtx_SYMBOL_REF (Pmode, ggc_strdup (name));
8866 xops[2] = gen_rtx_MEM (QImode, xops[2]);
8867 output_asm_insn ("call\t%X2", xops);
8868 /* Output the Mach-O "canonical" label name ("Lxx$pb") here too. This
8869 is what will be referenced by the Mach-O PIC subsystem. */
8870 #if TARGET_MACHO
8871 if (!label)
8872 ASM_OUTPUT_LABEL (asm_out_file, MACHOPIC_FUNCTION_BASE_NAME);
8873 else
8874 targetm.asm_out.internal_label (asm_out_file, "L",
8875 CODE_LABEL_NUMBER (label));
8876 #endif
8877 }
8878
8879 if (!TARGET_MACHO)
8880 output_asm_insn ("add%z0\t{%1, %0|%0, %1}", xops);
8881
8882 return "";
8883 }
8884
8885 /* Generate an "push" pattern for input ARG. */
8886
8887 static rtx
8888 gen_push (rtx arg)
8889 {
8890 struct machine_function *m = cfun->machine;
8891
8892 if (m->fs.cfa_reg == stack_pointer_rtx)
8893 m->fs.cfa_offset += UNITS_PER_WORD;
8894 m->fs.sp_offset += UNITS_PER_WORD;
8895
8896 if (REG_P (arg) && GET_MODE (arg) != word_mode)
8897 arg = gen_rtx_REG (word_mode, REGNO (arg));
8898
8899 return gen_rtx_SET (VOIDmode,
8900 gen_rtx_MEM (word_mode,
8901 gen_rtx_PRE_DEC (Pmode,
8902 stack_pointer_rtx)),
8903 arg);
8904 }
8905
8906 /* Generate an "pop" pattern for input ARG. */
8907
8908 static rtx
8909 gen_pop (rtx arg)
8910 {
8911 if (REG_P (arg) && GET_MODE (arg) != word_mode)
8912 arg = gen_rtx_REG (word_mode, REGNO (arg));
8913
8914 return gen_rtx_SET (VOIDmode,
8915 arg,
8916 gen_rtx_MEM (word_mode,
8917 gen_rtx_POST_INC (Pmode,
8918 stack_pointer_rtx)));
8919 }
8920
8921 /* Return >= 0 if there is an unused call-clobbered register available
8922 for the entire function. */
8923
8924 static unsigned int
8925 ix86_select_alt_pic_regnum (void)
8926 {
8927 if (crtl->is_leaf
8928 && !crtl->profile
8929 && !ix86_current_function_calls_tls_descriptor)
8930 {
8931 int i, drap;
8932 /* Can't use the same register for both PIC and DRAP. */
8933 if (crtl->drap_reg)
8934 drap = REGNO (crtl->drap_reg);
8935 else
8936 drap = -1;
8937 for (i = 2; i >= 0; --i)
8938 if (i != drap && !df_regs_ever_live_p (i))
8939 return i;
8940 }
8941
8942 return INVALID_REGNUM;
8943 }
8944
8945 /* Return TRUE if we need to save REGNO. */
8946
8947 static bool
8948 ix86_save_reg (unsigned int regno, bool maybe_eh_return)
8949 {
8950 if (pic_offset_table_rtx
8951 && regno == REAL_PIC_OFFSET_TABLE_REGNUM
8952 && (df_regs_ever_live_p (REAL_PIC_OFFSET_TABLE_REGNUM)
8953 || crtl->profile
8954 || crtl->calls_eh_return
8955 || crtl->uses_const_pool))
8956 return ix86_select_alt_pic_regnum () == INVALID_REGNUM;
8957
8958 if (crtl->calls_eh_return && maybe_eh_return)
8959 {
8960 unsigned i;
8961 for (i = 0; ; i++)
8962 {
8963 unsigned test = EH_RETURN_DATA_REGNO (i);
8964 if (test == INVALID_REGNUM)
8965 break;
8966 if (test == regno)
8967 return true;
8968 }
8969 }
8970
8971 if (crtl->drap_reg && regno == REGNO (crtl->drap_reg))
8972 return true;
8973
8974 return (df_regs_ever_live_p (regno)
8975 && !call_used_regs[regno]
8976 && !fixed_regs[regno]
8977 && (regno != HARD_FRAME_POINTER_REGNUM || !frame_pointer_needed));
8978 }
8979
8980 /* Return number of saved general prupose registers. */
8981
8982 static int
8983 ix86_nsaved_regs (void)
8984 {
8985 int nregs = 0;
8986 int regno;
8987
8988 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
8989 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
8990 nregs ++;
8991 return nregs;
8992 }
8993
8994 /* Return number of saved SSE registrers. */
8995
8996 static int
8997 ix86_nsaved_sseregs (void)
8998 {
8999 int nregs = 0;
9000 int regno;
9001
9002 if (!TARGET_64BIT_MS_ABI)
9003 return 0;
9004 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
9005 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9006 nregs ++;
9007 return nregs;
9008 }
9009
9010 /* Given FROM and TO register numbers, say whether this elimination is
9011 allowed. If stack alignment is needed, we can only replace argument
9012 pointer with hard frame pointer, or replace frame pointer with stack
9013 pointer. Otherwise, frame pointer elimination is automatically
9014 handled and all other eliminations are valid. */
9015
9016 static bool
9017 ix86_can_eliminate (const int from, const int to)
9018 {
9019 if (stack_realign_fp)
9020 return ((from == ARG_POINTER_REGNUM
9021 && to == HARD_FRAME_POINTER_REGNUM)
9022 || (from == FRAME_POINTER_REGNUM
9023 && to == STACK_POINTER_REGNUM));
9024 else
9025 return to == STACK_POINTER_REGNUM ? !frame_pointer_needed : true;
9026 }
9027
9028 /* Return the offset between two registers, one to be eliminated, and the other
9029 its replacement, at the start of a routine. */
9030
9031 HOST_WIDE_INT
9032 ix86_initial_elimination_offset (int from, int to)
9033 {
9034 struct ix86_frame frame;
9035 ix86_compute_frame_layout (&frame);
9036
9037 if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
9038 return frame.hard_frame_pointer_offset;
9039 else if (from == FRAME_POINTER_REGNUM
9040 && to == HARD_FRAME_POINTER_REGNUM)
9041 return frame.hard_frame_pointer_offset - frame.frame_pointer_offset;
9042 else
9043 {
9044 gcc_assert (to == STACK_POINTER_REGNUM);
9045
9046 if (from == ARG_POINTER_REGNUM)
9047 return frame.stack_pointer_offset;
9048
9049 gcc_assert (from == FRAME_POINTER_REGNUM);
9050 return frame.stack_pointer_offset - frame.frame_pointer_offset;
9051 }
9052 }
9053
9054 /* In a dynamically-aligned function, we can't know the offset from
9055 stack pointer to frame pointer, so we must ensure that setjmp
9056 eliminates fp against the hard fp (%ebp) rather than trying to
9057 index from %esp up to the top of the frame across a gap that is
9058 of unknown (at compile-time) size. */
9059 static rtx
9060 ix86_builtin_setjmp_frame_value (void)
9061 {
9062 return stack_realign_fp ? hard_frame_pointer_rtx : virtual_stack_vars_rtx;
9063 }
9064
9065 /* When using -fsplit-stack, the allocation routines set a field in
9066 the TCB to the bottom of the stack plus this much space, measured
9067 in bytes. */
9068
9069 #define SPLIT_STACK_AVAILABLE 256
9070
9071 /* Fill structure ix86_frame about frame of currently computed function. */
9072
9073 static void
9074 ix86_compute_frame_layout (struct ix86_frame *frame)
9075 {
9076 unsigned HOST_WIDE_INT stack_alignment_needed;
9077 HOST_WIDE_INT offset;
9078 unsigned HOST_WIDE_INT preferred_alignment;
9079 HOST_WIDE_INT size = get_frame_size ();
9080 HOST_WIDE_INT to_allocate;
9081
9082 frame->nregs = ix86_nsaved_regs ();
9083 frame->nsseregs = ix86_nsaved_sseregs ();
9084
9085 stack_alignment_needed = crtl->stack_alignment_needed / BITS_PER_UNIT;
9086 preferred_alignment = crtl->preferred_stack_boundary / BITS_PER_UNIT;
9087
9088 /* 64-bit MS ABI seem to require stack alignment to be always 16 except for
9089 function prologues and leaf. */
9090 if ((TARGET_64BIT_MS_ABI && preferred_alignment < 16)
9091 && (!crtl->is_leaf || cfun->calls_alloca != 0
9092 || ix86_current_function_calls_tls_descriptor))
9093 {
9094 preferred_alignment = 16;
9095 stack_alignment_needed = 16;
9096 crtl->preferred_stack_boundary = 128;
9097 crtl->stack_alignment_needed = 128;
9098 }
9099
9100 gcc_assert (!size || stack_alignment_needed);
9101 gcc_assert (preferred_alignment >= STACK_BOUNDARY / BITS_PER_UNIT);
9102 gcc_assert (preferred_alignment <= stack_alignment_needed);
9103
9104 /* For SEH we have to limit the amount of code movement into the prologue.
9105 At present we do this via a BLOCKAGE, at which point there's very little
9106 scheduling that can be done, which means that there's very little point
9107 in doing anything except PUSHs. */
9108 if (TARGET_SEH)
9109 cfun->machine->use_fast_prologue_epilogue = false;
9110
9111 /* During reload iteration the amount of registers saved can change.
9112 Recompute the value as needed. Do not recompute when amount of registers
9113 didn't change as reload does multiple calls to the function and does not
9114 expect the decision to change within single iteration. */
9115 else if (!optimize_function_for_size_p (cfun)
9116 && cfun->machine->use_fast_prologue_epilogue_nregs != frame->nregs)
9117 {
9118 int count = frame->nregs;
9119 struct cgraph_node *node = cgraph_get_node (current_function_decl);
9120
9121 cfun->machine->use_fast_prologue_epilogue_nregs = count;
9122
9123 /* The fast prologue uses move instead of push to save registers. This
9124 is significantly longer, but also executes faster as modern hardware
9125 can execute the moves in parallel, but can't do that for push/pop.
9126
9127 Be careful about choosing what prologue to emit: When function takes
9128 many instructions to execute we may use slow version as well as in
9129 case function is known to be outside hot spot (this is known with
9130 feedback only). Weight the size of function by number of registers
9131 to save as it is cheap to use one or two push instructions but very
9132 slow to use many of them. */
9133 if (count)
9134 count = (count - 1) * FAST_PROLOGUE_INSN_COUNT;
9135 if (node->frequency < NODE_FREQUENCY_NORMAL
9136 || (flag_branch_probabilities
9137 && node->frequency < NODE_FREQUENCY_HOT))
9138 cfun->machine->use_fast_prologue_epilogue = false;
9139 else
9140 cfun->machine->use_fast_prologue_epilogue
9141 = !expensive_function_p (count);
9142 }
9143
9144 frame->save_regs_using_mov
9145 = (TARGET_PROLOGUE_USING_MOVE && cfun->machine->use_fast_prologue_epilogue
9146 /* If static stack checking is enabled and done with probes,
9147 the registers need to be saved before allocating the frame. */
9148 && flag_stack_check != STATIC_BUILTIN_STACK_CHECK);
9149
9150 /* Skip return address. */
9151 offset = UNITS_PER_WORD;
9152
9153 /* Skip pushed static chain. */
9154 if (ix86_static_chain_on_stack)
9155 offset += UNITS_PER_WORD;
9156
9157 /* Skip saved base pointer. */
9158 if (frame_pointer_needed)
9159 offset += UNITS_PER_WORD;
9160 frame->hfp_save_offset = offset;
9161
9162 /* The traditional frame pointer location is at the top of the frame. */
9163 frame->hard_frame_pointer_offset = offset;
9164
9165 /* Register save area */
9166 offset += frame->nregs * UNITS_PER_WORD;
9167 frame->reg_save_offset = offset;
9168
9169 /* On SEH target, registers are pushed just before the frame pointer
9170 location. */
9171 if (TARGET_SEH)
9172 frame->hard_frame_pointer_offset = offset;
9173
9174 /* Align and set SSE register save area. */
9175 if (frame->nsseregs)
9176 {
9177 /* The only ABI that has saved SSE registers (Win64) also has a
9178 16-byte aligned default stack, and thus we don't need to be
9179 within the re-aligned local stack frame to save them. */
9180 gcc_assert (INCOMING_STACK_BOUNDARY >= 128);
9181 offset = (offset + 16 - 1) & -16;
9182 offset += frame->nsseregs * 16;
9183 }
9184 frame->sse_reg_save_offset = offset;
9185
9186 /* The re-aligned stack starts here. Values before this point are not
9187 directly comparable with values below this point. In order to make
9188 sure that no value happens to be the same before and after, force
9189 the alignment computation below to add a non-zero value. */
9190 if (stack_realign_fp)
9191 offset = (offset + stack_alignment_needed) & -stack_alignment_needed;
9192
9193 /* Va-arg area */
9194 frame->va_arg_size = ix86_varargs_gpr_size + ix86_varargs_fpr_size;
9195 offset += frame->va_arg_size;
9196
9197 /* Align start of frame for local function. */
9198 if (stack_realign_fp
9199 || offset != frame->sse_reg_save_offset
9200 || size != 0
9201 || !crtl->is_leaf
9202 || cfun->calls_alloca
9203 || ix86_current_function_calls_tls_descriptor)
9204 offset = (offset + stack_alignment_needed - 1) & -stack_alignment_needed;
9205
9206 /* Frame pointer points here. */
9207 frame->frame_pointer_offset = offset;
9208
9209 offset += size;
9210
9211 /* Add outgoing arguments area. Can be skipped if we eliminated
9212 all the function calls as dead code.
9213 Skipping is however impossible when function calls alloca. Alloca
9214 expander assumes that last crtl->outgoing_args_size
9215 of stack frame are unused. */
9216 if (ACCUMULATE_OUTGOING_ARGS
9217 && (!crtl->is_leaf || cfun->calls_alloca
9218 || ix86_current_function_calls_tls_descriptor))
9219 {
9220 offset += crtl->outgoing_args_size;
9221 frame->outgoing_arguments_size = crtl->outgoing_args_size;
9222 }
9223 else
9224 frame->outgoing_arguments_size = 0;
9225
9226 /* Align stack boundary. Only needed if we're calling another function
9227 or using alloca. */
9228 if (!crtl->is_leaf || cfun->calls_alloca
9229 || ix86_current_function_calls_tls_descriptor)
9230 offset = (offset + preferred_alignment - 1) & -preferred_alignment;
9231
9232 /* We've reached end of stack frame. */
9233 frame->stack_pointer_offset = offset;
9234
9235 /* Size prologue needs to allocate. */
9236 to_allocate = offset - frame->sse_reg_save_offset;
9237
9238 if ((!to_allocate && frame->nregs <= 1)
9239 || (TARGET_64BIT && to_allocate >= (HOST_WIDE_INT) 0x80000000))
9240 frame->save_regs_using_mov = false;
9241
9242 if (ix86_using_red_zone ()
9243 && crtl->sp_is_unchanging
9244 && crtl->is_leaf
9245 && !ix86_current_function_calls_tls_descriptor)
9246 {
9247 frame->red_zone_size = to_allocate;
9248 if (frame->save_regs_using_mov)
9249 frame->red_zone_size += frame->nregs * UNITS_PER_WORD;
9250 if (frame->red_zone_size > RED_ZONE_SIZE - RED_ZONE_RESERVE)
9251 frame->red_zone_size = RED_ZONE_SIZE - RED_ZONE_RESERVE;
9252 }
9253 else
9254 frame->red_zone_size = 0;
9255 frame->stack_pointer_offset -= frame->red_zone_size;
9256
9257 /* The SEH frame pointer location is near the bottom of the frame.
9258 This is enforced by the fact that the difference between the
9259 stack pointer and the frame pointer is limited to 240 bytes in
9260 the unwind data structure. */
9261 if (TARGET_SEH)
9262 {
9263 HOST_WIDE_INT diff;
9264
9265 /* If we can leave the frame pointer where it is, do so. Also, returns
9266 the establisher frame for __builtin_frame_address (0). */
9267 diff = frame->stack_pointer_offset - frame->hard_frame_pointer_offset;
9268 if (diff <= SEH_MAX_FRAME_SIZE
9269 && (diff > 240 || (diff & 15) != 0)
9270 && !crtl->accesses_prior_frames)
9271 {
9272 /* Ideally we'd determine what portion of the local stack frame
9273 (within the constraint of the lowest 240) is most heavily used.
9274 But without that complication, simply bias the frame pointer
9275 by 128 bytes so as to maximize the amount of the local stack
9276 frame that is addressable with 8-bit offsets. */
9277 frame->hard_frame_pointer_offset = frame->stack_pointer_offset - 128;
9278 }
9279 }
9280 }
9281
9282 /* This is semi-inlined memory_address_length, but simplified
9283 since we know that we're always dealing with reg+offset, and
9284 to avoid having to create and discard all that rtl. */
9285
9286 static inline int
9287 choose_baseaddr_len (unsigned int regno, HOST_WIDE_INT offset)
9288 {
9289 int len = 4;
9290
9291 if (offset == 0)
9292 {
9293 /* EBP and R13 cannot be encoded without an offset. */
9294 len = (regno == BP_REG || regno == R13_REG);
9295 }
9296 else if (IN_RANGE (offset, -128, 127))
9297 len = 1;
9298
9299 /* ESP and R12 must be encoded with a SIB byte. */
9300 if (regno == SP_REG || regno == R12_REG)
9301 len++;
9302
9303 return len;
9304 }
9305
9306 /* Return an RTX that points to CFA_OFFSET within the stack frame.
9307 The valid base registers are taken from CFUN->MACHINE->FS. */
9308
9309 static rtx
9310 choose_baseaddr (HOST_WIDE_INT cfa_offset)
9311 {
9312 const struct machine_function *m = cfun->machine;
9313 rtx base_reg = NULL;
9314 HOST_WIDE_INT base_offset = 0;
9315
9316 if (m->use_fast_prologue_epilogue)
9317 {
9318 /* Choose the base register most likely to allow the most scheduling
9319 opportunities. Generally FP is valid throughout the function,
9320 while DRAP must be reloaded within the epilogue. But choose either
9321 over the SP due to increased encoding size. */
9322
9323 if (m->fs.fp_valid)
9324 {
9325 base_reg = hard_frame_pointer_rtx;
9326 base_offset = m->fs.fp_offset - cfa_offset;
9327 }
9328 else if (m->fs.drap_valid)
9329 {
9330 base_reg = crtl->drap_reg;
9331 base_offset = 0 - cfa_offset;
9332 }
9333 else if (m->fs.sp_valid)
9334 {
9335 base_reg = stack_pointer_rtx;
9336 base_offset = m->fs.sp_offset - cfa_offset;
9337 }
9338 }
9339 else
9340 {
9341 HOST_WIDE_INT toffset;
9342 int len = 16, tlen;
9343
9344 /* Choose the base register with the smallest address encoding.
9345 With a tie, choose FP > DRAP > SP. */
9346 if (m->fs.sp_valid)
9347 {
9348 base_reg = stack_pointer_rtx;
9349 base_offset = m->fs.sp_offset - cfa_offset;
9350 len = choose_baseaddr_len (STACK_POINTER_REGNUM, base_offset);
9351 }
9352 if (m->fs.drap_valid)
9353 {
9354 toffset = 0 - cfa_offset;
9355 tlen = choose_baseaddr_len (REGNO (crtl->drap_reg), toffset);
9356 if (tlen <= len)
9357 {
9358 base_reg = crtl->drap_reg;
9359 base_offset = toffset;
9360 len = tlen;
9361 }
9362 }
9363 if (m->fs.fp_valid)
9364 {
9365 toffset = m->fs.fp_offset - cfa_offset;
9366 tlen = choose_baseaddr_len (HARD_FRAME_POINTER_REGNUM, toffset);
9367 if (tlen <= len)
9368 {
9369 base_reg = hard_frame_pointer_rtx;
9370 base_offset = toffset;
9371 len = tlen;
9372 }
9373 }
9374 }
9375 gcc_assert (base_reg != NULL);
9376
9377 return plus_constant (Pmode, base_reg, base_offset);
9378 }
9379
9380 /* Emit code to save registers in the prologue. */
9381
9382 static void
9383 ix86_emit_save_regs (void)
9384 {
9385 unsigned int regno;
9386 rtx insn;
9387
9388 for (regno = FIRST_PSEUDO_REGISTER - 1; regno-- > 0; )
9389 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9390 {
9391 insn = emit_insn (gen_push (gen_rtx_REG (word_mode, regno)));
9392 RTX_FRAME_RELATED_P (insn) = 1;
9393 }
9394 }
9395
9396 /* Emit a single register save at CFA - CFA_OFFSET. */
9397
9398 static void
9399 ix86_emit_save_reg_using_mov (enum machine_mode mode, unsigned int regno,
9400 HOST_WIDE_INT cfa_offset)
9401 {
9402 struct machine_function *m = cfun->machine;
9403 rtx reg = gen_rtx_REG (mode, regno);
9404 rtx mem, addr, base, insn;
9405
9406 addr = choose_baseaddr (cfa_offset);
9407 mem = gen_frame_mem (mode, addr);
9408
9409 /* For SSE saves, we need to indicate the 128-bit alignment. */
9410 set_mem_align (mem, GET_MODE_ALIGNMENT (mode));
9411
9412 insn = emit_move_insn (mem, reg);
9413 RTX_FRAME_RELATED_P (insn) = 1;
9414
9415 base = addr;
9416 if (GET_CODE (base) == PLUS)
9417 base = XEXP (base, 0);
9418 gcc_checking_assert (REG_P (base));
9419
9420 /* When saving registers into a re-aligned local stack frame, avoid
9421 any tricky guessing by dwarf2out. */
9422 if (m->fs.realigned)
9423 {
9424 gcc_checking_assert (stack_realign_drap);
9425
9426 if (regno == REGNO (crtl->drap_reg))
9427 {
9428 /* A bit of a hack. We force the DRAP register to be saved in
9429 the re-aligned stack frame, which provides us with a copy
9430 of the CFA that will last past the prologue. Install it. */
9431 gcc_checking_assert (cfun->machine->fs.fp_valid);
9432 addr = plus_constant (Pmode, hard_frame_pointer_rtx,
9433 cfun->machine->fs.fp_offset - cfa_offset);
9434 mem = gen_rtx_MEM (mode, addr);
9435 add_reg_note (insn, REG_CFA_DEF_CFA, mem);
9436 }
9437 else
9438 {
9439 /* The frame pointer is a stable reference within the
9440 aligned frame. Use it. */
9441 gcc_checking_assert (cfun->machine->fs.fp_valid);
9442 addr = plus_constant (Pmode, hard_frame_pointer_rtx,
9443 cfun->machine->fs.fp_offset - cfa_offset);
9444 mem = gen_rtx_MEM (mode, addr);
9445 add_reg_note (insn, REG_CFA_EXPRESSION,
9446 gen_rtx_SET (VOIDmode, mem, reg));
9447 }
9448 }
9449
9450 /* The memory may not be relative to the current CFA register,
9451 which means that we may need to generate a new pattern for
9452 use by the unwind info. */
9453 else if (base != m->fs.cfa_reg)
9454 {
9455 addr = plus_constant (Pmode, m->fs.cfa_reg,
9456 m->fs.cfa_offset - cfa_offset);
9457 mem = gen_rtx_MEM (mode, addr);
9458 add_reg_note (insn, REG_CFA_OFFSET, gen_rtx_SET (VOIDmode, mem, reg));
9459 }
9460 }
9461
9462 /* Emit code to save registers using MOV insns.
9463 First register is stored at CFA - CFA_OFFSET. */
9464 static void
9465 ix86_emit_save_regs_using_mov (HOST_WIDE_INT cfa_offset)
9466 {
9467 unsigned int regno;
9468
9469 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
9470 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9471 {
9472 ix86_emit_save_reg_using_mov (word_mode, regno, cfa_offset);
9473 cfa_offset -= UNITS_PER_WORD;
9474 }
9475 }
9476
9477 /* Emit code to save SSE registers using MOV insns.
9478 First register is stored at CFA - CFA_OFFSET. */
9479 static void
9480 ix86_emit_save_sse_regs_using_mov (HOST_WIDE_INT cfa_offset)
9481 {
9482 unsigned int regno;
9483
9484 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
9485 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9486 {
9487 ix86_emit_save_reg_using_mov (V4SFmode, regno, cfa_offset);
9488 cfa_offset -= 16;
9489 }
9490 }
9491
9492 static GTY(()) rtx queued_cfa_restores;
9493
9494 /* Add a REG_CFA_RESTORE REG note to INSN or queue them until next stack
9495 manipulation insn. The value is on the stack at CFA - CFA_OFFSET.
9496 Don't add the note if the previously saved value will be left untouched
9497 within stack red-zone till return, as unwinders can find the same value
9498 in the register and on the stack. */
9499
9500 static void
9501 ix86_add_cfa_restore_note (rtx insn, rtx reg, HOST_WIDE_INT cfa_offset)
9502 {
9503 if (!crtl->shrink_wrapped
9504 && cfa_offset <= cfun->machine->fs.red_zone_offset)
9505 return;
9506
9507 if (insn)
9508 {
9509 add_reg_note (insn, REG_CFA_RESTORE, reg);
9510 RTX_FRAME_RELATED_P (insn) = 1;
9511 }
9512 else
9513 queued_cfa_restores
9514 = alloc_reg_note (REG_CFA_RESTORE, reg, queued_cfa_restores);
9515 }
9516
9517 /* Add queued REG_CFA_RESTORE notes if any to INSN. */
9518
9519 static void
9520 ix86_add_queued_cfa_restore_notes (rtx insn)
9521 {
9522 rtx last;
9523 if (!queued_cfa_restores)
9524 return;
9525 for (last = queued_cfa_restores; XEXP (last, 1); last = XEXP (last, 1))
9526 ;
9527 XEXP (last, 1) = REG_NOTES (insn);
9528 REG_NOTES (insn) = queued_cfa_restores;
9529 queued_cfa_restores = NULL_RTX;
9530 RTX_FRAME_RELATED_P (insn) = 1;
9531 }
9532
9533 /* Expand prologue or epilogue stack adjustment.
9534 The pattern exist to put a dependency on all ebp-based memory accesses.
9535 STYLE should be negative if instructions should be marked as frame related,
9536 zero if %r11 register is live and cannot be freely used and positive
9537 otherwise. */
9538
9539 static void
9540 pro_epilogue_adjust_stack (rtx dest, rtx src, rtx offset,
9541 int style, bool set_cfa)
9542 {
9543 struct machine_function *m = cfun->machine;
9544 rtx insn;
9545 bool add_frame_related_expr = false;
9546
9547 if (Pmode == SImode)
9548 insn = gen_pro_epilogue_adjust_stack_si_add (dest, src, offset);
9549 else if (x86_64_immediate_operand (offset, DImode))
9550 insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, offset);
9551 else
9552 {
9553 rtx tmp;
9554 /* r11 is used by indirect sibcall return as well, set before the
9555 epilogue and used after the epilogue. */
9556 if (style)
9557 tmp = gen_rtx_REG (DImode, R11_REG);
9558 else
9559 {
9560 gcc_assert (src != hard_frame_pointer_rtx
9561 && dest != hard_frame_pointer_rtx);
9562 tmp = hard_frame_pointer_rtx;
9563 }
9564 insn = emit_insn (gen_rtx_SET (DImode, tmp, offset));
9565 if (style < 0)
9566 add_frame_related_expr = true;
9567
9568 insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, tmp);
9569 }
9570
9571 insn = emit_insn (insn);
9572 if (style >= 0)
9573 ix86_add_queued_cfa_restore_notes (insn);
9574
9575 if (set_cfa)
9576 {
9577 rtx r;
9578
9579 gcc_assert (m->fs.cfa_reg == src);
9580 m->fs.cfa_offset += INTVAL (offset);
9581 m->fs.cfa_reg = dest;
9582
9583 r = gen_rtx_PLUS (Pmode, src, offset);
9584 r = gen_rtx_SET (VOIDmode, dest, r);
9585 add_reg_note (insn, REG_CFA_ADJUST_CFA, r);
9586 RTX_FRAME_RELATED_P (insn) = 1;
9587 }
9588 else if (style < 0)
9589 {
9590 RTX_FRAME_RELATED_P (insn) = 1;
9591 if (add_frame_related_expr)
9592 {
9593 rtx r = gen_rtx_PLUS (Pmode, src, offset);
9594 r = gen_rtx_SET (VOIDmode, dest, r);
9595 add_reg_note (insn, REG_FRAME_RELATED_EXPR, r);
9596 }
9597 }
9598
9599 if (dest == stack_pointer_rtx)
9600 {
9601 HOST_WIDE_INT ooffset = m->fs.sp_offset;
9602 bool valid = m->fs.sp_valid;
9603
9604 if (src == hard_frame_pointer_rtx)
9605 {
9606 valid = m->fs.fp_valid;
9607 ooffset = m->fs.fp_offset;
9608 }
9609 else if (src == crtl->drap_reg)
9610 {
9611 valid = m->fs.drap_valid;
9612 ooffset = 0;
9613 }
9614 else
9615 {
9616 /* Else there are two possibilities: SP itself, which we set
9617 up as the default above. Or EH_RETURN_STACKADJ_RTX, which is
9618 taken care of this by hand along the eh_return path. */
9619 gcc_checking_assert (src == stack_pointer_rtx
9620 || offset == const0_rtx);
9621 }
9622
9623 m->fs.sp_offset = ooffset - INTVAL (offset);
9624 m->fs.sp_valid = valid;
9625 }
9626 }
9627
9628 /* Find an available register to be used as dynamic realign argument
9629 pointer regsiter. Such a register will be written in prologue and
9630 used in begin of body, so it must not be
9631 1. parameter passing register.
9632 2. GOT pointer.
9633 We reuse static-chain register if it is available. Otherwise, we
9634 use DI for i386 and R13 for x86-64. We chose R13 since it has
9635 shorter encoding.
9636
9637 Return: the regno of chosen register. */
9638
9639 static unsigned int
9640 find_drap_reg (void)
9641 {
9642 tree decl = cfun->decl;
9643
9644 if (TARGET_64BIT)
9645 {
9646 /* Use R13 for nested function or function need static chain.
9647 Since function with tail call may use any caller-saved
9648 registers in epilogue, DRAP must not use caller-saved
9649 register in such case. */
9650 if (DECL_STATIC_CHAIN (decl) || crtl->tail_call_emit)
9651 return R13_REG;
9652
9653 return R10_REG;
9654 }
9655 else
9656 {
9657 /* Use DI for nested function or function need static chain.
9658 Since function with tail call may use any caller-saved
9659 registers in epilogue, DRAP must not use caller-saved
9660 register in such case. */
9661 if (DECL_STATIC_CHAIN (decl) || crtl->tail_call_emit)
9662 return DI_REG;
9663
9664 /* Reuse static chain register if it isn't used for parameter
9665 passing. */
9666 if (ix86_function_regparm (TREE_TYPE (decl), decl) <= 2)
9667 {
9668 unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (decl));
9669 if ((ccvt & (IX86_CALLCVT_FASTCALL | IX86_CALLCVT_THISCALL)) == 0)
9670 return CX_REG;
9671 }
9672 return DI_REG;
9673 }
9674 }
9675
9676 /* Return minimum incoming stack alignment. */
9677
9678 static unsigned int
9679 ix86_minimum_incoming_stack_boundary (bool sibcall)
9680 {
9681 unsigned int incoming_stack_boundary;
9682
9683 /* Prefer the one specified at command line. */
9684 if (ix86_user_incoming_stack_boundary)
9685 incoming_stack_boundary = ix86_user_incoming_stack_boundary;
9686 /* In 32bit, use MIN_STACK_BOUNDARY for incoming stack boundary
9687 if -mstackrealign is used, it isn't used for sibcall check and
9688 estimated stack alignment is 128bit. */
9689 else if (!sibcall
9690 && !TARGET_64BIT
9691 && ix86_force_align_arg_pointer
9692 && crtl->stack_alignment_estimated == 128)
9693 incoming_stack_boundary = MIN_STACK_BOUNDARY;
9694 else
9695 incoming_stack_boundary = ix86_default_incoming_stack_boundary;
9696
9697 /* Incoming stack alignment can be changed on individual functions
9698 via force_align_arg_pointer attribute. We use the smallest
9699 incoming stack boundary. */
9700 if (incoming_stack_boundary > MIN_STACK_BOUNDARY
9701 && lookup_attribute (ix86_force_align_arg_pointer_string,
9702 TYPE_ATTRIBUTES (TREE_TYPE (current_function_decl))))
9703 incoming_stack_boundary = MIN_STACK_BOUNDARY;
9704
9705 /* The incoming stack frame has to be aligned at least at
9706 parm_stack_boundary. */
9707 if (incoming_stack_boundary < crtl->parm_stack_boundary)
9708 incoming_stack_boundary = crtl->parm_stack_boundary;
9709
9710 /* Stack at entrance of main is aligned by runtime. We use the
9711 smallest incoming stack boundary. */
9712 if (incoming_stack_boundary > MAIN_STACK_BOUNDARY
9713 && DECL_NAME (current_function_decl)
9714 && MAIN_NAME_P (DECL_NAME (current_function_decl))
9715 && DECL_FILE_SCOPE_P (current_function_decl))
9716 incoming_stack_boundary = MAIN_STACK_BOUNDARY;
9717
9718 return incoming_stack_boundary;
9719 }
9720
9721 /* Update incoming stack boundary and estimated stack alignment. */
9722
9723 static void
9724 ix86_update_stack_boundary (void)
9725 {
9726 ix86_incoming_stack_boundary
9727 = ix86_minimum_incoming_stack_boundary (false);
9728
9729 /* x86_64 vararg needs 16byte stack alignment for register save
9730 area. */
9731 if (TARGET_64BIT
9732 && cfun->stdarg
9733 && crtl->stack_alignment_estimated < 128)
9734 crtl->stack_alignment_estimated = 128;
9735 }
9736
9737 /* Handle the TARGET_GET_DRAP_RTX hook. Return NULL if no DRAP is
9738 needed or an rtx for DRAP otherwise. */
9739
9740 static rtx
9741 ix86_get_drap_rtx (void)
9742 {
9743 if (ix86_force_drap || !ACCUMULATE_OUTGOING_ARGS)
9744 crtl->need_drap = true;
9745
9746 if (stack_realign_drap)
9747 {
9748 /* Assign DRAP to vDRAP and returns vDRAP */
9749 unsigned int regno = find_drap_reg ();
9750 rtx drap_vreg;
9751 rtx arg_ptr;
9752 rtx seq, insn;
9753
9754 arg_ptr = gen_rtx_REG (Pmode, regno);
9755 crtl->drap_reg = arg_ptr;
9756
9757 start_sequence ();
9758 drap_vreg = copy_to_reg (arg_ptr);
9759 seq = get_insns ();
9760 end_sequence ();
9761
9762 insn = emit_insn_before (seq, NEXT_INSN (entry_of_function ()));
9763 if (!optimize)
9764 {
9765 add_reg_note (insn, REG_CFA_SET_VDRAP, drap_vreg);
9766 RTX_FRAME_RELATED_P (insn) = 1;
9767 }
9768 return drap_vreg;
9769 }
9770 else
9771 return NULL;
9772 }
9773
9774 /* Handle the TARGET_INTERNAL_ARG_POINTER hook. */
9775
9776 static rtx
9777 ix86_internal_arg_pointer (void)
9778 {
9779 return virtual_incoming_args_rtx;
9780 }
9781
9782 struct scratch_reg {
9783 rtx reg;
9784 bool saved;
9785 };
9786
9787 /* Return a short-lived scratch register for use on function entry.
9788 In 32-bit mode, it is valid only after the registers are saved
9789 in the prologue. This register must be released by means of
9790 release_scratch_register_on_entry once it is dead. */
9791
9792 static void
9793 get_scratch_register_on_entry (struct scratch_reg *sr)
9794 {
9795 int regno;
9796
9797 sr->saved = false;
9798
9799 if (TARGET_64BIT)
9800 {
9801 /* We always use R11 in 64-bit mode. */
9802 regno = R11_REG;
9803 }
9804 else
9805 {
9806 tree decl = current_function_decl, fntype = TREE_TYPE (decl);
9807 bool fastcall_p
9808 = lookup_attribute ("fastcall", TYPE_ATTRIBUTES (fntype)) != NULL_TREE;
9809 bool static_chain_p = DECL_STATIC_CHAIN (decl);
9810 int regparm = ix86_function_regparm (fntype, decl);
9811 int drap_regno
9812 = crtl->drap_reg ? REGNO (crtl->drap_reg) : INVALID_REGNUM;
9813
9814 /* 'fastcall' sets regparm to 2, uses ecx/edx for arguments and eax
9815 for the static chain register. */
9816 if ((regparm < 1 || (fastcall_p && !static_chain_p))
9817 && drap_regno != AX_REG)
9818 regno = AX_REG;
9819 else if (regparm < 2 && drap_regno != DX_REG)
9820 regno = DX_REG;
9821 /* ecx is the static chain register. */
9822 else if (regparm < 3 && !fastcall_p && !static_chain_p
9823 && drap_regno != CX_REG)
9824 regno = CX_REG;
9825 else if (ix86_save_reg (BX_REG, true))
9826 regno = BX_REG;
9827 /* esi is the static chain register. */
9828 else if (!(regparm == 3 && static_chain_p)
9829 && ix86_save_reg (SI_REG, true))
9830 regno = SI_REG;
9831 else if (ix86_save_reg (DI_REG, true))
9832 regno = DI_REG;
9833 else
9834 {
9835 regno = (drap_regno == AX_REG ? DX_REG : AX_REG);
9836 sr->saved = true;
9837 }
9838 }
9839
9840 sr->reg = gen_rtx_REG (Pmode, regno);
9841 if (sr->saved)
9842 {
9843 rtx insn = emit_insn (gen_push (sr->reg));
9844 RTX_FRAME_RELATED_P (insn) = 1;
9845 }
9846 }
9847
9848 /* Release a scratch register obtained from the preceding function. */
9849
9850 static void
9851 release_scratch_register_on_entry (struct scratch_reg *sr)
9852 {
9853 if (sr->saved)
9854 {
9855 rtx x, insn = emit_insn (gen_pop (sr->reg));
9856
9857 /* The RTX_FRAME_RELATED_P mechanism doesn't know about pop. */
9858 RTX_FRAME_RELATED_P (insn) = 1;
9859 x = gen_rtx_PLUS (Pmode, stack_pointer_rtx, GEN_INT (UNITS_PER_WORD));
9860 x = gen_rtx_SET (VOIDmode, stack_pointer_rtx, x);
9861 add_reg_note (insn, REG_FRAME_RELATED_EXPR, x);
9862 }
9863 }
9864
9865 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
9866
9867 /* Emit code to adjust the stack pointer by SIZE bytes while probing it. */
9868
9869 static void
9870 ix86_adjust_stack_and_probe (const HOST_WIDE_INT size)
9871 {
9872 /* We skip the probe for the first interval + a small dope of 4 words and
9873 probe that many bytes past the specified size to maintain a protection
9874 area at the botton of the stack. */
9875 const int dope = 4 * UNITS_PER_WORD;
9876 rtx size_rtx = GEN_INT (size), last;
9877
9878 /* See if we have a constant small number of probes to generate. If so,
9879 that's the easy case. The run-time loop is made up of 11 insns in the
9880 generic case while the compile-time loop is made up of 3+2*(n-1) insns
9881 for n # of intervals. */
9882 if (size <= 5 * PROBE_INTERVAL)
9883 {
9884 HOST_WIDE_INT i, adjust;
9885 bool first_probe = true;
9886
9887 /* Adjust SP and probe at PROBE_INTERVAL + N * PROBE_INTERVAL for
9888 values of N from 1 until it exceeds SIZE. If only one probe is
9889 needed, this will not generate any code. Then adjust and probe
9890 to PROBE_INTERVAL + SIZE. */
9891 for (i = PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
9892 {
9893 if (first_probe)
9894 {
9895 adjust = 2 * PROBE_INTERVAL + dope;
9896 first_probe = false;
9897 }
9898 else
9899 adjust = PROBE_INTERVAL;
9900
9901 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9902 plus_constant (Pmode, stack_pointer_rtx,
9903 -adjust)));
9904 emit_stack_probe (stack_pointer_rtx);
9905 }
9906
9907 if (first_probe)
9908 adjust = size + PROBE_INTERVAL + dope;
9909 else
9910 adjust = size + PROBE_INTERVAL - i;
9911
9912 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9913 plus_constant (Pmode, stack_pointer_rtx,
9914 -adjust)));
9915 emit_stack_probe (stack_pointer_rtx);
9916
9917 /* Adjust back to account for the additional first interval. */
9918 last = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9919 plus_constant (Pmode, stack_pointer_rtx,
9920 PROBE_INTERVAL + dope)));
9921 }
9922
9923 /* Otherwise, do the same as above, but in a loop. Note that we must be
9924 extra careful with variables wrapping around because we might be at
9925 the very top (or the very bottom) of the address space and we have
9926 to be able to handle this case properly; in particular, we use an
9927 equality test for the loop condition. */
9928 else
9929 {
9930 HOST_WIDE_INT rounded_size;
9931 struct scratch_reg sr;
9932
9933 get_scratch_register_on_entry (&sr);
9934
9935
9936 /* Step 1: round SIZE to the previous multiple of the interval. */
9937
9938 rounded_size = size & -PROBE_INTERVAL;
9939
9940
9941 /* Step 2: compute initial and final value of the loop counter. */
9942
9943 /* SP = SP_0 + PROBE_INTERVAL. */
9944 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9945 plus_constant (Pmode, stack_pointer_rtx,
9946 - (PROBE_INTERVAL + dope))));
9947
9948 /* LAST_ADDR = SP_0 + PROBE_INTERVAL + ROUNDED_SIZE. */
9949 emit_move_insn (sr.reg, GEN_INT (-rounded_size));
9950 emit_insn (gen_rtx_SET (VOIDmode, sr.reg,
9951 gen_rtx_PLUS (Pmode, sr.reg,
9952 stack_pointer_rtx)));
9953
9954
9955 /* Step 3: the loop
9956
9957 while (SP != LAST_ADDR)
9958 {
9959 SP = SP + PROBE_INTERVAL
9960 probe at SP
9961 }
9962
9963 adjusts SP and probes to PROBE_INTERVAL + N * PROBE_INTERVAL for
9964 values of N from 1 until it is equal to ROUNDED_SIZE. */
9965
9966 emit_insn (ix86_gen_adjust_stack_and_probe (sr.reg, sr.reg, size_rtx));
9967
9968
9969 /* Step 4: adjust SP and probe at PROBE_INTERVAL + SIZE if we cannot
9970 assert at compile-time that SIZE is equal to ROUNDED_SIZE. */
9971
9972 if (size != rounded_size)
9973 {
9974 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9975 plus_constant (Pmode, stack_pointer_rtx,
9976 rounded_size - size)));
9977 emit_stack_probe (stack_pointer_rtx);
9978 }
9979
9980 /* Adjust back to account for the additional first interval. */
9981 last = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9982 plus_constant (Pmode, stack_pointer_rtx,
9983 PROBE_INTERVAL + dope)));
9984
9985 release_scratch_register_on_entry (&sr);
9986 }
9987
9988 gcc_assert (cfun->machine->fs.cfa_reg != stack_pointer_rtx);
9989
9990 /* Even if the stack pointer isn't the CFA register, we need to correctly
9991 describe the adjustments made to it, in particular differentiate the
9992 frame-related ones from the frame-unrelated ones. */
9993 if (size > 0)
9994 {
9995 rtx expr = gen_rtx_SEQUENCE (VOIDmode, rtvec_alloc (2));
9996 XVECEXP (expr, 0, 0)
9997 = gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9998 plus_constant (Pmode, stack_pointer_rtx, -size));
9999 XVECEXP (expr, 0, 1)
10000 = gen_rtx_SET (VOIDmode, stack_pointer_rtx,
10001 plus_constant (Pmode, stack_pointer_rtx,
10002 PROBE_INTERVAL + dope + size));
10003 add_reg_note (last, REG_FRAME_RELATED_EXPR, expr);
10004 RTX_FRAME_RELATED_P (last) = 1;
10005
10006 cfun->machine->fs.sp_offset += size;
10007 }
10008
10009 /* Make sure nothing is scheduled before we are done. */
10010 emit_insn (gen_blockage ());
10011 }
10012
10013 /* Adjust the stack pointer up to REG while probing it. */
10014
10015 const char *
10016 output_adjust_stack_and_probe (rtx reg)
10017 {
10018 static int labelno = 0;
10019 char loop_lab[32], end_lab[32];
10020 rtx xops[2];
10021
10022 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno);
10023 ASM_GENERATE_INTERNAL_LABEL (end_lab, "LPSRE", labelno++);
10024
10025 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
10026
10027 /* Jump to END_LAB if SP == LAST_ADDR. */
10028 xops[0] = stack_pointer_rtx;
10029 xops[1] = reg;
10030 output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
10031 fputs ("\tje\t", asm_out_file);
10032 assemble_name_raw (asm_out_file, end_lab);
10033 fputc ('\n', asm_out_file);
10034
10035 /* SP = SP + PROBE_INTERVAL. */
10036 xops[1] = GEN_INT (PROBE_INTERVAL);
10037 output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
10038
10039 /* Probe at SP. */
10040 xops[1] = const0_rtx;
10041 output_asm_insn ("or%z0\t{%1, (%0)|DWORD PTR [%0], %1}", xops);
10042
10043 fprintf (asm_out_file, "\tjmp\t");
10044 assemble_name_raw (asm_out_file, loop_lab);
10045 fputc ('\n', asm_out_file);
10046
10047 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, end_lab);
10048
10049 return "";
10050 }
10051
10052 /* Emit code to probe a range of stack addresses from FIRST to FIRST+SIZE,
10053 inclusive. These are offsets from the current stack pointer. */
10054
10055 static void
10056 ix86_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size)
10057 {
10058 /* See if we have a constant small number of probes to generate. If so,
10059 that's the easy case. The run-time loop is made up of 7 insns in the
10060 generic case while the compile-time loop is made up of n insns for n #
10061 of intervals. */
10062 if (size <= 7 * PROBE_INTERVAL)
10063 {
10064 HOST_WIDE_INT i;
10065
10066 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 1 until
10067 it exceeds SIZE. If only one probe is needed, this will not
10068 generate any code. Then probe at FIRST + SIZE. */
10069 for (i = PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
10070 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
10071 -(first + i)));
10072
10073 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
10074 -(first + size)));
10075 }
10076
10077 /* Otherwise, do the same as above, but in a loop. Note that we must be
10078 extra careful with variables wrapping around because we might be at
10079 the very top (or the very bottom) of the address space and we have
10080 to be able to handle this case properly; in particular, we use an
10081 equality test for the loop condition. */
10082 else
10083 {
10084 HOST_WIDE_INT rounded_size, last;
10085 struct scratch_reg sr;
10086
10087 get_scratch_register_on_entry (&sr);
10088
10089
10090 /* Step 1: round SIZE to the previous multiple of the interval. */
10091
10092 rounded_size = size & -PROBE_INTERVAL;
10093
10094
10095 /* Step 2: compute initial and final value of the loop counter. */
10096
10097 /* TEST_OFFSET = FIRST. */
10098 emit_move_insn (sr.reg, GEN_INT (-first));
10099
10100 /* LAST_OFFSET = FIRST + ROUNDED_SIZE. */
10101 last = first + rounded_size;
10102
10103
10104 /* Step 3: the loop
10105
10106 while (TEST_ADDR != LAST_ADDR)
10107 {
10108 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
10109 probe at TEST_ADDR
10110 }
10111
10112 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
10113 until it is equal to ROUNDED_SIZE. */
10114
10115 emit_insn (ix86_gen_probe_stack_range (sr.reg, sr.reg, GEN_INT (-last)));
10116
10117
10118 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
10119 that SIZE is equal to ROUNDED_SIZE. */
10120
10121 if (size != rounded_size)
10122 emit_stack_probe (plus_constant (Pmode,
10123 gen_rtx_PLUS (Pmode,
10124 stack_pointer_rtx,
10125 sr.reg),
10126 rounded_size - size));
10127
10128 release_scratch_register_on_entry (&sr);
10129 }
10130
10131 /* Make sure nothing is scheduled before we are done. */
10132 emit_insn (gen_blockage ());
10133 }
10134
10135 /* Probe a range of stack addresses from REG to END, inclusive. These are
10136 offsets from the current stack pointer. */
10137
10138 const char *
10139 output_probe_stack_range (rtx reg, rtx end)
10140 {
10141 static int labelno = 0;
10142 char loop_lab[32], end_lab[32];
10143 rtx xops[3];
10144
10145 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno);
10146 ASM_GENERATE_INTERNAL_LABEL (end_lab, "LPSRE", labelno++);
10147
10148 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
10149
10150 /* Jump to END_LAB if TEST_ADDR == LAST_ADDR. */
10151 xops[0] = reg;
10152 xops[1] = end;
10153 output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
10154 fputs ("\tje\t", asm_out_file);
10155 assemble_name_raw (asm_out_file, end_lab);
10156 fputc ('\n', asm_out_file);
10157
10158 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
10159 xops[1] = GEN_INT (PROBE_INTERVAL);
10160 output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
10161
10162 /* Probe at TEST_ADDR. */
10163 xops[0] = stack_pointer_rtx;
10164 xops[1] = reg;
10165 xops[2] = const0_rtx;
10166 output_asm_insn ("or%z0\t{%2, (%0,%1)|DWORD PTR [%0+%1], %2}", xops);
10167
10168 fprintf (asm_out_file, "\tjmp\t");
10169 assemble_name_raw (asm_out_file, loop_lab);
10170 fputc ('\n', asm_out_file);
10171
10172 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, end_lab);
10173
10174 return "";
10175 }
10176
10177 /* Finalize stack_realign_needed flag, which will guide prologue/epilogue
10178 to be generated in correct form. */
10179 static void
10180 ix86_finalize_stack_realign_flags (void)
10181 {
10182 /* Check if stack realign is really needed after reload, and
10183 stores result in cfun */
10184 unsigned int incoming_stack_boundary
10185 = (crtl->parm_stack_boundary > ix86_incoming_stack_boundary
10186 ? crtl->parm_stack_boundary : ix86_incoming_stack_boundary);
10187 unsigned int stack_realign = (incoming_stack_boundary
10188 < (crtl->is_leaf
10189 ? crtl->max_used_stack_slot_alignment
10190 : crtl->stack_alignment_needed));
10191
10192 if (crtl->stack_realign_finalized)
10193 {
10194 /* After stack_realign_needed is finalized, we can't no longer
10195 change it. */
10196 gcc_assert (crtl->stack_realign_needed == stack_realign);
10197 return;
10198 }
10199
10200 /* If the only reason for frame_pointer_needed is that we conservatively
10201 assumed stack realignment might be needed, but in the end nothing that
10202 needed the stack alignment had been spilled, clear frame_pointer_needed
10203 and say we don't need stack realignment. */
10204 if (stack_realign
10205 && !crtl->need_drap
10206 && frame_pointer_needed
10207 && crtl->is_leaf
10208 && flag_omit_frame_pointer
10209 && crtl->sp_is_unchanging
10210 && !ix86_current_function_calls_tls_descriptor
10211 && !crtl->accesses_prior_frames
10212 && !cfun->calls_alloca
10213 && !crtl->calls_eh_return
10214 && !(flag_stack_check && STACK_CHECK_MOVING_SP)
10215 && !ix86_frame_pointer_required ()
10216 && get_frame_size () == 0
10217 && ix86_nsaved_sseregs () == 0
10218 && ix86_varargs_gpr_size + ix86_varargs_fpr_size == 0)
10219 {
10220 HARD_REG_SET set_up_by_prologue, prologue_used;
10221 basic_block bb;
10222
10223 CLEAR_HARD_REG_SET (prologue_used);
10224 CLEAR_HARD_REG_SET (set_up_by_prologue);
10225 add_to_hard_reg_set (&set_up_by_prologue, Pmode, STACK_POINTER_REGNUM);
10226 add_to_hard_reg_set (&set_up_by_prologue, Pmode, ARG_POINTER_REGNUM);
10227 add_to_hard_reg_set (&set_up_by_prologue, Pmode,
10228 HARD_FRAME_POINTER_REGNUM);
10229 FOR_EACH_BB (bb)
10230 {
10231 rtx insn;
10232 FOR_BB_INSNS (bb, insn)
10233 if (NONDEBUG_INSN_P (insn)
10234 && requires_stack_frame_p (insn, prologue_used,
10235 set_up_by_prologue))
10236 {
10237 crtl->stack_realign_needed = stack_realign;
10238 crtl->stack_realign_finalized = true;
10239 return;
10240 }
10241 }
10242
10243 frame_pointer_needed = false;
10244 stack_realign = false;
10245 crtl->max_used_stack_slot_alignment = incoming_stack_boundary;
10246 crtl->stack_alignment_needed = incoming_stack_boundary;
10247 crtl->stack_alignment_estimated = incoming_stack_boundary;
10248 if (crtl->preferred_stack_boundary > incoming_stack_boundary)
10249 crtl->preferred_stack_boundary = incoming_stack_boundary;
10250 df_finish_pass (true);
10251 df_scan_alloc (NULL);
10252 df_scan_blocks ();
10253 df_compute_regs_ever_live (true);
10254 df_analyze ();
10255 }
10256
10257 crtl->stack_realign_needed = stack_realign;
10258 crtl->stack_realign_finalized = true;
10259 }
10260
10261 /* Expand the prologue into a bunch of separate insns. */
10262
10263 void
10264 ix86_expand_prologue (void)
10265 {
10266 struct machine_function *m = cfun->machine;
10267 rtx insn, t;
10268 bool pic_reg_used;
10269 struct ix86_frame frame;
10270 HOST_WIDE_INT allocate;
10271 bool int_registers_saved;
10272 bool sse_registers_saved;
10273
10274 ix86_finalize_stack_realign_flags ();
10275
10276 /* DRAP should not coexist with stack_realign_fp */
10277 gcc_assert (!(crtl->drap_reg && stack_realign_fp));
10278
10279 memset (&m->fs, 0, sizeof (m->fs));
10280
10281 /* Initialize CFA state for before the prologue. */
10282 m->fs.cfa_reg = stack_pointer_rtx;
10283 m->fs.cfa_offset = INCOMING_FRAME_SP_OFFSET;
10284
10285 /* Track SP offset to the CFA. We continue tracking this after we've
10286 swapped the CFA register away from SP. In the case of re-alignment
10287 this is fudged; we're interested to offsets within the local frame. */
10288 m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
10289 m->fs.sp_valid = true;
10290
10291 ix86_compute_frame_layout (&frame);
10292
10293 if (!TARGET_64BIT && ix86_function_ms_hook_prologue (current_function_decl))
10294 {
10295 /* We should have already generated an error for any use of
10296 ms_hook on a nested function. */
10297 gcc_checking_assert (!ix86_static_chain_on_stack);
10298
10299 /* Check if profiling is active and we shall use profiling before
10300 prologue variant. If so sorry. */
10301 if (crtl->profile && flag_fentry != 0)
10302 sorry ("ms_hook_prologue attribute isn%'t compatible "
10303 "with -mfentry for 32-bit");
10304
10305 /* In ix86_asm_output_function_label we emitted:
10306 8b ff movl.s %edi,%edi
10307 55 push %ebp
10308 8b ec movl.s %esp,%ebp
10309
10310 This matches the hookable function prologue in Win32 API
10311 functions in Microsoft Windows XP Service Pack 2 and newer.
10312 Wine uses this to enable Windows apps to hook the Win32 API
10313 functions provided by Wine.
10314
10315 What that means is that we've already set up the frame pointer. */
10316
10317 if (frame_pointer_needed
10318 && !(crtl->drap_reg && crtl->stack_realign_needed))
10319 {
10320 rtx push, mov;
10321
10322 /* We've decided to use the frame pointer already set up.
10323 Describe this to the unwinder by pretending that both
10324 push and mov insns happen right here.
10325
10326 Putting the unwind info here at the end of the ms_hook
10327 is done so that we can make absolutely certain we get
10328 the required byte sequence at the start of the function,
10329 rather than relying on an assembler that can produce
10330 the exact encoding required.
10331
10332 However it does mean (in the unpatched case) that we have
10333 a 1 insn window where the asynchronous unwind info is
10334 incorrect. However, if we placed the unwind info at
10335 its correct location we would have incorrect unwind info
10336 in the patched case. Which is probably all moot since
10337 I don't expect Wine generates dwarf2 unwind info for the
10338 system libraries that use this feature. */
10339
10340 insn = emit_insn (gen_blockage ());
10341
10342 push = gen_push (hard_frame_pointer_rtx);
10343 mov = gen_rtx_SET (VOIDmode, hard_frame_pointer_rtx,
10344 stack_pointer_rtx);
10345 RTX_FRAME_RELATED_P (push) = 1;
10346 RTX_FRAME_RELATED_P (mov) = 1;
10347
10348 RTX_FRAME_RELATED_P (insn) = 1;
10349 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
10350 gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, push, mov)));
10351
10352 /* Note that gen_push incremented m->fs.cfa_offset, even
10353 though we didn't emit the push insn here. */
10354 m->fs.cfa_reg = hard_frame_pointer_rtx;
10355 m->fs.fp_offset = m->fs.cfa_offset;
10356 m->fs.fp_valid = true;
10357 }
10358 else
10359 {
10360 /* The frame pointer is not needed so pop %ebp again.
10361 This leaves us with a pristine state. */
10362 emit_insn (gen_pop (hard_frame_pointer_rtx));
10363 }
10364 }
10365
10366 /* The first insn of a function that accepts its static chain on the
10367 stack is to push the register that would be filled in by a direct
10368 call. This insn will be skipped by the trampoline. */
10369 else if (ix86_static_chain_on_stack)
10370 {
10371 insn = emit_insn (gen_push (ix86_static_chain (cfun->decl, false)));
10372 emit_insn (gen_blockage ());
10373
10374 /* We don't want to interpret this push insn as a register save,
10375 only as a stack adjustment. The real copy of the register as
10376 a save will be done later, if needed. */
10377 t = plus_constant (Pmode, stack_pointer_rtx, -UNITS_PER_WORD);
10378 t = gen_rtx_SET (VOIDmode, stack_pointer_rtx, t);
10379 add_reg_note (insn, REG_CFA_ADJUST_CFA, t);
10380 RTX_FRAME_RELATED_P (insn) = 1;
10381 }
10382
10383 /* Emit prologue code to adjust stack alignment and setup DRAP, in case
10384 of DRAP is needed and stack realignment is really needed after reload */
10385 if (stack_realign_drap)
10386 {
10387 int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
10388
10389 /* Only need to push parameter pointer reg if it is caller saved. */
10390 if (!call_used_regs[REGNO (crtl->drap_reg)])
10391 {
10392 /* Push arg pointer reg */
10393 insn = emit_insn (gen_push (crtl->drap_reg));
10394 RTX_FRAME_RELATED_P (insn) = 1;
10395 }
10396
10397 /* Grab the argument pointer. */
10398 t = plus_constant (Pmode, stack_pointer_rtx, m->fs.sp_offset);
10399 insn = emit_insn (gen_rtx_SET (VOIDmode, crtl->drap_reg, t));
10400 RTX_FRAME_RELATED_P (insn) = 1;
10401 m->fs.cfa_reg = crtl->drap_reg;
10402 m->fs.cfa_offset = 0;
10403
10404 /* Align the stack. */
10405 insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
10406 stack_pointer_rtx,
10407 GEN_INT (-align_bytes)));
10408 RTX_FRAME_RELATED_P (insn) = 1;
10409
10410 /* Replicate the return address on the stack so that return
10411 address can be reached via (argp - 1) slot. This is needed
10412 to implement macro RETURN_ADDR_RTX and intrinsic function
10413 expand_builtin_return_addr etc. */
10414 t = plus_constant (Pmode, crtl->drap_reg, -UNITS_PER_WORD);
10415 t = gen_frame_mem (word_mode, t);
10416 insn = emit_insn (gen_push (t));
10417 RTX_FRAME_RELATED_P (insn) = 1;
10418
10419 /* For the purposes of frame and register save area addressing,
10420 we've started over with a new frame. */
10421 m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
10422 m->fs.realigned = true;
10423 }
10424
10425 int_registers_saved = (frame.nregs == 0);
10426 sse_registers_saved = (frame.nsseregs == 0);
10427
10428 if (frame_pointer_needed && !m->fs.fp_valid)
10429 {
10430 /* Note: AT&T enter does NOT have reversed args. Enter is probably
10431 slower on all targets. Also sdb doesn't like it. */
10432 insn = emit_insn (gen_push (hard_frame_pointer_rtx));
10433 RTX_FRAME_RELATED_P (insn) = 1;
10434
10435 /* Push registers now, before setting the frame pointer
10436 on SEH target. */
10437 if (!int_registers_saved
10438 && TARGET_SEH
10439 && !frame.save_regs_using_mov)
10440 {
10441 ix86_emit_save_regs ();
10442 int_registers_saved = true;
10443 gcc_assert (m->fs.sp_offset == frame.reg_save_offset);
10444 }
10445
10446 if (m->fs.sp_offset == frame.hard_frame_pointer_offset)
10447 {
10448 insn = emit_move_insn (hard_frame_pointer_rtx, stack_pointer_rtx);
10449 RTX_FRAME_RELATED_P (insn) = 1;
10450
10451 if (m->fs.cfa_reg == stack_pointer_rtx)
10452 m->fs.cfa_reg = hard_frame_pointer_rtx;
10453 m->fs.fp_offset = m->fs.sp_offset;
10454 m->fs.fp_valid = true;
10455 }
10456 }
10457
10458 if (!int_registers_saved)
10459 {
10460 /* If saving registers via PUSH, do so now. */
10461 if (!frame.save_regs_using_mov)
10462 {
10463 ix86_emit_save_regs ();
10464 int_registers_saved = true;
10465 gcc_assert (m->fs.sp_offset == frame.reg_save_offset);
10466 }
10467
10468 /* When using red zone we may start register saving before allocating
10469 the stack frame saving one cycle of the prologue. However, avoid
10470 doing this if we have to probe the stack; at least on x86_64 the
10471 stack probe can turn into a call that clobbers a red zone location. */
10472 else if (ix86_using_red_zone ()
10473 && (! TARGET_STACK_PROBE
10474 || frame.stack_pointer_offset < CHECK_STACK_LIMIT))
10475 {
10476 ix86_emit_save_regs_using_mov (frame.reg_save_offset);
10477 int_registers_saved = true;
10478 }
10479 }
10480
10481 if (stack_realign_fp)
10482 {
10483 int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
10484 gcc_assert (align_bytes > MIN_STACK_BOUNDARY / BITS_PER_UNIT);
10485
10486 /* The computation of the size of the re-aligned stack frame means
10487 that we must allocate the size of the register save area before
10488 performing the actual alignment. Otherwise we cannot guarantee
10489 that there's enough storage above the realignment point. */
10490 if (m->fs.sp_offset != frame.sse_reg_save_offset)
10491 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10492 GEN_INT (m->fs.sp_offset
10493 - frame.sse_reg_save_offset),
10494 -1, false);
10495
10496 /* Align the stack. */
10497 insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
10498 stack_pointer_rtx,
10499 GEN_INT (-align_bytes)));
10500
10501 /* For the purposes of register save area addressing, the stack
10502 pointer is no longer valid. As for the value of sp_offset,
10503 see ix86_compute_frame_layout, which we need to match in order
10504 to pass verification of stack_pointer_offset at the end. */
10505 m->fs.sp_offset = (m->fs.sp_offset + align_bytes) & -align_bytes;
10506 m->fs.sp_valid = false;
10507 }
10508
10509 allocate = frame.stack_pointer_offset - m->fs.sp_offset;
10510
10511 if (flag_stack_usage_info)
10512 {
10513 /* We start to count from ARG_POINTER. */
10514 HOST_WIDE_INT stack_size = frame.stack_pointer_offset;
10515
10516 /* If it was realigned, take into account the fake frame. */
10517 if (stack_realign_drap)
10518 {
10519 if (ix86_static_chain_on_stack)
10520 stack_size += UNITS_PER_WORD;
10521
10522 if (!call_used_regs[REGNO (crtl->drap_reg)])
10523 stack_size += UNITS_PER_WORD;
10524
10525 /* This over-estimates by 1 minimal-stack-alignment-unit but
10526 mitigates that by counting in the new return address slot. */
10527 current_function_dynamic_stack_size
10528 += crtl->stack_alignment_needed / BITS_PER_UNIT;
10529 }
10530
10531 current_function_static_stack_size = stack_size;
10532 }
10533
10534 /* On SEH target with very large frame size, allocate an area to save
10535 SSE registers (as the very large allocation won't be described). */
10536 if (TARGET_SEH
10537 && frame.stack_pointer_offset > SEH_MAX_FRAME_SIZE
10538 && !sse_registers_saved)
10539 {
10540 HOST_WIDE_INT sse_size =
10541 frame.sse_reg_save_offset - frame.reg_save_offset;
10542
10543 gcc_assert (int_registers_saved);
10544
10545 /* No need to do stack checking as the area will be immediately
10546 written. */
10547 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10548 GEN_INT (-sse_size), -1,
10549 m->fs.cfa_reg == stack_pointer_rtx);
10550 allocate -= sse_size;
10551 ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
10552 sse_registers_saved = true;
10553 }
10554
10555 /* The stack has already been decremented by the instruction calling us
10556 so probe if the size is non-negative to preserve the protection area. */
10557 if (allocate >= 0 && flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
10558 {
10559 /* We expect the registers to be saved when probes are used. */
10560 gcc_assert (int_registers_saved);
10561
10562 if (STACK_CHECK_MOVING_SP)
10563 {
10564 ix86_adjust_stack_and_probe (allocate);
10565 allocate = 0;
10566 }
10567 else
10568 {
10569 HOST_WIDE_INT size = allocate;
10570
10571 if (TARGET_64BIT && size >= (HOST_WIDE_INT) 0x80000000)
10572 size = 0x80000000 - STACK_CHECK_PROTECT - 1;
10573
10574 if (TARGET_STACK_PROBE)
10575 ix86_emit_probe_stack_range (0, size + STACK_CHECK_PROTECT);
10576 else
10577 ix86_emit_probe_stack_range (STACK_CHECK_PROTECT, size);
10578 }
10579 }
10580
10581 if (allocate == 0)
10582 ;
10583 else if (!ix86_target_stack_probe ()
10584 || frame.stack_pointer_offset < CHECK_STACK_LIMIT)
10585 {
10586 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10587 GEN_INT (-allocate), -1,
10588 m->fs.cfa_reg == stack_pointer_rtx);
10589 }
10590 else
10591 {
10592 rtx eax = gen_rtx_REG (Pmode, AX_REG);
10593 rtx r10 = NULL;
10594 rtx (*adjust_stack_insn)(rtx, rtx, rtx);
10595
10596 bool eax_live = false;
10597 bool r10_live = false;
10598
10599 if (TARGET_64BIT)
10600 r10_live = (DECL_STATIC_CHAIN (current_function_decl) != 0);
10601 if (!TARGET_64BIT_MS_ABI)
10602 eax_live = ix86_eax_live_at_start_p ();
10603
10604 if (eax_live)
10605 {
10606 emit_insn (gen_push (eax));
10607 allocate -= UNITS_PER_WORD;
10608 }
10609 if (r10_live)
10610 {
10611 r10 = gen_rtx_REG (Pmode, R10_REG);
10612 emit_insn (gen_push (r10));
10613 allocate -= UNITS_PER_WORD;
10614 }
10615
10616 emit_move_insn (eax, GEN_INT (allocate));
10617 emit_insn (ix86_gen_allocate_stack_worker (eax, eax));
10618
10619 /* Use the fact that AX still contains ALLOCATE. */
10620 adjust_stack_insn = (Pmode == DImode
10621 ? gen_pro_epilogue_adjust_stack_di_sub
10622 : gen_pro_epilogue_adjust_stack_si_sub);
10623
10624 insn = emit_insn (adjust_stack_insn (stack_pointer_rtx,
10625 stack_pointer_rtx, eax));
10626
10627 /* Note that SEH directives need to continue tracking the stack
10628 pointer even after the frame pointer has been set up. */
10629 if (m->fs.cfa_reg == stack_pointer_rtx || TARGET_SEH)
10630 {
10631 if (m->fs.cfa_reg == stack_pointer_rtx)
10632 m->fs.cfa_offset += allocate;
10633
10634 RTX_FRAME_RELATED_P (insn) = 1;
10635 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
10636 gen_rtx_SET (VOIDmode, stack_pointer_rtx,
10637 plus_constant (Pmode, stack_pointer_rtx,
10638 -allocate)));
10639 }
10640 m->fs.sp_offset += allocate;
10641
10642 if (r10_live && eax_live)
10643 {
10644 t = choose_baseaddr (m->fs.sp_offset - allocate);
10645 emit_move_insn (gen_rtx_REG (word_mode, R10_REG),
10646 gen_frame_mem (word_mode, t));
10647 t = choose_baseaddr (m->fs.sp_offset - allocate - UNITS_PER_WORD);
10648 emit_move_insn (gen_rtx_REG (word_mode, AX_REG),
10649 gen_frame_mem (word_mode, t));
10650 }
10651 else if (eax_live || r10_live)
10652 {
10653 t = choose_baseaddr (m->fs.sp_offset - allocate);
10654 emit_move_insn (gen_rtx_REG (word_mode,
10655 (eax_live ? AX_REG : R10_REG)),
10656 gen_frame_mem (word_mode, t));
10657 }
10658 }
10659 gcc_assert (m->fs.sp_offset == frame.stack_pointer_offset);
10660
10661 /* If we havn't already set up the frame pointer, do so now. */
10662 if (frame_pointer_needed && !m->fs.fp_valid)
10663 {
10664 insn = ix86_gen_add3 (hard_frame_pointer_rtx, stack_pointer_rtx,
10665 GEN_INT (frame.stack_pointer_offset
10666 - frame.hard_frame_pointer_offset));
10667 insn = emit_insn (insn);
10668 RTX_FRAME_RELATED_P (insn) = 1;
10669 add_reg_note (insn, REG_CFA_ADJUST_CFA, NULL);
10670
10671 if (m->fs.cfa_reg == stack_pointer_rtx)
10672 m->fs.cfa_reg = hard_frame_pointer_rtx;
10673 m->fs.fp_offset = frame.hard_frame_pointer_offset;
10674 m->fs.fp_valid = true;
10675 }
10676
10677 if (!int_registers_saved)
10678 ix86_emit_save_regs_using_mov (frame.reg_save_offset);
10679 if (!sse_registers_saved)
10680 ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
10681
10682 pic_reg_used = false;
10683 if (pic_offset_table_rtx
10684 && (df_regs_ever_live_p (REAL_PIC_OFFSET_TABLE_REGNUM)
10685 || crtl->profile))
10686 {
10687 unsigned int alt_pic_reg_used = ix86_select_alt_pic_regnum ();
10688
10689 if (alt_pic_reg_used != INVALID_REGNUM)
10690 SET_REGNO (pic_offset_table_rtx, alt_pic_reg_used);
10691
10692 pic_reg_used = true;
10693 }
10694
10695 if (pic_reg_used)
10696 {
10697 if (TARGET_64BIT)
10698 {
10699 if (ix86_cmodel == CM_LARGE_PIC)
10700 {
10701 rtx label, tmp_reg;
10702
10703 gcc_assert (Pmode == DImode);
10704 label = gen_label_rtx ();
10705 emit_label (label);
10706 LABEL_PRESERVE_P (label) = 1;
10707 tmp_reg = gen_rtx_REG (Pmode, R11_REG);
10708 gcc_assert (REGNO (pic_offset_table_rtx) != REGNO (tmp_reg));
10709 insn = emit_insn (gen_set_rip_rex64 (pic_offset_table_rtx,
10710 label));
10711 insn = emit_insn (gen_set_got_offset_rex64 (tmp_reg, label));
10712 insn = emit_insn (ix86_gen_add3 (pic_offset_table_rtx,
10713 pic_offset_table_rtx, tmp_reg));
10714 }
10715 else
10716 insn = emit_insn (gen_set_got_rex64 (pic_offset_table_rtx));
10717 }
10718 else
10719 {
10720 insn = emit_insn (gen_set_got (pic_offset_table_rtx));
10721 RTX_FRAME_RELATED_P (insn) = 1;
10722 add_reg_note (insn, REG_CFA_FLUSH_QUEUE, NULL_RTX);
10723 }
10724 }
10725
10726 /* In the pic_reg_used case, make sure that the got load isn't deleted
10727 when mcount needs it. Blockage to avoid call movement across mcount
10728 call is emitted in generic code after the NOTE_INSN_PROLOGUE_END
10729 note. */
10730 if (crtl->profile && !flag_fentry && pic_reg_used)
10731 emit_insn (gen_prologue_use (pic_offset_table_rtx));
10732
10733 if (crtl->drap_reg && !crtl->stack_realign_needed)
10734 {
10735 /* vDRAP is setup but after reload it turns out stack realign
10736 isn't necessary, here we will emit prologue to setup DRAP
10737 without stack realign adjustment */
10738 t = choose_baseaddr (0);
10739 emit_insn (gen_rtx_SET (VOIDmode, crtl->drap_reg, t));
10740 }
10741
10742 /* Prevent instructions from being scheduled into register save push
10743 sequence when access to the redzone area is done through frame pointer.
10744 The offset between the frame pointer and the stack pointer is calculated
10745 relative to the value of the stack pointer at the end of the function
10746 prologue, and moving instructions that access redzone area via frame
10747 pointer inside push sequence violates this assumption. */
10748 if (frame_pointer_needed && frame.red_zone_size)
10749 emit_insn (gen_memory_blockage ());
10750
10751 /* Emit cld instruction if stringops are used in the function. */
10752 if (TARGET_CLD && ix86_current_function_needs_cld)
10753 emit_insn (gen_cld ());
10754
10755 /* SEH requires that the prologue end within 256 bytes of the start of
10756 the function. Prevent instruction schedules that would extend that.
10757 Further, prevent alloca modifications to the stack pointer from being
10758 combined with prologue modifications. */
10759 if (TARGET_SEH)
10760 emit_insn (gen_prologue_use (stack_pointer_rtx));
10761 }
10762
10763 /* Emit code to restore REG using a POP insn. */
10764
10765 static void
10766 ix86_emit_restore_reg_using_pop (rtx reg)
10767 {
10768 struct machine_function *m = cfun->machine;
10769 rtx insn = emit_insn (gen_pop (reg));
10770
10771 ix86_add_cfa_restore_note (insn, reg, m->fs.sp_offset);
10772 m->fs.sp_offset -= UNITS_PER_WORD;
10773
10774 if (m->fs.cfa_reg == crtl->drap_reg
10775 && REGNO (reg) == REGNO (crtl->drap_reg))
10776 {
10777 /* Previously we'd represented the CFA as an expression
10778 like *(%ebp - 8). We've just popped that value from
10779 the stack, which means we need to reset the CFA to
10780 the drap register. This will remain until we restore
10781 the stack pointer. */
10782 add_reg_note (insn, REG_CFA_DEF_CFA, reg);
10783 RTX_FRAME_RELATED_P (insn) = 1;
10784
10785 /* This means that the DRAP register is valid for addressing too. */
10786 m->fs.drap_valid = true;
10787 return;
10788 }
10789
10790 if (m->fs.cfa_reg == stack_pointer_rtx)
10791 {
10792 rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
10793 x = gen_rtx_SET (VOIDmode, stack_pointer_rtx, x);
10794 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
10795 RTX_FRAME_RELATED_P (insn) = 1;
10796
10797 m->fs.cfa_offset -= UNITS_PER_WORD;
10798 }
10799
10800 /* When the frame pointer is the CFA, and we pop it, we are
10801 swapping back to the stack pointer as the CFA. This happens
10802 for stack frames that don't allocate other data, so we assume
10803 the stack pointer is now pointing at the return address, i.e.
10804 the function entry state, which makes the offset be 1 word. */
10805 if (reg == hard_frame_pointer_rtx)
10806 {
10807 m->fs.fp_valid = false;
10808 if (m->fs.cfa_reg == hard_frame_pointer_rtx)
10809 {
10810 m->fs.cfa_reg = stack_pointer_rtx;
10811 m->fs.cfa_offset -= UNITS_PER_WORD;
10812
10813 add_reg_note (insn, REG_CFA_DEF_CFA,
10814 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
10815 GEN_INT (m->fs.cfa_offset)));
10816 RTX_FRAME_RELATED_P (insn) = 1;
10817 }
10818 }
10819 }
10820
10821 /* Emit code to restore saved registers using POP insns. */
10822
10823 static void
10824 ix86_emit_restore_regs_using_pop (void)
10825 {
10826 unsigned int regno;
10827
10828 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
10829 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, false))
10830 ix86_emit_restore_reg_using_pop (gen_rtx_REG (word_mode, regno));
10831 }
10832
10833 /* Emit code and notes for the LEAVE instruction. */
10834
10835 static void
10836 ix86_emit_leave (void)
10837 {
10838 struct machine_function *m = cfun->machine;
10839 rtx insn = emit_insn (ix86_gen_leave ());
10840
10841 ix86_add_queued_cfa_restore_notes (insn);
10842
10843 gcc_assert (m->fs.fp_valid);
10844 m->fs.sp_valid = true;
10845 m->fs.sp_offset = m->fs.fp_offset - UNITS_PER_WORD;
10846 m->fs.fp_valid = false;
10847
10848 if (m->fs.cfa_reg == hard_frame_pointer_rtx)
10849 {
10850 m->fs.cfa_reg = stack_pointer_rtx;
10851 m->fs.cfa_offset = m->fs.sp_offset;
10852
10853 add_reg_note (insn, REG_CFA_DEF_CFA,
10854 plus_constant (Pmode, stack_pointer_rtx,
10855 m->fs.sp_offset));
10856 RTX_FRAME_RELATED_P (insn) = 1;
10857 }
10858 ix86_add_cfa_restore_note (insn, hard_frame_pointer_rtx,
10859 m->fs.fp_offset);
10860 }
10861
10862 /* Emit code to restore saved registers using MOV insns.
10863 First register is restored from CFA - CFA_OFFSET. */
10864 static void
10865 ix86_emit_restore_regs_using_mov (HOST_WIDE_INT cfa_offset,
10866 bool maybe_eh_return)
10867 {
10868 struct machine_function *m = cfun->machine;
10869 unsigned int regno;
10870
10871 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
10872 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return))
10873 {
10874 rtx reg = gen_rtx_REG (word_mode, regno);
10875 rtx insn, mem;
10876
10877 mem = choose_baseaddr (cfa_offset);
10878 mem = gen_frame_mem (word_mode, mem);
10879 insn = emit_move_insn (reg, mem);
10880
10881 if (m->fs.cfa_reg == crtl->drap_reg && regno == REGNO (crtl->drap_reg))
10882 {
10883 /* Previously we'd represented the CFA as an expression
10884 like *(%ebp - 8). We've just popped that value from
10885 the stack, which means we need to reset the CFA to
10886 the drap register. This will remain until we restore
10887 the stack pointer. */
10888 add_reg_note (insn, REG_CFA_DEF_CFA, reg);
10889 RTX_FRAME_RELATED_P (insn) = 1;
10890
10891 /* This means that the DRAP register is valid for addressing. */
10892 m->fs.drap_valid = true;
10893 }
10894 else
10895 ix86_add_cfa_restore_note (NULL_RTX, reg, cfa_offset);
10896
10897 cfa_offset -= UNITS_PER_WORD;
10898 }
10899 }
10900
10901 /* Emit code to restore saved registers using MOV insns.
10902 First register is restored from CFA - CFA_OFFSET. */
10903 static void
10904 ix86_emit_restore_sse_regs_using_mov (HOST_WIDE_INT cfa_offset,
10905 bool maybe_eh_return)
10906 {
10907 unsigned int regno;
10908
10909 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
10910 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return))
10911 {
10912 rtx reg = gen_rtx_REG (V4SFmode, regno);
10913 rtx mem;
10914
10915 mem = choose_baseaddr (cfa_offset);
10916 mem = gen_rtx_MEM (V4SFmode, mem);
10917 set_mem_align (mem, 128);
10918 emit_move_insn (reg, mem);
10919
10920 ix86_add_cfa_restore_note (NULL_RTX, reg, cfa_offset);
10921
10922 cfa_offset -= 16;
10923 }
10924 }
10925
10926 /* Emit vzeroupper if needed. */
10927
10928 void
10929 ix86_maybe_emit_epilogue_vzeroupper (void)
10930 {
10931 if (TARGET_VZEROUPPER
10932 && !TREE_THIS_VOLATILE (cfun->decl)
10933 && !cfun->machine->caller_return_avx256_p)
10934 emit_insn (gen_avx_vzeroupper (GEN_INT (call_no_avx256)));
10935 }
10936
10937 /* Restore function stack, frame, and registers. */
10938
10939 void
10940 ix86_expand_epilogue (int style)
10941 {
10942 struct machine_function *m = cfun->machine;
10943 struct machine_frame_state frame_state_save = m->fs;
10944 struct ix86_frame frame;
10945 bool restore_regs_via_mov;
10946 bool using_drap;
10947
10948 ix86_finalize_stack_realign_flags ();
10949 ix86_compute_frame_layout (&frame);
10950
10951 m->fs.sp_valid = (!frame_pointer_needed
10952 || (crtl->sp_is_unchanging
10953 && !stack_realign_fp));
10954 gcc_assert (!m->fs.sp_valid
10955 || m->fs.sp_offset == frame.stack_pointer_offset);
10956
10957 /* The FP must be valid if the frame pointer is present. */
10958 gcc_assert (frame_pointer_needed == m->fs.fp_valid);
10959 gcc_assert (!m->fs.fp_valid
10960 || m->fs.fp_offset == frame.hard_frame_pointer_offset);
10961
10962 /* We must have *some* valid pointer to the stack frame. */
10963 gcc_assert (m->fs.sp_valid || m->fs.fp_valid);
10964
10965 /* The DRAP is never valid at this point. */
10966 gcc_assert (!m->fs.drap_valid);
10967
10968 /* See the comment about red zone and frame
10969 pointer usage in ix86_expand_prologue. */
10970 if (frame_pointer_needed && frame.red_zone_size)
10971 emit_insn (gen_memory_blockage ());
10972
10973 using_drap = crtl->drap_reg && crtl->stack_realign_needed;
10974 gcc_assert (!using_drap || m->fs.cfa_reg == crtl->drap_reg);
10975
10976 /* Determine the CFA offset of the end of the red-zone. */
10977 m->fs.red_zone_offset = 0;
10978 if (ix86_using_red_zone () && crtl->args.pops_args < 65536)
10979 {
10980 /* The red-zone begins below the return address. */
10981 m->fs.red_zone_offset = RED_ZONE_SIZE + UNITS_PER_WORD;
10982
10983 /* When the register save area is in the aligned portion of
10984 the stack, determine the maximum runtime displacement that
10985 matches up with the aligned frame. */
10986 if (stack_realign_drap)
10987 m->fs.red_zone_offset -= (crtl->stack_alignment_needed / BITS_PER_UNIT
10988 + UNITS_PER_WORD);
10989 }
10990
10991 /* Special care must be taken for the normal return case of a function
10992 using eh_return: the eax and edx registers are marked as saved, but
10993 not restored along this path. Adjust the save location to match. */
10994 if (crtl->calls_eh_return && style != 2)
10995 frame.reg_save_offset -= 2 * UNITS_PER_WORD;
10996
10997 /* EH_RETURN requires the use of moves to function properly. */
10998 if (crtl->calls_eh_return)
10999 restore_regs_via_mov = true;
11000 /* SEH requires the use of pops to identify the epilogue. */
11001 else if (TARGET_SEH)
11002 restore_regs_via_mov = false;
11003 /* If we're only restoring one register and sp is not valid then
11004 using a move instruction to restore the register since it's
11005 less work than reloading sp and popping the register. */
11006 else if (!m->fs.sp_valid && frame.nregs <= 1)
11007 restore_regs_via_mov = true;
11008 else if (TARGET_EPILOGUE_USING_MOVE
11009 && cfun->machine->use_fast_prologue_epilogue
11010 && (frame.nregs > 1
11011 || m->fs.sp_offset != frame.reg_save_offset))
11012 restore_regs_via_mov = true;
11013 else if (frame_pointer_needed
11014 && !frame.nregs
11015 && m->fs.sp_offset != frame.reg_save_offset)
11016 restore_regs_via_mov = true;
11017 else if (frame_pointer_needed
11018 && TARGET_USE_LEAVE
11019 && cfun->machine->use_fast_prologue_epilogue
11020 && frame.nregs == 1)
11021 restore_regs_via_mov = true;
11022 else
11023 restore_regs_via_mov = false;
11024
11025 if (restore_regs_via_mov || frame.nsseregs)
11026 {
11027 /* Ensure that the entire register save area is addressable via
11028 the stack pointer, if we will restore via sp. */
11029 if (TARGET_64BIT
11030 && m->fs.sp_offset > 0x7fffffff
11031 && !(m->fs.fp_valid || m->fs.drap_valid)
11032 && (frame.nsseregs + frame.nregs) != 0)
11033 {
11034 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
11035 GEN_INT (m->fs.sp_offset
11036 - frame.sse_reg_save_offset),
11037 style,
11038 m->fs.cfa_reg == stack_pointer_rtx);
11039 }
11040 }
11041
11042 /* If there are any SSE registers to restore, then we have to do it
11043 via moves, since there's obviously no pop for SSE regs. */
11044 if (frame.nsseregs)
11045 ix86_emit_restore_sse_regs_using_mov (frame.sse_reg_save_offset,
11046 style == 2);
11047
11048 if (restore_regs_via_mov)
11049 {
11050 rtx t;
11051
11052 if (frame.nregs)
11053 ix86_emit_restore_regs_using_mov (frame.reg_save_offset, style == 2);
11054
11055 /* eh_return epilogues need %ecx added to the stack pointer. */
11056 if (style == 2)
11057 {
11058 rtx insn, sa = EH_RETURN_STACKADJ_RTX;
11059
11060 /* Stack align doesn't work with eh_return. */
11061 gcc_assert (!stack_realign_drap);
11062 /* Neither does regparm nested functions. */
11063 gcc_assert (!ix86_static_chain_on_stack);
11064
11065 if (frame_pointer_needed)
11066 {
11067 t = gen_rtx_PLUS (Pmode, hard_frame_pointer_rtx, sa);
11068 t = plus_constant (Pmode, t, m->fs.fp_offset - UNITS_PER_WORD);
11069 emit_insn (gen_rtx_SET (VOIDmode, sa, t));
11070
11071 t = gen_frame_mem (Pmode, hard_frame_pointer_rtx);
11072 insn = emit_move_insn (hard_frame_pointer_rtx, t);
11073
11074 /* Note that we use SA as a temporary CFA, as the return
11075 address is at the proper place relative to it. We
11076 pretend this happens at the FP restore insn because
11077 prior to this insn the FP would be stored at the wrong
11078 offset relative to SA, and after this insn we have no
11079 other reasonable register to use for the CFA. We don't
11080 bother resetting the CFA to the SP for the duration of
11081 the return insn. */
11082 add_reg_note (insn, REG_CFA_DEF_CFA,
11083 plus_constant (Pmode, sa, UNITS_PER_WORD));
11084 ix86_add_queued_cfa_restore_notes (insn);
11085 add_reg_note (insn, REG_CFA_RESTORE, hard_frame_pointer_rtx);
11086 RTX_FRAME_RELATED_P (insn) = 1;
11087
11088 m->fs.cfa_reg = sa;
11089 m->fs.cfa_offset = UNITS_PER_WORD;
11090 m->fs.fp_valid = false;
11091
11092 pro_epilogue_adjust_stack (stack_pointer_rtx, sa,
11093 const0_rtx, style, false);
11094 }
11095 else
11096 {
11097 t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, sa);
11098 t = plus_constant (Pmode, t, m->fs.sp_offset - UNITS_PER_WORD);
11099 insn = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx, t));
11100 ix86_add_queued_cfa_restore_notes (insn);
11101
11102 gcc_assert (m->fs.cfa_reg == stack_pointer_rtx);
11103 if (m->fs.cfa_offset != UNITS_PER_WORD)
11104 {
11105 m->fs.cfa_offset = UNITS_PER_WORD;
11106 add_reg_note (insn, REG_CFA_DEF_CFA,
11107 plus_constant (Pmode, stack_pointer_rtx,
11108 UNITS_PER_WORD));
11109 RTX_FRAME_RELATED_P (insn) = 1;
11110 }
11111 }
11112 m->fs.sp_offset = UNITS_PER_WORD;
11113 m->fs.sp_valid = true;
11114 }
11115 }
11116 else
11117 {
11118 /* SEH requires that the function end with (1) a stack adjustment
11119 if necessary, (2) a sequence of pops, and (3) a return or
11120 jump instruction. Prevent insns from the function body from
11121 being scheduled into this sequence. */
11122 if (TARGET_SEH)
11123 {
11124 /* Prevent a catch region from being adjacent to the standard
11125 epilogue sequence. Unfortuantely crtl->uses_eh_lsda nor
11126 several other flags that would be interesting to test are
11127 not yet set up. */
11128 if (flag_non_call_exceptions)
11129 emit_insn (gen_nops (const1_rtx));
11130 else
11131 emit_insn (gen_blockage ());
11132 }
11133
11134 /* First step is to deallocate the stack frame so that we can
11135 pop the registers. Also do it on SEH target for very large
11136 frame as the emitted instructions aren't allowed by the ABI in
11137 epilogues. */
11138 if (!m->fs.sp_valid
11139 || (TARGET_SEH
11140 && (m->fs.sp_offset - frame.reg_save_offset
11141 >= SEH_MAX_FRAME_SIZE)))
11142 {
11143 pro_epilogue_adjust_stack (stack_pointer_rtx, hard_frame_pointer_rtx,
11144 GEN_INT (m->fs.fp_offset
11145 - frame.reg_save_offset),
11146 style, false);
11147 }
11148 else if (m->fs.sp_offset != frame.reg_save_offset)
11149 {
11150 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
11151 GEN_INT (m->fs.sp_offset
11152 - frame.reg_save_offset),
11153 style,
11154 m->fs.cfa_reg == stack_pointer_rtx);
11155 }
11156
11157 ix86_emit_restore_regs_using_pop ();
11158 }
11159
11160 /* If we used a stack pointer and haven't already got rid of it,
11161 then do so now. */
11162 if (m->fs.fp_valid)
11163 {
11164 /* If the stack pointer is valid and pointing at the frame
11165 pointer store address, then we only need a pop. */
11166 if (m->fs.sp_valid && m->fs.sp_offset == frame.hfp_save_offset)
11167 ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
11168 /* Leave results in shorter dependency chains on CPUs that are
11169 able to grok it fast. */
11170 else if (TARGET_USE_LEAVE
11171 || optimize_function_for_size_p (cfun)
11172 || !cfun->machine->use_fast_prologue_epilogue)
11173 ix86_emit_leave ();
11174 else
11175 {
11176 pro_epilogue_adjust_stack (stack_pointer_rtx,
11177 hard_frame_pointer_rtx,
11178 const0_rtx, style, !using_drap);
11179 ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
11180 }
11181 }
11182
11183 if (using_drap)
11184 {
11185 int param_ptr_offset = UNITS_PER_WORD;
11186 rtx insn;
11187
11188 gcc_assert (stack_realign_drap);
11189
11190 if (ix86_static_chain_on_stack)
11191 param_ptr_offset += UNITS_PER_WORD;
11192 if (!call_used_regs[REGNO (crtl->drap_reg)])
11193 param_ptr_offset += UNITS_PER_WORD;
11194
11195 insn = emit_insn (gen_rtx_SET
11196 (VOIDmode, stack_pointer_rtx,
11197 gen_rtx_PLUS (Pmode,
11198 crtl->drap_reg,
11199 GEN_INT (-param_ptr_offset))));
11200 m->fs.cfa_reg = stack_pointer_rtx;
11201 m->fs.cfa_offset = param_ptr_offset;
11202 m->fs.sp_offset = param_ptr_offset;
11203 m->fs.realigned = false;
11204
11205 add_reg_note (insn, REG_CFA_DEF_CFA,
11206 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
11207 GEN_INT (param_ptr_offset)));
11208 RTX_FRAME_RELATED_P (insn) = 1;
11209
11210 if (!call_used_regs[REGNO (crtl->drap_reg)])
11211 ix86_emit_restore_reg_using_pop (crtl->drap_reg);
11212 }
11213
11214 /* At this point the stack pointer must be valid, and we must have
11215 restored all of the registers. We may not have deallocated the
11216 entire stack frame. We've delayed this until now because it may
11217 be possible to merge the local stack deallocation with the
11218 deallocation forced by ix86_static_chain_on_stack. */
11219 gcc_assert (m->fs.sp_valid);
11220 gcc_assert (!m->fs.fp_valid);
11221 gcc_assert (!m->fs.realigned);
11222 if (m->fs.sp_offset != UNITS_PER_WORD)
11223 {
11224 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
11225 GEN_INT (m->fs.sp_offset - UNITS_PER_WORD),
11226 style, true);
11227 }
11228 else
11229 ix86_add_queued_cfa_restore_notes (get_last_insn ());
11230
11231 /* Sibcall epilogues don't want a return instruction. */
11232 if (style == 0)
11233 {
11234 m->fs = frame_state_save;
11235 return;
11236 }
11237
11238 /* Emit vzeroupper if needed. */
11239 ix86_maybe_emit_epilogue_vzeroupper ();
11240
11241 if (crtl->args.pops_args && crtl->args.size)
11242 {
11243 rtx popc = GEN_INT (crtl->args.pops_args);
11244
11245 /* i386 can only pop 64K bytes. If asked to pop more, pop return
11246 address, do explicit add, and jump indirectly to the caller. */
11247
11248 if (crtl->args.pops_args >= 65536)
11249 {
11250 rtx ecx = gen_rtx_REG (SImode, CX_REG);
11251 rtx insn;
11252
11253 /* There is no "pascal" calling convention in any 64bit ABI. */
11254 gcc_assert (!TARGET_64BIT);
11255
11256 insn = emit_insn (gen_pop (ecx));
11257 m->fs.cfa_offset -= UNITS_PER_WORD;
11258 m->fs.sp_offset -= UNITS_PER_WORD;
11259
11260 add_reg_note (insn, REG_CFA_ADJUST_CFA,
11261 copy_rtx (XVECEXP (PATTERN (insn), 0, 1)));
11262 add_reg_note (insn, REG_CFA_REGISTER,
11263 gen_rtx_SET (VOIDmode, ecx, pc_rtx));
11264 RTX_FRAME_RELATED_P (insn) = 1;
11265
11266 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
11267 popc, -1, true);
11268 emit_jump_insn (gen_simple_return_indirect_internal (ecx));
11269 }
11270 else
11271 emit_jump_insn (gen_simple_return_pop_internal (popc));
11272 }
11273 else
11274 emit_jump_insn (gen_simple_return_internal ());
11275
11276 /* Restore the state back to the state from the prologue,
11277 so that it's correct for the next epilogue. */
11278 m->fs = frame_state_save;
11279 }
11280
11281 /* Reset from the function's potential modifications. */
11282
11283 static void
11284 ix86_output_function_epilogue (FILE *file ATTRIBUTE_UNUSED,
11285 HOST_WIDE_INT size ATTRIBUTE_UNUSED)
11286 {
11287 if (pic_offset_table_rtx)
11288 SET_REGNO (pic_offset_table_rtx, REAL_PIC_OFFSET_TABLE_REGNUM);
11289 #if TARGET_MACHO
11290 /* Mach-O doesn't support labels at the end of objects, so if
11291 it looks like we might want one, insert a NOP. */
11292 {
11293 rtx insn = get_last_insn ();
11294 rtx deleted_debug_label = NULL_RTX;
11295 while (insn
11296 && NOTE_P (insn)
11297 && NOTE_KIND (insn) != NOTE_INSN_DELETED_LABEL)
11298 {
11299 /* Don't insert a nop for NOTE_INSN_DELETED_DEBUG_LABEL
11300 notes only, instead set their CODE_LABEL_NUMBER to -1,
11301 otherwise there would be code generation differences
11302 in between -g and -g0. */
11303 if (NOTE_P (insn) && NOTE_KIND (insn) == NOTE_INSN_DELETED_DEBUG_LABEL)
11304 deleted_debug_label = insn;
11305 insn = PREV_INSN (insn);
11306 }
11307 if (insn
11308 && (LABEL_P (insn)
11309 || (NOTE_P (insn)
11310 && NOTE_KIND (insn) == NOTE_INSN_DELETED_LABEL)))
11311 fputs ("\tnop\n", file);
11312 else if (deleted_debug_label)
11313 for (insn = deleted_debug_label; insn; insn = NEXT_INSN (insn))
11314 if (NOTE_KIND (insn) == NOTE_INSN_DELETED_DEBUG_LABEL)
11315 CODE_LABEL_NUMBER (insn) = -1;
11316 }
11317 #endif
11318
11319 }
11320
11321 /* Return a scratch register to use in the split stack prologue. The
11322 split stack prologue is used for -fsplit-stack. It is the first
11323 instructions in the function, even before the regular prologue.
11324 The scratch register can be any caller-saved register which is not
11325 used for parameters or for the static chain. */
11326
11327 static unsigned int
11328 split_stack_prologue_scratch_regno (void)
11329 {
11330 if (TARGET_64BIT)
11331 return R11_REG;
11332 else
11333 {
11334 bool is_fastcall;
11335 int regparm;
11336
11337 is_fastcall = (lookup_attribute ("fastcall",
11338 TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
11339 != NULL);
11340 regparm = ix86_function_regparm (TREE_TYPE (cfun->decl), cfun->decl);
11341
11342 if (is_fastcall)
11343 {
11344 if (DECL_STATIC_CHAIN (cfun->decl))
11345 {
11346 sorry ("-fsplit-stack does not support fastcall with "
11347 "nested function");
11348 return INVALID_REGNUM;
11349 }
11350 return AX_REG;
11351 }
11352 else if (regparm < 3)
11353 {
11354 if (!DECL_STATIC_CHAIN (cfun->decl))
11355 return CX_REG;
11356 else
11357 {
11358 if (regparm >= 2)
11359 {
11360 sorry ("-fsplit-stack does not support 2 register "
11361 " parameters for a nested function");
11362 return INVALID_REGNUM;
11363 }
11364 return DX_REG;
11365 }
11366 }
11367 else
11368 {
11369 /* FIXME: We could make this work by pushing a register
11370 around the addition and comparison. */
11371 sorry ("-fsplit-stack does not support 3 register parameters");
11372 return INVALID_REGNUM;
11373 }
11374 }
11375 }
11376
11377 /* A SYMBOL_REF for the function which allocates new stackspace for
11378 -fsplit-stack. */
11379
11380 static GTY(()) rtx split_stack_fn;
11381
11382 /* A SYMBOL_REF for the more stack function when using the large
11383 model. */
11384
11385 static GTY(()) rtx split_stack_fn_large;
11386
11387 /* Handle -fsplit-stack. These are the first instructions in the
11388 function, even before the regular prologue. */
11389
11390 void
11391 ix86_expand_split_stack_prologue (void)
11392 {
11393 struct ix86_frame frame;
11394 HOST_WIDE_INT allocate;
11395 unsigned HOST_WIDE_INT args_size;
11396 rtx label, limit, current, jump_insn, allocate_rtx, call_insn, call_fusage;
11397 rtx scratch_reg = NULL_RTX;
11398 rtx varargs_label = NULL_RTX;
11399 rtx fn;
11400
11401 gcc_assert (flag_split_stack && reload_completed);
11402
11403 ix86_finalize_stack_realign_flags ();
11404 ix86_compute_frame_layout (&frame);
11405 allocate = frame.stack_pointer_offset - INCOMING_FRAME_SP_OFFSET;
11406
11407 /* This is the label we will branch to if we have enough stack
11408 space. We expect the basic block reordering pass to reverse this
11409 branch if optimizing, so that we branch in the unlikely case. */
11410 label = gen_label_rtx ();
11411
11412 /* We need to compare the stack pointer minus the frame size with
11413 the stack boundary in the TCB. The stack boundary always gives
11414 us SPLIT_STACK_AVAILABLE bytes, so if we need less than that we
11415 can compare directly. Otherwise we need to do an addition. */
11416
11417 limit = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
11418 UNSPEC_STACK_CHECK);
11419 limit = gen_rtx_CONST (Pmode, limit);
11420 limit = gen_rtx_MEM (Pmode, limit);
11421 if (allocate < SPLIT_STACK_AVAILABLE)
11422 current = stack_pointer_rtx;
11423 else
11424 {
11425 unsigned int scratch_regno;
11426 rtx offset;
11427
11428 /* We need a scratch register to hold the stack pointer minus
11429 the required frame size. Since this is the very start of the
11430 function, the scratch register can be any caller-saved
11431 register which is not used for parameters. */
11432 offset = GEN_INT (- allocate);
11433 scratch_regno = split_stack_prologue_scratch_regno ();
11434 if (scratch_regno == INVALID_REGNUM)
11435 return;
11436 scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
11437 if (!TARGET_64BIT || x86_64_immediate_operand (offset, Pmode))
11438 {
11439 /* We don't use ix86_gen_add3 in this case because it will
11440 want to split to lea, but when not optimizing the insn
11441 will not be split after this point. */
11442 emit_insn (gen_rtx_SET (VOIDmode, scratch_reg,
11443 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
11444 offset)));
11445 }
11446 else
11447 {
11448 emit_move_insn (scratch_reg, offset);
11449 emit_insn (ix86_gen_add3 (scratch_reg, scratch_reg,
11450 stack_pointer_rtx));
11451 }
11452 current = scratch_reg;
11453 }
11454
11455 ix86_expand_branch (GEU, current, limit, label);
11456 jump_insn = get_last_insn ();
11457 JUMP_LABEL (jump_insn) = label;
11458
11459 /* Mark the jump as very likely to be taken. */
11460 add_reg_note (jump_insn, REG_BR_PROB,
11461 GEN_INT (REG_BR_PROB_BASE - REG_BR_PROB_BASE / 100));
11462
11463 if (split_stack_fn == NULL_RTX)
11464 split_stack_fn = gen_rtx_SYMBOL_REF (Pmode, "__morestack");
11465 fn = split_stack_fn;
11466
11467 /* Get more stack space. We pass in the desired stack space and the
11468 size of the arguments to copy to the new stack. In 32-bit mode
11469 we push the parameters; __morestack will return on a new stack
11470 anyhow. In 64-bit mode we pass the parameters in r10 and
11471 r11. */
11472 allocate_rtx = GEN_INT (allocate);
11473 args_size = crtl->args.size >= 0 ? crtl->args.size : 0;
11474 call_fusage = NULL_RTX;
11475 if (TARGET_64BIT)
11476 {
11477 rtx reg10, reg11;
11478
11479 reg10 = gen_rtx_REG (Pmode, R10_REG);
11480 reg11 = gen_rtx_REG (Pmode, R11_REG);
11481
11482 /* If this function uses a static chain, it will be in %r10.
11483 Preserve it across the call to __morestack. */
11484 if (DECL_STATIC_CHAIN (cfun->decl))
11485 {
11486 rtx rax;
11487
11488 rax = gen_rtx_REG (word_mode, AX_REG);
11489 emit_move_insn (rax, gen_rtx_REG (word_mode, R10_REG));
11490 use_reg (&call_fusage, rax);
11491 }
11492
11493 if (ix86_cmodel == CM_LARGE || ix86_cmodel == CM_LARGE_PIC)
11494 {
11495 HOST_WIDE_INT argval;
11496
11497 gcc_assert (Pmode == DImode);
11498 /* When using the large model we need to load the address
11499 into a register, and we've run out of registers. So we
11500 switch to a different calling convention, and we call a
11501 different function: __morestack_large. We pass the
11502 argument size in the upper 32 bits of r10 and pass the
11503 frame size in the lower 32 bits. */
11504 gcc_assert ((allocate & (HOST_WIDE_INT) 0xffffffff) == allocate);
11505 gcc_assert ((args_size & 0xffffffff) == args_size);
11506
11507 if (split_stack_fn_large == NULL_RTX)
11508 split_stack_fn_large =
11509 gen_rtx_SYMBOL_REF (Pmode, "__morestack_large_model");
11510
11511 if (ix86_cmodel == CM_LARGE_PIC)
11512 {
11513 rtx label, x;
11514
11515 label = gen_label_rtx ();
11516 emit_label (label);
11517 LABEL_PRESERVE_P (label) = 1;
11518 emit_insn (gen_set_rip_rex64 (reg10, label));
11519 emit_insn (gen_set_got_offset_rex64 (reg11, label));
11520 emit_insn (ix86_gen_add3 (reg10, reg10, reg11));
11521 x = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, split_stack_fn_large),
11522 UNSPEC_GOT);
11523 x = gen_rtx_CONST (Pmode, x);
11524 emit_move_insn (reg11, x);
11525 x = gen_rtx_PLUS (Pmode, reg10, reg11);
11526 x = gen_const_mem (Pmode, x);
11527 emit_move_insn (reg11, x);
11528 }
11529 else
11530 emit_move_insn (reg11, split_stack_fn_large);
11531
11532 fn = reg11;
11533
11534 argval = ((args_size << 16) << 16) + allocate;
11535 emit_move_insn (reg10, GEN_INT (argval));
11536 }
11537 else
11538 {
11539 emit_move_insn (reg10, allocate_rtx);
11540 emit_move_insn (reg11, GEN_INT (args_size));
11541 use_reg (&call_fusage, reg11);
11542 }
11543
11544 use_reg (&call_fusage, reg10);
11545 }
11546 else
11547 {
11548 emit_insn (gen_push (GEN_INT (args_size)));
11549 emit_insn (gen_push (allocate_rtx));
11550 }
11551 call_insn = ix86_expand_call (NULL_RTX, gen_rtx_MEM (QImode, fn),
11552 GEN_INT (UNITS_PER_WORD), constm1_rtx,
11553 NULL_RTX, false);
11554 add_function_usage_to (call_insn, call_fusage);
11555
11556 /* In order to make call/return prediction work right, we now need
11557 to execute a return instruction. See
11558 libgcc/config/i386/morestack.S for the details on how this works.
11559
11560 For flow purposes gcc must not see this as a return
11561 instruction--we need control flow to continue at the subsequent
11562 label. Therefore, we use an unspec. */
11563 gcc_assert (crtl->args.pops_args < 65536);
11564 emit_insn (gen_split_stack_return (GEN_INT (crtl->args.pops_args)));
11565
11566 /* If we are in 64-bit mode and this function uses a static chain,
11567 we saved %r10 in %rax before calling _morestack. */
11568 if (TARGET_64BIT && DECL_STATIC_CHAIN (cfun->decl))
11569 emit_move_insn (gen_rtx_REG (word_mode, R10_REG),
11570 gen_rtx_REG (word_mode, AX_REG));
11571
11572 /* If this function calls va_start, we need to store a pointer to
11573 the arguments on the old stack, because they may not have been
11574 all copied to the new stack. At this point the old stack can be
11575 found at the frame pointer value used by __morestack, because
11576 __morestack has set that up before calling back to us. Here we
11577 store that pointer in a scratch register, and in
11578 ix86_expand_prologue we store the scratch register in a stack
11579 slot. */
11580 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
11581 {
11582 unsigned int scratch_regno;
11583 rtx frame_reg;
11584 int words;
11585
11586 scratch_regno = split_stack_prologue_scratch_regno ();
11587 scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
11588 frame_reg = gen_rtx_REG (Pmode, BP_REG);
11589
11590 /* 64-bit:
11591 fp -> old fp value
11592 return address within this function
11593 return address of caller of this function
11594 stack arguments
11595 So we add three words to get to the stack arguments.
11596
11597 32-bit:
11598 fp -> old fp value
11599 return address within this function
11600 first argument to __morestack
11601 second argument to __morestack
11602 return address of caller of this function
11603 stack arguments
11604 So we add five words to get to the stack arguments.
11605 */
11606 words = TARGET_64BIT ? 3 : 5;
11607 emit_insn (gen_rtx_SET (VOIDmode, scratch_reg,
11608 gen_rtx_PLUS (Pmode, frame_reg,
11609 GEN_INT (words * UNITS_PER_WORD))));
11610
11611 varargs_label = gen_label_rtx ();
11612 emit_jump_insn (gen_jump (varargs_label));
11613 JUMP_LABEL (get_last_insn ()) = varargs_label;
11614
11615 emit_barrier ();
11616 }
11617
11618 emit_label (label);
11619 LABEL_NUSES (label) = 1;
11620
11621 /* If this function calls va_start, we now have to set the scratch
11622 register for the case where we do not call __morestack. In this
11623 case we need to set it based on the stack pointer. */
11624 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
11625 {
11626 emit_insn (gen_rtx_SET (VOIDmode, scratch_reg,
11627 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
11628 GEN_INT (UNITS_PER_WORD))));
11629
11630 emit_label (varargs_label);
11631 LABEL_NUSES (varargs_label) = 1;
11632 }
11633 }
11634
11635 /* We may have to tell the dataflow pass that the split stack prologue
11636 is initializing a scratch register. */
11637
11638 static void
11639 ix86_live_on_entry (bitmap regs)
11640 {
11641 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
11642 {
11643 gcc_assert (flag_split_stack);
11644 bitmap_set_bit (regs, split_stack_prologue_scratch_regno ());
11645 }
11646 }
11647 \f
11648 /* Determine if op is suitable SUBREG RTX for address. */
11649
11650 static bool
11651 ix86_address_subreg_operand (rtx op)
11652 {
11653 enum machine_mode mode;
11654
11655 if (!REG_P (op))
11656 return false;
11657
11658 mode = GET_MODE (op);
11659
11660 if (GET_MODE_CLASS (mode) != MODE_INT)
11661 return false;
11662
11663 /* Don't allow SUBREGs that span more than a word. It can lead to spill
11664 failures when the register is one word out of a two word structure. */
11665 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
11666 return false;
11667
11668 /* simplify_subreg does not handle stack pointer. */
11669 if (REGNO (op) == STACK_POINTER_REGNUM)
11670 return false;
11671
11672 /* Allow only SUBREGs of non-eliminable hard registers. */
11673 return register_no_elim_operand (op, mode);
11674 }
11675
11676 /* Extract the parts of an RTL expression that is a valid memory address
11677 for an instruction. Return 0 if the structure of the address is
11678 grossly off. Return -1 if the address contains ASHIFT, so it is not
11679 strictly valid, but still used for computing length of lea instruction. */
11680
11681 int
11682 ix86_decompose_address (rtx addr, struct ix86_address *out)
11683 {
11684 rtx base = NULL_RTX, index = NULL_RTX, disp = NULL_RTX;
11685 rtx base_reg, index_reg;
11686 HOST_WIDE_INT scale = 1;
11687 rtx scale_rtx = NULL_RTX;
11688 rtx tmp;
11689 int retval = 1;
11690 enum ix86_address_seg seg = SEG_DEFAULT;
11691
11692 /* Allow zero-extended SImode addresses,
11693 they will be emitted with addr32 prefix. */
11694 if (TARGET_64BIT && GET_MODE (addr) == DImode)
11695 {
11696 if (GET_CODE (addr) == ZERO_EXTEND
11697 && GET_MODE (XEXP (addr, 0)) == SImode)
11698 {
11699 addr = XEXP (addr, 0);
11700 if (CONST_INT_P (addr))
11701 return 0;
11702 }
11703 else if (GET_CODE (addr) == AND
11704 && const_32bit_mask (XEXP (addr, 1), DImode))
11705 {
11706 addr = XEXP (addr, 0);
11707
11708 /* Adjust SUBREGs. */
11709 if (GET_CODE (addr) == SUBREG
11710 && GET_MODE (SUBREG_REG (addr)) == SImode)
11711 {
11712 addr = SUBREG_REG (addr);
11713 if (CONST_INT_P (addr))
11714 return 0;
11715 }
11716 else if (GET_MODE (addr) == DImode)
11717 addr = gen_rtx_SUBREG (SImode, addr, 0);
11718 else if (GET_MODE (addr) != VOIDmode)
11719 return 0;
11720 }
11721 }
11722
11723 /* Allow SImode subregs of DImode addresses,
11724 they will be emitted with addr32 prefix. */
11725 if (TARGET_64BIT && GET_MODE (addr) == SImode)
11726 {
11727 if (GET_CODE (addr) == SUBREG
11728 && GET_MODE (SUBREG_REG (addr)) == DImode)
11729 {
11730 addr = SUBREG_REG (addr);
11731 if (CONST_INT_P (addr))
11732 return 0;
11733 }
11734 }
11735
11736 if (REG_P (addr))
11737 base = addr;
11738 else if (GET_CODE (addr) == SUBREG)
11739 {
11740 if (ix86_address_subreg_operand (SUBREG_REG (addr)))
11741 base = addr;
11742 else
11743 return 0;
11744 }
11745 else if (GET_CODE (addr) == PLUS)
11746 {
11747 rtx addends[4], op;
11748 int n = 0, i;
11749
11750 op = addr;
11751 do
11752 {
11753 if (n >= 4)
11754 return 0;
11755 addends[n++] = XEXP (op, 1);
11756 op = XEXP (op, 0);
11757 }
11758 while (GET_CODE (op) == PLUS);
11759 if (n >= 4)
11760 return 0;
11761 addends[n] = op;
11762
11763 for (i = n; i >= 0; --i)
11764 {
11765 op = addends[i];
11766 switch (GET_CODE (op))
11767 {
11768 case MULT:
11769 if (index)
11770 return 0;
11771 index = XEXP (op, 0);
11772 scale_rtx = XEXP (op, 1);
11773 break;
11774
11775 case ASHIFT:
11776 if (index)
11777 return 0;
11778 index = XEXP (op, 0);
11779 tmp = XEXP (op, 1);
11780 if (!CONST_INT_P (tmp))
11781 return 0;
11782 scale = INTVAL (tmp);
11783 if ((unsigned HOST_WIDE_INT) scale > 3)
11784 return 0;
11785 scale = 1 << scale;
11786 break;
11787
11788 case ZERO_EXTEND:
11789 op = XEXP (op, 0);
11790 if (GET_CODE (op) != UNSPEC)
11791 return 0;
11792 /* FALLTHRU */
11793
11794 case UNSPEC:
11795 if (XINT (op, 1) == UNSPEC_TP
11796 && TARGET_TLS_DIRECT_SEG_REFS
11797 && seg == SEG_DEFAULT)
11798 seg = TARGET_64BIT ? SEG_FS : SEG_GS;
11799 else
11800 return 0;
11801 break;
11802
11803 case SUBREG:
11804 if (!ix86_address_subreg_operand (SUBREG_REG (op)))
11805 return 0;
11806 /* FALLTHRU */
11807
11808 case REG:
11809 if (!base)
11810 base = op;
11811 else if (!index)
11812 index = op;
11813 else
11814 return 0;
11815 break;
11816
11817 case CONST:
11818 case CONST_INT:
11819 case SYMBOL_REF:
11820 case LABEL_REF:
11821 if (disp)
11822 return 0;
11823 disp = op;
11824 break;
11825
11826 default:
11827 return 0;
11828 }
11829 }
11830 }
11831 else if (GET_CODE (addr) == MULT)
11832 {
11833 index = XEXP (addr, 0); /* index*scale */
11834 scale_rtx = XEXP (addr, 1);
11835 }
11836 else if (GET_CODE (addr) == ASHIFT)
11837 {
11838 /* We're called for lea too, which implements ashift on occasion. */
11839 index = XEXP (addr, 0);
11840 tmp = XEXP (addr, 1);
11841 if (!CONST_INT_P (tmp))
11842 return 0;
11843 scale = INTVAL (tmp);
11844 if ((unsigned HOST_WIDE_INT) scale > 3)
11845 return 0;
11846 scale = 1 << scale;
11847 retval = -1;
11848 }
11849 else if (CONST_INT_P (addr))
11850 {
11851 if (!x86_64_immediate_operand (addr, VOIDmode))
11852 return 0;
11853
11854 /* Constant addresses are sign extended to 64bit, we have to
11855 prevent addresses from 0x80000000 to 0xffffffff in x32 mode. */
11856 if (TARGET_X32
11857 && val_signbit_known_set_p (SImode, INTVAL (addr)))
11858 return 0;
11859
11860 disp = addr;
11861 }
11862 else
11863 disp = addr; /* displacement */
11864
11865 if (index)
11866 {
11867 if (REG_P (index))
11868 ;
11869 else if (GET_CODE (index) == SUBREG
11870 && ix86_address_subreg_operand (SUBREG_REG (index)))
11871 ;
11872 else
11873 return 0;
11874 }
11875
11876 /* Address override works only on the (%reg) part of %fs:(%reg). */
11877 if (seg != SEG_DEFAULT
11878 && ((base && GET_MODE (base) != word_mode)
11879 || (index && GET_MODE (index) != word_mode)))
11880 return 0;
11881
11882 /* Extract the integral value of scale. */
11883 if (scale_rtx)
11884 {
11885 if (!CONST_INT_P (scale_rtx))
11886 return 0;
11887 scale = INTVAL (scale_rtx);
11888 }
11889
11890 base_reg = base && GET_CODE (base) == SUBREG ? SUBREG_REG (base) : base;
11891 index_reg = index && GET_CODE (index) == SUBREG ? SUBREG_REG (index) : index;
11892
11893 /* Avoid useless 0 displacement. */
11894 if (disp == const0_rtx && (base || index))
11895 disp = NULL_RTX;
11896
11897 /* Allow arg pointer and stack pointer as index if there is not scaling. */
11898 if (base_reg && index_reg && scale == 1
11899 && (index_reg == arg_pointer_rtx
11900 || index_reg == frame_pointer_rtx
11901 || (REG_P (index_reg) && REGNO (index_reg) == STACK_POINTER_REGNUM)))
11902 {
11903 rtx tmp;
11904 tmp = base, base = index, index = tmp;
11905 tmp = base_reg, base_reg = index_reg, index_reg = tmp;
11906 }
11907
11908 /* Special case: %ebp cannot be encoded as a base without a displacement.
11909 Similarly %r13. */
11910 if (!disp
11911 && base_reg
11912 && (base_reg == hard_frame_pointer_rtx
11913 || base_reg == frame_pointer_rtx
11914 || base_reg == arg_pointer_rtx
11915 || (REG_P (base_reg)
11916 && (REGNO (base_reg) == HARD_FRAME_POINTER_REGNUM
11917 || REGNO (base_reg) == R13_REG))))
11918 disp = const0_rtx;
11919
11920 /* Special case: on K6, [%esi] makes the instruction vector decoded.
11921 Avoid this by transforming to [%esi+0].
11922 Reload calls address legitimization without cfun defined, so we need
11923 to test cfun for being non-NULL. */
11924 if (TARGET_K6 && cfun && optimize_function_for_speed_p (cfun)
11925 && base_reg && !index_reg && !disp
11926 && REG_P (base_reg) && REGNO (base_reg) == SI_REG)
11927 disp = const0_rtx;
11928
11929 /* Special case: encode reg+reg instead of reg*2. */
11930 if (!base && index && scale == 2)
11931 base = index, base_reg = index_reg, scale = 1;
11932
11933 /* Special case: scaling cannot be encoded without base or displacement. */
11934 if (!base && !disp && index && scale != 1)
11935 disp = const0_rtx;
11936
11937 out->base = base;
11938 out->index = index;
11939 out->disp = disp;
11940 out->scale = scale;
11941 out->seg = seg;
11942
11943 return retval;
11944 }
11945 \f
11946 /* Return cost of the memory address x.
11947 For i386, it is better to use a complex address than let gcc copy
11948 the address into a reg and make a new pseudo. But not if the address
11949 requires to two regs - that would mean more pseudos with longer
11950 lifetimes. */
11951 static int
11952 ix86_address_cost (rtx x, bool speed ATTRIBUTE_UNUSED)
11953 {
11954 struct ix86_address parts;
11955 int cost = 1;
11956 int ok = ix86_decompose_address (x, &parts);
11957
11958 gcc_assert (ok);
11959
11960 if (parts.base && GET_CODE (parts.base) == SUBREG)
11961 parts.base = SUBREG_REG (parts.base);
11962 if (parts.index && GET_CODE (parts.index) == SUBREG)
11963 parts.index = SUBREG_REG (parts.index);
11964
11965 /* Attempt to minimize number of registers in the address. */
11966 if ((parts.base
11967 && (!REG_P (parts.base) || REGNO (parts.base) >= FIRST_PSEUDO_REGISTER))
11968 || (parts.index
11969 && (!REG_P (parts.index)
11970 || REGNO (parts.index) >= FIRST_PSEUDO_REGISTER)))
11971 cost++;
11972
11973 if (parts.base
11974 && (!REG_P (parts.base) || REGNO (parts.base) >= FIRST_PSEUDO_REGISTER)
11975 && parts.index
11976 && (!REG_P (parts.index) || REGNO (parts.index) >= FIRST_PSEUDO_REGISTER)
11977 && parts.base != parts.index)
11978 cost++;
11979
11980 /* AMD-K6 don't like addresses with ModR/M set to 00_xxx_100b,
11981 since it's predecode logic can't detect the length of instructions
11982 and it degenerates to vector decoded. Increase cost of such
11983 addresses here. The penalty is minimally 2 cycles. It may be worthwhile
11984 to split such addresses or even refuse such addresses at all.
11985
11986 Following addressing modes are affected:
11987 [base+scale*index]
11988 [scale*index+disp]
11989 [base+index]
11990
11991 The first and last case may be avoidable by explicitly coding the zero in
11992 memory address, but I don't have AMD-K6 machine handy to check this
11993 theory. */
11994
11995 if (TARGET_K6
11996 && ((!parts.disp && parts.base && parts.index && parts.scale != 1)
11997 || (parts.disp && !parts.base && parts.index && parts.scale != 1)
11998 || (!parts.disp && parts.base && parts.index && parts.scale == 1)))
11999 cost += 10;
12000
12001 return cost;
12002 }
12003 \f
12004 /* Allow {LABEL | SYMBOL}_REF - SYMBOL_REF-FOR-PICBASE for Mach-O as
12005 this is used for to form addresses to local data when -fPIC is in
12006 use. */
12007
12008 static bool
12009 darwin_local_data_pic (rtx disp)
12010 {
12011 return (GET_CODE (disp) == UNSPEC
12012 && XINT (disp, 1) == UNSPEC_MACHOPIC_OFFSET);
12013 }
12014
12015 /* Determine if a given RTX is a valid constant. We already know this
12016 satisfies CONSTANT_P. */
12017
12018 static bool
12019 ix86_legitimate_constant_p (enum machine_mode mode ATTRIBUTE_UNUSED, rtx x)
12020 {
12021 switch (GET_CODE (x))
12022 {
12023 case CONST:
12024 x = XEXP (x, 0);
12025
12026 if (GET_CODE (x) == PLUS)
12027 {
12028 if (!CONST_INT_P (XEXP (x, 1)))
12029 return false;
12030 x = XEXP (x, 0);
12031 }
12032
12033 if (TARGET_MACHO && darwin_local_data_pic (x))
12034 return true;
12035
12036 /* Only some unspecs are valid as "constants". */
12037 if (GET_CODE (x) == UNSPEC)
12038 switch (XINT (x, 1))
12039 {
12040 case UNSPEC_GOT:
12041 case UNSPEC_GOTOFF:
12042 case UNSPEC_PLTOFF:
12043 return TARGET_64BIT;
12044 case UNSPEC_TPOFF:
12045 case UNSPEC_NTPOFF:
12046 x = XVECEXP (x, 0, 0);
12047 return (GET_CODE (x) == SYMBOL_REF
12048 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
12049 case UNSPEC_DTPOFF:
12050 x = XVECEXP (x, 0, 0);
12051 return (GET_CODE (x) == SYMBOL_REF
12052 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_DYNAMIC);
12053 default:
12054 return false;
12055 }
12056
12057 /* We must have drilled down to a symbol. */
12058 if (GET_CODE (x) == LABEL_REF)
12059 return true;
12060 if (GET_CODE (x) != SYMBOL_REF)
12061 return false;
12062 /* FALLTHRU */
12063
12064 case SYMBOL_REF:
12065 /* TLS symbols are never valid. */
12066 if (SYMBOL_REF_TLS_MODEL (x))
12067 return false;
12068
12069 /* DLLIMPORT symbols are never valid. */
12070 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
12071 && SYMBOL_REF_DLLIMPORT_P (x))
12072 return false;
12073
12074 #if TARGET_MACHO
12075 /* mdynamic-no-pic */
12076 if (MACHO_DYNAMIC_NO_PIC_P)
12077 return machopic_symbol_defined_p (x);
12078 #endif
12079 break;
12080
12081 case CONST_DOUBLE:
12082 if (GET_MODE (x) == TImode
12083 && x != CONST0_RTX (TImode)
12084 && !TARGET_64BIT)
12085 return false;
12086 break;
12087
12088 case CONST_VECTOR:
12089 if (!standard_sse_constant_p (x))
12090 return false;
12091
12092 default:
12093 break;
12094 }
12095
12096 /* Otherwise we handle everything else in the move patterns. */
12097 return true;
12098 }
12099
12100 /* Determine if it's legal to put X into the constant pool. This
12101 is not possible for the address of thread-local symbols, which
12102 is checked above. */
12103
12104 static bool
12105 ix86_cannot_force_const_mem (enum machine_mode mode, rtx x)
12106 {
12107 /* We can always put integral constants and vectors in memory. */
12108 switch (GET_CODE (x))
12109 {
12110 case CONST_INT:
12111 case CONST_DOUBLE:
12112 case CONST_VECTOR:
12113 return false;
12114
12115 default:
12116 break;
12117 }
12118 return !ix86_legitimate_constant_p (mode, x);
12119 }
12120
12121
12122 /* Nonzero if the constant value X is a legitimate general operand
12123 when generating PIC code. It is given that flag_pic is on and
12124 that X satisfies CONSTANT_P or is a CONST_DOUBLE. */
12125
12126 bool
12127 legitimate_pic_operand_p (rtx x)
12128 {
12129 rtx inner;
12130
12131 switch (GET_CODE (x))
12132 {
12133 case CONST:
12134 inner = XEXP (x, 0);
12135 if (GET_CODE (inner) == PLUS
12136 && CONST_INT_P (XEXP (inner, 1)))
12137 inner = XEXP (inner, 0);
12138
12139 /* Only some unspecs are valid as "constants". */
12140 if (GET_CODE (inner) == UNSPEC)
12141 switch (XINT (inner, 1))
12142 {
12143 case UNSPEC_GOT:
12144 case UNSPEC_GOTOFF:
12145 case UNSPEC_PLTOFF:
12146 return TARGET_64BIT;
12147 case UNSPEC_TPOFF:
12148 x = XVECEXP (inner, 0, 0);
12149 return (GET_CODE (x) == SYMBOL_REF
12150 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
12151 case UNSPEC_MACHOPIC_OFFSET:
12152 return legitimate_pic_address_disp_p (x);
12153 default:
12154 return false;
12155 }
12156 /* FALLTHRU */
12157
12158 case SYMBOL_REF:
12159 case LABEL_REF:
12160 return legitimate_pic_address_disp_p (x);
12161
12162 default:
12163 return true;
12164 }
12165 }
12166
12167 /* Determine if a given CONST RTX is a valid memory displacement
12168 in PIC mode. */
12169
12170 bool
12171 legitimate_pic_address_disp_p (rtx disp)
12172 {
12173 bool saw_plus;
12174
12175 /* In 64bit mode we can allow direct addresses of symbols and labels
12176 when they are not dynamic symbols. */
12177 if (TARGET_64BIT)
12178 {
12179 rtx op0 = disp, op1;
12180
12181 switch (GET_CODE (disp))
12182 {
12183 case LABEL_REF:
12184 return true;
12185
12186 case CONST:
12187 if (GET_CODE (XEXP (disp, 0)) != PLUS)
12188 break;
12189 op0 = XEXP (XEXP (disp, 0), 0);
12190 op1 = XEXP (XEXP (disp, 0), 1);
12191 if (!CONST_INT_P (op1)
12192 || INTVAL (op1) >= 16*1024*1024
12193 || INTVAL (op1) < -16*1024*1024)
12194 break;
12195 if (GET_CODE (op0) == LABEL_REF)
12196 return true;
12197 if (GET_CODE (op0) == CONST
12198 && GET_CODE (XEXP (op0, 0)) == UNSPEC
12199 && XINT (XEXP (op0, 0), 1) == UNSPEC_PCREL)
12200 return true;
12201 if (GET_CODE (op0) == UNSPEC
12202 && XINT (op0, 1) == UNSPEC_PCREL)
12203 return true;
12204 if (GET_CODE (op0) != SYMBOL_REF)
12205 break;
12206 /* FALLTHRU */
12207
12208 case SYMBOL_REF:
12209 /* TLS references should always be enclosed in UNSPEC. */
12210 if (SYMBOL_REF_TLS_MODEL (op0))
12211 return false;
12212 if (!SYMBOL_REF_FAR_ADDR_P (op0) && SYMBOL_REF_LOCAL_P (op0)
12213 && ix86_cmodel != CM_LARGE_PIC)
12214 return true;
12215 break;
12216
12217 default:
12218 break;
12219 }
12220 }
12221 if (GET_CODE (disp) != CONST)
12222 return false;
12223 disp = XEXP (disp, 0);
12224
12225 if (TARGET_64BIT)
12226 {
12227 /* We are unsafe to allow PLUS expressions. This limit allowed distance
12228 of GOT tables. We should not need these anyway. */
12229 if (GET_CODE (disp) != UNSPEC
12230 || (XINT (disp, 1) != UNSPEC_GOTPCREL
12231 && XINT (disp, 1) != UNSPEC_GOTOFF
12232 && XINT (disp, 1) != UNSPEC_PCREL
12233 && XINT (disp, 1) != UNSPEC_PLTOFF))
12234 return false;
12235
12236 if (GET_CODE (XVECEXP (disp, 0, 0)) != SYMBOL_REF
12237 && GET_CODE (XVECEXP (disp, 0, 0)) != LABEL_REF)
12238 return false;
12239 return true;
12240 }
12241
12242 saw_plus = false;
12243 if (GET_CODE (disp) == PLUS)
12244 {
12245 if (!CONST_INT_P (XEXP (disp, 1)))
12246 return false;
12247 disp = XEXP (disp, 0);
12248 saw_plus = true;
12249 }
12250
12251 if (TARGET_MACHO && darwin_local_data_pic (disp))
12252 return true;
12253
12254 if (GET_CODE (disp) != UNSPEC)
12255 return false;
12256
12257 switch (XINT (disp, 1))
12258 {
12259 case UNSPEC_GOT:
12260 if (saw_plus)
12261 return false;
12262 /* We need to check for both symbols and labels because VxWorks loads
12263 text labels with @GOT rather than @GOTOFF. See gotoff_operand for
12264 details. */
12265 return (GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
12266 || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF);
12267 case UNSPEC_GOTOFF:
12268 /* Refuse GOTOFF in 64bit mode since it is always 64bit when used.
12269 While ABI specify also 32bit relocation but we don't produce it in
12270 small PIC model at all. */
12271 if ((GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
12272 || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF)
12273 && !TARGET_64BIT)
12274 return gotoff_operand (XVECEXP (disp, 0, 0), Pmode);
12275 return false;
12276 case UNSPEC_GOTTPOFF:
12277 case UNSPEC_GOTNTPOFF:
12278 case UNSPEC_INDNTPOFF:
12279 if (saw_plus)
12280 return false;
12281 disp = XVECEXP (disp, 0, 0);
12282 return (GET_CODE (disp) == SYMBOL_REF
12283 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_INITIAL_EXEC);
12284 case UNSPEC_NTPOFF:
12285 disp = XVECEXP (disp, 0, 0);
12286 return (GET_CODE (disp) == SYMBOL_REF
12287 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_EXEC);
12288 case UNSPEC_DTPOFF:
12289 disp = XVECEXP (disp, 0, 0);
12290 return (GET_CODE (disp) == SYMBOL_REF
12291 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_DYNAMIC);
12292 }
12293
12294 return false;
12295 }
12296
12297 /* Our implementation of LEGITIMIZE_RELOAD_ADDRESS. Returns a value to
12298 replace the input X, or the original X if no replacement is called for.
12299 The output parameter *WIN is 1 if the calling macro should goto WIN,
12300 0 if it should not. */
12301
12302 bool
12303 ix86_legitimize_reload_address (rtx x,
12304 enum machine_mode mode ATTRIBUTE_UNUSED,
12305 int opnum, int type,
12306 int ind_levels ATTRIBUTE_UNUSED)
12307 {
12308 /* Reload can generate:
12309
12310 (plus:DI (plus:DI (unspec:DI [(const_int 0 [0])] UNSPEC_TP)
12311 (reg:DI 97))
12312 (reg:DI 2 cx))
12313
12314 This RTX is rejected from ix86_legitimate_address_p due to
12315 non-strictness of base register 97. Following this rejection,
12316 reload pushes all three components into separate registers,
12317 creating invalid memory address RTX.
12318
12319 Following code reloads only the invalid part of the
12320 memory address RTX. */
12321
12322 if (GET_CODE (x) == PLUS
12323 && REG_P (XEXP (x, 1))
12324 && GET_CODE (XEXP (x, 0)) == PLUS
12325 && REG_P (XEXP (XEXP (x, 0), 1)))
12326 {
12327 rtx base, index;
12328 bool something_reloaded = false;
12329
12330 base = XEXP (XEXP (x, 0), 1);
12331 if (!REG_OK_FOR_BASE_STRICT_P (base))
12332 {
12333 push_reload (base, NULL_RTX, &XEXP (XEXP (x, 0), 1), NULL,
12334 BASE_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
12335 opnum, (enum reload_type) type);
12336 something_reloaded = true;
12337 }
12338
12339 index = XEXP (x, 1);
12340 if (!REG_OK_FOR_INDEX_STRICT_P (index))
12341 {
12342 push_reload (index, NULL_RTX, &XEXP (x, 1), NULL,
12343 INDEX_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
12344 opnum, (enum reload_type) type);
12345 something_reloaded = true;
12346 }
12347
12348 gcc_assert (something_reloaded);
12349 return true;
12350 }
12351
12352 return false;
12353 }
12354
12355 /* Recognizes RTL expressions that are valid memory addresses for an
12356 instruction. The MODE argument is the machine mode for the MEM
12357 expression that wants to use this address.
12358
12359 It only recognizes address in canonical form. LEGITIMIZE_ADDRESS should
12360 convert common non-canonical forms to canonical form so that they will
12361 be recognized. */
12362
12363 static bool
12364 ix86_legitimate_address_p (enum machine_mode mode ATTRIBUTE_UNUSED,
12365 rtx addr, bool strict)
12366 {
12367 struct ix86_address parts;
12368 rtx base, index, disp;
12369 HOST_WIDE_INT scale;
12370
12371 if (ix86_decompose_address (addr, &parts) <= 0)
12372 /* Decomposition failed. */
12373 return false;
12374
12375 base = parts.base;
12376 index = parts.index;
12377 disp = parts.disp;
12378 scale = parts.scale;
12379
12380 /* Validate base register. */
12381 if (base)
12382 {
12383 rtx reg;
12384
12385 if (REG_P (base))
12386 reg = base;
12387 else if (GET_CODE (base) == SUBREG && REG_P (SUBREG_REG (base)))
12388 reg = SUBREG_REG (base);
12389 else
12390 /* Base is not a register. */
12391 return false;
12392
12393 if (GET_MODE (base) != SImode && GET_MODE (base) != DImode)
12394 return false;
12395
12396 if ((strict && ! REG_OK_FOR_BASE_STRICT_P (reg))
12397 || (! strict && ! REG_OK_FOR_BASE_NONSTRICT_P (reg)))
12398 /* Base is not valid. */
12399 return false;
12400 }
12401
12402 /* Validate index register. */
12403 if (index)
12404 {
12405 rtx reg;
12406
12407 if (REG_P (index))
12408 reg = index;
12409 else if (GET_CODE (index) == SUBREG && REG_P (SUBREG_REG (index)))
12410 reg = SUBREG_REG (index);
12411 else
12412 /* Index is not a register. */
12413 return false;
12414
12415 if (GET_MODE (index) != SImode && GET_MODE (index) != DImode)
12416 return false;
12417
12418 if ((strict && ! REG_OK_FOR_INDEX_STRICT_P (reg))
12419 || (! strict && ! REG_OK_FOR_INDEX_NONSTRICT_P (reg)))
12420 /* Index is not valid. */
12421 return false;
12422 }
12423
12424 /* Index and base should have the same mode. */
12425 if (base && index
12426 && GET_MODE (base) != GET_MODE (index))
12427 return false;
12428
12429 /* Validate scale factor. */
12430 if (scale != 1)
12431 {
12432 if (!index)
12433 /* Scale without index. */
12434 return false;
12435
12436 if (scale != 2 && scale != 4 && scale != 8)
12437 /* Scale is not a valid multiplier. */
12438 return false;
12439 }
12440
12441 /* Validate displacement. */
12442 if (disp)
12443 {
12444 if (GET_CODE (disp) == CONST
12445 && GET_CODE (XEXP (disp, 0)) == UNSPEC
12446 && XINT (XEXP (disp, 0), 1) != UNSPEC_MACHOPIC_OFFSET)
12447 switch (XINT (XEXP (disp, 0), 1))
12448 {
12449 /* Refuse GOTOFF and GOT in 64bit mode since it is always 64bit when
12450 used. While ABI specify also 32bit relocations, we don't produce
12451 them at all and use IP relative instead. */
12452 case UNSPEC_GOT:
12453 case UNSPEC_GOTOFF:
12454 gcc_assert (flag_pic);
12455 if (!TARGET_64BIT)
12456 goto is_legitimate_pic;
12457
12458 /* 64bit address unspec. */
12459 return false;
12460
12461 case UNSPEC_GOTPCREL:
12462 case UNSPEC_PCREL:
12463 gcc_assert (flag_pic);
12464 goto is_legitimate_pic;
12465
12466 case UNSPEC_GOTTPOFF:
12467 case UNSPEC_GOTNTPOFF:
12468 case UNSPEC_INDNTPOFF:
12469 case UNSPEC_NTPOFF:
12470 case UNSPEC_DTPOFF:
12471 break;
12472
12473 case UNSPEC_STACK_CHECK:
12474 gcc_assert (flag_split_stack);
12475 break;
12476
12477 default:
12478 /* Invalid address unspec. */
12479 return false;
12480 }
12481
12482 else if (SYMBOLIC_CONST (disp)
12483 && (flag_pic
12484 || (TARGET_MACHO
12485 #if TARGET_MACHO
12486 && MACHOPIC_INDIRECT
12487 && !machopic_operand_p (disp)
12488 #endif
12489 )))
12490 {
12491
12492 is_legitimate_pic:
12493 if (TARGET_64BIT && (index || base))
12494 {
12495 /* foo@dtpoff(%rX) is ok. */
12496 if (GET_CODE (disp) != CONST
12497 || GET_CODE (XEXP (disp, 0)) != PLUS
12498 || GET_CODE (XEXP (XEXP (disp, 0), 0)) != UNSPEC
12499 || !CONST_INT_P (XEXP (XEXP (disp, 0), 1))
12500 || (XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_DTPOFF
12501 && XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_NTPOFF))
12502 /* Non-constant pic memory reference. */
12503 return false;
12504 }
12505 else if ((!TARGET_MACHO || flag_pic)
12506 && ! legitimate_pic_address_disp_p (disp))
12507 /* Displacement is an invalid pic construct. */
12508 return false;
12509 #if TARGET_MACHO
12510 else if (MACHO_DYNAMIC_NO_PIC_P
12511 && !ix86_legitimate_constant_p (Pmode, disp))
12512 /* displacment must be referenced via non_lazy_pointer */
12513 return false;
12514 #endif
12515
12516 /* This code used to verify that a symbolic pic displacement
12517 includes the pic_offset_table_rtx register.
12518
12519 While this is good idea, unfortunately these constructs may
12520 be created by "adds using lea" optimization for incorrect
12521 code like:
12522
12523 int a;
12524 int foo(int i)
12525 {
12526 return *(&a+i);
12527 }
12528
12529 This code is nonsensical, but results in addressing
12530 GOT table with pic_offset_table_rtx base. We can't
12531 just refuse it easily, since it gets matched by
12532 "addsi3" pattern, that later gets split to lea in the
12533 case output register differs from input. While this
12534 can be handled by separate addsi pattern for this case
12535 that never results in lea, this seems to be easier and
12536 correct fix for crash to disable this test. */
12537 }
12538 else if (GET_CODE (disp) != LABEL_REF
12539 && !CONST_INT_P (disp)
12540 && (GET_CODE (disp) != CONST
12541 || !ix86_legitimate_constant_p (Pmode, disp))
12542 && (GET_CODE (disp) != SYMBOL_REF
12543 || !ix86_legitimate_constant_p (Pmode, disp)))
12544 /* Displacement is not constant. */
12545 return false;
12546 else if (TARGET_64BIT
12547 && !x86_64_immediate_operand (disp, VOIDmode))
12548 /* Displacement is out of range. */
12549 return false;
12550 }
12551
12552 /* Everything looks valid. */
12553 return true;
12554 }
12555
12556 /* Determine if a given RTX is a valid constant address. */
12557
12558 bool
12559 constant_address_p (rtx x)
12560 {
12561 return CONSTANT_P (x) && ix86_legitimate_address_p (Pmode, x, 1);
12562 }
12563 \f
12564 /* Return a unique alias set for the GOT. */
12565
12566 static alias_set_type
12567 ix86_GOT_alias_set (void)
12568 {
12569 static alias_set_type set = -1;
12570 if (set == -1)
12571 set = new_alias_set ();
12572 return set;
12573 }
12574
12575 /* Return a legitimate reference for ORIG (an address) using the
12576 register REG. If REG is 0, a new pseudo is generated.
12577
12578 There are two types of references that must be handled:
12579
12580 1. Global data references must load the address from the GOT, via
12581 the PIC reg. An insn is emitted to do this load, and the reg is
12582 returned.
12583
12584 2. Static data references, constant pool addresses, and code labels
12585 compute the address as an offset from the GOT, whose base is in
12586 the PIC reg. Static data objects have SYMBOL_FLAG_LOCAL set to
12587 differentiate them from global data objects. The returned
12588 address is the PIC reg + an unspec constant.
12589
12590 TARGET_LEGITIMATE_ADDRESS_P rejects symbolic references unless the PIC
12591 reg also appears in the address. */
12592
12593 static rtx
12594 legitimize_pic_address (rtx orig, rtx reg)
12595 {
12596 rtx addr = orig;
12597 rtx new_rtx = orig;
12598 rtx base;
12599
12600 #if TARGET_MACHO
12601 if (TARGET_MACHO && !TARGET_64BIT)
12602 {
12603 if (reg == 0)
12604 reg = gen_reg_rtx (Pmode);
12605 /* Use the generic Mach-O PIC machinery. */
12606 return machopic_legitimize_pic_address (orig, GET_MODE (orig), reg);
12607 }
12608 #endif
12609
12610 if (TARGET_64BIT && legitimate_pic_address_disp_p (addr))
12611 new_rtx = addr;
12612 else if (TARGET_64BIT
12613 && ix86_cmodel != CM_SMALL_PIC
12614 && gotoff_operand (addr, Pmode))
12615 {
12616 rtx tmpreg;
12617 /* This symbol may be referenced via a displacement from the PIC
12618 base address (@GOTOFF). */
12619
12620 if (reload_in_progress)
12621 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12622 if (GET_CODE (addr) == CONST)
12623 addr = XEXP (addr, 0);
12624 if (GET_CODE (addr) == PLUS)
12625 {
12626 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, XEXP (addr, 0)),
12627 UNSPEC_GOTOFF);
12628 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, XEXP (addr, 1));
12629 }
12630 else
12631 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTOFF);
12632 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12633 if (!reg)
12634 tmpreg = gen_reg_rtx (Pmode);
12635 else
12636 tmpreg = reg;
12637 emit_move_insn (tmpreg, new_rtx);
12638
12639 if (reg != 0)
12640 {
12641 new_rtx = expand_simple_binop (Pmode, PLUS, reg, pic_offset_table_rtx,
12642 tmpreg, 1, OPTAB_DIRECT);
12643 new_rtx = reg;
12644 }
12645 else new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, tmpreg);
12646 }
12647 else if (!TARGET_64BIT && gotoff_operand (addr, Pmode))
12648 {
12649 /* This symbol may be referenced via a displacement from the PIC
12650 base address (@GOTOFF). */
12651
12652 if (reload_in_progress)
12653 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12654 if (GET_CODE (addr) == CONST)
12655 addr = XEXP (addr, 0);
12656 if (GET_CODE (addr) == PLUS)
12657 {
12658 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, XEXP (addr, 0)),
12659 UNSPEC_GOTOFF);
12660 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, XEXP (addr, 1));
12661 }
12662 else
12663 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTOFF);
12664 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12665 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
12666
12667 if (reg != 0)
12668 {
12669 emit_move_insn (reg, new_rtx);
12670 new_rtx = reg;
12671 }
12672 }
12673 else if ((GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (addr) == 0)
12674 /* We can't use @GOTOFF for text labels on VxWorks;
12675 see gotoff_operand. */
12676 || (TARGET_VXWORKS_RTP && GET_CODE (addr) == LABEL_REF))
12677 {
12678 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
12679 {
12680 if (GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_DLLIMPORT_P (addr))
12681 return legitimize_dllimport_symbol (addr, true);
12682 if (GET_CODE (addr) == CONST && GET_CODE (XEXP (addr, 0)) == PLUS
12683 && GET_CODE (XEXP (XEXP (addr, 0), 0)) == SYMBOL_REF
12684 && SYMBOL_REF_DLLIMPORT_P (XEXP (XEXP (addr, 0), 0)))
12685 {
12686 rtx t = legitimize_dllimport_symbol (XEXP (XEXP (addr, 0), 0), true);
12687 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (addr, 0), 1));
12688 }
12689 }
12690
12691 /* For x64 PE-COFF there is no GOT table. So we use address
12692 directly. */
12693 if (TARGET_64BIT && DEFAULT_ABI == MS_ABI)
12694 {
12695 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_PCREL);
12696 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12697
12698 if (reg == 0)
12699 reg = gen_reg_rtx (Pmode);
12700 emit_move_insn (reg, new_rtx);
12701 new_rtx = reg;
12702 }
12703 else if (TARGET_64BIT && ix86_cmodel != CM_LARGE_PIC)
12704 {
12705 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTPCREL);
12706 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12707 new_rtx = gen_const_mem (Pmode, new_rtx);
12708 set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
12709
12710 if (reg == 0)
12711 reg = gen_reg_rtx (Pmode);
12712 /* Use directly gen_movsi, otherwise the address is loaded
12713 into register for CSE. We don't want to CSE this addresses,
12714 instead we CSE addresses from the GOT table, so skip this. */
12715 emit_insn (gen_movsi (reg, new_rtx));
12716 new_rtx = reg;
12717 }
12718 else
12719 {
12720 /* This symbol must be referenced via a load from the
12721 Global Offset Table (@GOT). */
12722
12723 if (reload_in_progress)
12724 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12725 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOT);
12726 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12727 if (TARGET_64BIT)
12728 new_rtx = force_reg (Pmode, new_rtx);
12729 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
12730 new_rtx = gen_const_mem (Pmode, new_rtx);
12731 set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
12732
12733 if (reg == 0)
12734 reg = gen_reg_rtx (Pmode);
12735 emit_move_insn (reg, new_rtx);
12736 new_rtx = reg;
12737 }
12738 }
12739 else
12740 {
12741 if (CONST_INT_P (addr)
12742 && !x86_64_immediate_operand (addr, VOIDmode))
12743 {
12744 if (reg)
12745 {
12746 emit_move_insn (reg, addr);
12747 new_rtx = reg;
12748 }
12749 else
12750 new_rtx = force_reg (Pmode, addr);
12751 }
12752 else if (GET_CODE (addr) == CONST)
12753 {
12754 addr = XEXP (addr, 0);
12755
12756 /* We must match stuff we generate before. Assume the only
12757 unspecs that can get here are ours. Not that we could do
12758 anything with them anyway.... */
12759 if (GET_CODE (addr) == UNSPEC
12760 || (GET_CODE (addr) == PLUS
12761 && GET_CODE (XEXP (addr, 0)) == UNSPEC))
12762 return orig;
12763 gcc_assert (GET_CODE (addr) == PLUS);
12764 }
12765 if (GET_CODE (addr) == PLUS)
12766 {
12767 rtx op0 = XEXP (addr, 0), op1 = XEXP (addr, 1);
12768
12769 /* Check first to see if this is a constant offset from a @GOTOFF
12770 symbol reference. */
12771 if (gotoff_operand (op0, Pmode)
12772 && CONST_INT_P (op1))
12773 {
12774 if (!TARGET_64BIT)
12775 {
12776 if (reload_in_progress)
12777 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12778 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op0),
12779 UNSPEC_GOTOFF);
12780 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, op1);
12781 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12782 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
12783
12784 if (reg != 0)
12785 {
12786 emit_move_insn (reg, new_rtx);
12787 new_rtx = reg;
12788 }
12789 }
12790 else
12791 {
12792 if (INTVAL (op1) < -16*1024*1024
12793 || INTVAL (op1) >= 16*1024*1024)
12794 {
12795 if (!x86_64_immediate_operand (op1, Pmode))
12796 op1 = force_reg (Pmode, op1);
12797 new_rtx = gen_rtx_PLUS (Pmode, force_reg (Pmode, op0), op1);
12798 }
12799 }
12800 }
12801 else
12802 {
12803 base = legitimize_pic_address (XEXP (addr, 0), reg);
12804 new_rtx = legitimize_pic_address (XEXP (addr, 1),
12805 base == reg ? NULL_RTX : reg);
12806
12807 if (CONST_INT_P (new_rtx))
12808 new_rtx = plus_constant (Pmode, base, INTVAL (new_rtx));
12809 else
12810 {
12811 if (GET_CODE (new_rtx) == PLUS && CONSTANT_P (XEXP (new_rtx, 1)))
12812 {
12813 base = gen_rtx_PLUS (Pmode, base, XEXP (new_rtx, 0));
12814 new_rtx = XEXP (new_rtx, 1);
12815 }
12816 new_rtx = gen_rtx_PLUS (Pmode, base, new_rtx);
12817 }
12818 }
12819 }
12820 }
12821 return new_rtx;
12822 }
12823 \f
12824 /* Load the thread pointer. If TO_REG is true, force it into a register. */
12825
12826 static rtx
12827 get_thread_pointer (enum machine_mode tp_mode, bool to_reg)
12828 {
12829 rtx tp = gen_rtx_UNSPEC (ptr_mode, gen_rtvec (1, const0_rtx), UNSPEC_TP);
12830
12831 if (GET_MODE (tp) != tp_mode)
12832 {
12833 gcc_assert (GET_MODE (tp) == SImode);
12834 gcc_assert (tp_mode == DImode);
12835
12836 tp = gen_rtx_ZERO_EXTEND (tp_mode, tp);
12837 }
12838
12839 if (to_reg)
12840 tp = copy_to_mode_reg (tp_mode, tp);
12841
12842 return tp;
12843 }
12844
12845 /* Construct the SYMBOL_REF for the tls_get_addr function. */
12846
12847 static GTY(()) rtx ix86_tls_symbol;
12848
12849 static rtx
12850 ix86_tls_get_addr (void)
12851 {
12852 if (!ix86_tls_symbol)
12853 {
12854 const char *sym
12855 = ((TARGET_ANY_GNU_TLS && !TARGET_64BIT)
12856 ? "___tls_get_addr" : "__tls_get_addr");
12857
12858 ix86_tls_symbol = gen_rtx_SYMBOL_REF (Pmode, sym);
12859 }
12860
12861 return ix86_tls_symbol;
12862 }
12863
12864 /* Construct the SYMBOL_REF for the _TLS_MODULE_BASE_ symbol. */
12865
12866 static GTY(()) rtx ix86_tls_module_base_symbol;
12867
12868 rtx
12869 ix86_tls_module_base (void)
12870 {
12871 if (!ix86_tls_module_base_symbol)
12872 {
12873 ix86_tls_module_base_symbol
12874 = gen_rtx_SYMBOL_REF (Pmode, "_TLS_MODULE_BASE_");
12875
12876 SYMBOL_REF_FLAGS (ix86_tls_module_base_symbol)
12877 |= TLS_MODEL_GLOBAL_DYNAMIC << SYMBOL_FLAG_TLS_SHIFT;
12878 }
12879
12880 return ix86_tls_module_base_symbol;
12881 }
12882
12883 /* A subroutine of ix86_legitimize_address and ix86_expand_move. FOR_MOV is
12884 false if we expect this to be used for a memory address and true if
12885 we expect to load the address into a register. */
12886
12887 static rtx
12888 legitimize_tls_address (rtx x, enum tls_model model, bool for_mov)
12889 {
12890 rtx dest, base, off;
12891 rtx pic = NULL_RTX, tp = NULL_RTX;
12892 enum machine_mode tp_mode = Pmode;
12893 int type;
12894
12895 switch (model)
12896 {
12897 case TLS_MODEL_GLOBAL_DYNAMIC:
12898 dest = gen_reg_rtx (Pmode);
12899
12900 if (!TARGET_64BIT)
12901 {
12902 if (flag_pic)
12903 pic = pic_offset_table_rtx;
12904 else
12905 {
12906 pic = gen_reg_rtx (Pmode);
12907 emit_insn (gen_set_got (pic));
12908 }
12909 }
12910
12911 if (TARGET_GNU2_TLS)
12912 {
12913 if (TARGET_64BIT)
12914 emit_insn (gen_tls_dynamic_gnu2_64 (dest, x));
12915 else
12916 emit_insn (gen_tls_dynamic_gnu2_32 (dest, x, pic));
12917
12918 tp = get_thread_pointer (Pmode, true);
12919 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, tp, dest));
12920
12921 set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
12922 }
12923 else
12924 {
12925 rtx caddr = ix86_tls_get_addr ();
12926
12927 if (TARGET_64BIT)
12928 {
12929 rtx rax = gen_rtx_REG (Pmode, AX_REG), insns;
12930
12931 start_sequence ();
12932 emit_call_insn (ix86_gen_tls_global_dynamic_64 (rax, x,
12933 caddr));
12934 insns = get_insns ();
12935 end_sequence ();
12936
12937 RTL_CONST_CALL_P (insns) = 1;
12938 emit_libcall_block (insns, dest, rax, x);
12939 }
12940 else
12941 emit_insn (gen_tls_global_dynamic_32 (dest, x, pic, caddr));
12942 }
12943 break;
12944
12945 case TLS_MODEL_LOCAL_DYNAMIC:
12946 base = gen_reg_rtx (Pmode);
12947
12948 if (!TARGET_64BIT)
12949 {
12950 if (flag_pic)
12951 pic = pic_offset_table_rtx;
12952 else
12953 {
12954 pic = gen_reg_rtx (Pmode);
12955 emit_insn (gen_set_got (pic));
12956 }
12957 }
12958
12959 if (TARGET_GNU2_TLS)
12960 {
12961 rtx tmp = ix86_tls_module_base ();
12962
12963 if (TARGET_64BIT)
12964 emit_insn (gen_tls_dynamic_gnu2_64 (base, tmp));
12965 else
12966 emit_insn (gen_tls_dynamic_gnu2_32 (base, tmp, pic));
12967
12968 tp = get_thread_pointer (Pmode, true);
12969 set_unique_reg_note (get_last_insn (), REG_EQUAL,
12970 gen_rtx_MINUS (Pmode, tmp, tp));
12971 }
12972 else
12973 {
12974 rtx caddr = ix86_tls_get_addr ();
12975
12976 if (TARGET_64BIT)
12977 {
12978 rtx rax = gen_rtx_REG (Pmode, AX_REG), insns, eqv;
12979
12980 start_sequence ();
12981 emit_call_insn (ix86_gen_tls_local_dynamic_base_64 (rax,
12982 caddr));
12983 insns = get_insns ();
12984 end_sequence ();
12985
12986 /* Attach a unique REG_EQUAL, to allow the RTL optimizers to
12987 share the LD_BASE result with other LD model accesses. */
12988 eqv = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
12989 UNSPEC_TLS_LD_BASE);
12990
12991 RTL_CONST_CALL_P (insns) = 1;
12992 emit_libcall_block (insns, base, rax, eqv);
12993 }
12994 else
12995 emit_insn (gen_tls_local_dynamic_base_32 (base, pic, caddr));
12996 }
12997
12998 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x), UNSPEC_DTPOFF);
12999 off = gen_rtx_CONST (Pmode, off);
13000
13001 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, base, off));
13002
13003 if (TARGET_GNU2_TLS)
13004 {
13005 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, dest, tp));
13006
13007 set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
13008 }
13009 break;
13010
13011 case TLS_MODEL_INITIAL_EXEC:
13012 if (TARGET_64BIT)
13013 {
13014 if (TARGET_SUN_TLS && !TARGET_X32)
13015 {
13016 /* The Sun linker took the AMD64 TLS spec literally
13017 and can only handle %rax as destination of the
13018 initial executable code sequence. */
13019
13020 dest = gen_reg_rtx (DImode);
13021 emit_insn (gen_tls_initial_exec_64_sun (dest, x));
13022 return dest;
13023 }
13024
13025 /* Generate DImode references to avoid %fs:(%reg32)
13026 problems and linker IE->LE relaxation bug. */
13027 tp_mode = DImode;
13028 pic = NULL;
13029 type = UNSPEC_GOTNTPOFF;
13030 }
13031 else if (flag_pic)
13032 {
13033 if (reload_in_progress)
13034 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
13035 pic = pic_offset_table_rtx;
13036 type = TARGET_ANY_GNU_TLS ? UNSPEC_GOTNTPOFF : UNSPEC_GOTTPOFF;
13037 }
13038 else if (!TARGET_ANY_GNU_TLS)
13039 {
13040 pic = gen_reg_rtx (Pmode);
13041 emit_insn (gen_set_got (pic));
13042 type = UNSPEC_GOTTPOFF;
13043 }
13044 else
13045 {
13046 pic = NULL;
13047 type = UNSPEC_INDNTPOFF;
13048 }
13049
13050 off = gen_rtx_UNSPEC (tp_mode, gen_rtvec (1, x), type);
13051 off = gen_rtx_CONST (tp_mode, off);
13052 if (pic)
13053 off = gen_rtx_PLUS (tp_mode, pic, off);
13054 off = gen_const_mem (tp_mode, off);
13055 set_mem_alias_set (off, ix86_GOT_alias_set ());
13056
13057 if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
13058 {
13059 base = get_thread_pointer (tp_mode,
13060 for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
13061 off = force_reg (tp_mode, off);
13062 return gen_rtx_PLUS (tp_mode, base, off);
13063 }
13064 else
13065 {
13066 base = get_thread_pointer (Pmode, true);
13067 dest = gen_reg_rtx (Pmode);
13068 emit_insn (ix86_gen_sub3 (dest, base, off));
13069 }
13070 break;
13071
13072 case TLS_MODEL_LOCAL_EXEC:
13073 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x),
13074 (TARGET_64BIT || TARGET_ANY_GNU_TLS)
13075 ? UNSPEC_NTPOFF : UNSPEC_TPOFF);
13076 off = gen_rtx_CONST (Pmode, off);
13077
13078 if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
13079 {
13080 base = get_thread_pointer (Pmode,
13081 for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
13082 return gen_rtx_PLUS (Pmode, base, off);
13083 }
13084 else
13085 {
13086 base = get_thread_pointer (Pmode, true);
13087 dest = gen_reg_rtx (Pmode);
13088 emit_insn (ix86_gen_sub3 (dest, base, off));
13089 }
13090 break;
13091
13092 default:
13093 gcc_unreachable ();
13094 }
13095
13096 return dest;
13097 }
13098
13099 /* Create or return the unique __imp_DECL dllimport symbol corresponding
13100 to symbol DECL. */
13101
13102 static GTY((if_marked ("tree_map_marked_p"), param_is (struct tree_map)))
13103 htab_t dllimport_map;
13104
13105 static tree
13106 get_dllimport_decl (tree decl)
13107 {
13108 struct tree_map *h, in;
13109 void **loc;
13110 const char *name;
13111 const char *prefix;
13112 size_t namelen, prefixlen;
13113 char *imp_name;
13114 tree to;
13115 rtx rtl;
13116
13117 if (!dllimport_map)
13118 dllimport_map = htab_create_ggc (512, tree_map_hash, tree_map_eq, 0);
13119
13120 in.hash = htab_hash_pointer (decl);
13121 in.base.from = decl;
13122 loc = htab_find_slot_with_hash (dllimport_map, &in, in.hash, INSERT);
13123 h = (struct tree_map *) *loc;
13124 if (h)
13125 return h->to;
13126
13127 *loc = h = ggc_alloc_tree_map ();
13128 h->hash = in.hash;
13129 h->base.from = decl;
13130 h->to = to = build_decl (DECL_SOURCE_LOCATION (decl),
13131 VAR_DECL, NULL, ptr_type_node);
13132 DECL_ARTIFICIAL (to) = 1;
13133 DECL_IGNORED_P (to) = 1;
13134 DECL_EXTERNAL (to) = 1;
13135 TREE_READONLY (to) = 1;
13136
13137 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
13138 name = targetm.strip_name_encoding (name);
13139 prefix = name[0] == FASTCALL_PREFIX || user_label_prefix[0] == 0
13140 ? "*__imp_" : "*__imp__";
13141 namelen = strlen (name);
13142 prefixlen = strlen (prefix);
13143 imp_name = (char *) alloca (namelen + prefixlen + 1);
13144 memcpy (imp_name, prefix, prefixlen);
13145 memcpy (imp_name + prefixlen, name, namelen + 1);
13146
13147 name = ggc_alloc_string (imp_name, namelen + prefixlen);
13148 rtl = gen_rtx_SYMBOL_REF (Pmode, name);
13149 SET_SYMBOL_REF_DECL (rtl, to);
13150 SYMBOL_REF_FLAGS (rtl) = SYMBOL_FLAG_LOCAL;
13151
13152 rtl = gen_const_mem (Pmode, rtl);
13153 set_mem_alias_set (rtl, ix86_GOT_alias_set ());
13154
13155 SET_DECL_RTL (to, rtl);
13156 SET_DECL_ASSEMBLER_NAME (to, get_identifier (name));
13157
13158 return to;
13159 }
13160
13161 /* Expand SYMBOL into its corresponding dllimport symbol. WANT_REG is
13162 true if we require the result be a register. */
13163
13164 static rtx
13165 legitimize_dllimport_symbol (rtx symbol, bool want_reg)
13166 {
13167 tree imp_decl;
13168 rtx x;
13169
13170 gcc_assert (SYMBOL_REF_DECL (symbol));
13171 imp_decl = get_dllimport_decl (SYMBOL_REF_DECL (symbol));
13172
13173 x = DECL_RTL (imp_decl);
13174 if (want_reg)
13175 x = force_reg (Pmode, x);
13176 return x;
13177 }
13178
13179 /* Try machine-dependent ways of modifying an illegitimate address
13180 to be legitimate. If we find one, return the new, valid address.
13181 This macro is used in only one place: `memory_address' in explow.c.
13182
13183 OLDX is the address as it was before break_out_memory_refs was called.
13184 In some cases it is useful to look at this to decide what needs to be done.
13185
13186 It is always safe for this macro to do nothing. It exists to recognize
13187 opportunities to optimize the output.
13188
13189 For the 80386, we handle X+REG by loading X into a register R and
13190 using R+REG. R will go in a general reg and indexing will be used.
13191 However, if REG is a broken-out memory address or multiplication,
13192 nothing needs to be done because REG can certainly go in a general reg.
13193
13194 When -fpic is used, special handling is needed for symbolic references.
13195 See comments by legitimize_pic_address in i386.c for details. */
13196
13197 static rtx
13198 ix86_legitimize_address (rtx x, rtx oldx ATTRIBUTE_UNUSED,
13199 enum machine_mode mode)
13200 {
13201 int changed = 0;
13202 unsigned log;
13203
13204 log = GET_CODE (x) == SYMBOL_REF ? SYMBOL_REF_TLS_MODEL (x) : 0;
13205 if (log)
13206 return legitimize_tls_address (x, (enum tls_model) log, false);
13207 if (GET_CODE (x) == CONST
13208 && GET_CODE (XEXP (x, 0)) == PLUS
13209 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF
13210 && (log = SYMBOL_REF_TLS_MODEL (XEXP (XEXP (x, 0), 0))))
13211 {
13212 rtx t = legitimize_tls_address (XEXP (XEXP (x, 0), 0),
13213 (enum tls_model) log, false);
13214 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (x, 0), 1));
13215 }
13216
13217 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
13218 {
13219 if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_DLLIMPORT_P (x))
13220 return legitimize_dllimport_symbol (x, true);
13221 if (GET_CODE (x) == CONST
13222 && GET_CODE (XEXP (x, 0)) == PLUS
13223 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF
13224 && SYMBOL_REF_DLLIMPORT_P (XEXP (XEXP (x, 0), 0)))
13225 {
13226 rtx t = legitimize_dllimport_symbol (XEXP (XEXP (x, 0), 0), true);
13227 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (x, 0), 1));
13228 }
13229 }
13230
13231 if (flag_pic && SYMBOLIC_CONST (x))
13232 return legitimize_pic_address (x, 0);
13233
13234 #if TARGET_MACHO
13235 if (MACHO_DYNAMIC_NO_PIC_P && SYMBOLIC_CONST (x))
13236 return machopic_indirect_data_reference (x, 0);
13237 #endif
13238
13239 /* Canonicalize shifts by 0, 1, 2, 3 into multiply */
13240 if (GET_CODE (x) == ASHIFT
13241 && CONST_INT_P (XEXP (x, 1))
13242 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (x, 1)) < 4)
13243 {
13244 changed = 1;
13245 log = INTVAL (XEXP (x, 1));
13246 x = gen_rtx_MULT (Pmode, force_reg (Pmode, XEXP (x, 0)),
13247 GEN_INT (1 << log));
13248 }
13249
13250 if (GET_CODE (x) == PLUS)
13251 {
13252 /* Canonicalize shifts by 0, 1, 2, 3 into multiply. */
13253
13254 if (GET_CODE (XEXP (x, 0)) == ASHIFT
13255 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
13256 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 0), 1)) < 4)
13257 {
13258 changed = 1;
13259 log = INTVAL (XEXP (XEXP (x, 0), 1));
13260 XEXP (x, 0) = gen_rtx_MULT (Pmode,
13261 force_reg (Pmode, XEXP (XEXP (x, 0), 0)),
13262 GEN_INT (1 << log));
13263 }
13264
13265 if (GET_CODE (XEXP (x, 1)) == ASHIFT
13266 && CONST_INT_P (XEXP (XEXP (x, 1), 1))
13267 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 1), 1)) < 4)
13268 {
13269 changed = 1;
13270 log = INTVAL (XEXP (XEXP (x, 1), 1));
13271 XEXP (x, 1) = gen_rtx_MULT (Pmode,
13272 force_reg (Pmode, XEXP (XEXP (x, 1), 0)),
13273 GEN_INT (1 << log));
13274 }
13275
13276 /* Put multiply first if it isn't already. */
13277 if (GET_CODE (XEXP (x, 1)) == MULT)
13278 {
13279 rtx tmp = XEXP (x, 0);
13280 XEXP (x, 0) = XEXP (x, 1);
13281 XEXP (x, 1) = tmp;
13282 changed = 1;
13283 }
13284
13285 /* Canonicalize (plus (mult (reg) (const)) (plus (reg) (const)))
13286 into (plus (plus (mult (reg) (const)) (reg)) (const)). This can be
13287 created by virtual register instantiation, register elimination, and
13288 similar optimizations. */
13289 if (GET_CODE (XEXP (x, 0)) == MULT && GET_CODE (XEXP (x, 1)) == PLUS)
13290 {
13291 changed = 1;
13292 x = gen_rtx_PLUS (Pmode,
13293 gen_rtx_PLUS (Pmode, XEXP (x, 0),
13294 XEXP (XEXP (x, 1), 0)),
13295 XEXP (XEXP (x, 1), 1));
13296 }
13297
13298 /* Canonicalize
13299 (plus (plus (mult (reg) (const)) (plus (reg) (const))) const)
13300 into (plus (plus (mult (reg) (const)) (reg)) (const)). */
13301 else if (GET_CODE (x) == PLUS && GET_CODE (XEXP (x, 0)) == PLUS
13302 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
13303 && GET_CODE (XEXP (XEXP (x, 0), 1)) == PLUS
13304 && CONSTANT_P (XEXP (x, 1)))
13305 {
13306 rtx constant;
13307 rtx other = NULL_RTX;
13308
13309 if (CONST_INT_P (XEXP (x, 1)))
13310 {
13311 constant = XEXP (x, 1);
13312 other = XEXP (XEXP (XEXP (x, 0), 1), 1);
13313 }
13314 else if (CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 1), 1)))
13315 {
13316 constant = XEXP (XEXP (XEXP (x, 0), 1), 1);
13317 other = XEXP (x, 1);
13318 }
13319 else
13320 constant = 0;
13321
13322 if (constant)
13323 {
13324 changed = 1;
13325 x = gen_rtx_PLUS (Pmode,
13326 gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 0),
13327 XEXP (XEXP (XEXP (x, 0), 1), 0)),
13328 plus_constant (Pmode, other,
13329 INTVAL (constant)));
13330 }
13331 }
13332
13333 if (changed && ix86_legitimate_address_p (mode, x, false))
13334 return x;
13335
13336 if (GET_CODE (XEXP (x, 0)) == MULT)
13337 {
13338 changed = 1;
13339 XEXP (x, 0) = force_operand (XEXP (x, 0), 0);
13340 }
13341
13342 if (GET_CODE (XEXP (x, 1)) == MULT)
13343 {
13344 changed = 1;
13345 XEXP (x, 1) = force_operand (XEXP (x, 1), 0);
13346 }
13347
13348 if (changed
13349 && REG_P (XEXP (x, 1))
13350 && REG_P (XEXP (x, 0)))
13351 return x;
13352
13353 if (flag_pic && SYMBOLIC_CONST (XEXP (x, 1)))
13354 {
13355 changed = 1;
13356 x = legitimize_pic_address (x, 0);
13357 }
13358
13359 if (changed && ix86_legitimate_address_p (mode, x, false))
13360 return x;
13361
13362 if (REG_P (XEXP (x, 0)))
13363 {
13364 rtx temp = gen_reg_rtx (Pmode);
13365 rtx val = force_operand (XEXP (x, 1), temp);
13366 if (val != temp)
13367 {
13368 if (GET_MODE (val) != Pmode)
13369 val = convert_to_mode (Pmode, val, 1);
13370 emit_move_insn (temp, val);
13371 }
13372
13373 XEXP (x, 1) = temp;
13374 return x;
13375 }
13376
13377 else if (REG_P (XEXP (x, 1)))
13378 {
13379 rtx temp = gen_reg_rtx (Pmode);
13380 rtx val = force_operand (XEXP (x, 0), temp);
13381 if (val != temp)
13382 {
13383 if (GET_MODE (val) != Pmode)
13384 val = convert_to_mode (Pmode, val, 1);
13385 emit_move_insn (temp, val);
13386 }
13387
13388 XEXP (x, 0) = temp;
13389 return x;
13390 }
13391 }
13392
13393 return x;
13394 }
13395 \f
13396 /* Print an integer constant expression in assembler syntax. Addition
13397 and subtraction are the only arithmetic that may appear in these
13398 expressions. FILE is the stdio stream to write to, X is the rtx, and
13399 CODE is the operand print code from the output string. */
13400
13401 static void
13402 output_pic_addr_const (FILE *file, rtx x, int code)
13403 {
13404 char buf[256];
13405
13406 switch (GET_CODE (x))
13407 {
13408 case PC:
13409 gcc_assert (flag_pic);
13410 putc ('.', file);
13411 break;
13412
13413 case SYMBOL_REF:
13414 if (TARGET_64BIT || ! TARGET_MACHO_BRANCH_ISLANDS)
13415 output_addr_const (file, x);
13416 else
13417 {
13418 const char *name = XSTR (x, 0);
13419
13420 /* Mark the decl as referenced so that cgraph will
13421 output the function. */
13422 if (SYMBOL_REF_DECL (x))
13423 mark_decl_referenced (SYMBOL_REF_DECL (x));
13424
13425 #if TARGET_MACHO
13426 if (MACHOPIC_INDIRECT
13427 && machopic_classify_symbol (x) == MACHOPIC_UNDEFINED_FUNCTION)
13428 name = machopic_indirection_name (x, /*stub_p=*/true);
13429 #endif
13430 assemble_name (file, name);
13431 }
13432 if (!TARGET_MACHO && !(TARGET_64BIT && DEFAULT_ABI == MS_ABI)
13433 && code == 'P' && ! SYMBOL_REF_LOCAL_P (x))
13434 fputs ("@PLT", file);
13435 break;
13436
13437 case LABEL_REF:
13438 x = XEXP (x, 0);
13439 /* FALLTHRU */
13440 case CODE_LABEL:
13441 ASM_GENERATE_INTERNAL_LABEL (buf, "L", CODE_LABEL_NUMBER (x));
13442 assemble_name (asm_out_file, buf);
13443 break;
13444
13445 case CONST_INT:
13446 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
13447 break;
13448
13449 case CONST:
13450 /* This used to output parentheses around the expression,
13451 but that does not work on the 386 (either ATT or BSD assembler). */
13452 output_pic_addr_const (file, XEXP (x, 0), code);
13453 break;
13454
13455 case CONST_DOUBLE:
13456 if (GET_MODE (x) == VOIDmode)
13457 {
13458 /* We can use %d if the number is <32 bits and positive. */
13459 if (CONST_DOUBLE_HIGH (x) || CONST_DOUBLE_LOW (x) < 0)
13460 fprintf (file, "0x%lx%08lx",
13461 (unsigned long) CONST_DOUBLE_HIGH (x),
13462 (unsigned long) CONST_DOUBLE_LOW (x));
13463 else
13464 fprintf (file, HOST_WIDE_INT_PRINT_DEC, CONST_DOUBLE_LOW (x));
13465 }
13466 else
13467 /* We can't handle floating point constants;
13468 TARGET_PRINT_OPERAND must handle them. */
13469 output_operand_lossage ("floating constant misused");
13470 break;
13471
13472 case PLUS:
13473 /* Some assemblers need integer constants to appear first. */
13474 if (CONST_INT_P (XEXP (x, 0)))
13475 {
13476 output_pic_addr_const (file, XEXP (x, 0), code);
13477 putc ('+', file);
13478 output_pic_addr_const (file, XEXP (x, 1), code);
13479 }
13480 else
13481 {
13482 gcc_assert (CONST_INT_P (XEXP (x, 1)));
13483 output_pic_addr_const (file, XEXP (x, 1), code);
13484 putc ('+', file);
13485 output_pic_addr_const (file, XEXP (x, 0), code);
13486 }
13487 break;
13488
13489 case MINUS:
13490 if (!TARGET_MACHO)
13491 putc (ASSEMBLER_DIALECT == ASM_INTEL ? '(' : '[', file);
13492 output_pic_addr_const (file, XEXP (x, 0), code);
13493 putc ('-', file);
13494 output_pic_addr_const (file, XEXP (x, 1), code);
13495 if (!TARGET_MACHO)
13496 putc (ASSEMBLER_DIALECT == ASM_INTEL ? ')' : ']', file);
13497 break;
13498
13499 case UNSPEC:
13500 if (XINT (x, 1) == UNSPEC_STACK_CHECK)
13501 {
13502 bool f = i386_asm_output_addr_const_extra (file, x);
13503 gcc_assert (f);
13504 break;
13505 }
13506
13507 gcc_assert (XVECLEN (x, 0) == 1);
13508 output_pic_addr_const (file, XVECEXP (x, 0, 0), code);
13509 switch (XINT (x, 1))
13510 {
13511 case UNSPEC_GOT:
13512 fputs ("@GOT", file);
13513 break;
13514 case UNSPEC_GOTOFF:
13515 fputs ("@GOTOFF", file);
13516 break;
13517 case UNSPEC_PLTOFF:
13518 fputs ("@PLTOFF", file);
13519 break;
13520 case UNSPEC_PCREL:
13521 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
13522 "(%rip)" : "[rip]", file);
13523 break;
13524 case UNSPEC_GOTPCREL:
13525 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
13526 "@GOTPCREL(%rip)" : "@GOTPCREL[rip]", file);
13527 break;
13528 case UNSPEC_GOTTPOFF:
13529 /* FIXME: This might be @TPOFF in Sun ld too. */
13530 fputs ("@gottpoff", file);
13531 break;
13532 case UNSPEC_TPOFF:
13533 fputs ("@tpoff", file);
13534 break;
13535 case UNSPEC_NTPOFF:
13536 if (TARGET_64BIT)
13537 fputs ("@tpoff", file);
13538 else
13539 fputs ("@ntpoff", file);
13540 break;
13541 case UNSPEC_DTPOFF:
13542 fputs ("@dtpoff", file);
13543 break;
13544 case UNSPEC_GOTNTPOFF:
13545 if (TARGET_64BIT)
13546 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
13547 "@gottpoff(%rip)": "@gottpoff[rip]", file);
13548 else
13549 fputs ("@gotntpoff", file);
13550 break;
13551 case UNSPEC_INDNTPOFF:
13552 fputs ("@indntpoff", file);
13553 break;
13554 #if TARGET_MACHO
13555 case UNSPEC_MACHOPIC_OFFSET:
13556 putc ('-', file);
13557 machopic_output_function_base_name (file);
13558 break;
13559 #endif
13560 default:
13561 output_operand_lossage ("invalid UNSPEC as operand");
13562 break;
13563 }
13564 break;
13565
13566 default:
13567 output_operand_lossage ("invalid expression as operand");
13568 }
13569 }
13570
13571 /* This is called from dwarf2out.c via TARGET_ASM_OUTPUT_DWARF_DTPREL.
13572 We need to emit DTP-relative relocations. */
13573
13574 static void ATTRIBUTE_UNUSED
13575 i386_output_dwarf_dtprel (FILE *file, int size, rtx x)
13576 {
13577 fputs (ASM_LONG, file);
13578 output_addr_const (file, x);
13579 fputs ("@dtpoff", file);
13580 switch (size)
13581 {
13582 case 4:
13583 break;
13584 case 8:
13585 fputs (", 0", file);
13586 break;
13587 default:
13588 gcc_unreachable ();
13589 }
13590 }
13591
13592 /* Return true if X is a representation of the PIC register. This copes
13593 with calls from ix86_find_base_term, where the register might have
13594 been replaced by a cselib value. */
13595
13596 static bool
13597 ix86_pic_register_p (rtx x)
13598 {
13599 if (GET_CODE (x) == VALUE && CSELIB_VAL_PTR (x))
13600 return (pic_offset_table_rtx
13601 && rtx_equal_for_cselib_p (x, pic_offset_table_rtx));
13602 else
13603 return REG_P (x) && REGNO (x) == PIC_OFFSET_TABLE_REGNUM;
13604 }
13605
13606 /* Helper function for ix86_delegitimize_address.
13607 Attempt to delegitimize TLS local-exec accesses. */
13608
13609 static rtx
13610 ix86_delegitimize_tls_address (rtx orig_x)
13611 {
13612 rtx x = orig_x, unspec;
13613 struct ix86_address addr;
13614
13615 if (!TARGET_TLS_DIRECT_SEG_REFS)
13616 return orig_x;
13617 if (MEM_P (x))
13618 x = XEXP (x, 0);
13619 if (GET_CODE (x) != PLUS || GET_MODE (x) != Pmode)
13620 return orig_x;
13621 if (ix86_decompose_address (x, &addr) == 0
13622 || addr.seg != (TARGET_64BIT ? SEG_FS : SEG_GS)
13623 || addr.disp == NULL_RTX
13624 || GET_CODE (addr.disp) != CONST)
13625 return orig_x;
13626 unspec = XEXP (addr.disp, 0);
13627 if (GET_CODE (unspec) == PLUS && CONST_INT_P (XEXP (unspec, 1)))
13628 unspec = XEXP (unspec, 0);
13629 if (GET_CODE (unspec) != UNSPEC || XINT (unspec, 1) != UNSPEC_NTPOFF)
13630 return orig_x;
13631 x = XVECEXP (unspec, 0, 0);
13632 gcc_assert (GET_CODE (x) == SYMBOL_REF);
13633 if (unspec != XEXP (addr.disp, 0))
13634 x = gen_rtx_PLUS (Pmode, x, XEXP (XEXP (addr.disp, 0), 1));
13635 if (addr.index)
13636 {
13637 rtx idx = addr.index;
13638 if (addr.scale != 1)
13639 idx = gen_rtx_MULT (Pmode, idx, GEN_INT (addr.scale));
13640 x = gen_rtx_PLUS (Pmode, idx, x);
13641 }
13642 if (addr.base)
13643 x = gen_rtx_PLUS (Pmode, addr.base, x);
13644 if (MEM_P (orig_x))
13645 x = replace_equiv_address_nv (orig_x, x);
13646 return x;
13647 }
13648
13649 /* In the name of slightly smaller debug output, and to cater to
13650 general assembler lossage, recognize PIC+GOTOFF and turn it back
13651 into a direct symbol reference.
13652
13653 On Darwin, this is necessary to avoid a crash, because Darwin
13654 has a different PIC label for each routine but the DWARF debugging
13655 information is not associated with any particular routine, so it's
13656 necessary to remove references to the PIC label from RTL stored by
13657 the DWARF output code. */
13658
13659 static rtx
13660 ix86_delegitimize_address (rtx x)
13661 {
13662 rtx orig_x = delegitimize_mem_from_attrs (x);
13663 /* addend is NULL or some rtx if x is something+GOTOFF where
13664 something doesn't include the PIC register. */
13665 rtx addend = NULL_RTX;
13666 /* reg_addend is NULL or a multiple of some register. */
13667 rtx reg_addend = NULL_RTX;
13668 /* const_addend is NULL or a const_int. */
13669 rtx const_addend = NULL_RTX;
13670 /* This is the result, or NULL. */
13671 rtx result = NULL_RTX;
13672
13673 x = orig_x;
13674
13675 if (MEM_P (x))
13676 x = XEXP (x, 0);
13677
13678 if (TARGET_64BIT)
13679 {
13680 if (GET_CODE (x) == CONST
13681 && GET_CODE (XEXP (x, 0)) == PLUS
13682 && GET_MODE (XEXP (x, 0)) == Pmode
13683 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
13684 && GET_CODE (XEXP (XEXP (x, 0), 0)) == UNSPEC
13685 && XINT (XEXP (XEXP (x, 0), 0), 1) == UNSPEC_PCREL)
13686 {
13687 rtx x2 = XVECEXP (XEXP (XEXP (x, 0), 0), 0, 0);
13688 x = gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 1), x2);
13689 if (MEM_P (orig_x))
13690 x = replace_equiv_address_nv (orig_x, x);
13691 return x;
13692 }
13693 if (GET_CODE (x) != CONST
13694 || GET_CODE (XEXP (x, 0)) != UNSPEC
13695 || (XINT (XEXP (x, 0), 1) != UNSPEC_GOTPCREL
13696 && XINT (XEXP (x, 0), 1) != UNSPEC_PCREL)
13697 || (!MEM_P (orig_x) && XINT (XEXP (x, 0), 1) != UNSPEC_PCREL))
13698 return ix86_delegitimize_tls_address (orig_x);
13699 x = XVECEXP (XEXP (x, 0), 0, 0);
13700 if (GET_MODE (orig_x) != GET_MODE (x) && MEM_P (orig_x))
13701 {
13702 x = simplify_gen_subreg (GET_MODE (orig_x), x,
13703 GET_MODE (x), 0);
13704 if (x == NULL_RTX)
13705 return orig_x;
13706 }
13707 return x;
13708 }
13709
13710 if (GET_CODE (x) != PLUS
13711 || GET_CODE (XEXP (x, 1)) != CONST)
13712 return ix86_delegitimize_tls_address (orig_x);
13713
13714 if (ix86_pic_register_p (XEXP (x, 0)))
13715 /* %ebx + GOT/GOTOFF */
13716 ;
13717 else if (GET_CODE (XEXP (x, 0)) == PLUS)
13718 {
13719 /* %ebx + %reg * scale + GOT/GOTOFF */
13720 reg_addend = XEXP (x, 0);
13721 if (ix86_pic_register_p (XEXP (reg_addend, 0)))
13722 reg_addend = XEXP (reg_addend, 1);
13723 else if (ix86_pic_register_p (XEXP (reg_addend, 1)))
13724 reg_addend = XEXP (reg_addend, 0);
13725 else
13726 {
13727 reg_addend = NULL_RTX;
13728 addend = XEXP (x, 0);
13729 }
13730 }
13731 else
13732 addend = XEXP (x, 0);
13733
13734 x = XEXP (XEXP (x, 1), 0);
13735 if (GET_CODE (x) == PLUS
13736 && CONST_INT_P (XEXP (x, 1)))
13737 {
13738 const_addend = XEXP (x, 1);
13739 x = XEXP (x, 0);
13740 }
13741
13742 if (GET_CODE (x) == UNSPEC
13743 && ((XINT (x, 1) == UNSPEC_GOT && MEM_P (orig_x) && !addend)
13744 || (XINT (x, 1) == UNSPEC_GOTOFF && !MEM_P (orig_x))))
13745 result = XVECEXP (x, 0, 0);
13746
13747 if (TARGET_MACHO && darwin_local_data_pic (x)
13748 && !MEM_P (orig_x))
13749 result = XVECEXP (x, 0, 0);
13750
13751 if (! result)
13752 return ix86_delegitimize_tls_address (orig_x);
13753
13754 if (const_addend)
13755 result = gen_rtx_CONST (Pmode, gen_rtx_PLUS (Pmode, result, const_addend));
13756 if (reg_addend)
13757 result = gen_rtx_PLUS (Pmode, reg_addend, result);
13758 if (addend)
13759 {
13760 /* If the rest of original X doesn't involve the PIC register, add
13761 addend and subtract pic_offset_table_rtx. This can happen e.g.
13762 for code like:
13763 leal (%ebx, %ecx, 4), %ecx
13764 ...
13765 movl foo@GOTOFF(%ecx), %edx
13766 in which case we return (%ecx - %ebx) + foo. */
13767 if (pic_offset_table_rtx)
13768 result = gen_rtx_PLUS (Pmode, gen_rtx_MINUS (Pmode, copy_rtx (addend),
13769 pic_offset_table_rtx),
13770 result);
13771 else
13772 return orig_x;
13773 }
13774 if (GET_MODE (orig_x) != Pmode && MEM_P (orig_x))
13775 {
13776 result = simplify_gen_subreg (GET_MODE (orig_x), result, Pmode, 0);
13777 if (result == NULL_RTX)
13778 return orig_x;
13779 }
13780 return result;
13781 }
13782
13783 /* If X is a machine specific address (i.e. a symbol or label being
13784 referenced as a displacement from the GOT implemented using an
13785 UNSPEC), then return the base term. Otherwise return X. */
13786
13787 rtx
13788 ix86_find_base_term (rtx x)
13789 {
13790 rtx term;
13791
13792 if (TARGET_64BIT)
13793 {
13794 if (GET_CODE (x) != CONST)
13795 return x;
13796 term = XEXP (x, 0);
13797 if (GET_CODE (term) == PLUS
13798 && (CONST_INT_P (XEXP (term, 1))
13799 || GET_CODE (XEXP (term, 1)) == CONST_DOUBLE))
13800 term = XEXP (term, 0);
13801 if (GET_CODE (term) != UNSPEC
13802 || (XINT (term, 1) != UNSPEC_GOTPCREL
13803 && XINT (term, 1) != UNSPEC_PCREL))
13804 return x;
13805
13806 return XVECEXP (term, 0, 0);
13807 }
13808
13809 return ix86_delegitimize_address (x);
13810 }
13811 \f
13812 static void
13813 put_condition_code (enum rtx_code code, enum machine_mode mode, bool reverse,
13814 bool fp, FILE *file)
13815 {
13816 const char *suffix;
13817
13818 if (mode == CCFPmode || mode == CCFPUmode)
13819 {
13820 code = ix86_fp_compare_code_to_integer (code);
13821 mode = CCmode;
13822 }
13823 if (reverse)
13824 code = reverse_condition (code);
13825
13826 switch (code)
13827 {
13828 case EQ:
13829 switch (mode)
13830 {
13831 case CCAmode:
13832 suffix = "a";
13833 break;
13834
13835 case CCCmode:
13836 suffix = "c";
13837 break;
13838
13839 case CCOmode:
13840 suffix = "o";
13841 break;
13842
13843 case CCSmode:
13844 suffix = "s";
13845 break;
13846
13847 default:
13848 suffix = "e";
13849 }
13850 break;
13851 case NE:
13852 switch (mode)
13853 {
13854 case CCAmode:
13855 suffix = "na";
13856 break;
13857
13858 case CCCmode:
13859 suffix = "nc";
13860 break;
13861
13862 case CCOmode:
13863 suffix = "no";
13864 break;
13865
13866 case CCSmode:
13867 suffix = "ns";
13868 break;
13869
13870 default:
13871 suffix = "ne";
13872 }
13873 break;
13874 case GT:
13875 gcc_assert (mode == CCmode || mode == CCNOmode || mode == CCGCmode);
13876 suffix = "g";
13877 break;
13878 case GTU:
13879 /* ??? Use "nbe" instead of "a" for fcmov lossage on some assemblers.
13880 Those same assemblers have the same but opposite lossage on cmov. */
13881 if (mode == CCmode)
13882 suffix = fp ? "nbe" : "a";
13883 else if (mode == CCCmode)
13884 suffix = "b";
13885 else
13886 gcc_unreachable ();
13887 break;
13888 case LT:
13889 switch (mode)
13890 {
13891 case CCNOmode:
13892 case CCGOCmode:
13893 suffix = "s";
13894 break;
13895
13896 case CCmode:
13897 case CCGCmode:
13898 suffix = "l";
13899 break;
13900
13901 default:
13902 gcc_unreachable ();
13903 }
13904 break;
13905 case LTU:
13906 gcc_assert (mode == CCmode || mode == CCCmode);
13907 suffix = "b";
13908 break;
13909 case GE:
13910 switch (mode)
13911 {
13912 case CCNOmode:
13913 case CCGOCmode:
13914 suffix = "ns";
13915 break;
13916
13917 case CCmode:
13918 case CCGCmode:
13919 suffix = "ge";
13920 break;
13921
13922 default:
13923 gcc_unreachable ();
13924 }
13925 break;
13926 case GEU:
13927 /* ??? As above. */
13928 gcc_assert (mode == CCmode || mode == CCCmode);
13929 suffix = fp ? "nb" : "ae";
13930 break;
13931 case LE:
13932 gcc_assert (mode == CCmode || mode == CCGCmode || mode == CCNOmode);
13933 suffix = "le";
13934 break;
13935 case LEU:
13936 /* ??? As above. */
13937 if (mode == CCmode)
13938 suffix = "be";
13939 else if (mode == CCCmode)
13940 suffix = fp ? "nb" : "ae";
13941 else
13942 gcc_unreachable ();
13943 break;
13944 case UNORDERED:
13945 suffix = fp ? "u" : "p";
13946 break;
13947 case ORDERED:
13948 suffix = fp ? "nu" : "np";
13949 break;
13950 default:
13951 gcc_unreachable ();
13952 }
13953 fputs (suffix, file);
13954 }
13955
13956 /* Print the name of register X to FILE based on its machine mode and number.
13957 If CODE is 'w', pretend the mode is HImode.
13958 If CODE is 'b', pretend the mode is QImode.
13959 If CODE is 'k', pretend the mode is SImode.
13960 If CODE is 'q', pretend the mode is DImode.
13961 If CODE is 'x', pretend the mode is V4SFmode.
13962 If CODE is 't', pretend the mode is V8SFmode.
13963 If CODE is 'h', pretend the reg is the 'high' byte register.
13964 If CODE is 'y', print "st(0)" instead of "st", if the reg is stack op.
13965 If CODE is 'd', duplicate the operand for AVX instruction.
13966 */
13967
13968 void
13969 print_reg (rtx x, int code, FILE *file)
13970 {
13971 const char *reg;
13972 bool duplicated = code == 'd' && TARGET_AVX;
13973
13974 gcc_assert (x == pc_rtx
13975 || (REGNO (x) != ARG_POINTER_REGNUM
13976 && REGNO (x) != FRAME_POINTER_REGNUM
13977 && REGNO (x) != FLAGS_REG
13978 && REGNO (x) != FPSR_REG
13979 && REGNO (x) != FPCR_REG));
13980
13981 if (ASSEMBLER_DIALECT == ASM_ATT)
13982 putc ('%', file);
13983
13984 if (x == pc_rtx)
13985 {
13986 gcc_assert (TARGET_64BIT);
13987 fputs ("rip", file);
13988 return;
13989 }
13990
13991 if (code == 'w' || MMX_REG_P (x))
13992 code = 2;
13993 else if (code == 'b')
13994 code = 1;
13995 else if (code == 'k')
13996 code = 4;
13997 else if (code == 'q')
13998 code = 8;
13999 else if (code == 'y')
14000 code = 3;
14001 else if (code == 'h')
14002 code = 0;
14003 else if (code == 'x')
14004 code = 16;
14005 else if (code == 't')
14006 code = 32;
14007 else
14008 code = GET_MODE_SIZE (GET_MODE (x));
14009
14010 /* Irritatingly, AMD extended registers use different naming convention
14011 from the normal registers: "r%d[bwd]" */
14012 if (REX_INT_REG_P (x))
14013 {
14014 gcc_assert (TARGET_64BIT);
14015 putc ('r', file);
14016 fprint_ul (file, REGNO (x) - FIRST_REX_INT_REG + 8);
14017 switch (code)
14018 {
14019 case 0:
14020 error ("extended registers have no high halves");
14021 break;
14022 case 1:
14023 putc ('b', file);
14024 break;
14025 case 2:
14026 putc ('w', file);
14027 break;
14028 case 4:
14029 putc ('d', file);
14030 break;
14031 case 8:
14032 /* no suffix */
14033 break;
14034 default:
14035 error ("unsupported operand size for extended register");
14036 break;
14037 }
14038 return;
14039 }
14040
14041 reg = NULL;
14042 switch (code)
14043 {
14044 case 3:
14045 if (STACK_TOP_P (x))
14046 {
14047 reg = "st(0)";
14048 break;
14049 }
14050 /* FALLTHRU */
14051 case 8:
14052 case 4:
14053 case 12:
14054 if (! ANY_FP_REG_P (x))
14055 putc (code == 8 && TARGET_64BIT ? 'r' : 'e', file);
14056 /* FALLTHRU */
14057 case 16:
14058 case 2:
14059 normal:
14060 reg = hi_reg_name[REGNO (x)];
14061 break;
14062 case 1:
14063 if (REGNO (x) >= ARRAY_SIZE (qi_reg_name))
14064 goto normal;
14065 reg = qi_reg_name[REGNO (x)];
14066 break;
14067 case 0:
14068 if (REGNO (x) >= ARRAY_SIZE (qi_high_reg_name))
14069 goto normal;
14070 reg = qi_high_reg_name[REGNO (x)];
14071 break;
14072 case 32:
14073 if (SSE_REG_P (x))
14074 {
14075 gcc_assert (!duplicated);
14076 putc ('y', file);
14077 fputs (hi_reg_name[REGNO (x)] + 1, file);
14078 return;
14079 }
14080 break;
14081 default:
14082 gcc_unreachable ();
14083 }
14084
14085 fputs (reg, file);
14086 if (duplicated)
14087 {
14088 if (ASSEMBLER_DIALECT == ASM_ATT)
14089 fprintf (file, ", %%%s", reg);
14090 else
14091 fprintf (file, ", %s", reg);
14092 }
14093 }
14094
14095 /* Locate some local-dynamic symbol still in use by this function
14096 so that we can print its name in some tls_local_dynamic_base
14097 pattern. */
14098
14099 static int
14100 get_some_local_dynamic_name_1 (rtx *px, void *data ATTRIBUTE_UNUSED)
14101 {
14102 rtx x = *px;
14103
14104 if (GET_CODE (x) == SYMBOL_REF
14105 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_DYNAMIC)
14106 {
14107 cfun->machine->some_ld_name = XSTR (x, 0);
14108 return 1;
14109 }
14110
14111 return 0;
14112 }
14113
14114 static const char *
14115 get_some_local_dynamic_name (void)
14116 {
14117 rtx insn;
14118
14119 if (cfun->machine->some_ld_name)
14120 return cfun->machine->some_ld_name;
14121
14122 for (insn = get_insns (); insn ; insn = NEXT_INSN (insn))
14123 if (NONDEBUG_INSN_P (insn)
14124 && for_each_rtx (&PATTERN (insn), get_some_local_dynamic_name_1, 0))
14125 return cfun->machine->some_ld_name;
14126
14127 return NULL;
14128 }
14129
14130 /* Meaning of CODE:
14131 L,W,B,Q,S,T -- print the opcode suffix for specified size of operand.
14132 C -- print opcode suffix for set/cmov insn.
14133 c -- like C, but print reversed condition
14134 F,f -- likewise, but for floating-point.
14135 O -- if HAVE_AS_IX86_CMOV_SUN_SYNTAX, expand to "w.", "l." or "q.",
14136 otherwise nothing
14137 R -- print the prefix for register names.
14138 z -- print the opcode suffix for the size of the current operand.
14139 Z -- likewise, with special suffixes for x87 instructions.
14140 * -- print a star (in certain assembler syntax)
14141 A -- print an absolute memory reference.
14142 E -- print address with DImode register names if TARGET_64BIT.
14143 w -- print the operand as if it's a "word" (HImode) even if it isn't.
14144 s -- print a shift double count, followed by the assemblers argument
14145 delimiter.
14146 b -- print the QImode name of the register for the indicated operand.
14147 %b0 would print %al if operands[0] is reg 0.
14148 w -- likewise, print the HImode name of the register.
14149 k -- likewise, print the SImode name of the register.
14150 q -- likewise, print the DImode name of the register.
14151 x -- likewise, print the V4SFmode name of the register.
14152 t -- likewise, print the V8SFmode name of the register.
14153 h -- print the QImode name for a "high" register, either ah, bh, ch or dh.
14154 y -- print "st(0)" instead of "st" as a register.
14155 d -- print duplicated register operand for AVX instruction.
14156 D -- print condition for SSE cmp instruction.
14157 P -- if PIC, print an @PLT suffix.
14158 p -- print raw symbol name.
14159 X -- don't print any sort of PIC '@' suffix for a symbol.
14160 & -- print some in-use local-dynamic symbol name.
14161 H -- print a memory address offset by 8; used for sse high-parts
14162 Y -- print condition for XOP pcom* instruction.
14163 + -- print a branch hint as 'cs' or 'ds' prefix
14164 ; -- print a semicolon (after prefixes due to bug in older gas).
14165 ~ -- print "i" if TARGET_AVX2, "f" otherwise.
14166 @ -- print a segment register of thread base pointer load
14167 ^ -- print addr32 prefix if TARGET_64BIT and Pmode != word_mode
14168 */
14169
14170 void
14171 ix86_print_operand (FILE *file, rtx x, int code)
14172 {
14173 if (code)
14174 {
14175 switch (code)
14176 {
14177 case 'A':
14178 switch (ASSEMBLER_DIALECT)
14179 {
14180 case ASM_ATT:
14181 putc ('*', file);
14182 break;
14183
14184 case ASM_INTEL:
14185 /* Intel syntax. For absolute addresses, registers should not
14186 be surrounded by braces. */
14187 if (!REG_P (x))
14188 {
14189 putc ('[', file);
14190 ix86_print_operand (file, x, 0);
14191 putc (']', file);
14192 return;
14193 }
14194 break;
14195
14196 default:
14197 gcc_unreachable ();
14198 }
14199
14200 ix86_print_operand (file, x, 0);
14201 return;
14202
14203 case 'E':
14204 /* Wrap address in an UNSPEC to declare special handling. */
14205 if (TARGET_64BIT)
14206 x = gen_rtx_UNSPEC (DImode, gen_rtvec (1, x), UNSPEC_LEA_ADDR);
14207
14208 output_address (x);
14209 return;
14210
14211 case 'L':
14212 if (ASSEMBLER_DIALECT == ASM_ATT)
14213 putc ('l', file);
14214 return;
14215
14216 case 'W':
14217 if (ASSEMBLER_DIALECT == ASM_ATT)
14218 putc ('w', file);
14219 return;
14220
14221 case 'B':
14222 if (ASSEMBLER_DIALECT == ASM_ATT)
14223 putc ('b', file);
14224 return;
14225
14226 case 'Q':
14227 if (ASSEMBLER_DIALECT == ASM_ATT)
14228 putc ('l', file);
14229 return;
14230
14231 case 'S':
14232 if (ASSEMBLER_DIALECT == ASM_ATT)
14233 putc ('s', file);
14234 return;
14235
14236 case 'T':
14237 if (ASSEMBLER_DIALECT == ASM_ATT)
14238 putc ('t', file);
14239 return;
14240
14241 case 'O':
14242 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
14243 if (ASSEMBLER_DIALECT != ASM_ATT)
14244 return;
14245
14246 switch (GET_MODE_SIZE (GET_MODE (x)))
14247 {
14248 case 2:
14249 putc ('w', file);
14250 break;
14251
14252 case 4:
14253 putc ('l', file);
14254 break;
14255
14256 case 8:
14257 putc ('q', file);
14258 break;
14259
14260 default:
14261 output_operand_lossage
14262 ("invalid operand size for operand code 'O'");
14263 return;
14264 }
14265
14266 putc ('.', file);
14267 #endif
14268 return;
14269
14270 case 'z':
14271 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
14272 {
14273 /* Opcodes don't get size suffixes if using Intel opcodes. */
14274 if (ASSEMBLER_DIALECT == ASM_INTEL)
14275 return;
14276
14277 switch (GET_MODE_SIZE (GET_MODE (x)))
14278 {
14279 case 1:
14280 putc ('b', file);
14281 return;
14282
14283 case 2:
14284 putc ('w', file);
14285 return;
14286
14287 case 4:
14288 putc ('l', file);
14289 return;
14290
14291 case 8:
14292 putc ('q', file);
14293 return;
14294
14295 default:
14296 output_operand_lossage
14297 ("invalid operand size for operand code 'z'");
14298 return;
14299 }
14300 }
14301
14302 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
14303 warning
14304 (0, "non-integer operand used with operand code 'z'");
14305 /* FALLTHRU */
14306
14307 case 'Z':
14308 /* 387 opcodes don't get size suffixes if using Intel opcodes. */
14309 if (ASSEMBLER_DIALECT == ASM_INTEL)
14310 return;
14311
14312 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
14313 {
14314 switch (GET_MODE_SIZE (GET_MODE (x)))
14315 {
14316 case 2:
14317 #ifdef HAVE_AS_IX86_FILDS
14318 putc ('s', file);
14319 #endif
14320 return;
14321
14322 case 4:
14323 putc ('l', file);
14324 return;
14325
14326 case 8:
14327 #ifdef HAVE_AS_IX86_FILDQ
14328 putc ('q', file);
14329 #else
14330 fputs ("ll", file);
14331 #endif
14332 return;
14333
14334 default:
14335 break;
14336 }
14337 }
14338 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
14339 {
14340 /* 387 opcodes don't get size suffixes
14341 if the operands are registers. */
14342 if (STACK_REG_P (x))
14343 return;
14344
14345 switch (GET_MODE_SIZE (GET_MODE (x)))
14346 {
14347 case 4:
14348 putc ('s', file);
14349 return;
14350
14351 case 8:
14352 putc ('l', file);
14353 return;
14354
14355 case 12:
14356 case 16:
14357 putc ('t', file);
14358 return;
14359
14360 default:
14361 break;
14362 }
14363 }
14364 else
14365 {
14366 output_operand_lossage
14367 ("invalid operand type used with operand code 'Z'");
14368 return;
14369 }
14370
14371 output_operand_lossage
14372 ("invalid operand size for operand code 'Z'");
14373 return;
14374
14375 case 'd':
14376 case 'b':
14377 case 'w':
14378 case 'k':
14379 case 'q':
14380 case 'h':
14381 case 't':
14382 case 'y':
14383 case 'x':
14384 case 'X':
14385 case 'P':
14386 case 'p':
14387 break;
14388
14389 case 's':
14390 if (CONST_INT_P (x) || ! SHIFT_DOUBLE_OMITS_COUNT)
14391 {
14392 ix86_print_operand (file, x, 0);
14393 fputs (", ", file);
14394 }
14395 return;
14396
14397 case 'Y':
14398 switch (GET_CODE (x))
14399 {
14400 case NE:
14401 fputs ("neq", file);
14402 break;
14403 case EQ:
14404 fputs ("eq", file);
14405 break;
14406 case GE:
14407 case GEU:
14408 fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "ge" : "unlt", file);
14409 break;
14410 case GT:
14411 case GTU:
14412 fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "gt" : "unle", file);
14413 break;
14414 case LE:
14415 case LEU:
14416 fputs ("le", file);
14417 break;
14418 case LT:
14419 case LTU:
14420 fputs ("lt", file);
14421 break;
14422 case UNORDERED:
14423 fputs ("unord", file);
14424 break;
14425 case ORDERED:
14426 fputs ("ord", file);
14427 break;
14428 case UNEQ:
14429 fputs ("ueq", file);
14430 break;
14431 case UNGE:
14432 fputs ("nlt", file);
14433 break;
14434 case UNGT:
14435 fputs ("nle", file);
14436 break;
14437 case UNLE:
14438 fputs ("ule", file);
14439 break;
14440 case UNLT:
14441 fputs ("ult", file);
14442 break;
14443 case LTGT:
14444 fputs ("une", file);
14445 break;
14446 default:
14447 output_operand_lossage ("operand is not a condition code, "
14448 "invalid operand code 'Y'");
14449 return;
14450 }
14451 return;
14452
14453 case 'D':
14454 /* Little bit of braindamage here. The SSE compare instructions
14455 does use completely different names for the comparisons that the
14456 fp conditional moves. */
14457 switch (GET_CODE (x))
14458 {
14459 case UNEQ:
14460 if (TARGET_AVX)
14461 {
14462 fputs ("eq_us", file);
14463 break;
14464 }
14465 case EQ:
14466 fputs ("eq", file);
14467 break;
14468 case UNLT:
14469 if (TARGET_AVX)
14470 {
14471 fputs ("nge", file);
14472 break;
14473 }
14474 case LT:
14475 fputs ("lt", file);
14476 break;
14477 case UNLE:
14478 if (TARGET_AVX)
14479 {
14480 fputs ("ngt", file);
14481 break;
14482 }
14483 case LE:
14484 fputs ("le", file);
14485 break;
14486 case UNORDERED:
14487 fputs ("unord", file);
14488 break;
14489 case LTGT:
14490 if (TARGET_AVX)
14491 {
14492 fputs ("neq_oq", file);
14493 break;
14494 }
14495 case NE:
14496 fputs ("neq", file);
14497 break;
14498 case GE:
14499 if (TARGET_AVX)
14500 {
14501 fputs ("ge", file);
14502 break;
14503 }
14504 case UNGE:
14505 fputs ("nlt", file);
14506 break;
14507 case GT:
14508 if (TARGET_AVX)
14509 {
14510 fputs ("gt", file);
14511 break;
14512 }
14513 case UNGT:
14514 fputs ("nle", file);
14515 break;
14516 case ORDERED:
14517 fputs ("ord", file);
14518 break;
14519 default:
14520 output_operand_lossage ("operand is not a condition code, "
14521 "invalid operand code 'D'");
14522 return;
14523 }
14524 return;
14525
14526 case 'F':
14527 case 'f':
14528 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
14529 if (ASSEMBLER_DIALECT == ASM_ATT)
14530 putc ('.', file);
14531 #endif
14532
14533 case 'C':
14534 case 'c':
14535 if (!COMPARISON_P (x))
14536 {
14537 output_operand_lossage ("operand is not a condition code, "
14538 "invalid operand code '%c'", code);
14539 return;
14540 }
14541 put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)),
14542 code == 'c' || code == 'f',
14543 code == 'F' || code == 'f',
14544 file);
14545 return;
14546
14547 case 'H':
14548 if (!offsettable_memref_p (x))
14549 {
14550 output_operand_lossage ("operand is not an offsettable memory "
14551 "reference, invalid operand code 'H'");
14552 return;
14553 }
14554 /* It doesn't actually matter what mode we use here, as we're
14555 only going to use this for printing. */
14556 x = adjust_address_nv (x, DImode, 8);
14557 break;
14558
14559 case 'K':
14560 gcc_assert (CONST_INT_P (x));
14561
14562 if (INTVAL (x) & IX86_HLE_ACQUIRE)
14563 #ifdef HAVE_AS_IX86_HLE
14564 fputs ("xacquire ", file);
14565 #else
14566 fputs ("\n" ASM_BYTE "0xf2\n\t", file);
14567 #endif
14568 else if (INTVAL (x) & IX86_HLE_RELEASE)
14569 #ifdef HAVE_AS_IX86_HLE
14570 fputs ("xrelease ", file);
14571 #else
14572 fputs ("\n" ASM_BYTE "0xf3\n\t", file);
14573 #endif
14574 /* We do not want to print value of the operand. */
14575 return;
14576
14577 case '*':
14578 if (ASSEMBLER_DIALECT == ASM_ATT)
14579 putc ('*', file);
14580 return;
14581
14582 case '&':
14583 {
14584 const char *name = get_some_local_dynamic_name ();
14585 if (name == NULL)
14586 output_operand_lossage ("'%%&' used without any "
14587 "local dynamic TLS references");
14588 else
14589 assemble_name (file, name);
14590 return;
14591 }
14592
14593 case '+':
14594 {
14595 rtx x;
14596
14597 if (!optimize
14598 || optimize_function_for_size_p (cfun)
14599 || !TARGET_BRANCH_PREDICTION_HINTS)
14600 return;
14601
14602 x = find_reg_note (current_output_insn, REG_BR_PROB, 0);
14603 if (x)
14604 {
14605 int pred_val = INTVAL (XEXP (x, 0));
14606
14607 if (pred_val < REG_BR_PROB_BASE * 45 / 100
14608 || pred_val > REG_BR_PROB_BASE * 55 / 100)
14609 {
14610 bool taken = pred_val > REG_BR_PROB_BASE / 2;
14611 bool cputaken
14612 = final_forward_branch_p (current_output_insn) == 0;
14613
14614 /* Emit hints only in the case default branch prediction
14615 heuristics would fail. */
14616 if (taken != cputaken)
14617 {
14618 /* We use 3e (DS) prefix for taken branches and
14619 2e (CS) prefix for not taken branches. */
14620 if (taken)
14621 fputs ("ds ; ", file);
14622 else
14623 fputs ("cs ; ", file);
14624 }
14625 }
14626 }
14627 return;
14628 }
14629
14630 case ';':
14631 #ifndef HAVE_AS_IX86_REP_LOCK_PREFIX
14632 putc (';', file);
14633 #endif
14634 return;
14635
14636 case '@':
14637 if (ASSEMBLER_DIALECT == ASM_ATT)
14638 putc ('%', file);
14639
14640 /* The kernel uses a different segment register for performance
14641 reasons; a system call would not have to trash the userspace
14642 segment register, which would be expensive. */
14643 if (TARGET_64BIT && ix86_cmodel != CM_KERNEL)
14644 fputs ("fs", file);
14645 else
14646 fputs ("gs", file);
14647 return;
14648
14649 case '~':
14650 putc (TARGET_AVX2 ? 'i' : 'f', file);
14651 return;
14652
14653 case '^':
14654 if (TARGET_64BIT && Pmode != word_mode)
14655 fputs ("addr32 ", file);
14656 return;
14657
14658 default:
14659 output_operand_lossage ("invalid operand code '%c'", code);
14660 }
14661 }
14662
14663 if (REG_P (x))
14664 print_reg (x, code, file);
14665
14666 else if (MEM_P (x))
14667 {
14668 /* No `byte ptr' prefix for call instructions or BLKmode operands. */
14669 if (ASSEMBLER_DIALECT == ASM_INTEL && code != 'X' && code != 'P'
14670 && GET_MODE (x) != BLKmode)
14671 {
14672 const char * size;
14673 switch (GET_MODE_SIZE (GET_MODE (x)))
14674 {
14675 case 1: size = "BYTE"; break;
14676 case 2: size = "WORD"; break;
14677 case 4: size = "DWORD"; break;
14678 case 8: size = "QWORD"; break;
14679 case 12: size = "TBYTE"; break;
14680 case 16:
14681 if (GET_MODE (x) == XFmode)
14682 size = "TBYTE";
14683 else
14684 size = "XMMWORD";
14685 break;
14686 case 32: size = "YMMWORD"; break;
14687 default:
14688 gcc_unreachable ();
14689 }
14690
14691 /* Check for explicit size override (codes 'b', 'w', 'k',
14692 'q' and 'x') */
14693 if (code == 'b')
14694 size = "BYTE";
14695 else if (code == 'w')
14696 size = "WORD";
14697 else if (code == 'k')
14698 size = "DWORD";
14699 else if (code == 'q')
14700 size = "QWORD";
14701 else if (code == 'x')
14702 size = "XMMWORD";
14703
14704 fputs (size, file);
14705 fputs (" PTR ", file);
14706 }
14707
14708 x = XEXP (x, 0);
14709 /* Avoid (%rip) for call operands. */
14710 if (CONSTANT_ADDRESS_P (x) && code == 'P'
14711 && !CONST_INT_P (x))
14712 output_addr_const (file, x);
14713 else if (this_is_asm_operands && ! address_operand (x, VOIDmode))
14714 output_operand_lossage ("invalid constraints for operand");
14715 else
14716 output_address (x);
14717 }
14718
14719 else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == SFmode)
14720 {
14721 REAL_VALUE_TYPE r;
14722 long l;
14723
14724 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
14725 REAL_VALUE_TO_TARGET_SINGLE (r, l);
14726
14727 if (ASSEMBLER_DIALECT == ASM_ATT)
14728 putc ('$', file);
14729 /* Sign extend 32bit SFmode immediate to 8 bytes. */
14730 if (code == 'q')
14731 fprintf (file, "0x%08llx", (unsigned long long) (int) l);
14732 else
14733 fprintf (file, "0x%08x", (unsigned int) l);
14734 }
14735
14736 else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == DFmode)
14737 {
14738 REAL_VALUE_TYPE r;
14739 long l[2];
14740
14741 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
14742 REAL_VALUE_TO_TARGET_DOUBLE (r, l);
14743
14744 if (ASSEMBLER_DIALECT == ASM_ATT)
14745 putc ('$', file);
14746 fprintf (file, "0x%lx%08lx", l[1] & 0xffffffff, l[0] & 0xffffffff);
14747 }
14748
14749 /* These float cases don't actually occur as immediate operands. */
14750 else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == XFmode)
14751 {
14752 char dstr[30];
14753
14754 real_to_decimal (dstr, CONST_DOUBLE_REAL_VALUE (x), sizeof (dstr), 0, 1);
14755 fputs (dstr, file);
14756 }
14757
14758 else
14759 {
14760 /* We have patterns that allow zero sets of memory, for instance.
14761 In 64-bit mode, we should probably support all 8-byte vectors,
14762 since we can in fact encode that into an immediate. */
14763 if (GET_CODE (x) == CONST_VECTOR)
14764 {
14765 gcc_assert (x == CONST0_RTX (GET_MODE (x)));
14766 x = const0_rtx;
14767 }
14768
14769 if (code != 'P' && code != 'p')
14770 {
14771 if (CONST_INT_P (x) || GET_CODE (x) == CONST_DOUBLE)
14772 {
14773 if (ASSEMBLER_DIALECT == ASM_ATT)
14774 putc ('$', file);
14775 }
14776 else if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF
14777 || GET_CODE (x) == LABEL_REF)
14778 {
14779 if (ASSEMBLER_DIALECT == ASM_ATT)
14780 putc ('$', file);
14781 else
14782 fputs ("OFFSET FLAT:", file);
14783 }
14784 }
14785 if (CONST_INT_P (x))
14786 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
14787 else if (flag_pic || MACHOPIC_INDIRECT)
14788 output_pic_addr_const (file, x, code);
14789 else
14790 output_addr_const (file, x);
14791 }
14792 }
14793
14794 static bool
14795 ix86_print_operand_punct_valid_p (unsigned char code)
14796 {
14797 return (code == '@' || code == '*' || code == '+' || code == '&'
14798 || code == ';' || code == '~' || code == '^');
14799 }
14800 \f
14801 /* Print a memory operand whose address is ADDR. */
14802
14803 static void
14804 ix86_print_operand_address (FILE *file, rtx addr)
14805 {
14806 struct ix86_address parts;
14807 rtx base, index, disp;
14808 int scale;
14809 int ok;
14810 bool vsib = false;
14811 int code = 0;
14812
14813 if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_VSIBADDR)
14814 {
14815 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
14816 gcc_assert (parts.index == NULL_RTX);
14817 parts.index = XVECEXP (addr, 0, 1);
14818 parts.scale = INTVAL (XVECEXP (addr, 0, 2));
14819 addr = XVECEXP (addr, 0, 0);
14820 vsib = true;
14821 }
14822 else if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_LEA_ADDR)
14823 {
14824 gcc_assert (TARGET_64BIT);
14825 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
14826 code = 'q';
14827 }
14828 else
14829 ok = ix86_decompose_address (addr, &parts);
14830
14831 gcc_assert (ok);
14832
14833 if (parts.base && GET_CODE (parts.base) == SUBREG)
14834 {
14835 rtx tmp = SUBREG_REG (parts.base);
14836 parts.base = simplify_subreg (GET_MODE (parts.base),
14837 tmp, GET_MODE (tmp), 0);
14838 gcc_assert (parts.base != NULL_RTX);
14839 }
14840
14841 if (parts.index && GET_CODE (parts.index) == SUBREG)
14842 {
14843 rtx tmp = SUBREG_REG (parts.index);
14844 parts.index = simplify_subreg (GET_MODE (parts.index),
14845 tmp, GET_MODE (tmp), 0);
14846 gcc_assert (parts.index != NULL_RTX);
14847 }
14848
14849 base = parts.base;
14850 index = parts.index;
14851 disp = parts.disp;
14852 scale = parts.scale;
14853
14854 switch (parts.seg)
14855 {
14856 case SEG_DEFAULT:
14857 break;
14858 case SEG_FS:
14859 case SEG_GS:
14860 if (ASSEMBLER_DIALECT == ASM_ATT)
14861 putc ('%', file);
14862 fputs ((parts.seg == SEG_FS ? "fs:" : "gs:"), file);
14863 break;
14864 default:
14865 gcc_unreachable ();
14866 }
14867
14868 /* Use one byte shorter RIP relative addressing for 64bit mode. */
14869 if (TARGET_64BIT && !base && !index)
14870 {
14871 rtx symbol = disp;
14872
14873 if (GET_CODE (disp) == CONST
14874 && GET_CODE (XEXP (disp, 0)) == PLUS
14875 && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
14876 symbol = XEXP (XEXP (disp, 0), 0);
14877
14878 if (GET_CODE (symbol) == LABEL_REF
14879 || (GET_CODE (symbol) == SYMBOL_REF
14880 && SYMBOL_REF_TLS_MODEL (symbol) == 0))
14881 base = pc_rtx;
14882 }
14883 if (!base && !index)
14884 {
14885 /* Displacement only requires special attention. */
14886
14887 if (CONST_INT_P (disp))
14888 {
14889 if (ASSEMBLER_DIALECT == ASM_INTEL && parts.seg == SEG_DEFAULT)
14890 fputs ("ds:", file);
14891 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (disp));
14892 }
14893 else if (flag_pic)
14894 output_pic_addr_const (file, disp, 0);
14895 else
14896 output_addr_const (file, disp);
14897 }
14898 else
14899 {
14900 /* Print SImode register names to force addr32 prefix. */
14901 if (GET_CODE (addr) == SUBREG)
14902 {
14903 gcc_assert (TARGET_64BIT);
14904 gcc_assert (GET_MODE (addr) == SImode);
14905 gcc_assert (GET_MODE (SUBREG_REG (addr)) == DImode);
14906 gcc_assert (!code);
14907 code = 'l';
14908 }
14909 else if (GET_CODE (addr) == ZERO_EXTEND
14910 || GET_CODE (addr) == AND)
14911 {
14912 gcc_assert (TARGET_64BIT);
14913 gcc_assert (GET_MODE (addr) == DImode);
14914 gcc_assert (!code);
14915 code = 'l';
14916 }
14917
14918 if (ASSEMBLER_DIALECT == ASM_ATT)
14919 {
14920 if (disp)
14921 {
14922 if (flag_pic)
14923 output_pic_addr_const (file, disp, 0);
14924 else if (GET_CODE (disp) == LABEL_REF)
14925 output_asm_label (disp);
14926 else
14927 output_addr_const (file, disp);
14928 }
14929
14930 putc ('(', file);
14931 if (base)
14932 print_reg (base, code, file);
14933 if (index)
14934 {
14935 putc (',', file);
14936 print_reg (index, vsib ? 0 : code, file);
14937 if (scale != 1 || vsib)
14938 fprintf (file, ",%d", scale);
14939 }
14940 putc (')', file);
14941 }
14942 else
14943 {
14944 rtx offset = NULL_RTX;
14945
14946 if (disp)
14947 {
14948 /* Pull out the offset of a symbol; print any symbol itself. */
14949 if (GET_CODE (disp) == CONST
14950 && GET_CODE (XEXP (disp, 0)) == PLUS
14951 && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
14952 {
14953 offset = XEXP (XEXP (disp, 0), 1);
14954 disp = gen_rtx_CONST (VOIDmode,
14955 XEXP (XEXP (disp, 0), 0));
14956 }
14957
14958 if (flag_pic)
14959 output_pic_addr_const (file, disp, 0);
14960 else if (GET_CODE (disp) == LABEL_REF)
14961 output_asm_label (disp);
14962 else if (CONST_INT_P (disp))
14963 offset = disp;
14964 else
14965 output_addr_const (file, disp);
14966 }
14967
14968 putc ('[', file);
14969 if (base)
14970 {
14971 print_reg (base, code, file);
14972 if (offset)
14973 {
14974 if (INTVAL (offset) >= 0)
14975 putc ('+', file);
14976 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
14977 }
14978 }
14979 else if (offset)
14980 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
14981 else
14982 putc ('0', file);
14983
14984 if (index)
14985 {
14986 putc ('+', file);
14987 print_reg (index, vsib ? 0 : code, file);
14988 if (scale != 1 || vsib)
14989 fprintf (file, "*%d", scale);
14990 }
14991 putc (']', file);
14992 }
14993 }
14994 }
14995
14996 /* Implementation of TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA. */
14997
14998 static bool
14999 i386_asm_output_addr_const_extra (FILE *file, rtx x)
15000 {
15001 rtx op;
15002
15003 if (GET_CODE (x) != UNSPEC)
15004 return false;
15005
15006 op = XVECEXP (x, 0, 0);
15007 switch (XINT (x, 1))
15008 {
15009 case UNSPEC_GOTTPOFF:
15010 output_addr_const (file, op);
15011 /* FIXME: This might be @TPOFF in Sun ld. */
15012 fputs ("@gottpoff", file);
15013 break;
15014 case UNSPEC_TPOFF:
15015 output_addr_const (file, op);
15016 fputs ("@tpoff", file);
15017 break;
15018 case UNSPEC_NTPOFF:
15019 output_addr_const (file, op);
15020 if (TARGET_64BIT)
15021 fputs ("@tpoff", file);
15022 else
15023 fputs ("@ntpoff", file);
15024 break;
15025 case UNSPEC_DTPOFF:
15026 output_addr_const (file, op);
15027 fputs ("@dtpoff", file);
15028 break;
15029 case UNSPEC_GOTNTPOFF:
15030 output_addr_const (file, op);
15031 if (TARGET_64BIT)
15032 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
15033 "@gottpoff(%rip)" : "@gottpoff[rip]", file);
15034 else
15035 fputs ("@gotntpoff", file);
15036 break;
15037 case UNSPEC_INDNTPOFF:
15038 output_addr_const (file, op);
15039 fputs ("@indntpoff", file);
15040 break;
15041 #if TARGET_MACHO
15042 case UNSPEC_MACHOPIC_OFFSET:
15043 output_addr_const (file, op);
15044 putc ('-', file);
15045 machopic_output_function_base_name (file);
15046 break;
15047 #endif
15048
15049 case UNSPEC_STACK_CHECK:
15050 {
15051 int offset;
15052
15053 gcc_assert (flag_split_stack);
15054
15055 #ifdef TARGET_THREAD_SPLIT_STACK_OFFSET
15056 offset = TARGET_THREAD_SPLIT_STACK_OFFSET;
15057 #else
15058 gcc_unreachable ();
15059 #endif
15060
15061 fprintf (file, "%s:%d", TARGET_64BIT ? "%fs" : "%gs", offset);
15062 }
15063 break;
15064
15065 default:
15066 return false;
15067 }
15068
15069 return true;
15070 }
15071 \f
15072 /* Split one or more double-mode RTL references into pairs of half-mode
15073 references. The RTL can be REG, offsettable MEM, integer constant, or
15074 CONST_DOUBLE. "operands" is a pointer to an array of double-mode RTLs to
15075 split and "num" is its length. lo_half and hi_half are output arrays
15076 that parallel "operands". */
15077
15078 void
15079 split_double_mode (enum machine_mode mode, rtx operands[],
15080 int num, rtx lo_half[], rtx hi_half[])
15081 {
15082 enum machine_mode half_mode;
15083 unsigned int byte;
15084
15085 switch (mode)
15086 {
15087 case TImode:
15088 half_mode = DImode;
15089 break;
15090 case DImode:
15091 half_mode = SImode;
15092 break;
15093 default:
15094 gcc_unreachable ();
15095 }
15096
15097 byte = GET_MODE_SIZE (half_mode);
15098
15099 while (num--)
15100 {
15101 rtx op = operands[num];
15102
15103 /* simplify_subreg refuse to split volatile memory addresses,
15104 but we still have to handle it. */
15105 if (MEM_P (op))
15106 {
15107 lo_half[num] = adjust_address (op, half_mode, 0);
15108 hi_half[num] = adjust_address (op, half_mode, byte);
15109 }
15110 else
15111 {
15112 lo_half[num] = simplify_gen_subreg (half_mode, op,
15113 GET_MODE (op) == VOIDmode
15114 ? mode : GET_MODE (op), 0);
15115 hi_half[num] = simplify_gen_subreg (half_mode, op,
15116 GET_MODE (op) == VOIDmode
15117 ? mode : GET_MODE (op), byte);
15118 }
15119 }
15120 }
15121 \f
15122 /* Output code to perform a 387 binary operation in INSN, one of PLUS,
15123 MINUS, MULT or DIV. OPERANDS are the insn operands, where operands[3]
15124 is the expression of the binary operation. The output may either be
15125 emitted here, or returned to the caller, like all output_* functions.
15126
15127 There is no guarantee that the operands are the same mode, as they
15128 might be within FLOAT or FLOAT_EXTEND expressions. */
15129
15130 #ifndef SYSV386_COMPAT
15131 /* Set to 1 for compatibility with brain-damaged assemblers. No-one
15132 wants to fix the assemblers because that causes incompatibility
15133 with gcc. No-one wants to fix gcc because that causes
15134 incompatibility with assemblers... You can use the option of
15135 -DSYSV386_COMPAT=0 if you recompile both gcc and gas this way. */
15136 #define SYSV386_COMPAT 1
15137 #endif
15138
15139 const char *
15140 output_387_binary_op (rtx insn, rtx *operands)
15141 {
15142 static char buf[40];
15143 const char *p;
15144 const char *ssep;
15145 int is_sse = SSE_REG_P (operands[0]) || SSE_REG_P (operands[1]) || SSE_REG_P (operands[2]);
15146
15147 #ifdef ENABLE_CHECKING
15148 /* Even if we do not want to check the inputs, this documents input
15149 constraints. Which helps in understanding the following code. */
15150 if (STACK_REG_P (operands[0])
15151 && ((REG_P (operands[1])
15152 && REGNO (operands[0]) == REGNO (operands[1])
15153 && (STACK_REG_P (operands[2]) || MEM_P (operands[2])))
15154 || (REG_P (operands[2])
15155 && REGNO (operands[0]) == REGNO (operands[2])
15156 && (STACK_REG_P (operands[1]) || MEM_P (operands[1]))))
15157 && (STACK_TOP_P (operands[1]) || STACK_TOP_P (operands[2])))
15158 ; /* ok */
15159 else
15160 gcc_assert (is_sse);
15161 #endif
15162
15163 switch (GET_CODE (operands[3]))
15164 {
15165 case PLUS:
15166 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
15167 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
15168 p = "fiadd";
15169 else
15170 p = "fadd";
15171 ssep = "vadd";
15172 break;
15173
15174 case MINUS:
15175 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
15176 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
15177 p = "fisub";
15178 else
15179 p = "fsub";
15180 ssep = "vsub";
15181 break;
15182
15183 case MULT:
15184 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
15185 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
15186 p = "fimul";
15187 else
15188 p = "fmul";
15189 ssep = "vmul";
15190 break;
15191
15192 case DIV:
15193 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
15194 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
15195 p = "fidiv";
15196 else
15197 p = "fdiv";
15198 ssep = "vdiv";
15199 break;
15200
15201 default:
15202 gcc_unreachable ();
15203 }
15204
15205 if (is_sse)
15206 {
15207 if (TARGET_AVX)
15208 {
15209 strcpy (buf, ssep);
15210 if (GET_MODE (operands[0]) == SFmode)
15211 strcat (buf, "ss\t{%2, %1, %0|%0, %1, %2}");
15212 else
15213 strcat (buf, "sd\t{%2, %1, %0|%0, %1, %2}");
15214 }
15215 else
15216 {
15217 strcpy (buf, ssep + 1);
15218 if (GET_MODE (operands[0]) == SFmode)
15219 strcat (buf, "ss\t{%2, %0|%0, %2}");
15220 else
15221 strcat (buf, "sd\t{%2, %0|%0, %2}");
15222 }
15223 return buf;
15224 }
15225 strcpy (buf, p);
15226
15227 switch (GET_CODE (operands[3]))
15228 {
15229 case MULT:
15230 case PLUS:
15231 if (REG_P (operands[2]) && REGNO (operands[0]) == REGNO (operands[2]))
15232 {
15233 rtx temp = operands[2];
15234 operands[2] = operands[1];
15235 operands[1] = temp;
15236 }
15237
15238 /* know operands[0] == operands[1]. */
15239
15240 if (MEM_P (operands[2]))
15241 {
15242 p = "%Z2\t%2";
15243 break;
15244 }
15245
15246 if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
15247 {
15248 if (STACK_TOP_P (operands[0]))
15249 /* How is it that we are storing to a dead operand[2]?
15250 Well, presumably operands[1] is dead too. We can't
15251 store the result to st(0) as st(0) gets popped on this
15252 instruction. Instead store to operands[2] (which I
15253 think has to be st(1)). st(1) will be popped later.
15254 gcc <= 2.8.1 didn't have this check and generated
15255 assembly code that the Unixware assembler rejected. */
15256 p = "p\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */
15257 else
15258 p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */
15259 break;
15260 }
15261
15262 if (STACK_TOP_P (operands[0]))
15263 p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */
15264 else
15265 p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */
15266 break;
15267
15268 case MINUS:
15269 case DIV:
15270 if (MEM_P (operands[1]))
15271 {
15272 p = "r%Z1\t%1";
15273 break;
15274 }
15275
15276 if (MEM_P (operands[2]))
15277 {
15278 p = "%Z2\t%2";
15279 break;
15280 }
15281
15282 if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
15283 {
15284 #if SYSV386_COMPAT
15285 /* The SystemV/386 SVR3.2 assembler, and probably all AT&T
15286 derived assemblers, confusingly reverse the direction of
15287 the operation for fsub{r} and fdiv{r} when the
15288 destination register is not st(0). The Intel assembler
15289 doesn't have this brain damage. Read !SYSV386_COMPAT to
15290 figure out what the hardware really does. */
15291 if (STACK_TOP_P (operands[0]))
15292 p = "{p\t%0, %2|rp\t%2, %0}";
15293 else
15294 p = "{rp\t%2, %0|p\t%0, %2}";
15295 #else
15296 if (STACK_TOP_P (operands[0]))
15297 /* As above for fmul/fadd, we can't store to st(0). */
15298 p = "rp\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */
15299 else
15300 p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */
15301 #endif
15302 break;
15303 }
15304
15305 if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
15306 {
15307 #if SYSV386_COMPAT
15308 if (STACK_TOP_P (operands[0]))
15309 p = "{rp\t%0, %1|p\t%1, %0}";
15310 else
15311 p = "{p\t%1, %0|rp\t%0, %1}";
15312 #else
15313 if (STACK_TOP_P (operands[0]))
15314 p = "p\t{%0, %1|%1, %0}"; /* st(1) = st(1) op st(0); pop */
15315 else
15316 p = "rp\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2); pop */
15317 #endif
15318 break;
15319 }
15320
15321 if (STACK_TOP_P (operands[0]))
15322 {
15323 if (STACK_TOP_P (operands[1]))
15324 p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */
15325 else
15326 p = "r\t{%y1, %0|%0, %y1}"; /* st(0) = st(r1) op st(0) */
15327 break;
15328 }
15329 else if (STACK_TOP_P (operands[1]))
15330 {
15331 #if SYSV386_COMPAT
15332 p = "{\t%1, %0|r\t%0, %1}";
15333 #else
15334 p = "r\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2) */
15335 #endif
15336 }
15337 else
15338 {
15339 #if SYSV386_COMPAT
15340 p = "{r\t%2, %0|\t%0, %2}";
15341 #else
15342 p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */
15343 #endif
15344 }
15345 break;
15346
15347 default:
15348 gcc_unreachable ();
15349 }
15350
15351 strcat (buf, p);
15352 return buf;
15353 }
15354
15355 /* Return needed mode for entity in optimize_mode_switching pass. */
15356
15357 int
15358 ix86_mode_needed (int entity, rtx insn)
15359 {
15360 enum attr_i387_cw mode;
15361
15362 /* The mode UNINITIALIZED is used to store control word after a
15363 function call or ASM pattern. The mode ANY specify that function
15364 has no requirements on the control word and make no changes in the
15365 bits we are interested in. */
15366
15367 if (CALL_P (insn)
15368 || (NONJUMP_INSN_P (insn)
15369 && (asm_noperands (PATTERN (insn)) >= 0
15370 || GET_CODE (PATTERN (insn)) == ASM_INPUT)))
15371 return I387_CW_UNINITIALIZED;
15372
15373 if (recog_memoized (insn) < 0)
15374 return I387_CW_ANY;
15375
15376 mode = get_attr_i387_cw (insn);
15377
15378 switch (entity)
15379 {
15380 case I387_TRUNC:
15381 if (mode == I387_CW_TRUNC)
15382 return mode;
15383 break;
15384
15385 case I387_FLOOR:
15386 if (mode == I387_CW_FLOOR)
15387 return mode;
15388 break;
15389
15390 case I387_CEIL:
15391 if (mode == I387_CW_CEIL)
15392 return mode;
15393 break;
15394
15395 case I387_MASK_PM:
15396 if (mode == I387_CW_MASK_PM)
15397 return mode;
15398 break;
15399
15400 default:
15401 gcc_unreachable ();
15402 }
15403
15404 return I387_CW_ANY;
15405 }
15406
15407 /* Output code to initialize control word copies used by trunc?f?i and
15408 rounding patterns. CURRENT_MODE is set to current control word,
15409 while NEW_MODE is set to new control word. */
15410
15411 void
15412 emit_i387_cw_initialization (int mode)
15413 {
15414 rtx stored_mode = assign_386_stack_local (HImode, SLOT_CW_STORED);
15415 rtx new_mode;
15416
15417 enum ix86_stack_slot slot;
15418
15419 rtx reg = gen_reg_rtx (HImode);
15420
15421 emit_insn (gen_x86_fnstcw_1 (stored_mode));
15422 emit_move_insn (reg, copy_rtx (stored_mode));
15423
15424 if (TARGET_64BIT || TARGET_PARTIAL_REG_STALL
15425 || optimize_function_for_size_p (cfun))
15426 {
15427 switch (mode)
15428 {
15429 case I387_CW_TRUNC:
15430 /* round toward zero (truncate) */
15431 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0c00)));
15432 slot = SLOT_CW_TRUNC;
15433 break;
15434
15435 case I387_CW_FLOOR:
15436 /* round down toward -oo */
15437 emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
15438 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0400)));
15439 slot = SLOT_CW_FLOOR;
15440 break;
15441
15442 case I387_CW_CEIL:
15443 /* round up toward +oo */
15444 emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
15445 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0800)));
15446 slot = SLOT_CW_CEIL;
15447 break;
15448
15449 case I387_CW_MASK_PM:
15450 /* mask precision exception for nearbyint() */
15451 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
15452 slot = SLOT_CW_MASK_PM;
15453 break;
15454
15455 default:
15456 gcc_unreachable ();
15457 }
15458 }
15459 else
15460 {
15461 switch (mode)
15462 {
15463 case I387_CW_TRUNC:
15464 /* round toward zero (truncate) */
15465 emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0xc)));
15466 slot = SLOT_CW_TRUNC;
15467 break;
15468
15469 case I387_CW_FLOOR:
15470 /* round down toward -oo */
15471 emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0x4)));
15472 slot = SLOT_CW_FLOOR;
15473 break;
15474
15475 case I387_CW_CEIL:
15476 /* round up toward +oo */
15477 emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0x8)));
15478 slot = SLOT_CW_CEIL;
15479 break;
15480
15481 case I387_CW_MASK_PM:
15482 /* mask precision exception for nearbyint() */
15483 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
15484 slot = SLOT_CW_MASK_PM;
15485 break;
15486
15487 default:
15488 gcc_unreachable ();
15489 }
15490 }
15491
15492 gcc_assert (slot < MAX_386_STACK_LOCALS);
15493
15494 new_mode = assign_386_stack_local (HImode, slot);
15495 emit_move_insn (new_mode, reg);
15496 }
15497
15498 /* Output code for INSN to convert a float to a signed int. OPERANDS
15499 are the insn operands. The output may be [HSD]Imode and the input
15500 operand may be [SDX]Fmode. */
15501
15502 const char *
15503 output_fix_trunc (rtx insn, rtx *operands, bool fisttp)
15504 {
15505 int stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG) != 0;
15506 int dimode_p = GET_MODE (operands[0]) == DImode;
15507 int round_mode = get_attr_i387_cw (insn);
15508
15509 /* Jump through a hoop or two for DImode, since the hardware has no
15510 non-popping instruction. We used to do this a different way, but
15511 that was somewhat fragile and broke with post-reload splitters. */
15512 if ((dimode_p || fisttp) && !stack_top_dies)
15513 output_asm_insn ("fld\t%y1", operands);
15514
15515 gcc_assert (STACK_TOP_P (operands[1]));
15516 gcc_assert (MEM_P (operands[0]));
15517 gcc_assert (GET_MODE (operands[1]) != TFmode);
15518
15519 if (fisttp)
15520 output_asm_insn ("fisttp%Z0\t%0", operands);
15521 else
15522 {
15523 if (round_mode != I387_CW_ANY)
15524 output_asm_insn ("fldcw\t%3", operands);
15525 if (stack_top_dies || dimode_p)
15526 output_asm_insn ("fistp%Z0\t%0", operands);
15527 else
15528 output_asm_insn ("fist%Z0\t%0", operands);
15529 if (round_mode != I387_CW_ANY)
15530 output_asm_insn ("fldcw\t%2", operands);
15531 }
15532
15533 return "";
15534 }
15535
15536 /* Output code for x87 ffreep insn. The OPNO argument, which may only
15537 have the values zero or one, indicates the ffreep insn's operand
15538 from the OPERANDS array. */
15539
15540 static const char *
15541 output_387_ffreep (rtx *operands ATTRIBUTE_UNUSED, int opno)
15542 {
15543 if (TARGET_USE_FFREEP)
15544 #ifdef HAVE_AS_IX86_FFREEP
15545 return opno ? "ffreep\t%y1" : "ffreep\t%y0";
15546 #else
15547 {
15548 static char retval[32];
15549 int regno = REGNO (operands[opno]);
15550
15551 gcc_assert (FP_REGNO_P (regno));
15552
15553 regno -= FIRST_STACK_REG;
15554
15555 snprintf (retval, sizeof (retval), ASM_SHORT "0xc%ddf", regno);
15556 return retval;
15557 }
15558 #endif
15559
15560 return opno ? "fstp\t%y1" : "fstp\t%y0";
15561 }
15562
15563
15564 /* Output code for INSN to compare OPERANDS. EFLAGS_P is 1 when fcomi
15565 should be used. UNORDERED_P is true when fucom should be used. */
15566
15567 const char *
15568 output_fp_compare (rtx insn, rtx *operands, bool eflags_p, bool unordered_p)
15569 {
15570 int stack_top_dies;
15571 rtx cmp_op0, cmp_op1;
15572 int is_sse = SSE_REG_P (operands[0]) || SSE_REG_P (operands[1]);
15573
15574 if (eflags_p)
15575 {
15576 cmp_op0 = operands[0];
15577 cmp_op1 = operands[1];
15578 }
15579 else
15580 {
15581 cmp_op0 = operands[1];
15582 cmp_op1 = operands[2];
15583 }
15584
15585 if (is_sse)
15586 {
15587 if (GET_MODE (operands[0]) == SFmode)
15588 if (unordered_p)
15589 return "%vucomiss\t{%1, %0|%0, %1}";
15590 else
15591 return "%vcomiss\t{%1, %0|%0, %1}";
15592 else
15593 if (unordered_p)
15594 return "%vucomisd\t{%1, %0|%0, %1}";
15595 else
15596 return "%vcomisd\t{%1, %0|%0, %1}";
15597 }
15598
15599 gcc_assert (STACK_TOP_P (cmp_op0));
15600
15601 stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG) != 0;
15602
15603 if (cmp_op1 == CONST0_RTX (GET_MODE (cmp_op1)))
15604 {
15605 if (stack_top_dies)
15606 {
15607 output_asm_insn ("ftst\n\tfnstsw\t%0", operands);
15608 return output_387_ffreep (operands, 1);
15609 }
15610 else
15611 return "ftst\n\tfnstsw\t%0";
15612 }
15613
15614 if (STACK_REG_P (cmp_op1)
15615 && stack_top_dies
15616 && find_regno_note (insn, REG_DEAD, REGNO (cmp_op1))
15617 && REGNO (cmp_op1) != FIRST_STACK_REG)
15618 {
15619 /* If both the top of the 387 stack dies, and the other operand
15620 is also a stack register that dies, then this must be a
15621 `fcompp' float compare */
15622
15623 if (eflags_p)
15624 {
15625 /* There is no double popping fcomi variant. Fortunately,
15626 eflags is immune from the fstp's cc clobbering. */
15627 if (unordered_p)
15628 output_asm_insn ("fucomip\t{%y1, %0|%0, %y1}", operands);
15629 else
15630 output_asm_insn ("fcomip\t{%y1, %0|%0, %y1}", operands);
15631 return output_387_ffreep (operands, 0);
15632 }
15633 else
15634 {
15635 if (unordered_p)
15636 return "fucompp\n\tfnstsw\t%0";
15637 else
15638 return "fcompp\n\tfnstsw\t%0";
15639 }
15640 }
15641 else
15642 {
15643 /* Encoded here as eflags_p | intmode | unordered_p | stack_top_dies. */
15644
15645 static const char * const alt[16] =
15646 {
15647 "fcom%Z2\t%y2\n\tfnstsw\t%0",
15648 "fcomp%Z2\t%y2\n\tfnstsw\t%0",
15649 "fucom%Z2\t%y2\n\tfnstsw\t%0",
15650 "fucomp%Z2\t%y2\n\tfnstsw\t%0",
15651
15652 "ficom%Z2\t%y2\n\tfnstsw\t%0",
15653 "ficomp%Z2\t%y2\n\tfnstsw\t%0",
15654 NULL,
15655 NULL,
15656
15657 "fcomi\t{%y1, %0|%0, %y1}",
15658 "fcomip\t{%y1, %0|%0, %y1}",
15659 "fucomi\t{%y1, %0|%0, %y1}",
15660 "fucomip\t{%y1, %0|%0, %y1}",
15661
15662 NULL,
15663 NULL,
15664 NULL,
15665 NULL
15666 };
15667
15668 int mask;
15669 const char *ret;
15670
15671 mask = eflags_p << 3;
15672 mask |= (GET_MODE_CLASS (GET_MODE (cmp_op1)) == MODE_INT) << 2;
15673 mask |= unordered_p << 1;
15674 mask |= stack_top_dies;
15675
15676 gcc_assert (mask < 16);
15677 ret = alt[mask];
15678 gcc_assert (ret);
15679
15680 return ret;
15681 }
15682 }
15683
15684 void
15685 ix86_output_addr_vec_elt (FILE *file, int value)
15686 {
15687 const char *directive = ASM_LONG;
15688
15689 #ifdef ASM_QUAD
15690 if (TARGET_LP64)
15691 directive = ASM_QUAD;
15692 #else
15693 gcc_assert (!TARGET_64BIT);
15694 #endif
15695
15696 fprintf (file, "%s%s%d\n", directive, LPREFIX, value);
15697 }
15698
15699 void
15700 ix86_output_addr_diff_elt (FILE *file, int value, int rel)
15701 {
15702 const char *directive = ASM_LONG;
15703
15704 #ifdef ASM_QUAD
15705 if (TARGET_64BIT && CASE_VECTOR_MODE == DImode)
15706 directive = ASM_QUAD;
15707 #else
15708 gcc_assert (!TARGET_64BIT);
15709 #endif
15710 /* We can't use @GOTOFF for text labels on VxWorks; see gotoff_operand. */
15711 if (TARGET_64BIT || TARGET_VXWORKS_RTP)
15712 fprintf (file, "%s%s%d-%s%d\n",
15713 directive, LPREFIX, value, LPREFIX, rel);
15714 else if (HAVE_AS_GOTOFF_IN_DATA)
15715 fprintf (file, ASM_LONG "%s%d@GOTOFF\n", LPREFIX, value);
15716 #if TARGET_MACHO
15717 else if (TARGET_MACHO)
15718 {
15719 fprintf (file, ASM_LONG "%s%d-", LPREFIX, value);
15720 machopic_output_function_base_name (file);
15721 putc ('\n', file);
15722 }
15723 #endif
15724 else
15725 asm_fprintf (file, ASM_LONG "%U%s+[.-%s%d]\n",
15726 GOT_SYMBOL_NAME, LPREFIX, value);
15727 }
15728 \f
15729 /* Generate either "mov $0, reg" or "xor reg, reg", as appropriate
15730 for the target. */
15731
15732 void
15733 ix86_expand_clear (rtx dest)
15734 {
15735 rtx tmp;
15736
15737 /* We play register width games, which are only valid after reload. */
15738 gcc_assert (reload_completed);
15739
15740 /* Avoid HImode and its attendant prefix byte. */
15741 if (GET_MODE_SIZE (GET_MODE (dest)) < 4)
15742 dest = gen_rtx_REG (SImode, REGNO (dest));
15743 tmp = gen_rtx_SET (VOIDmode, dest, const0_rtx);
15744
15745 /* This predicate should match that for movsi_xor and movdi_xor_rex64. */
15746 if (!TARGET_USE_MOV0 || optimize_insn_for_speed_p ())
15747 {
15748 rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
15749 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, clob));
15750 }
15751
15752 emit_insn (tmp);
15753 }
15754
15755 /* X is an unchanging MEM. If it is a constant pool reference, return
15756 the constant pool rtx, else NULL. */
15757
15758 rtx
15759 maybe_get_pool_constant (rtx x)
15760 {
15761 x = ix86_delegitimize_address (XEXP (x, 0));
15762
15763 if (GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x))
15764 return get_pool_constant (x);
15765
15766 return NULL_RTX;
15767 }
15768
15769 void
15770 ix86_expand_move (enum machine_mode mode, rtx operands[])
15771 {
15772 rtx op0, op1;
15773 enum tls_model model;
15774
15775 op0 = operands[0];
15776 op1 = operands[1];
15777
15778 if (GET_CODE (op1) == SYMBOL_REF)
15779 {
15780 model = SYMBOL_REF_TLS_MODEL (op1);
15781 if (model)
15782 {
15783 op1 = legitimize_tls_address (op1, model, true);
15784 op1 = force_operand (op1, op0);
15785 if (op1 == op0)
15786 return;
15787 if (GET_MODE (op1) != mode)
15788 op1 = convert_to_mode (mode, op1, 1);
15789 }
15790 else if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
15791 && SYMBOL_REF_DLLIMPORT_P (op1))
15792 op1 = legitimize_dllimport_symbol (op1, false);
15793 }
15794 else if (GET_CODE (op1) == CONST
15795 && GET_CODE (XEXP (op1, 0)) == PLUS
15796 && GET_CODE (XEXP (XEXP (op1, 0), 0)) == SYMBOL_REF)
15797 {
15798 rtx addend = XEXP (XEXP (op1, 0), 1);
15799 rtx symbol = XEXP (XEXP (op1, 0), 0);
15800 rtx tmp = NULL;
15801
15802 model = SYMBOL_REF_TLS_MODEL (symbol);
15803 if (model)
15804 tmp = legitimize_tls_address (symbol, model, true);
15805 else if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
15806 && SYMBOL_REF_DLLIMPORT_P (symbol))
15807 tmp = legitimize_dllimport_symbol (symbol, true);
15808
15809 if (tmp)
15810 {
15811 tmp = force_operand (tmp, NULL);
15812 tmp = expand_simple_binop (Pmode, PLUS, tmp, addend,
15813 op0, 1, OPTAB_DIRECT);
15814 if (tmp == op0)
15815 return;
15816 if (GET_MODE (tmp) != mode)
15817 op1 = convert_to_mode (mode, tmp, 1);
15818 }
15819 }
15820
15821 if ((flag_pic || MACHOPIC_INDIRECT)
15822 && symbolic_operand (op1, mode))
15823 {
15824 if (TARGET_MACHO && !TARGET_64BIT)
15825 {
15826 #if TARGET_MACHO
15827 /* dynamic-no-pic */
15828 if (MACHOPIC_INDIRECT)
15829 {
15830 rtx temp = ((reload_in_progress
15831 || ((op0 && REG_P (op0))
15832 && mode == Pmode))
15833 ? op0 : gen_reg_rtx (Pmode));
15834 op1 = machopic_indirect_data_reference (op1, temp);
15835 if (MACHOPIC_PURE)
15836 op1 = machopic_legitimize_pic_address (op1, mode,
15837 temp == op1 ? 0 : temp);
15838 }
15839 if (op0 != op1 && GET_CODE (op0) != MEM)
15840 {
15841 rtx insn = gen_rtx_SET (VOIDmode, op0, op1);
15842 emit_insn (insn);
15843 return;
15844 }
15845 if (GET_CODE (op0) == MEM)
15846 op1 = force_reg (Pmode, op1);
15847 else
15848 {
15849 rtx temp = op0;
15850 if (GET_CODE (temp) != REG)
15851 temp = gen_reg_rtx (Pmode);
15852 temp = legitimize_pic_address (op1, temp);
15853 if (temp == op0)
15854 return;
15855 op1 = temp;
15856 }
15857 /* dynamic-no-pic */
15858 #endif
15859 }
15860 else
15861 {
15862 if (MEM_P (op0))
15863 op1 = force_reg (mode, op1);
15864 else if (!(TARGET_64BIT && x86_64_movabs_operand (op1, DImode)))
15865 {
15866 rtx reg = can_create_pseudo_p () ? NULL_RTX : op0;
15867 op1 = legitimize_pic_address (op1, reg);
15868 if (op0 == op1)
15869 return;
15870 if (GET_MODE (op1) != mode)
15871 op1 = convert_to_mode (mode, op1, 1);
15872 }
15873 }
15874 }
15875 else
15876 {
15877 if (MEM_P (op0)
15878 && (PUSH_ROUNDING (GET_MODE_SIZE (mode)) != GET_MODE_SIZE (mode)
15879 || !push_operand (op0, mode))
15880 && MEM_P (op1))
15881 op1 = force_reg (mode, op1);
15882
15883 if (push_operand (op0, mode)
15884 && ! general_no_elim_operand (op1, mode))
15885 op1 = copy_to_mode_reg (mode, op1);
15886
15887 /* Force large constants in 64bit compilation into register
15888 to get them CSEed. */
15889 if (can_create_pseudo_p ()
15890 && (mode == DImode) && TARGET_64BIT
15891 && immediate_operand (op1, mode)
15892 && !x86_64_zext_immediate_operand (op1, VOIDmode)
15893 && !register_operand (op0, mode)
15894 && optimize)
15895 op1 = copy_to_mode_reg (mode, op1);
15896
15897 if (can_create_pseudo_p ()
15898 && FLOAT_MODE_P (mode)
15899 && GET_CODE (op1) == CONST_DOUBLE)
15900 {
15901 /* If we are loading a floating point constant to a register,
15902 force the value to memory now, since we'll get better code
15903 out the back end. */
15904
15905 op1 = validize_mem (force_const_mem (mode, op1));
15906 if (!register_operand (op0, mode))
15907 {
15908 rtx temp = gen_reg_rtx (mode);
15909 emit_insn (gen_rtx_SET (VOIDmode, temp, op1));
15910 emit_move_insn (op0, temp);
15911 return;
15912 }
15913 }
15914 }
15915
15916 emit_insn (gen_rtx_SET (VOIDmode, op0, op1));
15917 }
15918
15919 void
15920 ix86_expand_vector_move (enum machine_mode mode, rtx operands[])
15921 {
15922 rtx op0 = operands[0], op1 = operands[1];
15923 unsigned int align = GET_MODE_ALIGNMENT (mode);
15924
15925 /* Force constants other than zero into memory. We do not know how
15926 the instructions used to build constants modify the upper 64 bits
15927 of the register, once we have that information we may be able
15928 to handle some of them more efficiently. */
15929 if (can_create_pseudo_p ()
15930 && register_operand (op0, mode)
15931 && (CONSTANT_P (op1)
15932 || (GET_CODE (op1) == SUBREG
15933 && CONSTANT_P (SUBREG_REG (op1))))
15934 && !standard_sse_constant_p (op1))
15935 op1 = validize_mem (force_const_mem (mode, op1));
15936
15937 /* We need to check memory alignment for SSE mode since attribute
15938 can make operands unaligned. */
15939 if (can_create_pseudo_p ()
15940 && SSE_REG_MODE_P (mode)
15941 && ((MEM_P (op0) && (MEM_ALIGN (op0) < align))
15942 || (MEM_P (op1) && (MEM_ALIGN (op1) < align))))
15943 {
15944 rtx tmp[2];
15945
15946 /* ix86_expand_vector_move_misalign() does not like constants ... */
15947 if (CONSTANT_P (op1)
15948 || (GET_CODE (op1) == SUBREG
15949 && CONSTANT_P (SUBREG_REG (op1))))
15950 op1 = validize_mem (force_const_mem (mode, op1));
15951
15952 /* ... nor both arguments in memory. */
15953 if (!register_operand (op0, mode)
15954 && !register_operand (op1, mode))
15955 op1 = force_reg (mode, op1);
15956
15957 tmp[0] = op0; tmp[1] = op1;
15958 ix86_expand_vector_move_misalign (mode, tmp);
15959 return;
15960 }
15961
15962 /* Make operand1 a register if it isn't already. */
15963 if (can_create_pseudo_p ()
15964 && !register_operand (op0, mode)
15965 && !register_operand (op1, mode))
15966 {
15967 emit_move_insn (op0, force_reg (GET_MODE (op0), op1));
15968 return;
15969 }
15970
15971 emit_insn (gen_rtx_SET (VOIDmode, op0, op1));
15972 }
15973
15974 /* Split 32-byte AVX unaligned load and store if needed. */
15975
15976 static void
15977 ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1)
15978 {
15979 rtx m;
15980 rtx (*extract) (rtx, rtx, rtx);
15981 rtx (*move_unaligned) (rtx, rtx);
15982 enum machine_mode mode;
15983
15984 switch (GET_MODE (op0))
15985 {
15986 default:
15987 gcc_unreachable ();
15988 case V32QImode:
15989 extract = gen_avx_vextractf128v32qi;
15990 move_unaligned = gen_avx_movdqu256;
15991 mode = V16QImode;
15992 break;
15993 case V8SFmode:
15994 extract = gen_avx_vextractf128v8sf;
15995 move_unaligned = gen_avx_movups256;
15996 mode = V4SFmode;
15997 break;
15998 case V4DFmode:
15999 extract = gen_avx_vextractf128v4df;
16000 move_unaligned = gen_avx_movupd256;
16001 mode = V2DFmode;
16002 break;
16003 }
16004
16005 if (MEM_P (op1) && TARGET_AVX256_SPLIT_UNALIGNED_LOAD)
16006 {
16007 rtx r = gen_reg_rtx (mode);
16008 m = adjust_address (op1, mode, 0);
16009 emit_move_insn (r, m);
16010 m = adjust_address (op1, mode, 16);
16011 r = gen_rtx_VEC_CONCAT (GET_MODE (op0), r, m);
16012 emit_move_insn (op0, r);
16013 }
16014 else if (MEM_P (op0) && TARGET_AVX256_SPLIT_UNALIGNED_STORE)
16015 {
16016 m = adjust_address (op0, mode, 0);
16017 emit_insn (extract (m, op1, const0_rtx));
16018 m = adjust_address (op0, mode, 16);
16019 emit_insn (extract (m, op1, const1_rtx));
16020 }
16021 else
16022 emit_insn (move_unaligned (op0, op1));
16023 }
16024
16025 /* Implement the movmisalign patterns for SSE. Non-SSE modes go
16026 straight to ix86_expand_vector_move. */
16027 /* Code generation for scalar reg-reg moves of single and double precision data:
16028 if (x86_sse_partial_reg_dependency == true | x86_sse_split_regs == true)
16029 movaps reg, reg
16030 else
16031 movss reg, reg
16032 if (x86_sse_partial_reg_dependency == true)
16033 movapd reg, reg
16034 else
16035 movsd reg, reg
16036
16037 Code generation for scalar loads of double precision data:
16038 if (x86_sse_split_regs == true)
16039 movlpd mem, reg (gas syntax)
16040 else
16041 movsd mem, reg
16042
16043 Code generation for unaligned packed loads of single precision data
16044 (x86_sse_unaligned_move_optimal overrides x86_sse_partial_reg_dependency):
16045 if (x86_sse_unaligned_move_optimal)
16046 movups mem, reg
16047
16048 if (x86_sse_partial_reg_dependency == true)
16049 {
16050 xorps reg, reg
16051 movlps mem, reg
16052 movhps mem+8, reg
16053 }
16054 else
16055 {
16056 movlps mem, reg
16057 movhps mem+8, reg
16058 }
16059
16060 Code generation for unaligned packed loads of double precision data
16061 (x86_sse_unaligned_move_optimal overrides x86_sse_split_regs):
16062 if (x86_sse_unaligned_move_optimal)
16063 movupd mem, reg
16064
16065 if (x86_sse_split_regs == true)
16066 {
16067 movlpd mem, reg
16068 movhpd mem+8, reg
16069 }
16070 else
16071 {
16072 movsd mem, reg
16073 movhpd mem+8, reg
16074 }
16075 */
16076
16077 void
16078 ix86_expand_vector_move_misalign (enum machine_mode mode, rtx operands[])
16079 {
16080 rtx op0, op1, m;
16081
16082 op0 = operands[0];
16083 op1 = operands[1];
16084
16085 if (TARGET_AVX
16086 && GET_MODE_SIZE (mode) == 32)
16087 {
16088 switch (GET_MODE_CLASS (mode))
16089 {
16090 case MODE_VECTOR_INT:
16091 case MODE_INT:
16092 op0 = gen_lowpart (V32QImode, op0);
16093 op1 = gen_lowpart (V32QImode, op1);
16094 /* FALLTHRU */
16095
16096 case MODE_VECTOR_FLOAT:
16097 ix86_avx256_split_vector_move_misalign (op0, op1);
16098 break;
16099
16100 default:
16101 gcc_unreachable ();
16102 }
16103
16104 return;
16105 }
16106
16107 if (MEM_P (op1))
16108 {
16109 /* ??? If we have typed data, then it would appear that using
16110 movdqu is the only way to get unaligned data loaded with
16111 integer type. */
16112 if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
16113 {
16114 op0 = gen_lowpart (V16QImode, op0);
16115 op1 = gen_lowpart (V16QImode, op1);
16116 /* We will eventually emit movups based on insn attributes. */
16117 emit_insn (gen_sse2_movdqu (op0, op1));
16118 }
16119 else if (TARGET_SSE2 && mode == V2DFmode)
16120 {
16121 rtx zero;
16122
16123 if (TARGET_AVX
16124 || TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
16125 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
16126 || optimize_function_for_size_p (cfun))
16127 {
16128 /* We will eventually emit movups based on insn attributes. */
16129 emit_insn (gen_sse2_movupd (op0, op1));
16130 return;
16131 }
16132
16133 /* When SSE registers are split into halves, we can avoid
16134 writing to the top half twice. */
16135 if (TARGET_SSE_SPLIT_REGS)
16136 {
16137 emit_clobber (op0);
16138 zero = op0;
16139 }
16140 else
16141 {
16142 /* ??? Not sure about the best option for the Intel chips.
16143 The following would seem to satisfy; the register is
16144 entirely cleared, breaking the dependency chain. We
16145 then store to the upper half, with a dependency depth
16146 of one. A rumor has it that Intel recommends two movsd
16147 followed by an unpacklpd, but this is unconfirmed. And
16148 given that the dependency depth of the unpacklpd would
16149 still be one, I'm not sure why this would be better. */
16150 zero = CONST0_RTX (V2DFmode);
16151 }
16152
16153 m = adjust_address (op1, DFmode, 0);
16154 emit_insn (gen_sse2_loadlpd (op0, zero, m));
16155 m = adjust_address (op1, DFmode, 8);
16156 emit_insn (gen_sse2_loadhpd (op0, op0, m));
16157 }
16158 else
16159 {
16160 if (TARGET_AVX
16161 || TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
16162 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
16163 || optimize_function_for_size_p (cfun))
16164 {
16165 op0 = gen_lowpart (V4SFmode, op0);
16166 op1 = gen_lowpart (V4SFmode, op1);
16167 emit_insn (gen_sse_movups (op0, op1));
16168 return;
16169 }
16170
16171 if (TARGET_SSE_PARTIAL_REG_DEPENDENCY)
16172 emit_move_insn (op0, CONST0_RTX (mode));
16173 else
16174 emit_clobber (op0);
16175
16176 if (mode != V4SFmode)
16177 op0 = gen_lowpart (V4SFmode, op0);
16178
16179 m = adjust_address (op1, V2SFmode, 0);
16180 emit_insn (gen_sse_loadlps (op0, op0, m));
16181 m = adjust_address (op1, V2SFmode, 8);
16182 emit_insn (gen_sse_loadhps (op0, op0, m));
16183 }
16184 }
16185 else if (MEM_P (op0))
16186 {
16187 if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
16188 {
16189 op0 = gen_lowpart (V16QImode, op0);
16190 op1 = gen_lowpart (V16QImode, op1);
16191 /* We will eventually emit movups based on insn attributes. */
16192 emit_insn (gen_sse2_movdqu (op0, op1));
16193 }
16194 else if (TARGET_SSE2 && mode == V2DFmode)
16195 {
16196 if (TARGET_AVX
16197 || TARGET_SSE_UNALIGNED_STORE_OPTIMAL
16198 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
16199 || optimize_function_for_size_p (cfun))
16200 /* We will eventually emit movups based on insn attributes. */
16201 emit_insn (gen_sse2_movupd (op0, op1));
16202 else
16203 {
16204 m = adjust_address (op0, DFmode, 0);
16205 emit_insn (gen_sse2_storelpd (m, op1));
16206 m = adjust_address (op0, DFmode, 8);
16207 emit_insn (gen_sse2_storehpd (m, op1));
16208 }
16209 }
16210 else
16211 {
16212 if (mode != V4SFmode)
16213 op1 = gen_lowpart (V4SFmode, op1);
16214
16215 if (TARGET_AVX
16216 || TARGET_SSE_UNALIGNED_STORE_OPTIMAL
16217 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
16218 || optimize_function_for_size_p (cfun))
16219 {
16220 op0 = gen_lowpart (V4SFmode, op0);
16221 emit_insn (gen_sse_movups (op0, op1));
16222 }
16223 else
16224 {
16225 m = adjust_address (op0, V2SFmode, 0);
16226 emit_insn (gen_sse_storelps (m, op1));
16227 m = adjust_address (op0, V2SFmode, 8);
16228 emit_insn (gen_sse_storehps (m, op1));
16229 }
16230 }
16231 }
16232 else
16233 gcc_unreachable ();
16234 }
16235
16236 /* Expand a push in MODE. This is some mode for which we do not support
16237 proper push instructions, at least from the registers that we expect
16238 the value to live in. */
16239
16240 void
16241 ix86_expand_push (enum machine_mode mode, rtx x)
16242 {
16243 rtx tmp;
16244
16245 tmp = expand_simple_binop (Pmode, PLUS, stack_pointer_rtx,
16246 GEN_INT (-GET_MODE_SIZE (mode)),
16247 stack_pointer_rtx, 1, OPTAB_DIRECT);
16248 if (tmp != stack_pointer_rtx)
16249 emit_move_insn (stack_pointer_rtx, tmp);
16250
16251 tmp = gen_rtx_MEM (mode, stack_pointer_rtx);
16252
16253 /* When we push an operand onto stack, it has to be aligned at least
16254 at the function argument boundary. However since we don't have
16255 the argument type, we can't determine the actual argument
16256 boundary. */
16257 emit_move_insn (tmp, x);
16258 }
16259
16260 /* Helper function of ix86_fixup_binary_operands to canonicalize
16261 operand order. Returns true if the operands should be swapped. */
16262
16263 static bool
16264 ix86_swap_binary_operands_p (enum rtx_code code, enum machine_mode mode,
16265 rtx operands[])
16266 {
16267 rtx dst = operands[0];
16268 rtx src1 = operands[1];
16269 rtx src2 = operands[2];
16270
16271 /* If the operation is not commutative, we can't do anything. */
16272 if (GET_RTX_CLASS (code) != RTX_COMM_ARITH)
16273 return false;
16274
16275 /* Highest priority is that src1 should match dst. */
16276 if (rtx_equal_p (dst, src1))
16277 return false;
16278 if (rtx_equal_p (dst, src2))
16279 return true;
16280
16281 /* Next highest priority is that immediate constants come second. */
16282 if (immediate_operand (src2, mode))
16283 return false;
16284 if (immediate_operand (src1, mode))
16285 return true;
16286
16287 /* Lowest priority is that memory references should come second. */
16288 if (MEM_P (src2))
16289 return false;
16290 if (MEM_P (src1))
16291 return true;
16292
16293 return false;
16294 }
16295
16296
16297 /* Fix up OPERANDS to satisfy ix86_binary_operator_ok. Return the
16298 destination to use for the operation. If different from the true
16299 destination in operands[0], a copy operation will be required. */
16300
16301 rtx
16302 ix86_fixup_binary_operands (enum rtx_code code, enum machine_mode mode,
16303 rtx operands[])
16304 {
16305 rtx dst = operands[0];
16306 rtx src1 = operands[1];
16307 rtx src2 = operands[2];
16308
16309 /* Canonicalize operand order. */
16310 if (ix86_swap_binary_operands_p (code, mode, operands))
16311 {
16312 rtx temp;
16313
16314 /* It is invalid to swap operands of different modes. */
16315 gcc_assert (GET_MODE (src1) == GET_MODE (src2));
16316
16317 temp = src1;
16318 src1 = src2;
16319 src2 = temp;
16320 }
16321
16322 /* Both source operands cannot be in memory. */
16323 if (MEM_P (src1) && MEM_P (src2))
16324 {
16325 /* Optimization: Only read from memory once. */
16326 if (rtx_equal_p (src1, src2))
16327 {
16328 src2 = force_reg (mode, src2);
16329 src1 = src2;
16330 }
16331 else
16332 src2 = force_reg (mode, src2);
16333 }
16334
16335 /* If the destination is memory, and we do not have matching source
16336 operands, do things in registers. */
16337 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
16338 dst = gen_reg_rtx (mode);
16339
16340 /* Source 1 cannot be a constant. */
16341 if (CONSTANT_P (src1))
16342 src1 = force_reg (mode, src1);
16343
16344 /* Source 1 cannot be a non-matching memory. */
16345 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
16346 src1 = force_reg (mode, src1);
16347
16348 /* Improve address combine. */
16349 if (code == PLUS
16350 && GET_MODE_CLASS (mode) == MODE_INT
16351 && MEM_P (src2))
16352 src2 = force_reg (mode, src2);
16353
16354 operands[1] = src1;
16355 operands[2] = src2;
16356 return dst;
16357 }
16358
16359 /* Similarly, but assume that the destination has already been
16360 set up properly. */
16361
16362 void
16363 ix86_fixup_binary_operands_no_copy (enum rtx_code code,
16364 enum machine_mode mode, rtx operands[])
16365 {
16366 rtx dst = ix86_fixup_binary_operands (code, mode, operands);
16367 gcc_assert (dst == operands[0]);
16368 }
16369
16370 /* Attempt to expand a binary operator. Make the expansion closer to the
16371 actual machine, then just general_operand, which will allow 3 separate
16372 memory references (one output, two input) in a single insn. */
16373
16374 void
16375 ix86_expand_binary_operator (enum rtx_code code, enum machine_mode mode,
16376 rtx operands[])
16377 {
16378 rtx src1, src2, dst, op, clob;
16379
16380 dst = ix86_fixup_binary_operands (code, mode, operands);
16381 src1 = operands[1];
16382 src2 = operands[2];
16383
16384 /* Emit the instruction. */
16385
16386 op = gen_rtx_SET (VOIDmode, dst, gen_rtx_fmt_ee (code, mode, src1, src2));
16387 if (reload_in_progress)
16388 {
16389 /* Reload doesn't know about the flags register, and doesn't know that
16390 it doesn't want to clobber it. We can only do this with PLUS. */
16391 gcc_assert (code == PLUS);
16392 emit_insn (op);
16393 }
16394 else if (reload_completed
16395 && code == PLUS
16396 && !rtx_equal_p (dst, src1))
16397 {
16398 /* This is going to be an LEA; avoid splitting it later. */
16399 emit_insn (op);
16400 }
16401 else
16402 {
16403 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
16404 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
16405 }
16406
16407 /* Fix up the destination if needed. */
16408 if (dst != operands[0])
16409 emit_move_insn (operands[0], dst);
16410 }
16411
16412 /* Return TRUE or FALSE depending on whether the binary operator meets the
16413 appropriate constraints. */
16414
16415 bool
16416 ix86_binary_operator_ok (enum rtx_code code, enum machine_mode mode,
16417 rtx operands[3])
16418 {
16419 rtx dst = operands[0];
16420 rtx src1 = operands[1];
16421 rtx src2 = operands[2];
16422
16423 /* Both source operands cannot be in memory. */
16424 if (MEM_P (src1) && MEM_P (src2))
16425 return false;
16426
16427 /* Canonicalize operand order for commutative operators. */
16428 if (ix86_swap_binary_operands_p (code, mode, operands))
16429 {
16430 rtx temp = src1;
16431 src1 = src2;
16432 src2 = temp;
16433 }
16434
16435 /* If the destination is memory, we must have a matching source operand. */
16436 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
16437 return false;
16438
16439 /* Source 1 cannot be a constant. */
16440 if (CONSTANT_P (src1))
16441 return false;
16442
16443 /* Source 1 cannot be a non-matching memory. */
16444 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
16445 /* Support "andhi/andsi/anddi" as a zero-extending move. */
16446 return (code == AND
16447 && (mode == HImode
16448 || mode == SImode
16449 || (TARGET_64BIT && mode == DImode))
16450 && satisfies_constraint_L (src2));
16451
16452 return true;
16453 }
16454
16455 /* Attempt to expand a unary operator. Make the expansion closer to the
16456 actual machine, then just general_operand, which will allow 2 separate
16457 memory references (one output, one input) in a single insn. */
16458
16459 void
16460 ix86_expand_unary_operator (enum rtx_code code, enum machine_mode mode,
16461 rtx operands[])
16462 {
16463 int matching_memory;
16464 rtx src, dst, op, clob;
16465
16466 dst = operands[0];
16467 src = operands[1];
16468
16469 /* If the destination is memory, and we do not have matching source
16470 operands, do things in registers. */
16471 matching_memory = 0;
16472 if (MEM_P (dst))
16473 {
16474 if (rtx_equal_p (dst, src))
16475 matching_memory = 1;
16476 else
16477 dst = gen_reg_rtx (mode);
16478 }
16479
16480 /* When source operand is memory, destination must match. */
16481 if (MEM_P (src) && !matching_memory)
16482 src = force_reg (mode, src);
16483
16484 /* Emit the instruction. */
16485
16486 op = gen_rtx_SET (VOIDmode, dst, gen_rtx_fmt_e (code, mode, src));
16487 if (reload_in_progress || code == NOT)
16488 {
16489 /* Reload doesn't know about the flags register, and doesn't know that
16490 it doesn't want to clobber it. */
16491 gcc_assert (code == NOT);
16492 emit_insn (op);
16493 }
16494 else
16495 {
16496 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
16497 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
16498 }
16499
16500 /* Fix up the destination if needed. */
16501 if (dst != operands[0])
16502 emit_move_insn (operands[0], dst);
16503 }
16504
16505 /* Split 32bit/64bit divmod with 8bit unsigned divmod if dividend and
16506 divisor are within the range [0-255]. */
16507
16508 void
16509 ix86_split_idivmod (enum machine_mode mode, rtx operands[],
16510 bool signed_p)
16511 {
16512 rtx end_label, qimode_label;
16513 rtx insn, div, mod;
16514 rtx scratch, tmp0, tmp1, tmp2;
16515 rtx (*gen_divmod4_1) (rtx, rtx, rtx, rtx);
16516 rtx (*gen_zero_extend) (rtx, rtx);
16517 rtx (*gen_test_ccno_1) (rtx, rtx);
16518
16519 switch (mode)
16520 {
16521 case SImode:
16522 gen_divmod4_1 = signed_p ? gen_divmodsi4_1 : gen_udivmodsi4_1;
16523 gen_test_ccno_1 = gen_testsi_ccno_1;
16524 gen_zero_extend = gen_zero_extendqisi2;
16525 break;
16526 case DImode:
16527 gen_divmod4_1 = signed_p ? gen_divmoddi4_1 : gen_udivmoddi4_1;
16528 gen_test_ccno_1 = gen_testdi_ccno_1;
16529 gen_zero_extend = gen_zero_extendqidi2;
16530 break;
16531 default:
16532 gcc_unreachable ();
16533 }
16534
16535 end_label = gen_label_rtx ();
16536 qimode_label = gen_label_rtx ();
16537
16538 scratch = gen_reg_rtx (mode);
16539
16540 /* Use 8bit unsigned divimod if dividend and divisor are within
16541 the range [0-255]. */
16542 emit_move_insn (scratch, operands[2]);
16543 scratch = expand_simple_binop (mode, IOR, scratch, operands[3],
16544 scratch, 1, OPTAB_DIRECT);
16545 emit_insn (gen_test_ccno_1 (scratch, GEN_INT (-0x100)));
16546 tmp0 = gen_rtx_REG (CCNOmode, FLAGS_REG);
16547 tmp0 = gen_rtx_EQ (VOIDmode, tmp0, const0_rtx);
16548 tmp0 = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp0,
16549 gen_rtx_LABEL_REF (VOIDmode, qimode_label),
16550 pc_rtx);
16551 insn = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp0));
16552 predict_jump (REG_BR_PROB_BASE * 50 / 100);
16553 JUMP_LABEL (insn) = qimode_label;
16554
16555 /* Generate original signed/unsigned divimod. */
16556 div = gen_divmod4_1 (operands[0], operands[1],
16557 operands[2], operands[3]);
16558 emit_insn (div);
16559
16560 /* Branch to the end. */
16561 emit_jump_insn (gen_jump (end_label));
16562 emit_barrier ();
16563
16564 /* Generate 8bit unsigned divide. */
16565 emit_label (qimode_label);
16566 /* Don't use operands[0] for result of 8bit divide since not all
16567 registers support QImode ZERO_EXTRACT. */
16568 tmp0 = simplify_gen_subreg (HImode, scratch, mode, 0);
16569 tmp1 = simplify_gen_subreg (HImode, operands[2], mode, 0);
16570 tmp2 = simplify_gen_subreg (QImode, operands[3], mode, 0);
16571 emit_insn (gen_udivmodhiqi3 (tmp0, tmp1, tmp2));
16572
16573 if (signed_p)
16574 {
16575 div = gen_rtx_DIV (SImode, operands[2], operands[3]);
16576 mod = gen_rtx_MOD (SImode, operands[2], operands[3]);
16577 }
16578 else
16579 {
16580 div = gen_rtx_UDIV (SImode, operands[2], operands[3]);
16581 mod = gen_rtx_UMOD (SImode, operands[2], operands[3]);
16582 }
16583
16584 /* Extract remainder from AH. */
16585 tmp1 = gen_rtx_ZERO_EXTRACT (mode, tmp0, GEN_INT (8), GEN_INT (8));
16586 if (REG_P (operands[1]))
16587 insn = emit_move_insn (operands[1], tmp1);
16588 else
16589 {
16590 /* Need a new scratch register since the old one has result
16591 of 8bit divide. */
16592 scratch = gen_reg_rtx (mode);
16593 emit_move_insn (scratch, tmp1);
16594 insn = emit_move_insn (operands[1], scratch);
16595 }
16596 set_unique_reg_note (insn, REG_EQUAL, mod);
16597
16598 /* Zero extend quotient from AL. */
16599 tmp1 = gen_lowpart (QImode, tmp0);
16600 insn = emit_insn (gen_zero_extend (operands[0], tmp1));
16601 set_unique_reg_note (insn, REG_EQUAL, div);
16602
16603 emit_label (end_label);
16604 }
16605
16606 #define LEA_MAX_STALL (3)
16607 #define LEA_SEARCH_THRESHOLD (LEA_MAX_STALL << 1)
16608
16609 /* Increase given DISTANCE in half-cycles according to
16610 dependencies between PREV and NEXT instructions.
16611 Add 1 half-cycle if there is no dependency and
16612 go to next cycle if there is some dependecy. */
16613
16614 static unsigned int
16615 increase_distance (rtx prev, rtx next, unsigned int distance)
16616 {
16617 df_ref *use_rec;
16618 df_ref *def_rec;
16619
16620 if (!prev || !next)
16621 return distance + (distance & 1) + 2;
16622
16623 if (!DF_INSN_USES (next) || !DF_INSN_DEFS (prev))
16624 return distance + 1;
16625
16626 for (use_rec = DF_INSN_USES (next); *use_rec; use_rec++)
16627 for (def_rec = DF_INSN_DEFS (prev); *def_rec; def_rec++)
16628 if (!DF_REF_IS_ARTIFICIAL (*def_rec)
16629 && DF_REF_REGNO (*use_rec) == DF_REF_REGNO (*def_rec))
16630 return distance + (distance & 1) + 2;
16631
16632 return distance + 1;
16633 }
16634
16635 /* Function checks if instruction INSN defines register number
16636 REGNO1 or REGNO2. */
16637
16638 static bool
16639 insn_defines_reg (unsigned int regno1, unsigned int regno2,
16640 rtx insn)
16641 {
16642 df_ref *def_rec;
16643
16644 for (def_rec = DF_INSN_DEFS (insn); *def_rec; def_rec++)
16645 if (DF_REF_REG_DEF_P (*def_rec)
16646 && !DF_REF_IS_ARTIFICIAL (*def_rec)
16647 && (regno1 == DF_REF_REGNO (*def_rec)
16648 || regno2 == DF_REF_REGNO (*def_rec)))
16649 {
16650 return true;
16651 }
16652
16653 return false;
16654 }
16655
16656 /* Function checks if instruction INSN uses register number
16657 REGNO as a part of address expression. */
16658
16659 static bool
16660 insn_uses_reg_mem (unsigned int regno, rtx insn)
16661 {
16662 df_ref *use_rec;
16663
16664 for (use_rec = DF_INSN_USES (insn); *use_rec; use_rec++)
16665 if (DF_REF_REG_MEM_P (*use_rec) && regno == DF_REF_REGNO (*use_rec))
16666 return true;
16667
16668 return false;
16669 }
16670
16671 /* Search backward for non-agu definition of register number REGNO1
16672 or register number REGNO2 in basic block starting from instruction
16673 START up to head of basic block or instruction INSN.
16674
16675 Function puts true value into *FOUND var if definition was found
16676 and false otherwise.
16677
16678 Distance in half-cycles between START and found instruction or head
16679 of BB is added to DISTANCE and returned. */
16680
16681 static int
16682 distance_non_agu_define_in_bb (unsigned int regno1, unsigned int regno2,
16683 rtx insn, int distance,
16684 rtx start, bool *found)
16685 {
16686 basic_block bb = start ? BLOCK_FOR_INSN (start) : NULL;
16687 rtx prev = start;
16688 rtx next = NULL;
16689
16690 *found = false;
16691
16692 while (prev
16693 && prev != insn
16694 && distance < LEA_SEARCH_THRESHOLD)
16695 {
16696 if (NONDEBUG_INSN_P (prev) && NONJUMP_INSN_P (prev))
16697 {
16698 distance = increase_distance (prev, next, distance);
16699 if (insn_defines_reg (regno1, regno2, prev))
16700 {
16701 if (recog_memoized (prev) < 0
16702 || get_attr_type (prev) != TYPE_LEA)
16703 {
16704 *found = true;
16705 return distance;
16706 }
16707 }
16708
16709 next = prev;
16710 }
16711 if (prev == BB_HEAD (bb))
16712 break;
16713
16714 prev = PREV_INSN (prev);
16715 }
16716
16717 return distance;
16718 }
16719
16720 /* Search backward for non-agu definition of register number REGNO1
16721 or register number REGNO2 in INSN's basic block until
16722 1. Pass LEA_SEARCH_THRESHOLD instructions, or
16723 2. Reach neighbour BBs boundary, or
16724 3. Reach agu definition.
16725 Returns the distance between the non-agu definition point and INSN.
16726 If no definition point, returns -1. */
16727
16728 static int
16729 distance_non_agu_define (unsigned int regno1, unsigned int regno2,
16730 rtx insn)
16731 {
16732 basic_block bb = BLOCK_FOR_INSN (insn);
16733 int distance = 0;
16734 bool found = false;
16735
16736 if (insn != BB_HEAD (bb))
16737 distance = distance_non_agu_define_in_bb (regno1, regno2, insn,
16738 distance, PREV_INSN (insn),
16739 &found);
16740
16741 if (!found && distance < LEA_SEARCH_THRESHOLD)
16742 {
16743 edge e;
16744 edge_iterator ei;
16745 bool simple_loop = false;
16746
16747 FOR_EACH_EDGE (e, ei, bb->preds)
16748 if (e->src == bb)
16749 {
16750 simple_loop = true;
16751 break;
16752 }
16753
16754 if (simple_loop)
16755 distance = distance_non_agu_define_in_bb (regno1, regno2,
16756 insn, distance,
16757 BB_END (bb), &found);
16758 else
16759 {
16760 int shortest_dist = -1;
16761 bool found_in_bb = false;
16762
16763 FOR_EACH_EDGE (e, ei, bb->preds)
16764 {
16765 int bb_dist
16766 = distance_non_agu_define_in_bb (regno1, regno2,
16767 insn, distance,
16768 BB_END (e->src),
16769 &found_in_bb);
16770 if (found_in_bb)
16771 {
16772 if (shortest_dist < 0)
16773 shortest_dist = bb_dist;
16774 else if (bb_dist > 0)
16775 shortest_dist = MIN (bb_dist, shortest_dist);
16776
16777 found = true;
16778 }
16779 }
16780
16781 distance = shortest_dist;
16782 }
16783 }
16784
16785 /* get_attr_type may modify recog data. We want to make sure
16786 that recog data is valid for instruction INSN, on which
16787 distance_non_agu_define is called. INSN is unchanged here. */
16788 extract_insn_cached (insn);
16789
16790 if (!found)
16791 return -1;
16792
16793 return distance >> 1;
16794 }
16795
16796 /* Return the distance in half-cycles between INSN and the next
16797 insn that uses register number REGNO in memory address added
16798 to DISTANCE. Return -1 if REGNO0 is set.
16799
16800 Put true value into *FOUND if register usage was found and
16801 false otherwise.
16802 Put true value into *REDEFINED if register redefinition was
16803 found and false otherwise. */
16804
16805 static int
16806 distance_agu_use_in_bb (unsigned int regno,
16807 rtx insn, int distance, rtx start,
16808 bool *found, bool *redefined)
16809 {
16810 basic_block bb = start ? BLOCK_FOR_INSN (start) : NULL;
16811 rtx next = start;
16812 rtx prev = NULL;
16813
16814 *found = false;
16815 *redefined = false;
16816
16817 while (next
16818 && next != insn
16819 && distance < LEA_SEARCH_THRESHOLD)
16820 {
16821 if (NONDEBUG_INSN_P (next) && NONJUMP_INSN_P (next))
16822 {
16823 distance = increase_distance(prev, next, distance);
16824 if (insn_uses_reg_mem (regno, next))
16825 {
16826 /* Return DISTANCE if OP0 is used in memory
16827 address in NEXT. */
16828 *found = true;
16829 return distance;
16830 }
16831
16832 if (insn_defines_reg (regno, INVALID_REGNUM, next))
16833 {
16834 /* Return -1 if OP0 is set in NEXT. */
16835 *redefined = true;
16836 return -1;
16837 }
16838
16839 prev = next;
16840 }
16841
16842 if (next == BB_END (bb))
16843 break;
16844
16845 next = NEXT_INSN (next);
16846 }
16847
16848 return distance;
16849 }
16850
16851 /* Return the distance between INSN and the next insn that uses
16852 register number REGNO0 in memory address. Return -1 if no such
16853 a use is found within LEA_SEARCH_THRESHOLD or REGNO0 is set. */
16854
16855 static int
16856 distance_agu_use (unsigned int regno0, rtx insn)
16857 {
16858 basic_block bb = BLOCK_FOR_INSN (insn);
16859 int distance = 0;
16860 bool found = false;
16861 bool redefined = false;
16862
16863 if (insn != BB_END (bb))
16864 distance = distance_agu_use_in_bb (regno0, insn, distance,
16865 NEXT_INSN (insn),
16866 &found, &redefined);
16867
16868 if (!found && !redefined && distance < LEA_SEARCH_THRESHOLD)
16869 {
16870 edge e;
16871 edge_iterator ei;
16872 bool simple_loop = false;
16873
16874 FOR_EACH_EDGE (e, ei, bb->succs)
16875 if (e->dest == bb)
16876 {
16877 simple_loop = true;
16878 break;
16879 }
16880
16881 if (simple_loop)
16882 distance = distance_agu_use_in_bb (regno0, insn,
16883 distance, BB_HEAD (bb),
16884 &found, &redefined);
16885 else
16886 {
16887 int shortest_dist = -1;
16888 bool found_in_bb = false;
16889 bool redefined_in_bb = false;
16890
16891 FOR_EACH_EDGE (e, ei, bb->succs)
16892 {
16893 int bb_dist
16894 = distance_agu_use_in_bb (regno0, insn,
16895 distance, BB_HEAD (e->dest),
16896 &found_in_bb, &redefined_in_bb);
16897 if (found_in_bb)
16898 {
16899 if (shortest_dist < 0)
16900 shortest_dist = bb_dist;
16901 else if (bb_dist > 0)
16902 shortest_dist = MIN (bb_dist, shortest_dist);
16903
16904 found = true;
16905 }
16906 }
16907
16908 distance = shortest_dist;
16909 }
16910 }
16911
16912 if (!found || redefined)
16913 return -1;
16914
16915 return distance >> 1;
16916 }
16917
16918 /* Define this macro to tune LEA priority vs ADD, it take effect when
16919 there is a dilemma of choicing LEA or ADD
16920 Negative value: ADD is more preferred than LEA
16921 Zero: Netrual
16922 Positive value: LEA is more preferred than ADD*/
16923 #define IX86_LEA_PRIORITY 0
16924
16925 /* Return true if usage of lea INSN has performance advantage
16926 over a sequence of instructions. Instructions sequence has
16927 SPLIT_COST cycles higher latency than lea latency. */
16928
16929 static bool
16930 ix86_lea_outperforms (rtx insn, unsigned int regno0, unsigned int regno1,
16931 unsigned int regno2, int split_cost)
16932 {
16933 int dist_define, dist_use;
16934
16935 dist_define = distance_non_agu_define (regno1, regno2, insn);
16936 dist_use = distance_agu_use (regno0, insn);
16937
16938 if (dist_define < 0 || dist_define >= LEA_MAX_STALL)
16939 {
16940 /* If there is no non AGU operand definition, no AGU
16941 operand usage and split cost is 0 then both lea
16942 and non lea variants have same priority. Currently
16943 we prefer lea for 64 bit code and non lea on 32 bit
16944 code. */
16945 if (dist_use < 0 && split_cost == 0)
16946 return TARGET_64BIT || IX86_LEA_PRIORITY;
16947 else
16948 return true;
16949 }
16950
16951 /* With longer definitions distance lea is more preferable.
16952 Here we change it to take into account splitting cost and
16953 lea priority. */
16954 dist_define += split_cost + IX86_LEA_PRIORITY;
16955
16956 /* If there is no use in memory addess then we just check
16957 that split cost exceeds AGU stall. */
16958 if (dist_use < 0)
16959 return dist_define > LEA_MAX_STALL;
16960
16961 /* If this insn has both backward non-agu dependence and forward
16962 agu dependence, the one with short distance takes effect. */
16963 return dist_define >= dist_use;
16964 }
16965
16966 /* Return true if it is legal to clobber flags by INSN and
16967 false otherwise. */
16968
16969 static bool
16970 ix86_ok_to_clobber_flags (rtx insn)
16971 {
16972 basic_block bb = BLOCK_FOR_INSN (insn);
16973 df_ref *use;
16974 bitmap live;
16975
16976 while (insn)
16977 {
16978 if (NONDEBUG_INSN_P (insn))
16979 {
16980 for (use = DF_INSN_USES (insn); *use; use++)
16981 if (DF_REF_REG_USE_P (*use) && DF_REF_REGNO (*use) == FLAGS_REG)
16982 return false;
16983
16984 if (insn_defines_reg (FLAGS_REG, INVALID_REGNUM, insn))
16985 return true;
16986 }
16987
16988 if (insn == BB_END (bb))
16989 break;
16990
16991 insn = NEXT_INSN (insn);
16992 }
16993
16994 live = df_get_live_out(bb);
16995 return !REGNO_REG_SET_P (live, FLAGS_REG);
16996 }
16997
16998 /* Return true if we need to split op0 = op1 + op2 into a sequence of
16999 move and add to avoid AGU stalls. */
17000
17001 bool
17002 ix86_avoid_lea_for_add (rtx insn, rtx operands[])
17003 {
17004 unsigned int regno0, regno1, regno2;
17005
17006 /* Check if we need to optimize. */
17007 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
17008 return false;
17009
17010 /* Check it is correct to split here. */
17011 if (!ix86_ok_to_clobber_flags(insn))
17012 return false;
17013
17014 regno0 = true_regnum (operands[0]);
17015 regno1 = true_regnum (operands[1]);
17016 regno2 = true_regnum (operands[2]);
17017
17018 /* We need to split only adds with non destructive
17019 destination operand. */
17020 if (regno0 == regno1 || regno0 == regno2)
17021 return false;
17022 else
17023 return !ix86_lea_outperforms (insn, regno0, regno1, regno2, 1);
17024 }
17025
17026 /* Return true if we should emit lea instruction instead of mov
17027 instruction. */
17028
17029 bool
17030 ix86_use_lea_for_mov (rtx insn, rtx operands[])
17031 {
17032 unsigned int regno0, regno1;
17033
17034 /* Check if we need to optimize. */
17035 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
17036 return false;
17037
17038 /* Use lea for reg to reg moves only. */
17039 if (!REG_P (operands[0]) || !REG_P (operands[1]))
17040 return false;
17041
17042 regno0 = true_regnum (operands[0]);
17043 regno1 = true_regnum (operands[1]);
17044
17045 return ix86_lea_outperforms (insn, regno0, regno1, INVALID_REGNUM, 0);
17046 }
17047
17048 /* Return true if we need to split lea into a sequence of
17049 instructions to avoid AGU stalls. */
17050
17051 bool
17052 ix86_avoid_lea_for_addr (rtx insn, rtx operands[])
17053 {
17054 unsigned int regno0, regno1, regno2;
17055 int split_cost;
17056 struct ix86_address parts;
17057 int ok;
17058
17059 /* Check we need to optimize. */
17060 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
17061 return false;
17062
17063 /* Check it is correct to split here. */
17064 if (!ix86_ok_to_clobber_flags(insn))
17065 return false;
17066
17067 ok = ix86_decompose_address (operands[1], &parts);
17068 gcc_assert (ok);
17069
17070 /* There should be at least two components in the address. */
17071 if ((parts.base != NULL_RTX) + (parts.index != NULL_RTX)
17072 + (parts.disp != NULL_RTX) + (parts.scale > 1) < 2)
17073 return false;
17074
17075 /* We should not split into add if non legitimate pic
17076 operand is used as displacement. */
17077 if (parts.disp && flag_pic && !LEGITIMATE_PIC_OPERAND_P (parts.disp))
17078 return false;
17079
17080 regno0 = true_regnum (operands[0]) ;
17081 regno1 = INVALID_REGNUM;
17082 regno2 = INVALID_REGNUM;
17083
17084 if (parts.base)
17085 regno1 = true_regnum (parts.base);
17086 if (parts.index)
17087 regno2 = true_regnum (parts.index);
17088
17089 split_cost = 0;
17090
17091 /* Compute how many cycles we will add to execution time
17092 if split lea into a sequence of instructions. */
17093 if (parts.base || parts.index)
17094 {
17095 /* Have to use mov instruction if non desctructive
17096 destination form is used. */
17097 if (regno1 != regno0 && regno2 != regno0)
17098 split_cost += 1;
17099
17100 /* Have to add index to base if both exist. */
17101 if (parts.base && parts.index)
17102 split_cost += 1;
17103
17104 /* Have to use shift and adds if scale is 2 or greater. */
17105 if (parts.scale > 1)
17106 {
17107 if (regno0 != regno1)
17108 split_cost += 1;
17109 else if (regno2 == regno0)
17110 split_cost += 4;
17111 else
17112 split_cost += parts.scale;
17113 }
17114
17115 /* Have to use add instruction with immediate if
17116 disp is non zero. */
17117 if (parts.disp && parts.disp != const0_rtx)
17118 split_cost += 1;
17119
17120 /* Subtract the price of lea. */
17121 split_cost -= 1;
17122 }
17123
17124 return !ix86_lea_outperforms (insn, regno0, regno1, regno2, split_cost);
17125 }
17126
17127 /* Emit x86 binary operand CODE in mode MODE, where the first operand
17128 matches destination. RTX includes clobber of FLAGS_REG. */
17129
17130 static void
17131 ix86_emit_binop (enum rtx_code code, enum machine_mode mode,
17132 rtx dst, rtx src)
17133 {
17134 rtx op, clob;
17135
17136 op = gen_rtx_SET (VOIDmode, dst, gen_rtx_fmt_ee (code, mode, dst, src));
17137 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
17138
17139 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
17140 }
17141
17142 /* Return true if regno1 def is nearest to the insn. */
17143
17144 static bool
17145 find_nearest_reg_def (rtx insn, int regno1, int regno2)
17146 {
17147 rtx prev = insn;
17148 rtx start = BB_HEAD (BLOCK_FOR_INSN (insn));
17149
17150 if (insn == start)
17151 return false;
17152 while (prev && prev != start)
17153 {
17154 if (!INSN_P (prev) || !NONDEBUG_INSN_P (prev))
17155 {
17156 prev = PREV_INSN (prev);
17157 continue;
17158 }
17159 if (insn_defines_reg (regno1, INVALID_REGNUM, prev))
17160 return true;
17161 else if (insn_defines_reg (regno2, INVALID_REGNUM, prev))
17162 return false;
17163 prev = PREV_INSN (prev);
17164 }
17165
17166 /* None of the regs is defined in the bb. */
17167 return false;
17168 }
17169
17170 /* Split lea instructions into a sequence of instructions
17171 which are executed on ALU to avoid AGU stalls.
17172 It is assumed that it is allowed to clobber flags register
17173 at lea position. */
17174
17175 void
17176 ix86_split_lea_for_addr (rtx insn, rtx operands[], enum machine_mode mode)
17177 {
17178 unsigned int regno0, regno1, regno2;
17179 struct ix86_address parts;
17180 rtx target, tmp;
17181 int ok, adds;
17182
17183 ok = ix86_decompose_address (operands[1], &parts);
17184 gcc_assert (ok);
17185
17186 target = gen_lowpart (mode, operands[0]);
17187
17188 regno0 = true_regnum (target);
17189 regno1 = INVALID_REGNUM;
17190 regno2 = INVALID_REGNUM;
17191
17192 if (parts.base)
17193 {
17194 parts.base = gen_lowpart (mode, parts.base);
17195 regno1 = true_regnum (parts.base);
17196 }
17197
17198 if (parts.index)
17199 {
17200 parts.index = gen_lowpart (mode, parts.index);
17201 regno2 = true_regnum (parts.index);
17202 }
17203
17204 if (parts.disp)
17205 parts.disp = gen_lowpart (mode, parts.disp);
17206
17207 if (parts.scale > 1)
17208 {
17209 /* Case r1 = r1 + ... */
17210 if (regno1 == regno0)
17211 {
17212 /* If we have a case r1 = r1 + C * r1 then we
17213 should use multiplication which is very
17214 expensive. Assume cost model is wrong if we
17215 have such case here. */
17216 gcc_assert (regno2 != regno0);
17217
17218 for (adds = parts.scale; adds > 0; adds--)
17219 ix86_emit_binop (PLUS, mode, target, parts.index);
17220 }
17221 else
17222 {
17223 /* r1 = r2 + r3 * C case. Need to move r3 into r1. */
17224 if (regno0 != regno2)
17225 emit_insn (gen_rtx_SET (VOIDmode, target, parts.index));
17226
17227 /* Use shift for scaling. */
17228 ix86_emit_binop (ASHIFT, mode, target,
17229 GEN_INT (exact_log2 (parts.scale)));
17230
17231 if (parts.base)
17232 ix86_emit_binop (PLUS, mode, target, parts.base);
17233
17234 if (parts.disp && parts.disp != const0_rtx)
17235 ix86_emit_binop (PLUS, mode, target, parts.disp);
17236 }
17237 }
17238 else if (!parts.base && !parts.index)
17239 {
17240 gcc_assert(parts.disp);
17241 emit_insn (gen_rtx_SET (VOIDmode, target, parts.disp));
17242 }
17243 else
17244 {
17245 if (!parts.base)
17246 {
17247 if (regno0 != regno2)
17248 emit_insn (gen_rtx_SET (VOIDmode, target, parts.index));
17249 }
17250 else if (!parts.index)
17251 {
17252 if (regno0 != regno1)
17253 emit_insn (gen_rtx_SET (VOIDmode, target, parts.base));
17254 }
17255 else
17256 {
17257 if (regno0 == regno1)
17258 tmp = parts.index;
17259 else if (regno0 == regno2)
17260 tmp = parts.base;
17261 else
17262 {
17263 rtx tmp1;
17264
17265 /* Find better operand for SET instruction, depending
17266 on which definition is farther from the insn. */
17267 if (find_nearest_reg_def (insn, regno1, regno2))
17268 tmp = parts.index, tmp1 = parts.base;
17269 else
17270 tmp = parts.base, tmp1 = parts.index;
17271
17272 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
17273
17274 if (parts.disp && parts.disp != const0_rtx)
17275 ix86_emit_binop (PLUS, mode, target, parts.disp);
17276
17277 ix86_emit_binop (PLUS, mode, target, tmp1);
17278 return;
17279 }
17280
17281 ix86_emit_binop (PLUS, mode, target, tmp);
17282 }
17283
17284 if (parts.disp && parts.disp != const0_rtx)
17285 ix86_emit_binop (PLUS, mode, target, parts.disp);
17286 }
17287 }
17288
17289 /* Return true if it is ok to optimize an ADD operation to LEA
17290 operation to avoid flag register consumation. For most processors,
17291 ADD is faster than LEA. For the processors like ATOM, if the
17292 destination register of LEA holds an actual address which will be
17293 used soon, LEA is better and otherwise ADD is better. */
17294
17295 bool
17296 ix86_lea_for_add_ok (rtx insn, rtx operands[])
17297 {
17298 unsigned int regno0 = true_regnum (operands[0]);
17299 unsigned int regno1 = true_regnum (operands[1]);
17300 unsigned int regno2 = true_regnum (operands[2]);
17301
17302 /* If a = b + c, (a!=b && a!=c), must use lea form. */
17303 if (regno0 != regno1 && regno0 != regno2)
17304 return true;
17305
17306 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
17307 return false;
17308
17309 return ix86_lea_outperforms (insn, regno0, regno1, regno2, 0);
17310 }
17311
17312 /* Return true if destination reg of SET_BODY is shift count of
17313 USE_BODY. */
17314
17315 static bool
17316 ix86_dep_by_shift_count_body (const_rtx set_body, const_rtx use_body)
17317 {
17318 rtx set_dest;
17319 rtx shift_rtx;
17320 int i;
17321
17322 /* Retrieve destination of SET_BODY. */
17323 switch (GET_CODE (set_body))
17324 {
17325 case SET:
17326 set_dest = SET_DEST (set_body);
17327 if (!set_dest || !REG_P (set_dest))
17328 return false;
17329 break;
17330 case PARALLEL:
17331 for (i = XVECLEN (set_body, 0) - 1; i >= 0; i--)
17332 if (ix86_dep_by_shift_count_body (XVECEXP (set_body, 0, i),
17333 use_body))
17334 return true;
17335 default:
17336 return false;
17337 break;
17338 }
17339
17340 /* Retrieve shift count of USE_BODY. */
17341 switch (GET_CODE (use_body))
17342 {
17343 case SET:
17344 shift_rtx = XEXP (use_body, 1);
17345 break;
17346 case PARALLEL:
17347 for (i = XVECLEN (use_body, 0) - 1; i >= 0; i--)
17348 if (ix86_dep_by_shift_count_body (set_body,
17349 XVECEXP (use_body, 0, i)))
17350 return true;
17351 default:
17352 return false;
17353 break;
17354 }
17355
17356 if (shift_rtx
17357 && (GET_CODE (shift_rtx) == ASHIFT
17358 || GET_CODE (shift_rtx) == LSHIFTRT
17359 || GET_CODE (shift_rtx) == ASHIFTRT
17360 || GET_CODE (shift_rtx) == ROTATE
17361 || GET_CODE (shift_rtx) == ROTATERT))
17362 {
17363 rtx shift_count = XEXP (shift_rtx, 1);
17364
17365 /* Return true if shift count is dest of SET_BODY. */
17366 if (REG_P (shift_count)
17367 && true_regnum (set_dest) == true_regnum (shift_count))
17368 return true;
17369 }
17370
17371 return false;
17372 }
17373
17374 /* Return true if destination reg of SET_INSN is shift count of
17375 USE_INSN. */
17376
17377 bool
17378 ix86_dep_by_shift_count (const_rtx set_insn, const_rtx use_insn)
17379 {
17380 return ix86_dep_by_shift_count_body (PATTERN (set_insn),
17381 PATTERN (use_insn));
17382 }
17383
17384 /* Return TRUE or FALSE depending on whether the unary operator meets the
17385 appropriate constraints. */
17386
17387 bool
17388 ix86_unary_operator_ok (enum rtx_code code ATTRIBUTE_UNUSED,
17389 enum machine_mode mode ATTRIBUTE_UNUSED,
17390 rtx operands[2] ATTRIBUTE_UNUSED)
17391 {
17392 /* If one of operands is memory, source and destination must match. */
17393 if ((MEM_P (operands[0])
17394 || MEM_P (operands[1]))
17395 && ! rtx_equal_p (operands[0], operands[1]))
17396 return false;
17397 return true;
17398 }
17399
17400 /* Return TRUE if the operands to a vec_interleave_{high,low}v2df
17401 are ok, keeping in mind the possible movddup alternative. */
17402
17403 bool
17404 ix86_vec_interleave_v2df_operator_ok (rtx operands[3], bool high)
17405 {
17406 if (MEM_P (operands[0]))
17407 return rtx_equal_p (operands[0], operands[1 + high]);
17408 if (MEM_P (operands[1]) && MEM_P (operands[2]))
17409 return TARGET_SSE3 && rtx_equal_p (operands[1], operands[2]);
17410 return true;
17411 }
17412
17413 /* Post-reload splitter for converting an SF or DFmode value in an
17414 SSE register into an unsigned SImode. */
17415
17416 void
17417 ix86_split_convert_uns_si_sse (rtx operands[])
17418 {
17419 enum machine_mode vecmode;
17420 rtx value, large, zero_or_two31, input, two31, x;
17421
17422 large = operands[1];
17423 zero_or_two31 = operands[2];
17424 input = operands[3];
17425 two31 = operands[4];
17426 vecmode = GET_MODE (large);
17427 value = gen_rtx_REG (vecmode, REGNO (operands[0]));
17428
17429 /* Load up the value into the low element. We must ensure that the other
17430 elements are valid floats -- zero is the easiest such value. */
17431 if (MEM_P (input))
17432 {
17433 if (vecmode == V4SFmode)
17434 emit_insn (gen_vec_setv4sf_0 (value, CONST0_RTX (V4SFmode), input));
17435 else
17436 emit_insn (gen_sse2_loadlpd (value, CONST0_RTX (V2DFmode), input));
17437 }
17438 else
17439 {
17440 input = gen_rtx_REG (vecmode, REGNO (input));
17441 emit_move_insn (value, CONST0_RTX (vecmode));
17442 if (vecmode == V4SFmode)
17443 emit_insn (gen_sse_movss (value, value, input));
17444 else
17445 emit_insn (gen_sse2_movsd (value, value, input));
17446 }
17447
17448 emit_move_insn (large, two31);
17449 emit_move_insn (zero_or_two31, MEM_P (two31) ? large : two31);
17450
17451 x = gen_rtx_fmt_ee (LE, vecmode, large, value);
17452 emit_insn (gen_rtx_SET (VOIDmode, large, x));
17453
17454 x = gen_rtx_AND (vecmode, zero_or_two31, large);
17455 emit_insn (gen_rtx_SET (VOIDmode, zero_or_two31, x));
17456
17457 x = gen_rtx_MINUS (vecmode, value, zero_or_two31);
17458 emit_insn (gen_rtx_SET (VOIDmode, value, x));
17459
17460 large = gen_rtx_REG (V4SImode, REGNO (large));
17461 emit_insn (gen_ashlv4si3 (large, large, GEN_INT (31)));
17462
17463 x = gen_rtx_REG (V4SImode, REGNO (value));
17464 if (vecmode == V4SFmode)
17465 emit_insn (gen_fix_truncv4sfv4si2 (x, value));
17466 else
17467 emit_insn (gen_sse2_cvttpd2dq (x, value));
17468 value = x;
17469
17470 emit_insn (gen_xorv4si3 (value, value, large));
17471 }
17472
17473 /* Convert an unsigned DImode value into a DFmode, using only SSE.
17474 Expects the 64-bit DImode to be supplied in a pair of integral
17475 registers. Requires SSE2; will use SSE3 if available. For x86_32,
17476 -mfpmath=sse, !optimize_size only. */
17477
17478 void
17479 ix86_expand_convert_uns_didf_sse (rtx target, rtx input)
17480 {
17481 REAL_VALUE_TYPE bias_lo_rvt, bias_hi_rvt;
17482 rtx int_xmm, fp_xmm;
17483 rtx biases, exponents;
17484 rtx x;
17485
17486 int_xmm = gen_reg_rtx (V4SImode);
17487 if (TARGET_INTER_UNIT_MOVES)
17488 emit_insn (gen_movdi_to_sse (int_xmm, input));
17489 else if (TARGET_SSE_SPLIT_REGS)
17490 {
17491 emit_clobber (int_xmm);
17492 emit_move_insn (gen_lowpart (DImode, int_xmm), input);
17493 }
17494 else
17495 {
17496 x = gen_reg_rtx (V2DImode);
17497 ix86_expand_vector_init_one_nonzero (false, V2DImode, x, input, 0);
17498 emit_move_insn (int_xmm, gen_lowpart (V4SImode, x));
17499 }
17500
17501 x = gen_rtx_CONST_VECTOR (V4SImode,
17502 gen_rtvec (4, GEN_INT (0x43300000UL),
17503 GEN_INT (0x45300000UL),
17504 const0_rtx, const0_rtx));
17505 exponents = validize_mem (force_const_mem (V4SImode, x));
17506
17507 /* int_xmm = {0x45300000UL, fp_xmm/hi, 0x43300000, fp_xmm/lo } */
17508 emit_insn (gen_vec_interleave_lowv4si (int_xmm, int_xmm, exponents));
17509
17510 /* Concatenating (juxtaposing) (0x43300000UL ## fp_value_low_xmm)
17511 yields a valid DF value equal to (0x1.0p52 + double(fp_value_lo_xmm)).
17512 Similarly (0x45300000UL ## fp_value_hi_xmm) yields
17513 (0x1.0p84 + double(fp_value_hi_xmm)).
17514 Note these exponents differ by 32. */
17515
17516 fp_xmm = copy_to_mode_reg (V2DFmode, gen_lowpart (V2DFmode, int_xmm));
17517
17518 /* Subtract off those 0x1.0p52 and 0x1.0p84 biases, to produce values
17519 in [0,2**32-1] and [0]+[2**32,2**64-1] respectively. */
17520 real_ldexp (&bias_lo_rvt, &dconst1, 52);
17521 real_ldexp (&bias_hi_rvt, &dconst1, 84);
17522 biases = const_double_from_real_value (bias_lo_rvt, DFmode);
17523 x = const_double_from_real_value (bias_hi_rvt, DFmode);
17524 biases = gen_rtx_CONST_VECTOR (V2DFmode, gen_rtvec (2, biases, x));
17525 biases = validize_mem (force_const_mem (V2DFmode, biases));
17526 emit_insn (gen_subv2df3 (fp_xmm, fp_xmm, biases));
17527
17528 /* Add the upper and lower DFmode values together. */
17529 if (TARGET_SSE3)
17530 emit_insn (gen_sse3_haddv2df3 (fp_xmm, fp_xmm, fp_xmm));
17531 else
17532 {
17533 x = copy_to_mode_reg (V2DFmode, fp_xmm);
17534 emit_insn (gen_vec_interleave_highv2df (fp_xmm, fp_xmm, fp_xmm));
17535 emit_insn (gen_addv2df3 (fp_xmm, fp_xmm, x));
17536 }
17537
17538 ix86_expand_vector_extract (false, target, fp_xmm, 0);
17539 }
17540
17541 /* Not used, but eases macroization of patterns. */
17542 void
17543 ix86_expand_convert_uns_sixf_sse (rtx target ATTRIBUTE_UNUSED,
17544 rtx input ATTRIBUTE_UNUSED)
17545 {
17546 gcc_unreachable ();
17547 }
17548
17549 /* Convert an unsigned SImode value into a DFmode. Only currently used
17550 for SSE, but applicable anywhere. */
17551
17552 void
17553 ix86_expand_convert_uns_sidf_sse (rtx target, rtx input)
17554 {
17555 REAL_VALUE_TYPE TWO31r;
17556 rtx x, fp;
17557
17558 x = expand_simple_binop (SImode, PLUS, input, GEN_INT (-2147483647 - 1),
17559 NULL, 1, OPTAB_DIRECT);
17560
17561 fp = gen_reg_rtx (DFmode);
17562 emit_insn (gen_floatsidf2 (fp, x));
17563
17564 real_ldexp (&TWO31r, &dconst1, 31);
17565 x = const_double_from_real_value (TWO31r, DFmode);
17566
17567 x = expand_simple_binop (DFmode, PLUS, fp, x, target, 0, OPTAB_DIRECT);
17568 if (x != target)
17569 emit_move_insn (target, x);
17570 }
17571
17572 /* Convert a signed DImode value into a DFmode. Only used for SSE in
17573 32-bit mode; otherwise we have a direct convert instruction. */
17574
17575 void
17576 ix86_expand_convert_sign_didf_sse (rtx target, rtx input)
17577 {
17578 REAL_VALUE_TYPE TWO32r;
17579 rtx fp_lo, fp_hi, x;
17580
17581 fp_lo = gen_reg_rtx (DFmode);
17582 fp_hi = gen_reg_rtx (DFmode);
17583
17584 emit_insn (gen_floatsidf2 (fp_hi, gen_highpart (SImode, input)));
17585
17586 real_ldexp (&TWO32r, &dconst1, 32);
17587 x = const_double_from_real_value (TWO32r, DFmode);
17588 fp_hi = expand_simple_binop (DFmode, MULT, fp_hi, x, fp_hi, 0, OPTAB_DIRECT);
17589
17590 ix86_expand_convert_uns_sidf_sse (fp_lo, gen_lowpart (SImode, input));
17591
17592 x = expand_simple_binop (DFmode, PLUS, fp_hi, fp_lo, target,
17593 0, OPTAB_DIRECT);
17594 if (x != target)
17595 emit_move_insn (target, x);
17596 }
17597
17598 /* Convert an unsigned SImode value into a SFmode, using only SSE.
17599 For x86_32, -mfpmath=sse, !optimize_size only. */
17600 void
17601 ix86_expand_convert_uns_sisf_sse (rtx target, rtx input)
17602 {
17603 REAL_VALUE_TYPE ONE16r;
17604 rtx fp_hi, fp_lo, int_hi, int_lo, x;
17605
17606 real_ldexp (&ONE16r, &dconst1, 16);
17607 x = const_double_from_real_value (ONE16r, SFmode);
17608 int_lo = expand_simple_binop (SImode, AND, input, GEN_INT(0xffff),
17609 NULL, 0, OPTAB_DIRECT);
17610 int_hi = expand_simple_binop (SImode, LSHIFTRT, input, GEN_INT(16),
17611 NULL, 0, OPTAB_DIRECT);
17612 fp_hi = gen_reg_rtx (SFmode);
17613 fp_lo = gen_reg_rtx (SFmode);
17614 emit_insn (gen_floatsisf2 (fp_hi, int_hi));
17615 emit_insn (gen_floatsisf2 (fp_lo, int_lo));
17616 fp_hi = expand_simple_binop (SFmode, MULT, fp_hi, x, fp_hi,
17617 0, OPTAB_DIRECT);
17618 fp_hi = expand_simple_binop (SFmode, PLUS, fp_hi, fp_lo, target,
17619 0, OPTAB_DIRECT);
17620 if (!rtx_equal_p (target, fp_hi))
17621 emit_move_insn (target, fp_hi);
17622 }
17623
17624 /* floatunsv{4,8}siv{4,8}sf2 expander. Expand code to convert
17625 a vector of unsigned ints VAL to vector of floats TARGET. */
17626
17627 void
17628 ix86_expand_vector_convert_uns_vsivsf (rtx target, rtx val)
17629 {
17630 rtx tmp[8];
17631 REAL_VALUE_TYPE TWO16r;
17632 enum machine_mode intmode = GET_MODE (val);
17633 enum machine_mode fltmode = GET_MODE (target);
17634 rtx (*cvt) (rtx, rtx);
17635
17636 if (intmode == V4SImode)
17637 cvt = gen_floatv4siv4sf2;
17638 else
17639 cvt = gen_floatv8siv8sf2;
17640 tmp[0] = ix86_build_const_vector (intmode, 1, GEN_INT (0xffff));
17641 tmp[0] = force_reg (intmode, tmp[0]);
17642 tmp[1] = expand_simple_binop (intmode, AND, val, tmp[0], NULL_RTX, 1,
17643 OPTAB_DIRECT);
17644 tmp[2] = expand_simple_binop (intmode, LSHIFTRT, val, GEN_INT (16),
17645 NULL_RTX, 1, OPTAB_DIRECT);
17646 tmp[3] = gen_reg_rtx (fltmode);
17647 emit_insn (cvt (tmp[3], tmp[1]));
17648 tmp[4] = gen_reg_rtx (fltmode);
17649 emit_insn (cvt (tmp[4], tmp[2]));
17650 real_ldexp (&TWO16r, &dconst1, 16);
17651 tmp[5] = const_double_from_real_value (TWO16r, SFmode);
17652 tmp[5] = force_reg (fltmode, ix86_build_const_vector (fltmode, 1, tmp[5]));
17653 tmp[6] = expand_simple_binop (fltmode, MULT, tmp[4], tmp[5], NULL_RTX, 1,
17654 OPTAB_DIRECT);
17655 tmp[7] = expand_simple_binop (fltmode, PLUS, tmp[3], tmp[6], target, 1,
17656 OPTAB_DIRECT);
17657 if (tmp[7] != target)
17658 emit_move_insn (target, tmp[7]);
17659 }
17660
17661 /* Adjust a V*SFmode/V*DFmode value VAL so that *sfix_trunc* resp. fix_trunc*
17662 pattern can be used on it instead of *ufix_trunc* resp. fixuns_trunc*.
17663 This is done by doing just signed conversion if < 0x1p31, and otherwise by
17664 subtracting 0x1p31 first and xoring in 0x80000000 from *XORP afterwards. */
17665
17666 rtx
17667 ix86_expand_adjust_ufix_to_sfix_si (rtx val, rtx *xorp)
17668 {
17669 REAL_VALUE_TYPE TWO31r;
17670 rtx two31r, tmp[4];
17671 enum machine_mode mode = GET_MODE (val);
17672 enum machine_mode scalarmode = GET_MODE_INNER (mode);
17673 enum machine_mode intmode = GET_MODE_SIZE (mode) == 32 ? V8SImode : V4SImode;
17674 rtx (*cmp) (rtx, rtx, rtx, rtx);
17675 int i;
17676
17677 for (i = 0; i < 3; i++)
17678 tmp[i] = gen_reg_rtx (mode);
17679 real_ldexp (&TWO31r, &dconst1, 31);
17680 two31r = const_double_from_real_value (TWO31r, scalarmode);
17681 two31r = ix86_build_const_vector (mode, 1, two31r);
17682 two31r = force_reg (mode, two31r);
17683 switch (mode)
17684 {
17685 case V8SFmode: cmp = gen_avx_maskcmpv8sf3; break;
17686 case V4SFmode: cmp = gen_sse_maskcmpv4sf3; break;
17687 case V4DFmode: cmp = gen_avx_maskcmpv4df3; break;
17688 case V2DFmode: cmp = gen_sse2_maskcmpv2df3; break;
17689 default: gcc_unreachable ();
17690 }
17691 tmp[3] = gen_rtx_LE (mode, two31r, val);
17692 emit_insn (cmp (tmp[0], two31r, val, tmp[3]));
17693 tmp[1] = expand_simple_binop (mode, AND, tmp[0], two31r, tmp[1],
17694 0, OPTAB_DIRECT);
17695 if (intmode == V4SImode || TARGET_AVX2)
17696 *xorp = expand_simple_binop (intmode, ASHIFT,
17697 gen_lowpart (intmode, tmp[0]),
17698 GEN_INT (31), NULL_RTX, 0,
17699 OPTAB_DIRECT);
17700 else
17701 {
17702 rtx two31 = GEN_INT ((unsigned HOST_WIDE_INT) 1 << 31);
17703 two31 = ix86_build_const_vector (intmode, 1, two31);
17704 *xorp = expand_simple_binop (intmode, AND,
17705 gen_lowpart (intmode, tmp[0]),
17706 two31, NULL_RTX, 0,
17707 OPTAB_DIRECT);
17708 }
17709 return expand_simple_binop (mode, MINUS, val, tmp[1], tmp[2],
17710 0, OPTAB_DIRECT);
17711 }
17712
17713 /* A subroutine of ix86_build_signbit_mask. If VECT is true,
17714 then replicate the value for all elements of the vector
17715 register. */
17716
17717 rtx
17718 ix86_build_const_vector (enum machine_mode mode, bool vect, rtx value)
17719 {
17720 int i, n_elt;
17721 rtvec v;
17722 enum machine_mode scalar_mode;
17723
17724 switch (mode)
17725 {
17726 case V32QImode:
17727 case V16QImode:
17728 case V16HImode:
17729 case V8HImode:
17730 case V8SImode:
17731 case V4SImode:
17732 case V4DImode:
17733 case V2DImode:
17734 gcc_assert (vect);
17735 case V8SFmode:
17736 case V4SFmode:
17737 case V4DFmode:
17738 case V2DFmode:
17739 n_elt = GET_MODE_NUNITS (mode);
17740 v = rtvec_alloc (n_elt);
17741 scalar_mode = GET_MODE_INNER (mode);
17742
17743 RTVEC_ELT (v, 0) = value;
17744
17745 for (i = 1; i < n_elt; ++i)
17746 RTVEC_ELT (v, i) = vect ? value : CONST0_RTX (scalar_mode);
17747
17748 return gen_rtx_CONST_VECTOR (mode, v);
17749
17750 default:
17751 gcc_unreachable ();
17752 }
17753 }
17754
17755 /* A subroutine of ix86_expand_fp_absneg_operator, copysign expanders
17756 and ix86_expand_int_vcond. Create a mask for the sign bit in MODE
17757 for an SSE register. If VECT is true, then replicate the mask for
17758 all elements of the vector register. If INVERT is true, then create
17759 a mask excluding the sign bit. */
17760
17761 rtx
17762 ix86_build_signbit_mask (enum machine_mode mode, bool vect, bool invert)
17763 {
17764 enum machine_mode vec_mode, imode;
17765 HOST_WIDE_INT hi, lo;
17766 int shift = 63;
17767 rtx v;
17768 rtx mask;
17769
17770 /* Find the sign bit, sign extended to 2*HWI. */
17771 switch (mode)
17772 {
17773 case V8SImode:
17774 case V4SImode:
17775 case V8SFmode:
17776 case V4SFmode:
17777 vec_mode = mode;
17778 mode = GET_MODE_INNER (mode);
17779 imode = SImode;
17780 lo = 0x80000000, hi = lo < 0;
17781 break;
17782
17783 case V4DImode:
17784 case V2DImode:
17785 case V4DFmode:
17786 case V2DFmode:
17787 vec_mode = mode;
17788 mode = GET_MODE_INNER (mode);
17789 imode = DImode;
17790 if (HOST_BITS_PER_WIDE_INT >= 64)
17791 lo = (HOST_WIDE_INT)1 << shift, hi = -1;
17792 else
17793 lo = 0, hi = (HOST_WIDE_INT)1 << (shift - HOST_BITS_PER_WIDE_INT);
17794 break;
17795
17796 case TImode:
17797 case TFmode:
17798 vec_mode = VOIDmode;
17799 if (HOST_BITS_PER_WIDE_INT >= 64)
17800 {
17801 imode = TImode;
17802 lo = 0, hi = (HOST_WIDE_INT)1 << shift;
17803 }
17804 else
17805 {
17806 rtvec vec;
17807
17808 imode = DImode;
17809 lo = 0, hi = (HOST_WIDE_INT)1 << (shift - HOST_BITS_PER_WIDE_INT);
17810
17811 if (invert)
17812 {
17813 lo = ~lo, hi = ~hi;
17814 v = constm1_rtx;
17815 }
17816 else
17817 v = const0_rtx;
17818
17819 mask = immed_double_const (lo, hi, imode);
17820
17821 vec = gen_rtvec (2, v, mask);
17822 v = gen_rtx_CONST_VECTOR (V2DImode, vec);
17823 v = copy_to_mode_reg (mode, gen_lowpart (mode, v));
17824
17825 return v;
17826 }
17827 break;
17828
17829 default:
17830 gcc_unreachable ();
17831 }
17832
17833 if (invert)
17834 lo = ~lo, hi = ~hi;
17835
17836 /* Force this value into the low part of a fp vector constant. */
17837 mask = immed_double_const (lo, hi, imode);
17838 mask = gen_lowpart (mode, mask);
17839
17840 if (vec_mode == VOIDmode)
17841 return force_reg (mode, mask);
17842
17843 v = ix86_build_const_vector (vec_mode, vect, mask);
17844 return force_reg (vec_mode, v);
17845 }
17846
17847 /* Generate code for floating point ABS or NEG. */
17848
17849 void
17850 ix86_expand_fp_absneg_operator (enum rtx_code code, enum machine_mode mode,
17851 rtx operands[])
17852 {
17853 rtx mask, set, dst, src;
17854 bool use_sse = false;
17855 bool vector_mode = VECTOR_MODE_P (mode);
17856 enum machine_mode vmode = mode;
17857
17858 if (vector_mode)
17859 use_sse = true;
17860 else if (mode == TFmode)
17861 use_sse = true;
17862 else if (TARGET_SSE_MATH)
17863 {
17864 use_sse = SSE_FLOAT_MODE_P (mode);
17865 if (mode == SFmode)
17866 vmode = V4SFmode;
17867 else if (mode == DFmode)
17868 vmode = V2DFmode;
17869 }
17870
17871 /* NEG and ABS performed with SSE use bitwise mask operations.
17872 Create the appropriate mask now. */
17873 if (use_sse)
17874 mask = ix86_build_signbit_mask (vmode, vector_mode, code == ABS);
17875 else
17876 mask = NULL_RTX;
17877
17878 dst = operands[0];
17879 src = operands[1];
17880
17881 set = gen_rtx_fmt_e (code, mode, src);
17882 set = gen_rtx_SET (VOIDmode, dst, set);
17883
17884 if (mask)
17885 {
17886 rtx use, clob;
17887 rtvec par;
17888
17889 use = gen_rtx_USE (VOIDmode, mask);
17890 if (vector_mode)
17891 par = gen_rtvec (2, set, use);
17892 else
17893 {
17894 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
17895 par = gen_rtvec (3, set, use, clob);
17896 }
17897 emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
17898 }
17899 else
17900 emit_insn (set);
17901 }
17902
17903 /* Expand a copysign operation. Special case operand 0 being a constant. */
17904
17905 void
17906 ix86_expand_copysign (rtx operands[])
17907 {
17908 enum machine_mode mode, vmode;
17909 rtx dest, op0, op1, mask, nmask;
17910
17911 dest = operands[0];
17912 op0 = operands[1];
17913 op1 = operands[2];
17914
17915 mode = GET_MODE (dest);
17916
17917 if (mode == SFmode)
17918 vmode = V4SFmode;
17919 else if (mode == DFmode)
17920 vmode = V2DFmode;
17921 else
17922 vmode = mode;
17923
17924 if (GET_CODE (op0) == CONST_DOUBLE)
17925 {
17926 rtx (*copysign_insn)(rtx, rtx, rtx, rtx);
17927
17928 if (real_isneg (CONST_DOUBLE_REAL_VALUE (op0)))
17929 op0 = simplify_unary_operation (ABS, mode, op0, mode);
17930
17931 if (mode == SFmode || mode == DFmode)
17932 {
17933 if (op0 == CONST0_RTX (mode))
17934 op0 = CONST0_RTX (vmode);
17935 else
17936 {
17937 rtx v = ix86_build_const_vector (vmode, false, op0);
17938
17939 op0 = force_reg (vmode, v);
17940 }
17941 }
17942 else if (op0 != CONST0_RTX (mode))
17943 op0 = force_reg (mode, op0);
17944
17945 mask = ix86_build_signbit_mask (vmode, 0, 0);
17946
17947 if (mode == SFmode)
17948 copysign_insn = gen_copysignsf3_const;
17949 else if (mode == DFmode)
17950 copysign_insn = gen_copysigndf3_const;
17951 else
17952 copysign_insn = gen_copysigntf3_const;
17953
17954 emit_insn (copysign_insn (dest, op0, op1, mask));
17955 }
17956 else
17957 {
17958 rtx (*copysign_insn)(rtx, rtx, rtx, rtx, rtx, rtx);
17959
17960 nmask = ix86_build_signbit_mask (vmode, 0, 1);
17961 mask = ix86_build_signbit_mask (vmode, 0, 0);
17962
17963 if (mode == SFmode)
17964 copysign_insn = gen_copysignsf3_var;
17965 else if (mode == DFmode)
17966 copysign_insn = gen_copysigndf3_var;
17967 else
17968 copysign_insn = gen_copysigntf3_var;
17969
17970 emit_insn (copysign_insn (dest, NULL_RTX, op0, op1, nmask, mask));
17971 }
17972 }
17973
17974 /* Deconstruct a copysign operation into bit masks. Operand 0 is known to
17975 be a constant, and so has already been expanded into a vector constant. */
17976
17977 void
17978 ix86_split_copysign_const (rtx operands[])
17979 {
17980 enum machine_mode mode, vmode;
17981 rtx dest, op0, mask, x;
17982
17983 dest = operands[0];
17984 op0 = operands[1];
17985 mask = operands[3];
17986
17987 mode = GET_MODE (dest);
17988 vmode = GET_MODE (mask);
17989
17990 dest = simplify_gen_subreg (vmode, dest, mode, 0);
17991 x = gen_rtx_AND (vmode, dest, mask);
17992 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
17993
17994 if (op0 != CONST0_RTX (vmode))
17995 {
17996 x = gen_rtx_IOR (vmode, dest, op0);
17997 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
17998 }
17999 }
18000
18001 /* Deconstruct a copysign operation into bit masks. Operand 0 is variable,
18002 so we have to do two masks. */
18003
18004 void
18005 ix86_split_copysign_var (rtx operands[])
18006 {
18007 enum machine_mode mode, vmode;
18008 rtx dest, scratch, op0, op1, mask, nmask, x;
18009
18010 dest = operands[0];
18011 scratch = operands[1];
18012 op0 = operands[2];
18013 op1 = operands[3];
18014 nmask = operands[4];
18015 mask = operands[5];
18016
18017 mode = GET_MODE (dest);
18018 vmode = GET_MODE (mask);
18019
18020 if (rtx_equal_p (op0, op1))
18021 {
18022 /* Shouldn't happen often (it's useless, obviously), but when it does
18023 we'd generate incorrect code if we continue below. */
18024 emit_move_insn (dest, op0);
18025 return;
18026 }
18027
18028 if (REG_P (mask) && REGNO (dest) == REGNO (mask)) /* alternative 0 */
18029 {
18030 gcc_assert (REGNO (op1) == REGNO (scratch));
18031
18032 x = gen_rtx_AND (vmode, scratch, mask);
18033 emit_insn (gen_rtx_SET (VOIDmode, scratch, x));
18034
18035 dest = mask;
18036 op0 = simplify_gen_subreg (vmode, op0, mode, 0);
18037 x = gen_rtx_NOT (vmode, dest);
18038 x = gen_rtx_AND (vmode, x, op0);
18039 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
18040 }
18041 else
18042 {
18043 if (REGNO (op1) == REGNO (scratch)) /* alternative 1,3 */
18044 {
18045 x = gen_rtx_AND (vmode, scratch, mask);
18046 }
18047 else /* alternative 2,4 */
18048 {
18049 gcc_assert (REGNO (mask) == REGNO (scratch));
18050 op1 = simplify_gen_subreg (vmode, op1, mode, 0);
18051 x = gen_rtx_AND (vmode, scratch, op1);
18052 }
18053 emit_insn (gen_rtx_SET (VOIDmode, scratch, x));
18054
18055 if (REGNO (op0) == REGNO (dest)) /* alternative 1,2 */
18056 {
18057 dest = simplify_gen_subreg (vmode, op0, mode, 0);
18058 x = gen_rtx_AND (vmode, dest, nmask);
18059 }
18060 else /* alternative 3,4 */
18061 {
18062 gcc_assert (REGNO (nmask) == REGNO (dest));
18063 dest = nmask;
18064 op0 = simplify_gen_subreg (vmode, op0, mode, 0);
18065 x = gen_rtx_AND (vmode, dest, op0);
18066 }
18067 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
18068 }
18069
18070 x = gen_rtx_IOR (vmode, dest, scratch);
18071 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
18072 }
18073
18074 /* Return TRUE or FALSE depending on whether the first SET in INSN
18075 has source and destination with matching CC modes, and that the
18076 CC mode is at least as constrained as REQ_MODE. */
18077
18078 bool
18079 ix86_match_ccmode (rtx insn, enum machine_mode req_mode)
18080 {
18081 rtx set;
18082 enum machine_mode set_mode;
18083
18084 set = PATTERN (insn);
18085 if (GET_CODE (set) == PARALLEL)
18086 set = XVECEXP (set, 0, 0);
18087 gcc_assert (GET_CODE (set) == SET);
18088 gcc_assert (GET_CODE (SET_SRC (set)) == COMPARE);
18089
18090 set_mode = GET_MODE (SET_DEST (set));
18091 switch (set_mode)
18092 {
18093 case CCNOmode:
18094 if (req_mode != CCNOmode
18095 && (req_mode != CCmode
18096 || XEXP (SET_SRC (set), 1) != const0_rtx))
18097 return false;
18098 break;
18099 case CCmode:
18100 if (req_mode == CCGCmode)
18101 return false;
18102 /* FALLTHRU */
18103 case CCGCmode:
18104 if (req_mode == CCGOCmode || req_mode == CCNOmode)
18105 return false;
18106 /* FALLTHRU */
18107 case CCGOCmode:
18108 if (req_mode == CCZmode)
18109 return false;
18110 /* FALLTHRU */
18111 case CCZmode:
18112 break;
18113
18114 case CCAmode:
18115 case CCCmode:
18116 case CCOmode:
18117 case CCSmode:
18118 if (set_mode != req_mode)
18119 return false;
18120 break;
18121
18122 default:
18123 gcc_unreachable ();
18124 }
18125
18126 return GET_MODE (SET_SRC (set)) == set_mode;
18127 }
18128
18129 /* Generate insn patterns to do an integer compare of OPERANDS. */
18130
18131 static rtx
18132 ix86_expand_int_compare (enum rtx_code code, rtx op0, rtx op1)
18133 {
18134 enum machine_mode cmpmode;
18135 rtx tmp, flags;
18136
18137 cmpmode = SELECT_CC_MODE (code, op0, op1);
18138 flags = gen_rtx_REG (cmpmode, FLAGS_REG);
18139
18140 /* This is very simple, but making the interface the same as in the
18141 FP case makes the rest of the code easier. */
18142 tmp = gen_rtx_COMPARE (cmpmode, op0, op1);
18143 emit_insn (gen_rtx_SET (VOIDmode, flags, tmp));
18144
18145 /* Return the test that should be put into the flags user, i.e.
18146 the bcc, scc, or cmov instruction. */
18147 return gen_rtx_fmt_ee (code, VOIDmode, flags, const0_rtx);
18148 }
18149
18150 /* Figure out whether to use ordered or unordered fp comparisons.
18151 Return the appropriate mode to use. */
18152
18153 enum machine_mode
18154 ix86_fp_compare_mode (enum rtx_code code ATTRIBUTE_UNUSED)
18155 {
18156 /* ??? In order to make all comparisons reversible, we do all comparisons
18157 non-trapping when compiling for IEEE. Once gcc is able to distinguish
18158 all forms trapping and nontrapping comparisons, we can make inequality
18159 comparisons trapping again, since it results in better code when using
18160 FCOM based compares. */
18161 return TARGET_IEEE_FP ? CCFPUmode : CCFPmode;
18162 }
18163
18164 enum machine_mode
18165 ix86_cc_mode (enum rtx_code code, rtx op0, rtx op1)
18166 {
18167 enum machine_mode mode = GET_MODE (op0);
18168
18169 if (SCALAR_FLOAT_MODE_P (mode))
18170 {
18171 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
18172 return ix86_fp_compare_mode (code);
18173 }
18174
18175 switch (code)
18176 {
18177 /* Only zero flag is needed. */
18178 case EQ: /* ZF=0 */
18179 case NE: /* ZF!=0 */
18180 return CCZmode;
18181 /* Codes needing carry flag. */
18182 case GEU: /* CF=0 */
18183 case LTU: /* CF=1 */
18184 /* Detect overflow checks. They need just the carry flag. */
18185 if (GET_CODE (op0) == PLUS
18186 && rtx_equal_p (op1, XEXP (op0, 0)))
18187 return CCCmode;
18188 else
18189 return CCmode;
18190 case GTU: /* CF=0 & ZF=0 */
18191 case LEU: /* CF=1 | ZF=1 */
18192 /* Detect overflow checks. They need just the carry flag. */
18193 if (GET_CODE (op0) == MINUS
18194 && rtx_equal_p (op1, XEXP (op0, 0)))
18195 return CCCmode;
18196 else
18197 return CCmode;
18198 /* Codes possibly doable only with sign flag when
18199 comparing against zero. */
18200 case GE: /* SF=OF or SF=0 */
18201 case LT: /* SF<>OF or SF=1 */
18202 if (op1 == const0_rtx)
18203 return CCGOCmode;
18204 else
18205 /* For other cases Carry flag is not required. */
18206 return CCGCmode;
18207 /* Codes doable only with sign flag when comparing
18208 against zero, but we miss jump instruction for it
18209 so we need to use relational tests against overflow
18210 that thus needs to be zero. */
18211 case GT: /* ZF=0 & SF=OF */
18212 case LE: /* ZF=1 | SF<>OF */
18213 if (op1 == const0_rtx)
18214 return CCNOmode;
18215 else
18216 return CCGCmode;
18217 /* strcmp pattern do (use flags) and combine may ask us for proper
18218 mode. */
18219 case USE:
18220 return CCmode;
18221 default:
18222 gcc_unreachable ();
18223 }
18224 }
18225
18226 /* Return the fixed registers used for condition codes. */
18227
18228 static bool
18229 ix86_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
18230 {
18231 *p1 = FLAGS_REG;
18232 *p2 = FPSR_REG;
18233 return true;
18234 }
18235
18236 /* If two condition code modes are compatible, return a condition code
18237 mode which is compatible with both. Otherwise, return
18238 VOIDmode. */
18239
18240 static enum machine_mode
18241 ix86_cc_modes_compatible (enum machine_mode m1, enum machine_mode m2)
18242 {
18243 if (m1 == m2)
18244 return m1;
18245
18246 if (GET_MODE_CLASS (m1) != MODE_CC || GET_MODE_CLASS (m2) != MODE_CC)
18247 return VOIDmode;
18248
18249 if ((m1 == CCGCmode && m2 == CCGOCmode)
18250 || (m1 == CCGOCmode && m2 == CCGCmode))
18251 return CCGCmode;
18252
18253 if (m1 == CCZmode && (m2 == CCGCmode || m2 == CCGOCmode))
18254 return m2;
18255 else if (m2 == CCZmode && (m1 == CCGCmode || m1 == CCGOCmode))
18256 return m1;
18257
18258 switch (m1)
18259 {
18260 default:
18261 gcc_unreachable ();
18262
18263 case CCmode:
18264 case CCGCmode:
18265 case CCGOCmode:
18266 case CCNOmode:
18267 case CCAmode:
18268 case CCCmode:
18269 case CCOmode:
18270 case CCSmode:
18271 case CCZmode:
18272 switch (m2)
18273 {
18274 default:
18275 return VOIDmode;
18276
18277 case CCmode:
18278 case CCGCmode:
18279 case CCGOCmode:
18280 case CCNOmode:
18281 case CCAmode:
18282 case CCCmode:
18283 case CCOmode:
18284 case CCSmode:
18285 case CCZmode:
18286 return CCmode;
18287 }
18288
18289 case CCFPmode:
18290 case CCFPUmode:
18291 /* These are only compatible with themselves, which we already
18292 checked above. */
18293 return VOIDmode;
18294 }
18295 }
18296
18297
18298 /* Return a comparison we can do and that it is equivalent to
18299 swap_condition (code) apart possibly from orderedness.
18300 But, never change orderedness if TARGET_IEEE_FP, returning
18301 UNKNOWN in that case if necessary. */
18302
18303 static enum rtx_code
18304 ix86_fp_swap_condition (enum rtx_code code)
18305 {
18306 switch (code)
18307 {
18308 case GT: /* GTU - CF=0 & ZF=0 */
18309 return TARGET_IEEE_FP ? UNKNOWN : UNLT;
18310 case GE: /* GEU - CF=0 */
18311 return TARGET_IEEE_FP ? UNKNOWN : UNLE;
18312 case UNLT: /* LTU - CF=1 */
18313 return TARGET_IEEE_FP ? UNKNOWN : GT;
18314 case UNLE: /* LEU - CF=1 | ZF=1 */
18315 return TARGET_IEEE_FP ? UNKNOWN : GE;
18316 default:
18317 return swap_condition (code);
18318 }
18319 }
18320
18321 /* Return cost of comparison CODE using the best strategy for performance.
18322 All following functions do use number of instructions as a cost metrics.
18323 In future this should be tweaked to compute bytes for optimize_size and
18324 take into account performance of various instructions on various CPUs. */
18325
18326 static int
18327 ix86_fp_comparison_cost (enum rtx_code code)
18328 {
18329 int arith_cost;
18330
18331 /* The cost of code using bit-twiddling on %ah. */
18332 switch (code)
18333 {
18334 case UNLE:
18335 case UNLT:
18336 case LTGT:
18337 case GT:
18338 case GE:
18339 case UNORDERED:
18340 case ORDERED:
18341 case UNEQ:
18342 arith_cost = 4;
18343 break;
18344 case LT:
18345 case NE:
18346 case EQ:
18347 case UNGE:
18348 arith_cost = TARGET_IEEE_FP ? 5 : 4;
18349 break;
18350 case LE:
18351 case UNGT:
18352 arith_cost = TARGET_IEEE_FP ? 6 : 4;
18353 break;
18354 default:
18355 gcc_unreachable ();
18356 }
18357
18358 switch (ix86_fp_comparison_strategy (code))
18359 {
18360 case IX86_FPCMP_COMI:
18361 return arith_cost > 4 ? 3 : 2;
18362 case IX86_FPCMP_SAHF:
18363 return arith_cost > 4 ? 4 : 3;
18364 default:
18365 return arith_cost;
18366 }
18367 }
18368
18369 /* Return strategy to use for floating-point. We assume that fcomi is always
18370 preferrable where available, since that is also true when looking at size
18371 (2 bytes, vs. 3 for fnstsw+sahf and at least 5 for fnstsw+test). */
18372
18373 enum ix86_fpcmp_strategy
18374 ix86_fp_comparison_strategy (enum rtx_code code ATTRIBUTE_UNUSED)
18375 {
18376 /* Do fcomi/sahf based test when profitable. */
18377
18378 if (TARGET_CMOVE)
18379 return IX86_FPCMP_COMI;
18380
18381 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_function_for_size_p (cfun)))
18382 return IX86_FPCMP_SAHF;
18383
18384 return IX86_FPCMP_ARITH;
18385 }
18386
18387 /* Swap, force into registers, or otherwise massage the two operands
18388 to a fp comparison. The operands are updated in place; the new
18389 comparison code is returned. */
18390
18391 static enum rtx_code
18392 ix86_prepare_fp_compare_args (enum rtx_code code, rtx *pop0, rtx *pop1)
18393 {
18394 enum machine_mode fpcmp_mode = ix86_fp_compare_mode (code);
18395 rtx op0 = *pop0, op1 = *pop1;
18396 enum machine_mode op_mode = GET_MODE (op0);
18397 int is_sse = TARGET_SSE_MATH && SSE_FLOAT_MODE_P (op_mode);
18398
18399 /* All of the unordered compare instructions only work on registers.
18400 The same is true of the fcomi compare instructions. The XFmode
18401 compare instructions require registers except when comparing
18402 against zero or when converting operand 1 from fixed point to
18403 floating point. */
18404
18405 if (!is_sse
18406 && (fpcmp_mode == CCFPUmode
18407 || (op_mode == XFmode
18408 && ! (standard_80387_constant_p (op0) == 1
18409 || standard_80387_constant_p (op1) == 1)
18410 && GET_CODE (op1) != FLOAT)
18411 || ix86_fp_comparison_strategy (code) == IX86_FPCMP_COMI))
18412 {
18413 op0 = force_reg (op_mode, op0);
18414 op1 = force_reg (op_mode, op1);
18415 }
18416 else
18417 {
18418 /* %%% We only allow op1 in memory; op0 must be st(0). So swap
18419 things around if they appear profitable, otherwise force op0
18420 into a register. */
18421
18422 if (standard_80387_constant_p (op0) == 0
18423 || (MEM_P (op0)
18424 && ! (standard_80387_constant_p (op1) == 0
18425 || MEM_P (op1))))
18426 {
18427 enum rtx_code new_code = ix86_fp_swap_condition (code);
18428 if (new_code != UNKNOWN)
18429 {
18430 rtx tmp;
18431 tmp = op0, op0 = op1, op1 = tmp;
18432 code = new_code;
18433 }
18434 }
18435
18436 if (!REG_P (op0))
18437 op0 = force_reg (op_mode, op0);
18438
18439 if (CONSTANT_P (op1))
18440 {
18441 int tmp = standard_80387_constant_p (op1);
18442 if (tmp == 0)
18443 op1 = validize_mem (force_const_mem (op_mode, op1));
18444 else if (tmp == 1)
18445 {
18446 if (TARGET_CMOVE)
18447 op1 = force_reg (op_mode, op1);
18448 }
18449 else
18450 op1 = force_reg (op_mode, op1);
18451 }
18452 }
18453
18454 /* Try to rearrange the comparison to make it cheaper. */
18455 if (ix86_fp_comparison_cost (code)
18456 > ix86_fp_comparison_cost (swap_condition (code))
18457 && (REG_P (op1) || can_create_pseudo_p ()))
18458 {
18459 rtx tmp;
18460 tmp = op0, op0 = op1, op1 = tmp;
18461 code = swap_condition (code);
18462 if (!REG_P (op0))
18463 op0 = force_reg (op_mode, op0);
18464 }
18465
18466 *pop0 = op0;
18467 *pop1 = op1;
18468 return code;
18469 }
18470
18471 /* Convert comparison codes we use to represent FP comparison to integer
18472 code that will result in proper branch. Return UNKNOWN if no such code
18473 is available. */
18474
18475 enum rtx_code
18476 ix86_fp_compare_code_to_integer (enum rtx_code code)
18477 {
18478 switch (code)
18479 {
18480 case GT:
18481 return GTU;
18482 case GE:
18483 return GEU;
18484 case ORDERED:
18485 case UNORDERED:
18486 return code;
18487 break;
18488 case UNEQ:
18489 return EQ;
18490 break;
18491 case UNLT:
18492 return LTU;
18493 break;
18494 case UNLE:
18495 return LEU;
18496 break;
18497 case LTGT:
18498 return NE;
18499 break;
18500 default:
18501 return UNKNOWN;
18502 }
18503 }
18504
18505 /* Generate insn patterns to do a floating point compare of OPERANDS. */
18506
18507 static rtx
18508 ix86_expand_fp_compare (enum rtx_code code, rtx op0, rtx op1, rtx scratch)
18509 {
18510 enum machine_mode fpcmp_mode, intcmp_mode;
18511 rtx tmp, tmp2;
18512
18513 fpcmp_mode = ix86_fp_compare_mode (code);
18514 code = ix86_prepare_fp_compare_args (code, &op0, &op1);
18515
18516 /* Do fcomi/sahf based test when profitable. */
18517 switch (ix86_fp_comparison_strategy (code))
18518 {
18519 case IX86_FPCMP_COMI:
18520 intcmp_mode = fpcmp_mode;
18521 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
18522 tmp = gen_rtx_SET (VOIDmode, gen_rtx_REG (fpcmp_mode, FLAGS_REG),
18523 tmp);
18524 emit_insn (tmp);
18525 break;
18526
18527 case IX86_FPCMP_SAHF:
18528 intcmp_mode = fpcmp_mode;
18529 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
18530 tmp = gen_rtx_SET (VOIDmode, gen_rtx_REG (fpcmp_mode, FLAGS_REG),
18531 tmp);
18532
18533 if (!scratch)
18534 scratch = gen_reg_rtx (HImode);
18535 tmp2 = gen_rtx_CLOBBER (VOIDmode, scratch);
18536 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, tmp2)));
18537 break;
18538
18539 case IX86_FPCMP_ARITH:
18540 /* Sadness wrt reg-stack pops killing fpsr -- gotta get fnstsw first. */
18541 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
18542 tmp2 = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
18543 if (!scratch)
18544 scratch = gen_reg_rtx (HImode);
18545 emit_insn (gen_rtx_SET (VOIDmode, scratch, tmp2));
18546
18547 /* In the unordered case, we have to check C2 for NaN's, which
18548 doesn't happen to work out to anything nice combination-wise.
18549 So do some bit twiddling on the value we've got in AH to come
18550 up with an appropriate set of condition codes. */
18551
18552 intcmp_mode = CCNOmode;
18553 switch (code)
18554 {
18555 case GT:
18556 case UNGT:
18557 if (code == GT || !TARGET_IEEE_FP)
18558 {
18559 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x45)));
18560 code = EQ;
18561 }
18562 else
18563 {
18564 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18565 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
18566 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x44)));
18567 intcmp_mode = CCmode;
18568 code = GEU;
18569 }
18570 break;
18571 case LT:
18572 case UNLT:
18573 if (code == LT && TARGET_IEEE_FP)
18574 {
18575 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18576 emit_insn (gen_cmpqi_ext_3 (scratch, const1_rtx));
18577 intcmp_mode = CCmode;
18578 code = EQ;
18579 }
18580 else
18581 {
18582 emit_insn (gen_testqi_ext_ccno_0 (scratch, const1_rtx));
18583 code = NE;
18584 }
18585 break;
18586 case GE:
18587 case UNGE:
18588 if (code == GE || !TARGET_IEEE_FP)
18589 {
18590 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x05)));
18591 code = EQ;
18592 }
18593 else
18594 {
18595 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18596 emit_insn (gen_xorqi_cc_ext_1 (scratch, scratch, const1_rtx));
18597 code = NE;
18598 }
18599 break;
18600 case LE:
18601 case UNLE:
18602 if (code == LE && TARGET_IEEE_FP)
18603 {
18604 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18605 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
18606 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
18607 intcmp_mode = CCmode;
18608 code = LTU;
18609 }
18610 else
18611 {
18612 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x45)));
18613 code = NE;
18614 }
18615 break;
18616 case EQ:
18617 case UNEQ:
18618 if (code == EQ && TARGET_IEEE_FP)
18619 {
18620 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18621 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
18622 intcmp_mode = CCmode;
18623 code = EQ;
18624 }
18625 else
18626 {
18627 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x40)));
18628 code = NE;
18629 }
18630 break;
18631 case NE:
18632 case LTGT:
18633 if (code == NE && TARGET_IEEE_FP)
18634 {
18635 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18636 emit_insn (gen_xorqi_cc_ext_1 (scratch, scratch,
18637 GEN_INT (0x40)));
18638 code = NE;
18639 }
18640 else
18641 {
18642 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x40)));
18643 code = EQ;
18644 }
18645 break;
18646
18647 case UNORDERED:
18648 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x04)));
18649 code = NE;
18650 break;
18651 case ORDERED:
18652 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x04)));
18653 code = EQ;
18654 break;
18655
18656 default:
18657 gcc_unreachable ();
18658 }
18659 break;
18660
18661 default:
18662 gcc_unreachable();
18663 }
18664
18665 /* Return the test that should be put into the flags user, i.e.
18666 the bcc, scc, or cmov instruction. */
18667 return gen_rtx_fmt_ee (code, VOIDmode,
18668 gen_rtx_REG (intcmp_mode, FLAGS_REG),
18669 const0_rtx);
18670 }
18671
18672 static rtx
18673 ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1)
18674 {
18675 rtx ret;
18676
18677 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC)
18678 ret = gen_rtx_fmt_ee (code, VOIDmode, op0, op1);
18679
18680 else if (SCALAR_FLOAT_MODE_P (GET_MODE (op0)))
18681 {
18682 gcc_assert (!DECIMAL_FLOAT_MODE_P (GET_MODE (op0)));
18683 ret = ix86_expand_fp_compare (code, op0, op1, NULL_RTX);
18684 }
18685 else
18686 ret = ix86_expand_int_compare (code, op0, op1);
18687
18688 return ret;
18689 }
18690
18691 void
18692 ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label)
18693 {
18694 enum machine_mode mode = GET_MODE (op0);
18695 rtx tmp;
18696
18697 switch (mode)
18698 {
18699 case SFmode:
18700 case DFmode:
18701 case XFmode:
18702 case QImode:
18703 case HImode:
18704 case SImode:
18705 simple:
18706 tmp = ix86_expand_compare (code, op0, op1);
18707 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
18708 gen_rtx_LABEL_REF (VOIDmode, label),
18709 pc_rtx);
18710 emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
18711 return;
18712
18713 case DImode:
18714 if (TARGET_64BIT)
18715 goto simple;
18716 case TImode:
18717 /* Expand DImode branch into multiple compare+branch. */
18718 {
18719 rtx lo[2], hi[2], label2;
18720 enum rtx_code code1, code2, code3;
18721 enum machine_mode submode;
18722
18723 if (CONSTANT_P (op0) && !CONSTANT_P (op1))
18724 {
18725 tmp = op0, op0 = op1, op1 = tmp;
18726 code = swap_condition (code);
18727 }
18728
18729 split_double_mode (mode, &op0, 1, lo+0, hi+0);
18730 split_double_mode (mode, &op1, 1, lo+1, hi+1);
18731
18732 submode = mode == DImode ? SImode : DImode;
18733
18734 /* When comparing for equality, we can use (hi0^hi1)|(lo0^lo1) to
18735 avoid two branches. This costs one extra insn, so disable when
18736 optimizing for size. */
18737
18738 if ((code == EQ || code == NE)
18739 && (!optimize_insn_for_size_p ()
18740 || hi[1] == const0_rtx || lo[1] == const0_rtx))
18741 {
18742 rtx xor0, xor1;
18743
18744 xor1 = hi[0];
18745 if (hi[1] != const0_rtx)
18746 xor1 = expand_binop (submode, xor_optab, xor1, hi[1],
18747 NULL_RTX, 0, OPTAB_WIDEN);
18748
18749 xor0 = lo[0];
18750 if (lo[1] != const0_rtx)
18751 xor0 = expand_binop (submode, xor_optab, xor0, lo[1],
18752 NULL_RTX, 0, OPTAB_WIDEN);
18753
18754 tmp = expand_binop (submode, ior_optab, xor1, xor0,
18755 NULL_RTX, 0, OPTAB_WIDEN);
18756
18757 ix86_expand_branch (code, tmp, const0_rtx, label);
18758 return;
18759 }
18760
18761 /* Otherwise, if we are doing less-than or greater-or-equal-than,
18762 op1 is a constant and the low word is zero, then we can just
18763 examine the high word. Similarly for low word -1 and
18764 less-or-equal-than or greater-than. */
18765
18766 if (CONST_INT_P (hi[1]))
18767 switch (code)
18768 {
18769 case LT: case LTU: case GE: case GEU:
18770 if (lo[1] == const0_rtx)
18771 {
18772 ix86_expand_branch (code, hi[0], hi[1], label);
18773 return;
18774 }
18775 break;
18776 case LE: case LEU: case GT: case GTU:
18777 if (lo[1] == constm1_rtx)
18778 {
18779 ix86_expand_branch (code, hi[0], hi[1], label);
18780 return;
18781 }
18782 break;
18783 default:
18784 break;
18785 }
18786
18787 /* Otherwise, we need two or three jumps. */
18788
18789 label2 = gen_label_rtx ();
18790
18791 code1 = code;
18792 code2 = swap_condition (code);
18793 code3 = unsigned_condition (code);
18794
18795 switch (code)
18796 {
18797 case LT: case GT: case LTU: case GTU:
18798 break;
18799
18800 case LE: code1 = LT; code2 = GT; break;
18801 case GE: code1 = GT; code2 = LT; break;
18802 case LEU: code1 = LTU; code2 = GTU; break;
18803 case GEU: code1 = GTU; code2 = LTU; break;
18804
18805 case EQ: code1 = UNKNOWN; code2 = NE; break;
18806 case NE: code2 = UNKNOWN; break;
18807
18808 default:
18809 gcc_unreachable ();
18810 }
18811
18812 /*
18813 * a < b =>
18814 * if (hi(a) < hi(b)) goto true;
18815 * if (hi(a) > hi(b)) goto false;
18816 * if (lo(a) < lo(b)) goto true;
18817 * false:
18818 */
18819
18820 if (code1 != UNKNOWN)
18821 ix86_expand_branch (code1, hi[0], hi[1], label);
18822 if (code2 != UNKNOWN)
18823 ix86_expand_branch (code2, hi[0], hi[1], label2);
18824
18825 ix86_expand_branch (code3, lo[0], lo[1], label);
18826
18827 if (code2 != UNKNOWN)
18828 emit_label (label2);
18829 return;
18830 }
18831
18832 default:
18833 gcc_assert (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC);
18834 goto simple;
18835 }
18836 }
18837
18838 /* Split branch based on floating point condition. */
18839 void
18840 ix86_split_fp_branch (enum rtx_code code, rtx op1, rtx op2,
18841 rtx target1, rtx target2, rtx tmp, rtx pushed)
18842 {
18843 rtx condition;
18844 rtx i;
18845
18846 if (target2 != pc_rtx)
18847 {
18848 rtx tmp = target2;
18849 code = reverse_condition_maybe_unordered (code);
18850 target2 = target1;
18851 target1 = tmp;
18852 }
18853
18854 condition = ix86_expand_fp_compare (code, op1, op2,
18855 tmp);
18856
18857 /* Remove pushed operand from stack. */
18858 if (pushed)
18859 ix86_free_from_memory (GET_MODE (pushed));
18860
18861 i = emit_jump_insn (gen_rtx_SET
18862 (VOIDmode, pc_rtx,
18863 gen_rtx_IF_THEN_ELSE (VOIDmode,
18864 condition, target1, target2)));
18865 if (split_branch_probability >= 0)
18866 add_reg_note (i, REG_BR_PROB, GEN_INT (split_branch_probability));
18867 }
18868
18869 void
18870 ix86_expand_setcc (rtx dest, enum rtx_code code, rtx op0, rtx op1)
18871 {
18872 rtx ret;
18873
18874 gcc_assert (GET_MODE (dest) == QImode);
18875
18876 ret = ix86_expand_compare (code, op0, op1);
18877 PUT_MODE (ret, QImode);
18878 emit_insn (gen_rtx_SET (VOIDmode, dest, ret));
18879 }
18880
18881 /* Expand comparison setting or clearing carry flag. Return true when
18882 successful and set pop for the operation. */
18883 static bool
18884 ix86_expand_carry_flag_compare (enum rtx_code code, rtx op0, rtx op1, rtx *pop)
18885 {
18886 enum machine_mode mode =
18887 GET_MODE (op0) != VOIDmode ? GET_MODE (op0) : GET_MODE (op1);
18888
18889 /* Do not handle double-mode compares that go through special path. */
18890 if (mode == (TARGET_64BIT ? TImode : DImode))
18891 return false;
18892
18893 if (SCALAR_FLOAT_MODE_P (mode))
18894 {
18895 rtx compare_op, compare_seq;
18896
18897 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
18898
18899 /* Shortcut: following common codes never translate
18900 into carry flag compares. */
18901 if (code == EQ || code == NE || code == UNEQ || code == LTGT
18902 || code == ORDERED || code == UNORDERED)
18903 return false;
18904
18905 /* These comparisons require zero flag; swap operands so they won't. */
18906 if ((code == GT || code == UNLE || code == LE || code == UNGT)
18907 && !TARGET_IEEE_FP)
18908 {
18909 rtx tmp = op0;
18910 op0 = op1;
18911 op1 = tmp;
18912 code = swap_condition (code);
18913 }
18914
18915 /* Try to expand the comparison and verify that we end up with
18916 carry flag based comparison. This fails to be true only when
18917 we decide to expand comparison using arithmetic that is not
18918 too common scenario. */
18919 start_sequence ();
18920 compare_op = ix86_expand_fp_compare (code, op0, op1, NULL_RTX);
18921 compare_seq = get_insns ();
18922 end_sequence ();
18923
18924 if (GET_MODE (XEXP (compare_op, 0)) == CCFPmode
18925 || GET_MODE (XEXP (compare_op, 0)) == CCFPUmode)
18926 code = ix86_fp_compare_code_to_integer (GET_CODE (compare_op));
18927 else
18928 code = GET_CODE (compare_op);
18929
18930 if (code != LTU && code != GEU)
18931 return false;
18932
18933 emit_insn (compare_seq);
18934 *pop = compare_op;
18935 return true;
18936 }
18937
18938 if (!INTEGRAL_MODE_P (mode))
18939 return false;
18940
18941 switch (code)
18942 {
18943 case LTU:
18944 case GEU:
18945 break;
18946
18947 /* Convert a==0 into (unsigned)a<1. */
18948 case EQ:
18949 case NE:
18950 if (op1 != const0_rtx)
18951 return false;
18952 op1 = const1_rtx;
18953 code = (code == EQ ? LTU : GEU);
18954 break;
18955
18956 /* Convert a>b into b<a or a>=b-1. */
18957 case GTU:
18958 case LEU:
18959 if (CONST_INT_P (op1))
18960 {
18961 op1 = gen_int_mode (INTVAL (op1) + 1, GET_MODE (op0));
18962 /* Bail out on overflow. We still can swap operands but that
18963 would force loading of the constant into register. */
18964 if (op1 == const0_rtx
18965 || !x86_64_immediate_operand (op1, GET_MODE (op1)))
18966 return false;
18967 code = (code == GTU ? GEU : LTU);
18968 }
18969 else
18970 {
18971 rtx tmp = op1;
18972 op1 = op0;
18973 op0 = tmp;
18974 code = (code == GTU ? LTU : GEU);
18975 }
18976 break;
18977
18978 /* Convert a>=0 into (unsigned)a<0x80000000. */
18979 case LT:
18980 case GE:
18981 if (mode == DImode || op1 != const0_rtx)
18982 return false;
18983 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
18984 code = (code == LT ? GEU : LTU);
18985 break;
18986 case LE:
18987 case GT:
18988 if (mode == DImode || op1 != constm1_rtx)
18989 return false;
18990 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
18991 code = (code == LE ? GEU : LTU);
18992 break;
18993
18994 default:
18995 return false;
18996 }
18997 /* Swapping operands may cause constant to appear as first operand. */
18998 if (!nonimmediate_operand (op0, VOIDmode))
18999 {
19000 if (!can_create_pseudo_p ())
19001 return false;
19002 op0 = force_reg (mode, op0);
19003 }
19004 *pop = ix86_expand_compare (code, op0, op1);
19005 gcc_assert (GET_CODE (*pop) == LTU || GET_CODE (*pop) == GEU);
19006 return true;
19007 }
19008
19009 bool
19010 ix86_expand_int_movcc (rtx operands[])
19011 {
19012 enum rtx_code code = GET_CODE (operands[1]), compare_code;
19013 rtx compare_seq, compare_op;
19014 enum machine_mode mode = GET_MODE (operands[0]);
19015 bool sign_bit_compare_p = false;
19016 rtx op0 = XEXP (operands[1], 0);
19017 rtx op1 = XEXP (operands[1], 1);
19018
19019 if (GET_MODE (op0) == TImode
19020 || (GET_MODE (op0) == DImode
19021 && !TARGET_64BIT))
19022 return false;
19023
19024 start_sequence ();
19025 compare_op = ix86_expand_compare (code, op0, op1);
19026 compare_seq = get_insns ();
19027 end_sequence ();
19028
19029 compare_code = GET_CODE (compare_op);
19030
19031 if ((op1 == const0_rtx && (code == GE || code == LT))
19032 || (op1 == constm1_rtx && (code == GT || code == LE)))
19033 sign_bit_compare_p = true;
19034
19035 /* Don't attempt mode expansion here -- if we had to expand 5 or 6
19036 HImode insns, we'd be swallowed in word prefix ops. */
19037
19038 if ((mode != HImode || TARGET_FAST_PREFIX)
19039 && (mode != (TARGET_64BIT ? TImode : DImode))
19040 && CONST_INT_P (operands[2])
19041 && CONST_INT_P (operands[3]))
19042 {
19043 rtx out = operands[0];
19044 HOST_WIDE_INT ct = INTVAL (operands[2]);
19045 HOST_WIDE_INT cf = INTVAL (operands[3]);
19046 HOST_WIDE_INT diff;
19047
19048 diff = ct - cf;
19049 /* Sign bit compares are better done using shifts than we do by using
19050 sbb. */
19051 if (sign_bit_compare_p
19052 || ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
19053 {
19054 /* Detect overlap between destination and compare sources. */
19055 rtx tmp = out;
19056
19057 if (!sign_bit_compare_p)
19058 {
19059 rtx flags;
19060 bool fpcmp = false;
19061
19062 compare_code = GET_CODE (compare_op);
19063
19064 flags = XEXP (compare_op, 0);
19065
19066 if (GET_MODE (flags) == CCFPmode
19067 || GET_MODE (flags) == CCFPUmode)
19068 {
19069 fpcmp = true;
19070 compare_code
19071 = ix86_fp_compare_code_to_integer (compare_code);
19072 }
19073
19074 /* To simplify rest of code, restrict to the GEU case. */
19075 if (compare_code == LTU)
19076 {
19077 HOST_WIDE_INT tmp = ct;
19078 ct = cf;
19079 cf = tmp;
19080 compare_code = reverse_condition (compare_code);
19081 code = reverse_condition (code);
19082 }
19083 else
19084 {
19085 if (fpcmp)
19086 PUT_CODE (compare_op,
19087 reverse_condition_maybe_unordered
19088 (GET_CODE (compare_op)));
19089 else
19090 PUT_CODE (compare_op,
19091 reverse_condition (GET_CODE (compare_op)));
19092 }
19093 diff = ct - cf;
19094
19095 if (reg_overlap_mentioned_p (out, op0)
19096 || reg_overlap_mentioned_p (out, op1))
19097 tmp = gen_reg_rtx (mode);
19098
19099 if (mode == DImode)
19100 emit_insn (gen_x86_movdicc_0_m1 (tmp, flags, compare_op));
19101 else
19102 emit_insn (gen_x86_movsicc_0_m1 (gen_lowpart (SImode, tmp),
19103 flags, compare_op));
19104 }
19105 else
19106 {
19107 if (code == GT || code == GE)
19108 code = reverse_condition (code);
19109 else
19110 {
19111 HOST_WIDE_INT tmp = ct;
19112 ct = cf;
19113 cf = tmp;
19114 diff = ct - cf;
19115 }
19116 tmp = emit_store_flag (tmp, code, op0, op1, VOIDmode, 0, -1);
19117 }
19118
19119 if (diff == 1)
19120 {
19121 /*
19122 * cmpl op0,op1
19123 * sbbl dest,dest
19124 * [addl dest, ct]
19125 *
19126 * Size 5 - 8.
19127 */
19128 if (ct)
19129 tmp = expand_simple_binop (mode, PLUS,
19130 tmp, GEN_INT (ct),
19131 copy_rtx (tmp), 1, OPTAB_DIRECT);
19132 }
19133 else if (cf == -1)
19134 {
19135 /*
19136 * cmpl op0,op1
19137 * sbbl dest,dest
19138 * orl $ct, dest
19139 *
19140 * Size 8.
19141 */
19142 tmp = expand_simple_binop (mode, IOR,
19143 tmp, GEN_INT (ct),
19144 copy_rtx (tmp), 1, OPTAB_DIRECT);
19145 }
19146 else if (diff == -1 && ct)
19147 {
19148 /*
19149 * cmpl op0,op1
19150 * sbbl dest,dest
19151 * notl dest
19152 * [addl dest, cf]
19153 *
19154 * Size 8 - 11.
19155 */
19156 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
19157 if (cf)
19158 tmp = expand_simple_binop (mode, PLUS,
19159 copy_rtx (tmp), GEN_INT (cf),
19160 copy_rtx (tmp), 1, OPTAB_DIRECT);
19161 }
19162 else
19163 {
19164 /*
19165 * cmpl op0,op1
19166 * sbbl dest,dest
19167 * [notl dest]
19168 * andl cf - ct, dest
19169 * [addl dest, ct]
19170 *
19171 * Size 8 - 11.
19172 */
19173
19174 if (cf == 0)
19175 {
19176 cf = ct;
19177 ct = 0;
19178 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
19179 }
19180
19181 tmp = expand_simple_binop (mode, AND,
19182 copy_rtx (tmp),
19183 gen_int_mode (cf - ct, mode),
19184 copy_rtx (tmp), 1, OPTAB_DIRECT);
19185 if (ct)
19186 tmp = expand_simple_binop (mode, PLUS,
19187 copy_rtx (tmp), GEN_INT (ct),
19188 copy_rtx (tmp), 1, OPTAB_DIRECT);
19189 }
19190
19191 if (!rtx_equal_p (tmp, out))
19192 emit_move_insn (copy_rtx (out), copy_rtx (tmp));
19193
19194 return true;
19195 }
19196
19197 if (diff < 0)
19198 {
19199 enum machine_mode cmp_mode = GET_MODE (op0);
19200
19201 HOST_WIDE_INT tmp;
19202 tmp = ct, ct = cf, cf = tmp;
19203 diff = -diff;
19204
19205 if (SCALAR_FLOAT_MODE_P (cmp_mode))
19206 {
19207 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
19208
19209 /* We may be reversing unordered compare to normal compare, that
19210 is not valid in general (we may convert non-trapping condition
19211 to trapping one), however on i386 we currently emit all
19212 comparisons unordered. */
19213 compare_code = reverse_condition_maybe_unordered (compare_code);
19214 code = reverse_condition_maybe_unordered (code);
19215 }
19216 else
19217 {
19218 compare_code = reverse_condition (compare_code);
19219 code = reverse_condition (code);
19220 }
19221 }
19222
19223 compare_code = UNKNOWN;
19224 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT
19225 && CONST_INT_P (op1))
19226 {
19227 if (op1 == const0_rtx
19228 && (code == LT || code == GE))
19229 compare_code = code;
19230 else if (op1 == constm1_rtx)
19231 {
19232 if (code == LE)
19233 compare_code = LT;
19234 else if (code == GT)
19235 compare_code = GE;
19236 }
19237 }
19238
19239 /* Optimize dest = (op0 < 0) ? -1 : cf. */
19240 if (compare_code != UNKNOWN
19241 && GET_MODE (op0) == GET_MODE (out)
19242 && (cf == -1 || ct == -1))
19243 {
19244 /* If lea code below could be used, only optimize
19245 if it results in a 2 insn sequence. */
19246
19247 if (! (diff == 1 || diff == 2 || diff == 4 || diff == 8
19248 || diff == 3 || diff == 5 || diff == 9)
19249 || (compare_code == LT && ct == -1)
19250 || (compare_code == GE && cf == -1))
19251 {
19252 /*
19253 * notl op1 (if necessary)
19254 * sarl $31, op1
19255 * orl cf, op1
19256 */
19257 if (ct != -1)
19258 {
19259 cf = ct;
19260 ct = -1;
19261 code = reverse_condition (code);
19262 }
19263
19264 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
19265
19266 out = expand_simple_binop (mode, IOR,
19267 out, GEN_INT (cf),
19268 out, 1, OPTAB_DIRECT);
19269 if (out != operands[0])
19270 emit_move_insn (operands[0], out);
19271
19272 return true;
19273 }
19274 }
19275
19276
19277 if ((diff == 1 || diff == 2 || diff == 4 || diff == 8
19278 || diff == 3 || diff == 5 || diff == 9)
19279 && ((mode != QImode && mode != HImode) || !TARGET_PARTIAL_REG_STALL)
19280 && (mode != DImode
19281 || x86_64_immediate_operand (GEN_INT (cf), VOIDmode)))
19282 {
19283 /*
19284 * xorl dest,dest
19285 * cmpl op1,op2
19286 * setcc dest
19287 * lea cf(dest*(ct-cf)),dest
19288 *
19289 * Size 14.
19290 *
19291 * This also catches the degenerate setcc-only case.
19292 */
19293
19294 rtx tmp;
19295 int nops;
19296
19297 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
19298
19299 nops = 0;
19300 /* On x86_64 the lea instruction operates on Pmode, so we need
19301 to get arithmetics done in proper mode to match. */
19302 if (diff == 1)
19303 tmp = copy_rtx (out);
19304 else
19305 {
19306 rtx out1;
19307 out1 = copy_rtx (out);
19308 tmp = gen_rtx_MULT (mode, out1, GEN_INT (diff & ~1));
19309 nops++;
19310 if (diff & 1)
19311 {
19312 tmp = gen_rtx_PLUS (mode, tmp, out1);
19313 nops++;
19314 }
19315 }
19316 if (cf != 0)
19317 {
19318 tmp = gen_rtx_PLUS (mode, tmp, GEN_INT (cf));
19319 nops++;
19320 }
19321 if (!rtx_equal_p (tmp, out))
19322 {
19323 if (nops == 1)
19324 out = force_operand (tmp, copy_rtx (out));
19325 else
19326 emit_insn (gen_rtx_SET (VOIDmode, copy_rtx (out), copy_rtx (tmp)));
19327 }
19328 if (!rtx_equal_p (out, operands[0]))
19329 emit_move_insn (operands[0], copy_rtx (out));
19330
19331 return true;
19332 }
19333
19334 /*
19335 * General case: Jumpful:
19336 * xorl dest,dest cmpl op1, op2
19337 * cmpl op1, op2 movl ct, dest
19338 * setcc dest jcc 1f
19339 * decl dest movl cf, dest
19340 * andl (cf-ct),dest 1:
19341 * addl ct,dest
19342 *
19343 * Size 20. Size 14.
19344 *
19345 * This is reasonably steep, but branch mispredict costs are
19346 * high on modern cpus, so consider failing only if optimizing
19347 * for space.
19348 */
19349
19350 if ((!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
19351 && BRANCH_COST (optimize_insn_for_speed_p (),
19352 false) >= 2)
19353 {
19354 if (cf == 0)
19355 {
19356 enum machine_mode cmp_mode = GET_MODE (op0);
19357
19358 cf = ct;
19359 ct = 0;
19360
19361 if (SCALAR_FLOAT_MODE_P (cmp_mode))
19362 {
19363 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
19364
19365 /* We may be reversing unordered compare to normal compare,
19366 that is not valid in general (we may convert non-trapping
19367 condition to trapping one), however on i386 we currently
19368 emit all comparisons unordered. */
19369 code = reverse_condition_maybe_unordered (code);
19370 }
19371 else
19372 {
19373 code = reverse_condition (code);
19374 if (compare_code != UNKNOWN)
19375 compare_code = reverse_condition (compare_code);
19376 }
19377 }
19378
19379 if (compare_code != UNKNOWN)
19380 {
19381 /* notl op1 (if needed)
19382 sarl $31, op1
19383 andl (cf-ct), op1
19384 addl ct, op1
19385
19386 For x < 0 (resp. x <= -1) there will be no notl,
19387 so if possible swap the constants to get rid of the
19388 complement.
19389 True/false will be -1/0 while code below (store flag
19390 followed by decrement) is 0/-1, so the constants need
19391 to be exchanged once more. */
19392
19393 if (compare_code == GE || !cf)
19394 {
19395 code = reverse_condition (code);
19396 compare_code = LT;
19397 }
19398 else
19399 {
19400 HOST_WIDE_INT tmp = cf;
19401 cf = ct;
19402 ct = tmp;
19403 }
19404
19405 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
19406 }
19407 else
19408 {
19409 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
19410
19411 out = expand_simple_binop (mode, PLUS, copy_rtx (out),
19412 constm1_rtx,
19413 copy_rtx (out), 1, OPTAB_DIRECT);
19414 }
19415
19416 out = expand_simple_binop (mode, AND, copy_rtx (out),
19417 gen_int_mode (cf - ct, mode),
19418 copy_rtx (out), 1, OPTAB_DIRECT);
19419 if (ct)
19420 out = expand_simple_binop (mode, PLUS, copy_rtx (out), GEN_INT (ct),
19421 copy_rtx (out), 1, OPTAB_DIRECT);
19422 if (!rtx_equal_p (out, operands[0]))
19423 emit_move_insn (operands[0], copy_rtx (out));
19424
19425 return true;
19426 }
19427 }
19428
19429 if (!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
19430 {
19431 /* Try a few things more with specific constants and a variable. */
19432
19433 optab op;
19434 rtx var, orig_out, out, tmp;
19435
19436 if (BRANCH_COST (optimize_insn_for_speed_p (), false) <= 2)
19437 return false;
19438
19439 /* If one of the two operands is an interesting constant, load a
19440 constant with the above and mask it in with a logical operation. */
19441
19442 if (CONST_INT_P (operands[2]))
19443 {
19444 var = operands[3];
19445 if (INTVAL (operands[2]) == 0 && operands[3] != constm1_rtx)
19446 operands[3] = constm1_rtx, op = and_optab;
19447 else if (INTVAL (operands[2]) == -1 && operands[3] != const0_rtx)
19448 operands[3] = const0_rtx, op = ior_optab;
19449 else
19450 return false;
19451 }
19452 else if (CONST_INT_P (operands[3]))
19453 {
19454 var = operands[2];
19455 if (INTVAL (operands[3]) == 0 && operands[2] != constm1_rtx)
19456 operands[2] = constm1_rtx, op = and_optab;
19457 else if (INTVAL (operands[3]) == -1 && operands[3] != const0_rtx)
19458 operands[2] = const0_rtx, op = ior_optab;
19459 else
19460 return false;
19461 }
19462 else
19463 return false;
19464
19465 orig_out = operands[0];
19466 tmp = gen_reg_rtx (mode);
19467 operands[0] = tmp;
19468
19469 /* Recurse to get the constant loaded. */
19470 if (ix86_expand_int_movcc (operands) == 0)
19471 return false;
19472
19473 /* Mask in the interesting variable. */
19474 out = expand_binop (mode, op, var, tmp, orig_out, 0,
19475 OPTAB_WIDEN);
19476 if (!rtx_equal_p (out, orig_out))
19477 emit_move_insn (copy_rtx (orig_out), copy_rtx (out));
19478
19479 return true;
19480 }
19481
19482 /*
19483 * For comparison with above,
19484 *
19485 * movl cf,dest
19486 * movl ct,tmp
19487 * cmpl op1,op2
19488 * cmovcc tmp,dest
19489 *
19490 * Size 15.
19491 */
19492
19493 if (! nonimmediate_operand (operands[2], mode))
19494 operands[2] = force_reg (mode, operands[2]);
19495 if (! nonimmediate_operand (operands[3], mode))
19496 operands[3] = force_reg (mode, operands[3]);
19497
19498 if (! register_operand (operands[2], VOIDmode)
19499 && (mode == QImode
19500 || ! register_operand (operands[3], VOIDmode)))
19501 operands[2] = force_reg (mode, operands[2]);
19502
19503 if (mode == QImode
19504 && ! register_operand (operands[3], VOIDmode))
19505 operands[3] = force_reg (mode, operands[3]);
19506
19507 emit_insn (compare_seq);
19508 emit_insn (gen_rtx_SET (VOIDmode, operands[0],
19509 gen_rtx_IF_THEN_ELSE (mode,
19510 compare_op, operands[2],
19511 operands[3])));
19512 return true;
19513 }
19514
19515 /* Swap, force into registers, or otherwise massage the two operands
19516 to an sse comparison with a mask result. Thus we differ a bit from
19517 ix86_prepare_fp_compare_args which expects to produce a flags result.
19518
19519 The DEST operand exists to help determine whether to commute commutative
19520 operators. The POP0/POP1 operands are updated in place. The new
19521 comparison code is returned, or UNKNOWN if not implementable. */
19522
19523 static enum rtx_code
19524 ix86_prepare_sse_fp_compare_args (rtx dest, enum rtx_code code,
19525 rtx *pop0, rtx *pop1)
19526 {
19527 rtx tmp;
19528
19529 switch (code)
19530 {
19531 case LTGT:
19532 case UNEQ:
19533 /* AVX supports all the needed comparisons. */
19534 if (TARGET_AVX)
19535 break;
19536 /* We have no LTGT as an operator. We could implement it with
19537 NE & ORDERED, but this requires an extra temporary. It's
19538 not clear that it's worth it. */
19539 return UNKNOWN;
19540
19541 case LT:
19542 case LE:
19543 case UNGT:
19544 case UNGE:
19545 /* These are supported directly. */
19546 break;
19547
19548 case EQ:
19549 case NE:
19550 case UNORDERED:
19551 case ORDERED:
19552 /* AVX has 3 operand comparisons, no need to swap anything. */
19553 if (TARGET_AVX)
19554 break;
19555 /* For commutative operators, try to canonicalize the destination
19556 operand to be first in the comparison - this helps reload to
19557 avoid extra moves. */
19558 if (!dest || !rtx_equal_p (dest, *pop1))
19559 break;
19560 /* FALLTHRU */
19561
19562 case GE:
19563 case GT:
19564 case UNLE:
19565 case UNLT:
19566 /* These are not supported directly before AVX, and furthermore
19567 ix86_expand_sse_fp_minmax only optimizes LT/UNGE. Swap the
19568 comparison operands to transform into something that is
19569 supported. */
19570 tmp = *pop0;
19571 *pop0 = *pop1;
19572 *pop1 = tmp;
19573 code = swap_condition (code);
19574 break;
19575
19576 default:
19577 gcc_unreachable ();
19578 }
19579
19580 return code;
19581 }
19582
19583 /* Detect conditional moves that exactly match min/max operational
19584 semantics. Note that this is IEEE safe, as long as we don't
19585 interchange the operands.
19586
19587 Returns FALSE if this conditional move doesn't match a MIN/MAX,
19588 and TRUE if the operation is successful and instructions are emitted. */
19589
19590 static bool
19591 ix86_expand_sse_fp_minmax (rtx dest, enum rtx_code code, rtx cmp_op0,
19592 rtx cmp_op1, rtx if_true, rtx if_false)
19593 {
19594 enum machine_mode mode;
19595 bool is_min;
19596 rtx tmp;
19597
19598 if (code == LT)
19599 ;
19600 else if (code == UNGE)
19601 {
19602 tmp = if_true;
19603 if_true = if_false;
19604 if_false = tmp;
19605 }
19606 else
19607 return false;
19608
19609 if (rtx_equal_p (cmp_op0, if_true) && rtx_equal_p (cmp_op1, if_false))
19610 is_min = true;
19611 else if (rtx_equal_p (cmp_op1, if_true) && rtx_equal_p (cmp_op0, if_false))
19612 is_min = false;
19613 else
19614 return false;
19615
19616 mode = GET_MODE (dest);
19617
19618 /* We want to check HONOR_NANS and HONOR_SIGNED_ZEROS here,
19619 but MODE may be a vector mode and thus not appropriate. */
19620 if (!flag_finite_math_only || !flag_unsafe_math_optimizations)
19621 {
19622 int u = is_min ? UNSPEC_IEEE_MIN : UNSPEC_IEEE_MAX;
19623 rtvec v;
19624
19625 if_true = force_reg (mode, if_true);
19626 v = gen_rtvec (2, if_true, if_false);
19627 tmp = gen_rtx_UNSPEC (mode, v, u);
19628 }
19629 else
19630 {
19631 code = is_min ? SMIN : SMAX;
19632 tmp = gen_rtx_fmt_ee (code, mode, if_true, if_false);
19633 }
19634
19635 emit_insn (gen_rtx_SET (VOIDmode, dest, tmp));
19636 return true;
19637 }
19638
19639 /* Expand an sse vector comparison. Return the register with the result. */
19640
19641 static rtx
19642 ix86_expand_sse_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1,
19643 rtx op_true, rtx op_false)
19644 {
19645 enum machine_mode mode = GET_MODE (dest);
19646 enum machine_mode cmp_mode = GET_MODE (cmp_op0);
19647 rtx x;
19648
19649 cmp_op0 = force_reg (cmp_mode, cmp_op0);
19650 if (!nonimmediate_operand (cmp_op1, cmp_mode))
19651 cmp_op1 = force_reg (cmp_mode, cmp_op1);
19652
19653 if (optimize
19654 || reg_overlap_mentioned_p (dest, op_true)
19655 || reg_overlap_mentioned_p (dest, op_false))
19656 dest = gen_reg_rtx (mode);
19657
19658 x = gen_rtx_fmt_ee (code, cmp_mode, cmp_op0, cmp_op1);
19659 if (cmp_mode != mode)
19660 {
19661 x = force_reg (cmp_mode, x);
19662 convert_move (dest, x, false);
19663 }
19664 else
19665 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
19666
19667 return dest;
19668 }
19669
19670 /* Expand DEST = CMP ? OP_TRUE : OP_FALSE into a sequence of logical
19671 operations. This is used for both scalar and vector conditional moves. */
19672
19673 static void
19674 ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
19675 {
19676 enum machine_mode mode = GET_MODE (dest);
19677 rtx t2, t3, x;
19678
19679 if (vector_all_ones_operand (op_true, mode)
19680 && rtx_equal_p (op_false, CONST0_RTX (mode)))
19681 {
19682 emit_insn (gen_rtx_SET (VOIDmode, dest, cmp));
19683 }
19684 else if (op_false == CONST0_RTX (mode))
19685 {
19686 op_true = force_reg (mode, op_true);
19687 x = gen_rtx_AND (mode, cmp, op_true);
19688 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
19689 }
19690 else if (op_true == CONST0_RTX (mode))
19691 {
19692 op_false = force_reg (mode, op_false);
19693 x = gen_rtx_NOT (mode, cmp);
19694 x = gen_rtx_AND (mode, x, op_false);
19695 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
19696 }
19697 else if (INTEGRAL_MODE_P (mode) && op_true == CONSTM1_RTX (mode))
19698 {
19699 op_false = force_reg (mode, op_false);
19700 x = gen_rtx_IOR (mode, cmp, op_false);
19701 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
19702 }
19703 else if (TARGET_XOP)
19704 {
19705 op_true = force_reg (mode, op_true);
19706
19707 if (!nonimmediate_operand (op_false, mode))
19708 op_false = force_reg (mode, op_false);
19709
19710 emit_insn (gen_rtx_SET (mode, dest,
19711 gen_rtx_IF_THEN_ELSE (mode, cmp,
19712 op_true,
19713 op_false)));
19714 }
19715 else
19716 {
19717 rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
19718
19719 if (!nonimmediate_operand (op_true, mode))
19720 op_true = force_reg (mode, op_true);
19721
19722 op_false = force_reg (mode, op_false);
19723
19724 switch (mode)
19725 {
19726 case V4SFmode:
19727 if (TARGET_SSE4_1)
19728 gen = gen_sse4_1_blendvps;
19729 break;
19730 case V2DFmode:
19731 if (TARGET_SSE4_1)
19732 gen = gen_sse4_1_blendvpd;
19733 break;
19734 case V16QImode:
19735 case V8HImode:
19736 case V4SImode:
19737 case V2DImode:
19738 if (TARGET_SSE4_1)
19739 {
19740 gen = gen_sse4_1_pblendvb;
19741 dest = gen_lowpart (V16QImode, dest);
19742 op_false = gen_lowpart (V16QImode, op_false);
19743 op_true = gen_lowpart (V16QImode, op_true);
19744 cmp = gen_lowpart (V16QImode, cmp);
19745 }
19746 break;
19747 case V8SFmode:
19748 if (TARGET_AVX)
19749 gen = gen_avx_blendvps256;
19750 break;
19751 case V4DFmode:
19752 if (TARGET_AVX)
19753 gen = gen_avx_blendvpd256;
19754 break;
19755 case V32QImode:
19756 case V16HImode:
19757 case V8SImode:
19758 case V4DImode:
19759 if (TARGET_AVX2)
19760 {
19761 gen = gen_avx2_pblendvb;
19762 dest = gen_lowpart (V32QImode, dest);
19763 op_false = gen_lowpart (V32QImode, op_false);
19764 op_true = gen_lowpart (V32QImode, op_true);
19765 cmp = gen_lowpart (V32QImode, cmp);
19766 }
19767 break;
19768 default:
19769 break;
19770 }
19771
19772 if (gen != NULL)
19773 emit_insn (gen (dest, op_false, op_true, cmp));
19774 else
19775 {
19776 op_true = force_reg (mode, op_true);
19777
19778 t2 = gen_reg_rtx (mode);
19779 if (optimize)
19780 t3 = gen_reg_rtx (mode);
19781 else
19782 t3 = dest;
19783
19784 x = gen_rtx_AND (mode, op_true, cmp);
19785 emit_insn (gen_rtx_SET (VOIDmode, t2, x));
19786
19787 x = gen_rtx_NOT (mode, cmp);
19788 x = gen_rtx_AND (mode, x, op_false);
19789 emit_insn (gen_rtx_SET (VOIDmode, t3, x));
19790
19791 x = gen_rtx_IOR (mode, t3, t2);
19792 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
19793 }
19794 }
19795 }
19796
19797 /* Expand a floating-point conditional move. Return true if successful. */
19798
19799 bool
19800 ix86_expand_fp_movcc (rtx operands[])
19801 {
19802 enum machine_mode mode = GET_MODE (operands[0]);
19803 enum rtx_code code = GET_CODE (operands[1]);
19804 rtx tmp, compare_op;
19805 rtx op0 = XEXP (operands[1], 0);
19806 rtx op1 = XEXP (operands[1], 1);
19807
19808 if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode))
19809 {
19810 enum machine_mode cmode;
19811
19812 /* Since we've no cmove for sse registers, don't force bad register
19813 allocation just to gain access to it. Deny movcc when the
19814 comparison mode doesn't match the move mode. */
19815 cmode = GET_MODE (op0);
19816 if (cmode == VOIDmode)
19817 cmode = GET_MODE (op1);
19818 if (cmode != mode)
19819 return false;
19820
19821 code = ix86_prepare_sse_fp_compare_args (operands[0], code, &op0, &op1);
19822 if (code == UNKNOWN)
19823 return false;
19824
19825 if (ix86_expand_sse_fp_minmax (operands[0], code, op0, op1,
19826 operands[2], operands[3]))
19827 return true;
19828
19829 tmp = ix86_expand_sse_cmp (operands[0], code, op0, op1,
19830 operands[2], operands[3]);
19831 ix86_expand_sse_movcc (operands[0], tmp, operands[2], operands[3]);
19832 return true;
19833 }
19834
19835 /* The floating point conditional move instructions don't directly
19836 support conditions resulting from a signed integer comparison. */
19837
19838 compare_op = ix86_expand_compare (code, op0, op1);
19839 if (!fcmov_comparison_operator (compare_op, VOIDmode))
19840 {
19841 tmp = gen_reg_rtx (QImode);
19842 ix86_expand_setcc (tmp, code, op0, op1);
19843
19844 compare_op = ix86_expand_compare (NE, tmp, const0_rtx);
19845 }
19846
19847 emit_insn (gen_rtx_SET (VOIDmode, operands[0],
19848 gen_rtx_IF_THEN_ELSE (mode, compare_op,
19849 operands[2], operands[3])));
19850
19851 return true;
19852 }
19853
19854 /* Expand a floating-point vector conditional move; a vcond operation
19855 rather than a movcc operation. */
19856
19857 bool
19858 ix86_expand_fp_vcond (rtx operands[])
19859 {
19860 enum rtx_code code = GET_CODE (operands[3]);
19861 rtx cmp;
19862
19863 code = ix86_prepare_sse_fp_compare_args (operands[0], code,
19864 &operands[4], &operands[5]);
19865 if (code == UNKNOWN)
19866 {
19867 rtx temp;
19868 switch (GET_CODE (operands[3]))
19869 {
19870 case LTGT:
19871 temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[4],
19872 operands[5], operands[0], operands[0]);
19873 cmp = ix86_expand_sse_cmp (operands[0], NE, operands[4],
19874 operands[5], operands[1], operands[2]);
19875 code = AND;
19876 break;
19877 case UNEQ:
19878 temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[4],
19879 operands[5], operands[0], operands[0]);
19880 cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[4],
19881 operands[5], operands[1], operands[2]);
19882 code = IOR;
19883 break;
19884 default:
19885 gcc_unreachable ();
19886 }
19887 cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
19888 OPTAB_DIRECT);
19889 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
19890 return true;
19891 }
19892
19893 if (ix86_expand_sse_fp_minmax (operands[0], code, operands[4],
19894 operands[5], operands[1], operands[2]))
19895 return true;
19896
19897 cmp = ix86_expand_sse_cmp (operands[0], code, operands[4], operands[5],
19898 operands[1], operands[2]);
19899 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
19900 return true;
19901 }
19902
19903 /* Expand a signed/unsigned integral vector conditional move. */
19904
19905 bool
19906 ix86_expand_int_vcond (rtx operands[])
19907 {
19908 enum machine_mode data_mode = GET_MODE (operands[0]);
19909 enum machine_mode mode = GET_MODE (operands[4]);
19910 enum rtx_code code = GET_CODE (operands[3]);
19911 bool negate = false;
19912 rtx x, cop0, cop1;
19913
19914 cop0 = operands[4];
19915 cop1 = operands[5];
19916
19917 /* Try to optimize x < 0 ? -1 : 0 into (signed) x >> 31
19918 and x < 0 ? 1 : 0 into (unsigned) x >> 31. */
19919 if ((code == LT || code == GE)
19920 && data_mode == mode
19921 && cop1 == CONST0_RTX (mode)
19922 && operands[1 + (code == LT)] == CONST0_RTX (data_mode)
19923 && GET_MODE_SIZE (GET_MODE_INNER (data_mode)) > 1
19924 && GET_MODE_SIZE (GET_MODE_INNER (data_mode)) <= 8
19925 && (GET_MODE_SIZE (data_mode) == 16
19926 || (TARGET_AVX2 && GET_MODE_SIZE (data_mode) == 32)))
19927 {
19928 rtx negop = operands[2 - (code == LT)];
19929 int shift = GET_MODE_BITSIZE (GET_MODE_INNER (data_mode)) - 1;
19930 if (negop == CONST1_RTX (data_mode))
19931 {
19932 rtx res = expand_simple_binop (mode, LSHIFTRT, cop0, GEN_INT (shift),
19933 operands[0], 1, OPTAB_DIRECT);
19934 if (res != operands[0])
19935 emit_move_insn (operands[0], res);
19936 return true;
19937 }
19938 else if (GET_MODE_INNER (data_mode) != DImode
19939 && vector_all_ones_operand (negop, data_mode))
19940 {
19941 rtx res = expand_simple_binop (mode, ASHIFTRT, cop0, GEN_INT (shift),
19942 operands[0], 0, OPTAB_DIRECT);
19943 if (res != operands[0])
19944 emit_move_insn (operands[0], res);
19945 return true;
19946 }
19947 }
19948
19949 if (!nonimmediate_operand (cop1, mode))
19950 cop1 = force_reg (mode, cop1);
19951 if (!general_operand (operands[1], data_mode))
19952 operands[1] = force_reg (data_mode, operands[1]);
19953 if (!general_operand (operands[2], data_mode))
19954 operands[2] = force_reg (data_mode, operands[2]);
19955
19956 /* XOP supports all of the comparisons on all 128-bit vector int types. */
19957 if (TARGET_XOP
19958 && (mode == V16QImode || mode == V8HImode
19959 || mode == V4SImode || mode == V2DImode))
19960 ;
19961 else
19962 {
19963 /* Canonicalize the comparison to EQ, GT, GTU. */
19964 switch (code)
19965 {
19966 case EQ:
19967 case GT:
19968 case GTU:
19969 break;
19970
19971 case NE:
19972 case LE:
19973 case LEU:
19974 code = reverse_condition (code);
19975 negate = true;
19976 break;
19977
19978 case GE:
19979 case GEU:
19980 code = reverse_condition (code);
19981 negate = true;
19982 /* FALLTHRU */
19983
19984 case LT:
19985 case LTU:
19986 code = swap_condition (code);
19987 x = cop0, cop0 = cop1, cop1 = x;
19988 break;
19989
19990 default:
19991 gcc_unreachable ();
19992 }
19993
19994 /* Only SSE4.1/SSE4.2 supports V2DImode. */
19995 if (mode == V2DImode)
19996 {
19997 switch (code)
19998 {
19999 case EQ:
20000 /* SSE4.1 supports EQ. */
20001 if (!TARGET_SSE4_1)
20002 return false;
20003 break;
20004
20005 case GT:
20006 case GTU:
20007 /* SSE4.2 supports GT/GTU. */
20008 if (!TARGET_SSE4_2)
20009 return false;
20010 break;
20011
20012 default:
20013 gcc_unreachable ();
20014 }
20015 }
20016
20017 /* Unsigned parallel compare is not supported by the hardware.
20018 Play some tricks to turn this into a signed comparison
20019 against 0. */
20020 if (code == GTU)
20021 {
20022 cop0 = force_reg (mode, cop0);
20023
20024 switch (mode)
20025 {
20026 case V8SImode:
20027 case V4DImode:
20028 case V4SImode:
20029 case V2DImode:
20030 {
20031 rtx t1, t2, mask;
20032 rtx (*gen_sub3) (rtx, rtx, rtx);
20033
20034 switch (mode)
20035 {
20036 case V8SImode: gen_sub3 = gen_subv8si3; break;
20037 case V4DImode: gen_sub3 = gen_subv4di3; break;
20038 case V4SImode: gen_sub3 = gen_subv4si3; break;
20039 case V2DImode: gen_sub3 = gen_subv2di3; break;
20040 default:
20041 gcc_unreachable ();
20042 }
20043 /* Subtract (-(INT MAX) - 1) from both operands to make
20044 them signed. */
20045 mask = ix86_build_signbit_mask (mode, true, false);
20046 t1 = gen_reg_rtx (mode);
20047 emit_insn (gen_sub3 (t1, cop0, mask));
20048
20049 t2 = gen_reg_rtx (mode);
20050 emit_insn (gen_sub3 (t2, cop1, mask));
20051
20052 cop0 = t1;
20053 cop1 = t2;
20054 code = GT;
20055 }
20056 break;
20057
20058 case V32QImode:
20059 case V16HImode:
20060 case V16QImode:
20061 case V8HImode:
20062 /* Perform a parallel unsigned saturating subtraction. */
20063 x = gen_reg_rtx (mode);
20064 emit_insn (gen_rtx_SET (VOIDmode, x,
20065 gen_rtx_US_MINUS (mode, cop0, cop1)));
20066
20067 cop0 = x;
20068 cop1 = CONST0_RTX (mode);
20069 code = EQ;
20070 negate = !negate;
20071 break;
20072
20073 default:
20074 gcc_unreachable ();
20075 }
20076 }
20077 }
20078
20079 /* Allow the comparison to be done in one mode, but the movcc to
20080 happen in another mode. */
20081 if (data_mode == mode)
20082 {
20083 x = ix86_expand_sse_cmp (operands[0], code, cop0, cop1,
20084 operands[1+negate], operands[2-negate]);
20085 }
20086 else
20087 {
20088 gcc_assert (GET_MODE_SIZE (data_mode) == GET_MODE_SIZE (mode));
20089 x = ix86_expand_sse_cmp (gen_lowpart (mode, operands[0]),
20090 code, cop0, cop1,
20091 operands[1+negate], operands[2-negate]);
20092 x = gen_lowpart (data_mode, x);
20093 }
20094
20095 ix86_expand_sse_movcc (operands[0], x, operands[1+negate],
20096 operands[2-negate]);
20097 return true;
20098 }
20099
20100 /* Expand a variable vector permutation. */
20101
20102 void
20103 ix86_expand_vec_perm (rtx operands[])
20104 {
20105 rtx target = operands[0];
20106 rtx op0 = operands[1];
20107 rtx op1 = operands[2];
20108 rtx mask = operands[3];
20109 rtx t1, t2, t3, t4, vt, vt2, vec[32];
20110 enum machine_mode mode = GET_MODE (op0);
20111 enum machine_mode maskmode = GET_MODE (mask);
20112 int w, e, i;
20113 bool one_operand_shuffle = rtx_equal_p (op0, op1);
20114
20115 /* Number of elements in the vector. */
20116 w = GET_MODE_NUNITS (mode);
20117 e = GET_MODE_UNIT_SIZE (mode);
20118 gcc_assert (w <= 32);
20119
20120 if (TARGET_AVX2)
20121 {
20122 if (mode == V4DImode || mode == V4DFmode || mode == V16HImode)
20123 {
20124 /* Unfortunately, the VPERMQ and VPERMPD instructions only support
20125 an constant shuffle operand. With a tiny bit of effort we can
20126 use VPERMD instead. A re-interpretation stall for V4DFmode is
20127 unfortunate but there's no avoiding it.
20128 Similarly for V16HImode we don't have instructions for variable
20129 shuffling, while for V32QImode we can use after preparing suitable
20130 masks vpshufb; vpshufb; vpermq; vpor. */
20131
20132 if (mode == V16HImode)
20133 {
20134 maskmode = mode = V32QImode;
20135 w = 32;
20136 e = 1;
20137 }
20138 else
20139 {
20140 maskmode = mode = V8SImode;
20141 w = 8;
20142 e = 4;
20143 }
20144 t1 = gen_reg_rtx (maskmode);
20145
20146 /* Replicate the low bits of the V4DImode mask into V8SImode:
20147 mask = { A B C D }
20148 t1 = { A A B B C C D D }. */
20149 for (i = 0; i < w / 2; ++i)
20150 vec[i*2 + 1] = vec[i*2] = GEN_INT (i * 2);
20151 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
20152 vt = force_reg (maskmode, vt);
20153 mask = gen_lowpart (maskmode, mask);
20154 if (maskmode == V8SImode)
20155 emit_insn (gen_avx2_permvarv8si (t1, mask, vt));
20156 else
20157 emit_insn (gen_avx2_pshufbv32qi3 (t1, mask, vt));
20158
20159 /* Multiply the shuffle indicies by two. */
20160 t1 = expand_simple_binop (maskmode, PLUS, t1, t1, t1, 1,
20161 OPTAB_DIRECT);
20162
20163 /* Add one to the odd shuffle indicies:
20164 t1 = { A*2, A*2+1, B*2, B*2+1, ... }. */
20165 for (i = 0; i < w / 2; ++i)
20166 {
20167 vec[i * 2] = const0_rtx;
20168 vec[i * 2 + 1] = const1_rtx;
20169 }
20170 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
20171 vt = force_const_mem (maskmode, vt);
20172 t1 = expand_simple_binop (maskmode, PLUS, t1, vt, t1, 1,
20173 OPTAB_DIRECT);
20174
20175 /* Continue as if V8SImode (resp. V32QImode) was used initially. */
20176 operands[3] = mask = t1;
20177 target = gen_lowpart (mode, target);
20178 op0 = gen_lowpart (mode, op0);
20179 op1 = gen_lowpart (mode, op1);
20180 }
20181
20182 switch (mode)
20183 {
20184 case V8SImode:
20185 /* The VPERMD and VPERMPS instructions already properly ignore
20186 the high bits of the shuffle elements. No need for us to
20187 perform an AND ourselves. */
20188 if (one_operand_shuffle)
20189 emit_insn (gen_avx2_permvarv8si (target, op0, mask));
20190 else
20191 {
20192 t1 = gen_reg_rtx (V8SImode);
20193 t2 = gen_reg_rtx (V8SImode);
20194 emit_insn (gen_avx2_permvarv8si (t1, op0, mask));
20195 emit_insn (gen_avx2_permvarv8si (t2, op1, mask));
20196 goto merge_two;
20197 }
20198 return;
20199
20200 case V8SFmode:
20201 mask = gen_lowpart (V8SFmode, mask);
20202 if (one_operand_shuffle)
20203 emit_insn (gen_avx2_permvarv8sf (target, op0, mask));
20204 else
20205 {
20206 t1 = gen_reg_rtx (V8SFmode);
20207 t2 = gen_reg_rtx (V8SFmode);
20208 emit_insn (gen_avx2_permvarv8sf (t1, op0, mask));
20209 emit_insn (gen_avx2_permvarv8sf (t2, op1, mask));
20210 goto merge_two;
20211 }
20212 return;
20213
20214 case V4SImode:
20215 /* By combining the two 128-bit input vectors into one 256-bit
20216 input vector, we can use VPERMD and VPERMPS for the full
20217 two-operand shuffle. */
20218 t1 = gen_reg_rtx (V8SImode);
20219 t2 = gen_reg_rtx (V8SImode);
20220 emit_insn (gen_avx_vec_concatv8si (t1, op0, op1));
20221 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
20222 emit_insn (gen_avx2_permvarv8si (t1, t1, t2));
20223 emit_insn (gen_avx_vextractf128v8si (target, t1, const0_rtx));
20224 return;
20225
20226 case V4SFmode:
20227 t1 = gen_reg_rtx (V8SFmode);
20228 t2 = gen_reg_rtx (V8SImode);
20229 mask = gen_lowpart (V4SImode, mask);
20230 emit_insn (gen_avx_vec_concatv8sf (t1, op0, op1));
20231 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
20232 emit_insn (gen_avx2_permvarv8sf (t1, t1, t2));
20233 emit_insn (gen_avx_vextractf128v8sf (target, t1, const0_rtx));
20234 return;
20235
20236 case V32QImode:
20237 t1 = gen_reg_rtx (V32QImode);
20238 t2 = gen_reg_rtx (V32QImode);
20239 t3 = gen_reg_rtx (V32QImode);
20240 vt2 = GEN_INT (128);
20241 for (i = 0; i < 32; i++)
20242 vec[i] = vt2;
20243 vt = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
20244 vt = force_reg (V32QImode, vt);
20245 for (i = 0; i < 32; i++)
20246 vec[i] = i < 16 ? vt2 : const0_rtx;
20247 vt2 = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
20248 vt2 = force_reg (V32QImode, vt2);
20249 /* From mask create two adjusted masks, which contain the same
20250 bits as mask in the low 7 bits of each vector element.
20251 The first mask will have the most significant bit clear
20252 if it requests element from the same 128-bit lane
20253 and MSB set if it requests element from the other 128-bit lane.
20254 The second mask will have the opposite values of the MSB,
20255 and additionally will have its 128-bit lanes swapped.
20256 E.g. { 07 12 1e 09 ... | 17 19 05 1f ... } mask vector will have
20257 t1 { 07 92 9e 09 ... | 17 19 85 1f ... } and
20258 t3 { 97 99 05 9f ... | 87 12 1e 89 ... } where each ...
20259 stands for other 12 bytes. */
20260 /* The bit whether element is from the same lane or the other
20261 lane is bit 4, so shift it up by 3 to the MSB position. */
20262 emit_insn (gen_ashlv4di3 (gen_lowpart (V4DImode, t1),
20263 gen_lowpart (V4DImode, mask),
20264 GEN_INT (3)));
20265 /* Clear MSB bits from the mask just in case it had them set. */
20266 emit_insn (gen_avx2_andnotv32qi3 (t2, vt, mask));
20267 /* After this t1 will have MSB set for elements from other lane. */
20268 emit_insn (gen_xorv32qi3 (t1, t1, vt2));
20269 /* Clear bits other than MSB. */
20270 emit_insn (gen_andv32qi3 (t1, t1, vt));
20271 /* Or in the lower bits from mask into t3. */
20272 emit_insn (gen_iorv32qi3 (t3, t1, t2));
20273 /* And invert MSB bits in t1, so MSB is set for elements from the same
20274 lane. */
20275 emit_insn (gen_xorv32qi3 (t1, t1, vt));
20276 /* Swap 128-bit lanes in t3. */
20277 emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t3),
20278 gen_lowpart (V4DImode, t3),
20279 const2_rtx, GEN_INT (3),
20280 const0_rtx, const1_rtx));
20281 /* And or in the lower bits from mask into t1. */
20282 emit_insn (gen_iorv32qi3 (t1, t1, t2));
20283 if (one_operand_shuffle)
20284 {
20285 /* Each of these shuffles will put 0s in places where
20286 element from the other 128-bit lane is needed, otherwise
20287 will shuffle in the requested value. */
20288 emit_insn (gen_avx2_pshufbv32qi3 (t3, op0, t3));
20289 emit_insn (gen_avx2_pshufbv32qi3 (t1, op0, t1));
20290 /* For t3 the 128-bit lanes are swapped again. */
20291 emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t3),
20292 gen_lowpart (V4DImode, t3),
20293 const2_rtx, GEN_INT (3),
20294 const0_rtx, const1_rtx));
20295 /* And oring both together leads to the result. */
20296 emit_insn (gen_iorv32qi3 (target, t1, t3));
20297 return;
20298 }
20299
20300 t4 = gen_reg_rtx (V32QImode);
20301 /* Similarly to the above one_operand_shuffle code,
20302 just for repeated twice for each operand. merge_two:
20303 code will merge the two results together. */
20304 emit_insn (gen_avx2_pshufbv32qi3 (t4, op0, t3));
20305 emit_insn (gen_avx2_pshufbv32qi3 (t3, op1, t3));
20306 emit_insn (gen_avx2_pshufbv32qi3 (t2, op0, t1));
20307 emit_insn (gen_avx2_pshufbv32qi3 (t1, op1, t1));
20308 emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t4),
20309 gen_lowpart (V4DImode, t4),
20310 const2_rtx, GEN_INT (3),
20311 const0_rtx, const1_rtx));
20312 emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t3),
20313 gen_lowpart (V4DImode, t3),
20314 const2_rtx, GEN_INT (3),
20315 const0_rtx, const1_rtx));
20316 emit_insn (gen_iorv32qi3 (t4, t2, t4));
20317 emit_insn (gen_iorv32qi3 (t3, t1, t3));
20318 t1 = t4;
20319 t2 = t3;
20320 goto merge_two;
20321
20322 default:
20323 gcc_assert (GET_MODE_SIZE (mode) <= 16);
20324 break;
20325 }
20326 }
20327
20328 if (TARGET_XOP)
20329 {
20330 /* The XOP VPPERM insn supports three inputs. By ignoring the
20331 one_operand_shuffle special case, we avoid creating another
20332 set of constant vectors in memory. */
20333 one_operand_shuffle = false;
20334
20335 /* mask = mask & {2*w-1, ...} */
20336 vt = GEN_INT (2*w - 1);
20337 }
20338 else
20339 {
20340 /* mask = mask & {w-1, ...} */
20341 vt = GEN_INT (w - 1);
20342 }
20343
20344 for (i = 0; i < w; i++)
20345 vec[i] = vt;
20346 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
20347 mask = expand_simple_binop (maskmode, AND, mask, vt,
20348 NULL_RTX, 0, OPTAB_DIRECT);
20349
20350 /* For non-QImode operations, convert the word permutation control
20351 into a byte permutation control. */
20352 if (mode != V16QImode)
20353 {
20354 mask = expand_simple_binop (maskmode, ASHIFT, mask,
20355 GEN_INT (exact_log2 (e)),
20356 NULL_RTX, 0, OPTAB_DIRECT);
20357
20358 /* Convert mask to vector of chars. */
20359 mask = force_reg (V16QImode, gen_lowpart (V16QImode, mask));
20360
20361 /* Replicate each of the input bytes into byte positions:
20362 (v2di) --> {0,0,0,0,0,0,0,0, 8,8,8,8,8,8,8,8}
20363 (v4si) --> {0,0,0,0, 4,4,4,4, 8,8,8,8, 12,12,12,12}
20364 (v8hi) --> {0,0, 2,2, 4,4, 6,6, ...}. */
20365 for (i = 0; i < 16; ++i)
20366 vec[i] = GEN_INT (i/e * e);
20367 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
20368 vt = force_const_mem (V16QImode, vt);
20369 if (TARGET_XOP)
20370 emit_insn (gen_xop_pperm (mask, mask, mask, vt));
20371 else
20372 emit_insn (gen_ssse3_pshufbv16qi3 (mask, mask, vt));
20373
20374 /* Convert it into the byte positions by doing
20375 mask = mask + {0,1,..,16/w, 0,1,..,16/w, ...} */
20376 for (i = 0; i < 16; ++i)
20377 vec[i] = GEN_INT (i % e);
20378 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
20379 vt = force_const_mem (V16QImode, vt);
20380 emit_insn (gen_addv16qi3 (mask, mask, vt));
20381 }
20382
20383 /* The actual shuffle operations all operate on V16QImode. */
20384 op0 = gen_lowpart (V16QImode, op0);
20385 op1 = gen_lowpart (V16QImode, op1);
20386 target = gen_lowpart (V16QImode, target);
20387
20388 if (TARGET_XOP)
20389 {
20390 emit_insn (gen_xop_pperm (target, op0, op1, mask));
20391 }
20392 else if (one_operand_shuffle)
20393 {
20394 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, mask));
20395 }
20396 else
20397 {
20398 rtx xops[6];
20399 bool ok;
20400
20401 /* Shuffle the two input vectors independently. */
20402 t1 = gen_reg_rtx (V16QImode);
20403 t2 = gen_reg_rtx (V16QImode);
20404 emit_insn (gen_ssse3_pshufbv16qi3 (t1, op0, mask));
20405 emit_insn (gen_ssse3_pshufbv16qi3 (t2, op1, mask));
20406
20407 merge_two:
20408 /* Then merge them together. The key is whether any given control
20409 element contained a bit set that indicates the second word. */
20410 mask = operands[3];
20411 vt = GEN_INT (w);
20412 if (maskmode == V2DImode && !TARGET_SSE4_1)
20413 {
20414 /* Without SSE4.1, we don't have V2DImode EQ. Perform one
20415 more shuffle to convert the V2DI input mask into a V4SI
20416 input mask. At which point the masking that expand_int_vcond
20417 will work as desired. */
20418 rtx t3 = gen_reg_rtx (V4SImode);
20419 emit_insn (gen_sse2_pshufd_1 (t3, gen_lowpart (V4SImode, mask),
20420 const0_rtx, const0_rtx,
20421 const2_rtx, const2_rtx));
20422 mask = t3;
20423 maskmode = V4SImode;
20424 e = w = 4;
20425 }
20426
20427 for (i = 0; i < w; i++)
20428 vec[i] = vt;
20429 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
20430 vt = force_reg (maskmode, vt);
20431 mask = expand_simple_binop (maskmode, AND, mask, vt,
20432 NULL_RTX, 0, OPTAB_DIRECT);
20433
20434 xops[0] = gen_lowpart (mode, operands[0]);
20435 xops[1] = gen_lowpart (mode, t2);
20436 xops[2] = gen_lowpart (mode, t1);
20437 xops[3] = gen_rtx_EQ (maskmode, mask, vt);
20438 xops[4] = mask;
20439 xops[5] = vt;
20440 ok = ix86_expand_int_vcond (xops);
20441 gcc_assert (ok);
20442 }
20443 }
20444
20445 /* Unpack OP[1] into the next wider integer vector type. UNSIGNED_P is
20446 true if we should do zero extension, else sign extension. HIGH_P is
20447 true if we want the N/2 high elements, else the low elements. */
20448
20449 void
20450 ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p)
20451 {
20452 enum machine_mode imode = GET_MODE (src);
20453 rtx tmp;
20454
20455 if (TARGET_SSE4_1)
20456 {
20457 rtx (*unpack)(rtx, rtx);
20458 rtx (*extract)(rtx, rtx) = NULL;
20459 enum machine_mode halfmode = BLKmode;
20460
20461 switch (imode)
20462 {
20463 case V32QImode:
20464 if (unsigned_p)
20465 unpack = gen_avx2_zero_extendv16qiv16hi2;
20466 else
20467 unpack = gen_avx2_sign_extendv16qiv16hi2;
20468 halfmode = V16QImode;
20469 extract
20470 = high_p ? gen_vec_extract_hi_v32qi : gen_vec_extract_lo_v32qi;
20471 break;
20472 case V16HImode:
20473 if (unsigned_p)
20474 unpack = gen_avx2_zero_extendv8hiv8si2;
20475 else
20476 unpack = gen_avx2_sign_extendv8hiv8si2;
20477 halfmode = V8HImode;
20478 extract
20479 = high_p ? gen_vec_extract_hi_v16hi : gen_vec_extract_lo_v16hi;
20480 break;
20481 case V8SImode:
20482 if (unsigned_p)
20483 unpack = gen_avx2_zero_extendv4siv4di2;
20484 else
20485 unpack = gen_avx2_sign_extendv4siv4di2;
20486 halfmode = V4SImode;
20487 extract
20488 = high_p ? gen_vec_extract_hi_v8si : gen_vec_extract_lo_v8si;
20489 break;
20490 case V16QImode:
20491 if (unsigned_p)
20492 unpack = gen_sse4_1_zero_extendv8qiv8hi2;
20493 else
20494 unpack = gen_sse4_1_sign_extendv8qiv8hi2;
20495 break;
20496 case V8HImode:
20497 if (unsigned_p)
20498 unpack = gen_sse4_1_zero_extendv4hiv4si2;
20499 else
20500 unpack = gen_sse4_1_sign_extendv4hiv4si2;
20501 break;
20502 case V4SImode:
20503 if (unsigned_p)
20504 unpack = gen_sse4_1_zero_extendv2siv2di2;
20505 else
20506 unpack = gen_sse4_1_sign_extendv2siv2di2;
20507 break;
20508 default:
20509 gcc_unreachable ();
20510 }
20511
20512 if (GET_MODE_SIZE (imode) == 32)
20513 {
20514 tmp = gen_reg_rtx (halfmode);
20515 emit_insn (extract (tmp, src));
20516 }
20517 else if (high_p)
20518 {
20519 /* Shift higher 8 bytes to lower 8 bytes. */
20520 tmp = gen_reg_rtx (imode);
20521 emit_insn (gen_sse2_lshrv1ti3 (gen_lowpart (V1TImode, tmp),
20522 gen_lowpart (V1TImode, src),
20523 GEN_INT (64)));
20524 }
20525 else
20526 tmp = src;
20527
20528 emit_insn (unpack (dest, tmp));
20529 }
20530 else
20531 {
20532 rtx (*unpack)(rtx, rtx, rtx);
20533
20534 switch (imode)
20535 {
20536 case V16QImode:
20537 if (high_p)
20538 unpack = gen_vec_interleave_highv16qi;
20539 else
20540 unpack = gen_vec_interleave_lowv16qi;
20541 break;
20542 case V8HImode:
20543 if (high_p)
20544 unpack = gen_vec_interleave_highv8hi;
20545 else
20546 unpack = gen_vec_interleave_lowv8hi;
20547 break;
20548 case V4SImode:
20549 if (high_p)
20550 unpack = gen_vec_interleave_highv4si;
20551 else
20552 unpack = gen_vec_interleave_lowv4si;
20553 break;
20554 default:
20555 gcc_unreachable ();
20556 }
20557
20558 if (unsigned_p)
20559 tmp = force_reg (imode, CONST0_RTX (imode));
20560 else
20561 tmp = ix86_expand_sse_cmp (gen_reg_rtx (imode), GT, CONST0_RTX (imode),
20562 src, pc_rtx, pc_rtx);
20563
20564 emit_insn (unpack (gen_lowpart (imode, dest), src, tmp));
20565 }
20566 }
20567
20568 /* Expand conditional increment or decrement using adb/sbb instructions.
20569 The default case using setcc followed by the conditional move can be
20570 done by generic code. */
20571 bool
20572 ix86_expand_int_addcc (rtx operands[])
20573 {
20574 enum rtx_code code = GET_CODE (operands[1]);
20575 rtx flags;
20576 rtx (*insn)(rtx, rtx, rtx, rtx, rtx);
20577 rtx compare_op;
20578 rtx val = const0_rtx;
20579 bool fpcmp = false;
20580 enum machine_mode mode;
20581 rtx op0 = XEXP (operands[1], 0);
20582 rtx op1 = XEXP (operands[1], 1);
20583
20584 if (operands[3] != const1_rtx
20585 && operands[3] != constm1_rtx)
20586 return false;
20587 if (!ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
20588 return false;
20589 code = GET_CODE (compare_op);
20590
20591 flags = XEXP (compare_op, 0);
20592
20593 if (GET_MODE (flags) == CCFPmode
20594 || GET_MODE (flags) == CCFPUmode)
20595 {
20596 fpcmp = true;
20597 code = ix86_fp_compare_code_to_integer (code);
20598 }
20599
20600 if (code != LTU)
20601 {
20602 val = constm1_rtx;
20603 if (fpcmp)
20604 PUT_CODE (compare_op,
20605 reverse_condition_maybe_unordered
20606 (GET_CODE (compare_op)));
20607 else
20608 PUT_CODE (compare_op, reverse_condition (GET_CODE (compare_op)));
20609 }
20610
20611 mode = GET_MODE (operands[0]);
20612
20613 /* Construct either adc or sbb insn. */
20614 if ((code == LTU) == (operands[3] == constm1_rtx))
20615 {
20616 switch (mode)
20617 {
20618 case QImode:
20619 insn = gen_subqi3_carry;
20620 break;
20621 case HImode:
20622 insn = gen_subhi3_carry;
20623 break;
20624 case SImode:
20625 insn = gen_subsi3_carry;
20626 break;
20627 case DImode:
20628 insn = gen_subdi3_carry;
20629 break;
20630 default:
20631 gcc_unreachable ();
20632 }
20633 }
20634 else
20635 {
20636 switch (mode)
20637 {
20638 case QImode:
20639 insn = gen_addqi3_carry;
20640 break;
20641 case HImode:
20642 insn = gen_addhi3_carry;
20643 break;
20644 case SImode:
20645 insn = gen_addsi3_carry;
20646 break;
20647 case DImode:
20648 insn = gen_adddi3_carry;
20649 break;
20650 default:
20651 gcc_unreachable ();
20652 }
20653 }
20654 emit_insn (insn (operands[0], operands[2], val, flags, compare_op));
20655
20656 return true;
20657 }
20658
20659
20660 /* Split operands 0 and 1 into half-mode parts. Similar to split_double_mode,
20661 but works for floating pointer parameters and nonoffsetable memories.
20662 For pushes, it returns just stack offsets; the values will be saved
20663 in the right order. Maximally three parts are generated. */
20664
20665 static int
20666 ix86_split_to_parts (rtx operand, rtx *parts, enum machine_mode mode)
20667 {
20668 int size;
20669
20670 if (!TARGET_64BIT)
20671 size = mode==XFmode ? 3 : GET_MODE_SIZE (mode) / 4;
20672 else
20673 size = (GET_MODE_SIZE (mode) + 4) / 8;
20674
20675 gcc_assert (!REG_P (operand) || !MMX_REGNO_P (REGNO (operand)));
20676 gcc_assert (size >= 2 && size <= 4);
20677
20678 /* Optimize constant pool reference to immediates. This is used by fp
20679 moves, that force all constants to memory to allow combining. */
20680 if (MEM_P (operand) && MEM_READONLY_P (operand))
20681 {
20682 rtx tmp = maybe_get_pool_constant (operand);
20683 if (tmp)
20684 operand = tmp;
20685 }
20686
20687 if (MEM_P (operand) && !offsettable_memref_p (operand))
20688 {
20689 /* The only non-offsetable memories we handle are pushes. */
20690 int ok = push_operand (operand, VOIDmode);
20691
20692 gcc_assert (ok);
20693
20694 operand = copy_rtx (operand);
20695 PUT_MODE (operand, word_mode);
20696 parts[0] = parts[1] = parts[2] = parts[3] = operand;
20697 return size;
20698 }
20699
20700 if (GET_CODE (operand) == CONST_VECTOR)
20701 {
20702 enum machine_mode imode = int_mode_for_mode (mode);
20703 /* Caution: if we looked through a constant pool memory above,
20704 the operand may actually have a different mode now. That's
20705 ok, since we want to pun this all the way back to an integer. */
20706 operand = simplify_subreg (imode, operand, GET_MODE (operand), 0);
20707 gcc_assert (operand != NULL);
20708 mode = imode;
20709 }
20710
20711 if (!TARGET_64BIT)
20712 {
20713 if (mode == DImode)
20714 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
20715 else
20716 {
20717 int i;
20718
20719 if (REG_P (operand))
20720 {
20721 gcc_assert (reload_completed);
20722 for (i = 0; i < size; i++)
20723 parts[i] = gen_rtx_REG (SImode, REGNO (operand) + i);
20724 }
20725 else if (offsettable_memref_p (operand))
20726 {
20727 operand = adjust_address (operand, SImode, 0);
20728 parts[0] = operand;
20729 for (i = 1; i < size; i++)
20730 parts[i] = adjust_address (operand, SImode, 4 * i);
20731 }
20732 else if (GET_CODE (operand) == CONST_DOUBLE)
20733 {
20734 REAL_VALUE_TYPE r;
20735 long l[4];
20736
20737 REAL_VALUE_FROM_CONST_DOUBLE (r, operand);
20738 switch (mode)
20739 {
20740 case TFmode:
20741 real_to_target (l, &r, mode);
20742 parts[3] = gen_int_mode (l[3], SImode);
20743 parts[2] = gen_int_mode (l[2], SImode);
20744 break;
20745 case XFmode:
20746 REAL_VALUE_TO_TARGET_LONG_DOUBLE (r, l);
20747 parts[2] = gen_int_mode (l[2], SImode);
20748 break;
20749 case DFmode:
20750 REAL_VALUE_TO_TARGET_DOUBLE (r, l);
20751 break;
20752 default:
20753 gcc_unreachable ();
20754 }
20755 parts[1] = gen_int_mode (l[1], SImode);
20756 parts[0] = gen_int_mode (l[0], SImode);
20757 }
20758 else
20759 gcc_unreachable ();
20760 }
20761 }
20762 else
20763 {
20764 if (mode == TImode)
20765 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
20766 if (mode == XFmode || mode == TFmode)
20767 {
20768 enum machine_mode upper_mode = mode==XFmode ? SImode : DImode;
20769 if (REG_P (operand))
20770 {
20771 gcc_assert (reload_completed);
20772 parts[0] = gen_rtx_REG (DImode, REGNO (operand) + 0);
20773 parts[1] = gen_rtx_REG (upper_mode, REGNO (operand) + 1);
20774 }
20775 else if (offsettable_memref_p (operand))
20776 {
20777 operand = adjust_address (operand, DImode, 0);
20778 parts[0] = operand;
20779 parts[1] = adjust_address (operand, upper_mode, 8);
20780 }
20781 else if (GET_CODE (operand) == CONST_DOUBLE)
20782 {
20783 REAL_VALUE_TYPE r;
20784 long l[4];
20785
20786 REAL_VALUE_FROM_CONST_DOUBLE (r, operand);
20787 real_to_target (l, &r, mode);
20788
20789 /* Do not use shift by 32 to avoid warning on 32bit systems. */
20790 if (HOST_BITS_PER_WIDE_INT >= 64)
20791 parts[0]
20792 = gen_int_mode
20793 ((l[0] & (((HOST_WIDE_INT) 2 << 31) - 1))
20794 + ((((HOST_WIDE_INT) l[1]) << 31) << 1),
20795 DImode);
20796 else
20797 parts[0] = immed_double_const (l[0], l[1], DImode);
20798
20799 if (upper_mode == SImode)
20800 parts[1] = gen_int_mode (l[2], SImode);
20801 else if (HOST_BITS_PER_WIDE_INT >= 64)
20802 parts[1]
20803 = gen_int_mode
20804 ((l[2] & (((HOST_WIDE_INT) 2 << 31) - 1))
20805 + ((((HOST_WIDE_INT) l[3]) << 31) << 1),
20806 DImode);
20807 else
20808 parts[1] = immed_double_const (l[2], l[3], DImode);
20809 }
20810 else
20811 gcc_unreachable ();
20812 }
20813 }
20814
20815 return size;
20816 }
20817
20818 /* Emit insns to perform a move or push of DI, DF, XF, and TF values.
20819 Return false when normal moves are needed; true when all required
20820 insns have been emitted. Operands 2-4 contain the input values
20821 int the correct order; operands 5-7 contain the output values. */
20822
20823 void
20824 ix86_split_long_move (rtx operands[])
20825 {
20826 rtx part[2][4];
20827 int nparts, i, j;
20828 int push = 0;
20829 int collisions = 0;
20830 enum machine_mode mode = GET_MODE (operands[0]);
20831 bool collisionparts[4];
20832
20833 /* The DFmode expanders may ask us to move double.
20834 For 64bit target this is single move. By hiding the fact
20835 here we simplify i386.md splitters. */
20836 if (TARGET_64BIT && GET_MODE_SIZE (GET_MODE (operands[0])) == 8)
20837 {
20838 /* Optimize constant pool reference to immediates. This is used by
20839 fp moves, that force all constants to memory to allow combining. */
20840
20841 if (MEM_P (operands[1])
20842 && GET_CODE (XEXP (operands[1], 0)) == SYMBOL_REF
20843 && CONSTANT_POOL_ADDRESS_P (XEXP (operands[1], 0)))
20844 operands[1] = get_pool_constant (XEXP (operands[1], 0));
20845 if (push_operand (operands[0], VOIDmode))
20846 {
20847 operands[0] = copy_rtx (operands[0]);
20848 PUT_MODE (operands[0], word_mode);
20849 }
20850 else
20851 operands[0] = gen_lowpart (DImode, operands[0]);
20852 operands[1] = gen_lowpart (DImode, operands[1]);
20853 emit_move_insn (operands[0], operands[1]);
20854 return;
20855 }
20856
20857 /* The only non-offsettable memory we handle is push. */
20858 if (push_operand (operands[0], VOIDmode))
20859 push = 1;
20860 else
20861 gcc_assert (!MEM_P (operands[0])
20862 || offsettable_memref_p (operands[0]));
20863
20864 nparts = ix86_split_to_parts (operands[1], part[1], GET_MODE (operands[0]));
20865 ix86_split_to_parts (operands[0], part[0], GET_MODE (operands[0]));
20866
20867 /* When emitting push, take care for source operands on the stack. */
20868 if (push && MEM_P (operands[1])
20869 && reg_overlap_mentioned_p (stack_pointer_rtx, operands[1]))
20870 {
20871 rtx src_base = XEXP (part[1][nparts - 1], 0);
20872
20873 /* Compensate for the stack decrement by 4. */
20874 if (!TARGET_64BIT && nparts == 3
20875 && mode == XFmode && TARGET_128BIT_LONG_DOUBLE)
20876 src_base = plus_constant (Pmode, src_base, 4);
20877
20878 /* src_base refers to the stack pointer and is
20879 automatically decreased by emitted push. */
20880 for (i = 0; i < nparts; i++)
20881 part[1][i] = change_address (part[1][i],
20882 GET_MODE (part[1][i]), src_base);
20883 }
20884
20885 /* We need to do copy in the right order in case an address register
20886 of the source overlaps the destination. */
20887 if (REG_P (part[0][0]) && MEM_P (part[1][0]))
20888 {
20889 rtx tmp;
20890
20891 for (i = 0; i < nparts; i++)
20892 {
20893 collisionparts[i]
20894 = reg_overlap_mentioned_p (part[0][i], XEXP (part[1][0], 0));
20895 if (collisionparts[i])
20896 collisions++;
20897 }
20898
20899 /* Collision in the middle part can be handled by reordering. */
20900 if (collisions == 1 && nparts == 3 && collisionparts [1])
20901 {
20902 tmp = part[0][1]; part[0][1] = part[0][2]; part[0][2] = tmp;
20903 tmp = part[1][1]; part[1][1] = part[1][2]; part[1][2] = tmp;
20904 }
20905 else if (collisions == 1
20906 && nparts == 4
20907 && (collisionparts [1] || collisionparts [2]))
20908 {
20909 if (collisionparts [1])
20910 {
20911 tmp = part[0][1]; part[0][1] = part[0][2]; part[0][2] = tmp;
20912 tmp = part[1][1]; part[1][1] = part[1][2]; part[1][2] = tmp;
20913 }
20914 else
20915 {
20916 tmp = part[0][2]; part[0][2] = part[0][3]; part[0][3] = tmp;
20917 tmp = part[1][2]; part[1][2] = part[1][3]; part[1][3] = tmp;
20918 }
20919 }
20920
20921 /* If there are more collisions, we can't handle it by reordering.
20922 Do an lea to the last part and use only one colliding move. */
20923 else if (collisions > 1)
20924 {
20925 rtx base;
20926
20927 collisions = 1;
20928
20929 base = part[0][nparts - 1];
20930
20931 /* Handle the case when the last part isn't valid for lea.
20932 Happens in 64-bit mode storing the 12-byte XFmode. */
20933 if (GET_MODE (base) != Pmode)
20934 base = gen_rtx_REG (Pmode, REGNO (base));
20935
20936 emit_insn (gen_rtx_SET (VOIDmode, base, XEXP (part[1][0], 0)));
20937 part[1][0] = replace_equiv_address (part[1][0], base);
20938 for (i = 1; i < nparts; i++)
20939 {
20940 tmp = plus_constant (Pmode, base, UNITS_PER_WORD * i);
20941 part[1][i] = replace_equiv_address (part[1][i], tmp);
20942 }
20943 }
20944 }
20945
20946 if (push)
20947 {
20948 if (!TARGET_64BIT)
20949 {
20950 if (nparts == 3)
20951 {
20952 if (TARGET_128BIT_LONG_DOUBLE && mode == XFmode)
20953 emit_insn (ix86_gen_add3 (stack_pointer_rtx,
20954 stack_pointer_rtx, GEN_INT (-4)));
20955 emit_move_insn (part[0][2], part[1][2]);
20956 }
20957 else if (nparts == 4)
20958 {
20959 emit_move_insn (part[0][3], part[1][3]);
20960 emit_move_insn (part[0][2], part[1][2]);
20961 }
20962 }
20963 else
20964 {
20965 /* In 64bit mode we don't have 32bit push available. In case this is
20966 register, it is OK - we will just use larger counterpart. We also
20967 retype memory - these comes from attempt to avoid REX prefix on
20968 moving of second half of TFmode value. */
20969 if (GET_MODE (part[1][1]) == SImode)
20970 {
20971 switch (GET_CODE (part[1][1]))
20972 {
20973 case MEM:
20974 part[1][1] = adjust_address (part[1][1], DImode, 0);
20975 break;
20976
20977 case REG:
20978 part[1][1] = gen_rtx_REG (DImode, REGNO (part[1][1]));
20979 break;
20980
20981 default:
20982 gcc_unreachable ();
20983 }
20984
20985 if (GET_MODE (part[1][0]) == SImode)
20986 part[1][0] = part[1][1];
20987 }
20988 }
20989 emit_move_insn (part[0][1], part[1][1]);
20990 emit_move_insn (part[0][0], part[1][0]);
20991 return;
20992 }
20993
20994 /* Choose correct order to not overwrite the source before it is copied. */
20995 if ((REG_P (part[0][0])
20996 && REG_P (part[1][1])
20997 && (REGNO (part[0][0]) == REGNO (part[1][1])
20998 || (nparts == 3
20999 && REGNO (part[0][0]) == REGNO (part[1][2]))
21000 || (nparts == 4
21001 && REGNO (part[0][0]) == REGNO (part[1][3]))))
21002 || (collisions > 0
21003 && reg_overlap_mentioned_p (part[0][0], XEXP (part[1][0], 0))))
21004 {
21005 for (i = 0, j = nparts - 1; i < nparts; i++, j--)
21006 {
21007 operands[2 + i] = part[0][j];
21008 operands[6 + i] = part[1][j];
21009 }
21010 }
21011 else
21012 {
21013 for (i = 0; i < nparts; i++)
21014 {
21015 operands[2 + i] = part[0][i];
21016 operands[6 + i] = part[1][i];
21017 }
21018 }
21019
21020 /* If optimizing for size, attempt to locally unCSE nonzero constants. */
21021 if (optimize_insn_for_size_p ())
21022 {
21023 for (j = 0; j < nparts - 1; j++)
21024 if (CONST_INT_P (operands[6 + j])
21025 && operands[6 + j] != const0_rtx
21026 && REG_P (operands[2 + j]))
21027 for (i = j; i < nparts - 1; i++)
21028 if (CONST_INT_P (operands[7 + i])
21029 && INTVAL (operands[7 + i]) == INTVAL (operands[6 + j]))
21030 operands[7 + i] = operands[2 + j];
21031 }
21032
21033 for (i = 0; i < nparts; i++)
21034 emit_move_insn (operands[2 + i], operands[6 + i]);
21035
21036 return;
21037 }
21038
21039 /* Helper function of ix86_split_ashl used to generate an SImode/DImode
21040 left shift by a constant, either using a single shift or
21041 a sequence of add instructions. */
21042
21043 static void
21044 ix86_expand_ashl_const (rtx operand, int count, enum machine_mode mode)
21045 {
21046 rtx (*insn)(rtx, rtx, rtx);
21047
21048 if (count == 1
21049 || (count * ix86_cost->add <= ix86_cost->shift_const
21050 && !optimize_insn_for_size_p ()))
21051 {
21052 insn = mode == DImode ? gen_addsi3 : gen_adddi3;
21053 while (count-- > 0)
21054 emit_insn (insn (operand, operand, operand));
21055 }
21056 else
21057 {
21058 insn = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
21059 emit_insn (insn (operand, operand, GEN_INT (count)));
21060 }
21061 }
21062
21063 void
21064 ix86_split_ashl (rtx *operands, rtx scratch, enum machine_mode mode)
21065 {
21066 rtx (*gen_ashl3)(rtx, rtx, rtx);
21067 rtx (*gen_shld)(rtx, rtx, rtx);
21068 int half_width = GET_MODE_BITSIZE (mode) >> 1;
21069
21070 rtx low[2], high[2];
21071 int count;
21072
21073 if (CONST_INT_P (operands[2]))
21074 {
21075 split_double_mode (mode, operands, 2, low, high);
21076 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
21077
21078 if (count >= half_width)
21079 {
21080 emit_move_insn (high[0], low[1]);
21081 emit_move_insn (low[0], const0_rtx);
21082
21083 if (count > half_width)
21084 ix86_expand_ashl_const (high[0], count - half_width, mode);
21085 }
21086 else
21087 {
21088 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
21089
21090 if (!rtx_equal_p (operands[0], operands[1]))
21091 emit_move_insn (operands[0], operands[1]);
21092
21093 emit_insn (gen_shld (high[0], low[0], GEN_INT (count)));
21094 ix86_expand_ashl_const (low[0], count, mode);
21095 }
21096 return;
21097 }
21098
21099 split_double_mode (mode, operands, 1, low, high);
21100
21101 gen_ashl3 = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
21102
21103 if (operands[1] == const1_rtx)
21104 {
21105 /* Assuming we've chosen a QImode capable registers, then 1 << N
21106 can be done with two 32/64-bit shifts, no branches, no cmoves. */
21107 if (ANY_QI_REG_P (low[0]) && ANY_QI_REG_P (high[0]))
21108 {
21109 rtx s, d, flags = gen_rtx_REG (CCZmode, FLAGS_REG);
21110
21111 ix86_expand_clear (low[0]);
21112 ix86_expand_clear (high[0]);
21113 emit_insn (gen_testqi_ccz_1 (operands[2], GEN_INT (half_width)));
21114
21115 d = gen_lowpart (QImode, low[0]);
21116 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
21117 s = gen_rtx_EQ (QImode, flags, const0_rtx);
21118 emit_insn (gen_rtx_SET (VOIDmode, d, s));
21119
21120 d = gen_lowpart (QImode, high[0]);
21121 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
21122 s = gen_rtx_NE (QImode, flags, const0_rtx);
21123 emit_insn (gen_rtx_SET (VOIDmode, d, s));
21124 }
21125
21126 /* Otherwise, we can get the same results by manually performing
21127 a bit extract operation on bit 5/6, and then performing the two
21128 shifts. The two methods of getting 0/1 into low/high are exactly
21129 the same size. Avoiding the shift in the bit extract case helps
21130 pentium4 a bit; no one else seems to care much either way. */
21131 else
21132 {
21133 enum machine_mode half_mode;
21134 rtx (*gen_lshr3)(rtx, rtx, rtx);
21135 rtx (*gen_and3)(rtx, rtx, rtx);
21136 rtx (*gen_xor3)(rtx, rtx, rtx);
21137 HOST_WIDE_INT bits;
21138 rtx x;
21139
21140 if (mode == DImode)
21141 {
21142 half_mode = SImode;
21143 gen_lshr3 = gen_lshrsi3;
21144 gen_and3 = gen_andsi3;
21145 gen_xor3 = gen_xorsi3;
21146 bits = 5;
21147 }
21148 else
21149 {
21150 half_mode = DImode;
21151 gen_lshr3 = gen_lshrdi3;
21152 gen_and3 = gen_anddi3;
21153 gen_xor3 = gen_xordi3;
21154 bits = 6;
21155 }
21156
21157 if (TARGET_PARTIAL_REG_STALL && !optimize_insn_for_size_p ())
21158 x = gen_rtx_ZERO_EXTEND (half_mode, operands[2]);
21159 else
21160 x = gen_lowpart (half_mode, operands[2]);
21161 emit_insn (gen_rtx_SET (VOIDmode, high[0], x));
21162
21163 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (bits)));
21164 emit_insn (gen_and3 (high[0], high[0], const1_rtx));
21165 emit_move_insn (low[0], high[0]);
21166 emit_insn (gen_xor3 (low[0], low[0], const1_rtx));
21167 }
21168
21169 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
21170 emit_insn (gen_ashl3 (high[0], high[0], operands[2]));
21171 return;
21172 }
21173
21174 if (operands[1] == constm1_rtx)
21175 {
21176 /* For -1 << N, we can avoid the shld instruction, because we
21177 know that we're shifting 0...31/63 ones into a -1. */
21178 emit_move_insn (low[0], constm1_rtx);
21179 if (optimize_insn_for_size_p ())
21180 emit_move_insn (high[0], low[0]);
21181 else
21182 emit_move_insn (high[0], constm1_rtx);
21183 }
21184 else
21185 {
21186 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
21187
21188 if (!rtx_equal_p (operands[0], operands[1]))
21189 emit_move_insn (operands[0], operands[1]);
21190
21191 split_double_mode (mode, operands, 1, low, high);
21192 emit_insn (gen_shld (high[0], low[0], operands[2]));
21193 }
21194
21195 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
21196
21197 if (TARGET_CMOVE && scratch)
21198 {
21199 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
21200 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
21201
21202 ix86_expand_clear (scratch);
21203 emit_insn (gen_x86_shift_adj_1 (high[0], low[0], operands[2], scratch));
21204 }
21205 else
21206 {
21207 rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
21208 = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
21209
21210 emit_insn (gen_x86_shift_adj_2 (high[0], low[0], operands[2]));
21211 }
21212 }
21213
21214 void
21215 ix86_split_ashr (rtx *operands, rtx scratch, enum machine_mode mode)
21216 {
21217 rtx (*gen_ashr3)(rtx, rtx, rtx)
21218 = mode == DImode ? gen_ashrsi3 : gen_ashrdi3;
21219 rtx (*gen_shrd)(rtx, rtx, rtx);
21220 int half_width = GET_MODE_BITSIZE (mode) >> 1;
21221
21222 rtx low[2], high[2];
21223 int count;
21224
21225 if (CONST_INT_P (operands[2]))
21226 {
21227 split_double_mode (mode, operands, 2, low, high);
21228 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
21229
21230 if (count == GET_MODE_BITSIZE (mode) - 1)
21231 {
21232 emit_move_insn (high[0], high[1]);
21233 emit_insn (gen_ashr3 (high[0], high[0],
21234 GEN_INT (half_width - 1)));
21235 emit_move_insn (low[0], high[0]);
21236
21237 }
21238 else if (count >= half_width)
21239 {
21240 emit_move_insn (low[0], high[1]);
21241 emit_move_insn (high[0], low[0]);
21242 emit_insn (gen_ashr3 (high[0], high[0],
21243 GEN_INT (half_width - 1)));
21244
21245 if (count > half_width)
21246 emit_insn (gen_ashr3 (low[0], low[0],
21247 GEN_INT (count - half_width)));
21248 }
21249 else
21250 {
21251 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
21252
21253 if (!rtx_equal_p (operands[0], operands[1]))
21254 emit_move_insn (operands[0], operands[1]);
21255
21256 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
21257 emit_insn (gen_ashr3 (high[0], high[0], GEN_INT (count)));
21258 }
21259 }
21260 else
21261 {
21262 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
21263
21264 if (!rtx_equal_p (operands[0], operands[1]))
21265 emit_move_insn (operands[0], operands[1]);
21266
21267 split_double_mode (mode, operands, 1, low, high);
21268
21269 emit_insn (gen_shrd (low[0], high[0], operands[2]));
21270 emit_insn (gen_ashr3 (high[0], high[0], operands[2]));
21271
21272 if (TARGET_CMOVE && scratch)
21273 {
21274 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
21275 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
21276
21277 emit_move_insn (scratch, high[0]);
21278 emit_insn (gen_ashr3 (scratch, scratch,
21279 GEN_INT (half_width - 1)));
21280 emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
21281 scratch));
21282 }
21283 else
21284 {
21285 rtx (*gen_x86_shift_adj_3)(rtx, rtx, rtx)
21286 = mode == DImode ? gen_x86_shiftsi_adj_3 : gen_x86_shiftdi_adj_3;
21287
21288 emit_insn (gen_x86_shift_adj_3 (low[0], high[0], operands[2]));
21289 }
21290 }
21291 }
21292
21293 void
21294 ix86_split_lshr (rtx *operands, rtx scratch, enum machine_mode mode)
21295 {
21296 rtx (*gen_lshr3)(rtx, rtx, rtx)
21297 = mode == DImode ? gen_lshrsi3 : gen_lshrdi3;
21298 rtx (*gen_shrd)(rtx, rtx, rtx);
21299 int half_width = GET_MODE_BITSIZE (mode) >> 1;
21300
21301 rtx low[2], high[2];
21302 int count;
21303
21304 if (CONST_INT_P (operands[2]))
21305 {
21306 split_double_mode (mode, operands, 2, low, high);
21307 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
21308
21309 if (count >= half_width)
21310 {
21311 emit_move_insn (low[0], high[1]);
21312 ix86_expand_clear (high[0]);
21313
21314 if (count > half_width)
21315 emit_insn (gen_lshr3 (low[0], low[0],
21316 GEN_INT (count - half_width)));
21317 }
21318 else
21319 {
21320 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
21321
21322 if (!rtx_equal_p (operands[0], operands[1]))
21323 emit_move_insn (operands[0], operands[1]);
21324
21325 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
21326 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (count)));
21327 }
21328 }
21329 else
21330 {
21331 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
21332
21333 if (!rtx_equal_p (operands[0], operands[1]))
21334 emit_move_insn (operands[0], operands[1]);
21335
21336 split_double_mode (mode, operands, 1, low, high);
21337
21338 emit_insn (gen_shrd (low[0], high[0], operands[2]));
21339 emit_insn (gen_lshr3 (high[0], high[0], operands[2]));
21340
21341 if (TARGET_CMOVE && scratch)
21342 {
21343 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
21344 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
21345
21346 ix86_expand_clear (scratch);
21347 emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
21348 scratch));
21349 }
21350 else
21351 {
21352 rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
21353 = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
21354
21355 emit_insn (gen_x86_shift_adj_2 (low[0], high[0], operands[2]));
21356 }
21357 }
21358 }
21359
21360 /* Predict just emitted jump instruction to be taken with probability PROB. */
21361 static void
21362 predict_jump (int prob)
21363 {
21364 rtx insn = get_last_insn ();
21365 gcc_assert (JUMP_P (insn));
21366 add_reg_note (insn, REG_BR_PROB, GEN_INT (prob));
21367 }
21368
21369 /* Helper function for the string operations below. Dest VARIABLE whether
21370 it is aligned to VALUE bytes. If true, jump to the label. */
21371 static rtx
21372 ix86_expand_aligntest (rtx variable, int value, bool epilogue)
21373 {
21374 rtx label = gen_label_rtx ();
21375 rtx tmpcount = gen_reg_rtx (GET_MODE (variable));
21376 if (GET_MODE (variable) == DImode)
21377 emit_insn (gen_anddi3 (tmpcount, variable, GEN_INT (value)));
21378 else
21379 emit_insn (gen_andsi3 (tmpcount, variable, GEN_INT (value)));
21380 emit_cmp_and_jump_insns (tmpcount, const0_rtx, EQ, 0, GET_MODE (variable),
21381 1, label);
21382 if (epilogue)
21383 predict_jump (REG_BR_PROB_BASE * 50 / 100);
21384 else
21385 predict_jump (REG_BR_PROB_BASE * 90 / 100);
21386 return label;
21387 }
21388
21389 /* Adjust COUNTER by the VALUE. */
21390 static void
21391 ix86_adjust_counter (rtx countreg, HOST_WIDE_INT value)
21392 {
21393 rtx (*gen_add)(rtx, rtx, rtx)
21394 = GET_MODE (countreg) == DImode ? gen_adddi3 : gen_addsi3;
21395
21396 emit_insn (gen_add (countreg, countreg, GEN_INT (-value)));
21397 }
21398
21399 /* Zero extend possibly SImode EXP to Pmode register. */
21400 rtx
21401 ix86_zero_extend_to_Pmode (rtx exp)
21402 {
21403 if (GET_MODE (exp) != Pmode)
21404 exp = convert_to_mode (Pmode, exp, 1);
21405 return force_reg (Pmode, exp);
21406 }
21407
21408 /* Divide COUNTREG by SCALE. */
21409 static rtx
21410 scale_counter (rtx countreg, int scale)
21411 {
21412 rtx sc;
21413
21414 if (scale == 1)
21415 return countreg;
21416 if (CONST_INT_P (countreg))
21417 return GEN_INT (INTVAL (countreg) / scale);
21418 gcc_assert (REG_P (countreg));
21419
21420 sc = expand_simple_binop (GET_MODE (countreg), LSHIFTRT, countreg,
21421 GEN_INT (exact_log2 (scale)),
21422 NULL, 1, OPTAB_DIRECT);
21423 return sc;
21424 }
21425
21426 /* Return mode for the memcpy/memset loop counter. Prefer SImode over
21427 DImode for constant loop counts. */
21428
21429 static enum machine_mode
21430 counter_mode (rtx count_exp)
21431 {
21432 if (GET_MODE (count_exp) != VOIDmode)
21433 return GET_MODE (count_exp);
21434 if (!CONST_INT_P (count_exp))
21435 return Pmode;
21436 if (TARGET_64BIT && (INTVAL (count_exp) & ~0xffffffff))
21437 return DImode;
21438 return SImode;
21439 }
21440
21441 /* When SRCPTR is non-NULL, output simple loop to move memory
21442 pointer to SRCPTR to DESTPTR via chunks of MODE unrolled UNROLL times,
21443 overall size is COUNT specified in bytes. When SRCPTR is NULL, output the
21444 equivalent loop to set memory by VALUE (supposed to be in MODE).
21445
21446 The size is rounded down to whole number of chunk size moved at once.
21447 SRCMEM and DESTMEM provide MEMrtx to feed proper aliasing info. */
21448
21449
21450 static void
21451 expand_set_or_movmem_via_loop (rtx destmem, rtx srcmem,
21452 rtx destptr, rtx srcptr, rtx value,
21453 rtx count, enum machine_mode mode, int unroll,
21454 int expected_size)
21455 {
21456 rtx out_label, top_label, iter, tmp;
21457 enum machine_mode iter_mode = counter_mode (count);
21458 rtx piece_size = GEN_INT (GET_MODE_SIZE (mode) * unroll);
21459 rtx piece_size_mask = GEN_INT (~((GET_MODE_SIZE (mode) * unroll) - 1));
21460 rtx size;
21461 rtx x_addr;
21462 rtx y_addr;
21463 int i;
21464
21465 top_label = gen_label_rtx ();
21466 out_label = gen_label_rtx ();
21467 iter = gen_reg_rtx (iter_mode);
21468
21469 size = expand_simple_binop (iter_mode, AND, count, piece_size_mask,
21470 NULL, 1, OPTAB_DIRECT);
21471 /* Those two should combine. */
21472 if (piece_size == const1_rtx)
21473 {
21474 emit_cmp_and_jump_insns (size, const0_rtx, EQ, NULL_RTX, iter_mode,
21475 true, out_label);
21476 predict_jump (REG_BR_PROB_BASE * 10 / 100);
21477 }
21478 emit_move_insn (iter, const0_rtx);
21479
21480 emit_label (top_label);
21481
21482 tmp = convert_modes (Pmode, iter_mode, iter, true);
21483 x_addr = gen_rtx_PLUS (Pmode, destptr, tmp);
21484 destmem = change_address (destmem, mode, x_addr);
21485
21486 if (srcmem)
21487 {
21488 y_addr = gen_rtx_PLUS (Pmode, srcptr, copy_rtx (tmp));
21489 srcmem = change_address (srcmem, mode, y_addr);
21490
21491 /* When unrolling for chips that reorder memory reads and writes,
21492 we can save registers by using single temporary.
21493 Also using 4 temporaries is overkill in 32bit mode. */
21494 if (!TARGET_64BIT && 0)
21495 {
21496 for (i = 0; i < unroll; i++)
21497 {
21498 if (i)
21499 {
21500 destmem =
21501 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
21502 srcmem =
21503 adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
21504 }
21505 emit_move_insn (destmem, srcmem);
21506 }
21507 }
21508 else
21509 {
21510 rtx tmpreg[4];
21511 gcc_assert (unroll <= 4);
21512 for (i = 0; i < unroll; i++)
21513 {
21514 tmpreg[i] = gen_reg_rtx (mode);
21515 if (i)
21516 {
21517 srcmem =
21518 adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
21519 }
21520 emit_move_insn (tmpreg[i], srcmem);
21521 }
21522 for (i = 0; i < unroll; i++)
21523 {
21524 if (i)
21525 {
21526 destmem =
21527 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
21528 }
21529 emit_move_insn (destmem, tmpreg[i]);
21530 }
21531 }
21532 }
21533 else
21534 for (i = 0; i < unroll; i++)
21535 {
21536 if (i)
21537 destmem =
21538 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
21539 emit_move_insn (destmem, value);
21540 }
21541
21542 tmp = expand_simple_binop (iter_mode, PLUS, iter, piece_size, iter,
21543 true, OPTAB_LIB_WIDEN);
21544 if (tmp != iter)
21545 emit_move_insn (iter, tmp);
21546
21547 emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
21548 true, top_label);
21549 if (expected_size != -1)
21550 {
21551 expected_size /= GET_MODE_SIZE (mode) * unroll;
21552 if (expected_size == 0)
21553 predict_jump (0);
21554 else if (expected_size > REG_BR_PROB_BASE)
21555 predict_jump (REG_BR_PROB_BASE - 1);
21556 else
21557 predict_jump (REG_BR_PROB_BASE - (REG_BR_PROB_BASE + expected_size / 2) / expected_size);
21558 }
21559 else
21560 predict_jump (REG_BR_PROB_BASE * 80 / 100);
21561 iter = ix86_zero_extend_to_Pmode (iter);
21562 tmp = expand_simple_binop (Pmode, PLUS, destptr, iter, destptr,
21563 true, OPTAB_LIB_WIDEN);
21564 if (tmp != destptr)
21565 emit_move_insn (destptr, tmp);
21566 if (srcptr)
21567 {
21568 tmp = expand_simple_binop (Pmode, PLUS, srcptr, iter, srcptr,
21569 true, OPTAB_LIB_WIDEN);
21570 if (tmp != srcptr)
21571 emit_move_insn (srcptr, tmp);
21572 }
21573 emit_label (out_label);
21574 }
21575
21576 /* Output "rep; mov" instruction.
21577 Arguments have same meaning as for previous function */
21578 static void
21579 expand_movmem_via_rep_mov (rtx destmem, rtx srcmem,
21580 rtx destptr, rtx srcptr,
21581 rtx count,
21582 enum machine_mode mode)
21583 {
21584 rtx destexp;
21585 rtx srcexp;
21586 rtx countreg;
21587 HOST_WIDE_INT rounded_count;
21588
21589 /* If the size is known, it is shorter to use rep movs. */
21590 if (mode == QImode && CONST_INT_P (count)
21591 && !(INTVAL (count) & 3))
21592 mode = SImode;
21593
21594 if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
21595 destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
21596 if (srcptr != XEXP (srcmem, 0) || GET_MODE (srcmem) != BLKmode)
21597 srcmem = adjust_automodify_address_nv (srcmem, BLKmode, srcptr, 0);
21598 countreg = ix86_zero_extend_to_Pmode (scale_counter (count, GET_MODE_SIZE (mode)));
21599 if (mode != QImode)
21600 {
21601 destexp = gen_rtx_ASHIFT (Pmode, countreg,
21602 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
21603 destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
21604 srcexp = gen_rtx_ASHIFT (Pmode, countreg,
21605 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
21606 srcexp = gen_rtx_PLUS (Pmode, srcexp, srcptr);
21607 }
21608 else
21609 {
21610 destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
21611 srcexp = gen_rtx_PLUS (Pmode, srcptr, countreg);
21612 }
21613 if (CONST_INT_P (count))
21614 {
21615 rounded_count = (INTVAL (count)
21616 & ~((HOST_WIDE_INT) GET_MODE_SIZE (mode) - 1));
21617 destmem = shallow_copy_rtx (destmem);
21618 srcmem = shallow_copy_rtx (srcmem);
21619 set_mem_size (destmem, rounded_count);
21620 set_mem_size (srcmem, rounded_count);
21621 }
21622 else
21623 {
21624 if (MEM_SIZE_KNOWN_P (destmem))
21625 clear_mem_size (destmem);
21626 if (MEM_SIZE_KNOWN_P (srcmem))
21627 clear_mem_size (srcmem);
21628 }
21629 emit_insn (gen_rep_mov (destptr, destmem, srcptr, srcmem, countreg,
21630 destexp, srcexp));
21631 }
21632
21633 /* Output "rep; stos" instruction.
21634 Arguments have same meaning as for previous function */
21635 static void
21636 expand_setmem_via_rep_stos (rtx destmem, rtx destptr, rtx value,
21637 rtx count, enum machine_mode mode,
21638 rtx orig_value)
21639 {
21640 rtx destexp;
21641 rtx countreg;
21642 HOST_WIDE_INT rounded_count;
21643
21644 if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
21645 destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
21646 value = force_reg (mode, gen_lowpart (mode, value));
21647 countreg = ix86_zero_extend_to_Pmode (scale_counter (count, GET_MODE_SIZE (mode)));
21648 if (mode != QImode)
21649 {
21650 destexp = gen_rtx_ASHIFT (Pmode, countreg,
21651 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
21652 destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
21653 }
21654 else
21655 destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
21656 if (orig_value == const0_rtx && CONST_INT_P (count))
21657 {
21658 rounded_count = (INTVAL (count)
21659 & ~((HOST_WIDE_INT) GET_MODE_SIZE (mode) - 1));
21660 destmem = shallow_copy_rtx (destmem);
21661 set_mem_size (destmem, rounded_count);
21662 }
21663 else if (MEM_SIZE_KNOWN_P (destmem))
21664 clear_mem_size (destmem);
21665 emit_insn (gen_rep_stos (destptr, countreg, destmem, value, destexp));
21666 }
21667
21668 static void
21669 emit_strmov (rtx destmem, rtx srcmem,
21670 rtx destptr, rtx srcptr, enum machine_mode mode, int offset)
21671 {
21672 rtx src = adjust_automodify_address_nv (srcmem, mode, srcptr, offset);
21673 rtx dest = adjust_automodify_address_nv (destmem, mode, destptr, offset);
21674 emit_insn (gen_strmov (destptr, dest, srcptr, src));
21675 }
21676
21677 /* Output code to copy at most count & (max_size - 1) bytes from SRC to DEST. */
21678 static void
21679 expand_movmem_epilogue (rtx destmem, rtx srcmem,
21680 rtx destptr, rtx srcptr, rtx count, int max_size)
21681 {
21682 rtx src, dest;
21683 if (CONST_INT_P (count))
21684 {
21685 HOST_WIDE_INT countval = INTVAL (count);
21686 int offset = 0;
21687
21688 if ((countval & 0x10) && max_size > 16)
21689 {
21690 if (TARGET_64BIT)
21691 {
21692 emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset);
21693 emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset + 8);
21694 }
21695 else
21696 gcc_unreachable ();
21697 offset += 16;
21698 }
21699 if ((countval & 0x08) && max_size > 8)
21700 {
21701 if (TARGET_64BIT)
21702 emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset);
21703 else
21704 {
21705 emit_strmov (destmem, srcmem, destptr, srcptr, SImode, offset);
21706 emit_strmov (destmem, srcmem, destptr, srcptr, SImode, offset + 4);
21707 }
21708 offset += 8;
21709 }
21710 if ((countval & 0x04) && max_size > 4)
21711 {
21712 emit_strmov (destmem, srcmem, destptr, srcptr, SImode, offset);
21713 offset += 4;
21714 }
21715 if ((countval & 0x02) && max_size > 2)
21716 {
21717 emit_strmov (destmem, srcmem, destptr, srcptr, HImode, offset);
21718 offset += 2;
21719 }
21720 if ((countval & 0x01) && max_size > 1)
21721 {
21722 emit_strmov (destmem, srcmem, destptr, srcptr, QImode, offset);
21723 offset += 1;
21724 }
21725 return;
21726 }
21727 if (max_size > 8)
21728 {
21729 count = expand_simple_binop (GET_MODE (count), AND, count, GEN_INT (max_size - 1),
21730 count, 1, OPTAB_DIRECT);
21731 expand_set_or_movmem_via_loop (destmem, srcmem, destptr, srcptr, NULL,
21732 count, QImode, 1, 4);
21733 return;
21734 }
21735
21736 /* When there are stringops, we can cheaply increase dest and src pointers.
21737 Otherwise we save code size by maintaining offset (zero is readily
21738 available from preceding rep operation) and using x86 addressing modes.
21739 */
21740 if (TARGET_SINGLE_STRINGOP)
21741 {
21742 if (max_size > 4)
21743 {
21744 rtx label = ix86_expand_aligntest (count, 4, true);
21745 src = change_address (srcmem, SImode, srcptr);
21746 dest = change_address (destmem, SImode, destptr);
21747 emit_insn (gen_strmov (destptr, dest, srcptr, src));
21748 emit_label (label);
21749 LABEL_NUSES (label) = 1;
21750 }
21751 if (max_size > 2)
21752 {
21753 rtx label = ix86_expand_aligntest (count, 2, true);
21754 src = change_address (srcmem, HImode, srcptr);
21755 dest = change_address (destmem, HImode, destptr);
21756 emit_insn (gen_strmov (destptr, dest, srcptr, src));
21757 emit_label (label);
21758 LABEL_NUSES (label) = 1;
21759 }
21760 if (max_size > 1)
21761 {
21762 rtx label = ix86_expand_aligntest (count, 1, true);
21763 src = change_address (srcmem, QImode, srcptr);
21764 dest = change_address (destmem, QImode, destptr);
21765 emit_insn (gen_strmov (destptr, dest, srcptr, src));
21766 emit_label (label);
21767 LABEL_NUSES (label) = 1;
21768 }
21769 }
21770 else
21771 {
21772 rtx offset = force_reg (Pmode, const0_rtx);
21773 rtx tmp;
21774
21775 if (max_size > 4)
21776 {
21777 rtx label = ix86_expand_aligntest (count, 4, true);
21778 src = change_address (srcmem, SImode, srcptr);
21779 dest = change_address (destmem, SImode, destptr);
21780 emit_move_insn (dest, src);
21781 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (4), NULL,
21782 true, OPTAB_LIB_WIDEN);
21783 if (tmp != offset)
21784 emit_move_insn (offset, tmp);
21785 emit_label (label);
21786 LABEL_NUSES (label) = 1;
21787 }
21788 if (max_size > 2)
21789 {
21790 rtx label = ix86_expand_aligntest (count, 2, true);
21791 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
21792 src = change_address (srcmem, HImode, tmp);
21793 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
21794 dest = change_address (destmem, HImode, tmp);
21795 emit_move_insn (dest, src);
21796 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (2), tmp,
21797 true, OPTAB_LIB_WIDEN);
21798 if (tmp != offset)
21799 emit_move_insn (offset, tmp);
21800 emit_label (label);
21801 LABEL_NUSES (label) = 1;
21802 }
21803 if (max_size > 1)
21804 {
21805 rtx label = ix86_expand_aligntest (count, 1, true);
21806 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
21807 src = change_address (srcmem, QImode, tmp);
21808 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
21809 dest = change_address (destmem, QImode, tmp);
21810 emit_move_insn (dest, src);
21811 emit_label (label);
21812 LABEL_NUSES (label) = 1;
21813 }
21814 }
21815 }
21816
21817 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
21818 static void
21819 expand_setmem_epilogue_via_loop (rtx destmem, rtx destptr, rtx value,
21820 rtx count, int max_size)
21821 {
21822 count =
21823 expand_simple_binop (counter_mode (count), AND, count,
21824 GEN_INT (max_size - 1), count, 1, OPTAB_DIRECT);
21825 expand_set_or_movmem_via_loop (destmem, NULL, destptr, NULL,
21826 gen_lowpart (QImode, value), count, QImode,
21827 1, max_size / 2);
21828 }
21829
21830 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
21831 static void
21832 expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx count, int max_size)
21833 {
21834 rtx dest;
21835
21836 if (CONST_INT_P (count))
21837 {
21838 HOST_WIDE_INT countval = INTVAL (count);
21839 int offset = 0;
21840
21841 if ((countval & 0x10) && max_size > 16)
21842 {
21843 if (TARGET_64BIT)
21844 {
21845 dest = adjust_automodify_address_nv (destmem, DImode, destptr, offset);
21846 emit_insn (gen_strset (destptr, dest, value));
21847 dest = adjust_automodify_address_nv (destmem, DImode, destptr, offset + 8);
21848 emit_insn (gen_strset (destptr, dest, value));
21849 }
21850 else
21851 gcc_unreachable ();
21852 offset += 16;
21853 }
21854 if ((countval & 0x08) && max_size > 8)
21855 {
21856 if (TARGET_64BIT)
21857 {
21858 dest = adjust_automodify_address_nv (destmem, DImode, destptr, offset);
21859 emit_insn (gen_strset (destptr, dest, value));
21860 }
21861 else
21862 {
21863 dest = adjust_automodify_address_nv (destmem, SImode, destptr, offset);
21864 emit_insn (gen_strset (destptr, dest, value));
21865 dest = adjust_automodify_address_nv (destmem, SImode, destptr, offset + 4);
21866 emit_insn (gen_strset (destptr, dest, value));
21867 }
21868 offset += 8;
21869 }
21870 if ((countval & 0x04) && max_size > 4)
21871 {
21872 dest = adjust_automodify_address_nv (destmem, SImode, destptr, offset);
21873 emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
21874 offset += 4;
21875 }
21876 if ((countval & 0x02) && max_size > 2)
21877 {
21878 dest = adjust_automodify_address_nv (destmem, HImode, destptr, offset);
21879 emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
21880 offset += 2;
21881 }
21882 if ((countval & 0x01) && max_size > 1)
21883 {
21884 dest = adjust_automodify_address_nv (destmem, QImode, destptr, offset);
21885 emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
21886 offset += 1;
21887 }
21888 return;
21889 }
21890 if (max_size > 32)
21891 {
21892 expand_setmem_epilogue_via_loop (destmem, destptr, value, count, max_size);
21893 return;
21894 }
21895 if (max_size > 16)
21896 {
21897 rtx label = ix86_expand_aligntest (count, 16, true);
21898 if (TARGET_64BIT)
21899 {
21900 dest = change_address (destmem, DImode, destptr);
21901 emit_insn (gen_strset (destptr, dest, value));
21902 emit_insn (gen_strset (destptr, dest, value));
21903 }
21904 else
21905 {
21906 dest = change_address (destmem, SImode, destptr);
21907 emit_insn (gen_strset (destptr, dest, value));
21908 emit_insn (gen_strset (destptr, dest, value));
21909 emit_insn (gen_strset (destptr, dest, value));
21910 emit_insn (gen_strset (destptr, dest, value));
21911 }
21912 emit_label (label);
21913 LABEL_NUSES (label) = 1;
21914 }
21915 if (max_size > 8)
21916 {
21917 rtx label = ix86_expand_aligntest (count, 8, true);
21918 if (TARGET_64BIT)
21919 {
21920 dest = change_address (destmem, DImode, destptr);
21921 emit_insn (gen_strset (destptr, dest, value));
21922 }
21923 else
21924 {
21925 dest = change_address (destmem, SImode, destptr);
21926 emit_insn (gen_strset (destptr, dest, value));
21927 emit_insn (gen_strset (destptr, dest, value));
21928 }
21929 emit_label (label);
21930 LABEL_NUSES (label) = 1;
21931 }
21932 if (max_size > 4)
21933 {
21934 rtx label = ix86_expand_aligntest (count, 4, true);
21935 dest = change_address (destmem, SImode, destptr);
21936 emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
21937 emit_label (label);
21938 LABEL_NUSES (label) = 1;
21939 }
21940 if (max_size > 2)
21941 {
21942 rtx label = ix86_expand_aligntest (count, 2, true);
21943 dest = change_address (destmem, HImode, destptr);
21944 emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
21945 emit_label (label);
21946 LABEL_NUSES (label) = 1;
21947 }
21948 if (max_size > 1)
21949 {
21950 rtx label = ix86_expand_aligntest (count, 1, true);
21951 dest = change_address (destmem, QImode, destptr);
21952 emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
21953 emit_label (label);
21954 LABEL_NUSES (label) = 1;
21955 }
21956 }
21957
21958 /* Copy enough from DEST to SRC to align DEST known to by aligned by ALIGN to
21959 DESIRED_ALIGNMENT. */
21960 static void
21961 expand_movmem_prologue (rtx destmem, rtx srcmem,
21962 rtx destptr, rtx srcptr, rtx count,
21963 int align, int desired_alignment)
21964 {
21965 if (align <= 1 && desired_alignment > 1)
21966 {
21967 rtx label = ix86_expand_aligntest (destptr, 1, false);
21968 srcmem = change_address (srcmem, QImode, srcptr);
21969 destmem = change_address (destmem, QImode, destptr);
21970 emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem));
21971 ix86_adjust_counter (count, 1);
21972 emit_label (label);
21973 LABEL_NUSES (label) = 1;
21974 }
21975 if (align <= 2 && desired_alignment > 2)
21976 {
21977 rtx label = ix86_expand_aligntest (destptr, 2, false);
21978 srcmem = change_address (srcmem, HImode, srcptr);
21979 destmem = change_address (destmem, HImode, destptr);
21980 emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem));
21981 ix86_adjust_counter (count, 2);
21982 emit_label (label);
21983 LABEL_NUSES (label) = 1;
21984 }
21985 if (align <= 4 && desired_alignment > 4)
21986 {
21987 rtx label = ix86_expand_aligntest (destptr, 4, false);
21988 srcmem = change_address (srcmem, SImode, srcptr);
21989 destmem = change_address (destmem, SImode, destptr);
21990 emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem));
21991 ix86_adjust_counter (count, 4);
21992 emit_label (label);
21993 LABEL_NUSES (label) = 1;
21994 }
21995 gcc_assert (desired_alignment <= 8);
21996 }
21997
21998 /* Copy enough from DST to SRC to align DST known to DESIRED_ALIGN.
21999 ALIGN_BYTES is how many bytes need to be copied. */
22000 static rtx
22001 expand_constant_movmem_prologue (rtx dst, rtx *srcp, rtx destreg, rtx srcreg,
22002 int desired_align, int align_bytes)
22003 {
22004 rtx src = *srcp;
22005 rtx orig_dst = dst;
22006 rtx orig_src = src;
22007 int off = 0;
22008 int src_align_bytes = get_mem_align_offset (src, desired_align * BITS_PER_UNIT);
22009 if (src_align_bytes >= 0)
22010 src_align_bytes = desired_align - src_align_bytes;
22011 if (align_bytes & 1)
22012 {
22013 dst = adjust_automodify_address_nv (dst, QImode, destreg, 0);
22014 src = adjust_automodify_address_nv (src, QImode, srcreg, 0);
22015 off = 1;
22016 emit_insn (gen_strmov (destreg, dst, srcreg, src));
22017 }
22018 if (align_bytes & 2)
22019 {
22020 dst = adjust_automodify_address_nv (dst, HImode, destreg, off);
22021 src = adjust_automodify_address_nv (src, HImode, srcreg, off);
22022 if (MEM_ALIGN (dst) < 2 * BITS_PER_UNIT)
22023 set_mem_align (dst, 2 * BITS_PER_UNIT);
22024 if (src_align_bytes >= 0
22025 && (src_align_bytes & 1) == (align_bytes & 1)
22026 && MEM_ALIGN (src) < 2 * BITS_PER_UNIT)
22027 set_mem_align (src, 2 * BITS_PER_UNIT);
22028 off = 2;
22029 emit_insn (gen_strmov (destreg, dst, srcreg, src));
22030 }
22031 if (align_bytes & 4)
22032 {
22033 dst = adjust_automodify_address_nv (dst, SImode, destreg, off);
22034 src = adjust_automodify_address_nv (src, SImode, srcreg, off);
22035 if (MEM_ALIGN (dst) < 4 * BITS_PER_UNIT)
22036 set_mem_align (dst, 4 * BITS_PER_UNIT);
22037 if (src_align_bytes >= 0)
22038 {
22039 unsigned int src_align = 0;
22040 if ((src_align_bytes & 3) == (align_bytes & 3))
22041 src_align = 4;
22042 else if ((src_align_bytes & 1) == (align_bytes & 1))
22043 src_align = 2;
22044 if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
22045 set_mem_align (src, src_align * BITS_PER_UNIT);
22046 }
22047 off = 4;
22048 emit_insn (gen_strmov (destreg, dst, srcreg, src));
22049 }
22050 dst = adjust_automodify_address_nv (dst, BLKmode, destreg, off);
22051 src = adjust_automodify_address_nv (src, BLKmode, srcreg, off);
22052 if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
22053 set_mem_align (dst, desired_align * BITS_PER_UNIT);
22054 if (src_align_bytes >= 0)
22055 {
22056 unsigned int src_align = 0;
22057 if ((src_align_bytes & 7) == (align_bytes & 7))
22058 src_align = 8;
22059 else if ((src_align_bytes & 3) == (align_bytes & 3))
22060 src_align = 4;
22061 else if ((src_align_bytes & 1) == (align_bytes & 1))
22062 src_align = 2;
22063 if (src_align > (unsigned int) desired_align)
22064 src_align = desired_align;
22065 if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
22066 set_mem_align (src, src_align * BITS_PER_UNIT);
22067 }
22068 if (MEM_SIZE_KNOWN_P (orig_dst))
22069 set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
22070 if (MEM_SIZE_KNOWN_P (orig_src))
22071 set_mem_size (src, MEM_SIZE (orig_src) - align_bytes);
22072 *srcp = src;
22073 return dst;
22074 }
22075
22076 /* Set enough from DEST to align DEST known to by aligned by ALIGN to
22077 DESIRED_ALIGNMENT. */
22078 static void
22079 expand_setmem_prologue (rtx destmem, rtx destptr, rtx value, rtx count,
22080 int align, int desired_alignment)
22081 {
22082 if (align <= 1 && desired_alignment > 1)
22083 {
22084 rtx label = ix86_expand_aligntest (destptr, 1, false);
22085 destmem = change_address (destmem, QImode, destptr);
22086 emit_insn (gen_strset (destptr, destmem, gen_lowpart (QImode, value)));
22087 ix86_adjust_counter (count, 1);
22088 emit_label (label);
22089 LABEL_NUSES (label) = 1;
22090 }
22091 if (align <= 2 && desired_alignment > 2)
22092 {
22093 rtx label = ix86_expand_aligntest (destptr, 2, false);
22094 destmem = change_address (destmem, HImode, destptr);
22095 emit_insn (gen_strset (destptr, destmem, gen_lowpart (HImode, value)));
22096 ix86_adjust_counter (count, 2);
22097 emit_label (label);
22098 LABEL_NUSES (label) = 1;
22099 }
22100 if (align <= 4 && desired_alignment > 4)
22101 {
22102 rtx label = ix86_expand_aligntest (destptr, 4, false);
22103 destmem = change_address (destmem, SImode, destptr);
22104 emit_insn (gen_strset (destptr, destmem, gen_lowpart (SImode, value)));
22105 ix86_adjust_counter (count, 4);
22106 emit_label (label);
22107 LABEL_NUSES (label) = 1;
22108 }
22109 gcc_assert (desired_alignment <= 8);
22110 }
22111
22112 /* Set enough from DST to align DST known to by aligned by ALIGN to
22113 DESIRED_ALIGN. ALIGN_BYTES is how many bytes need to be stored. */
22114 static rtx
22115 expand_constant_setmem_prologue (rtx dst, rtx destreg, rtx value,
22116 int desired_align, int align_bytes)
22117 {
22118 int off = 0;
22119 rtx orig_dst = dst;
22120 if (align_bytes & 1)
22121 {
22122 dst = adjust_automodify_address_nv (dst, QImode, destreg, 0);
22123 off = 1;
22124 emit_insn (gen_strset (destreg, dst,
22125 gen_lowpart (QImode, value)));
22126 }
22127 if (align_bytes & 2)
22128 {
22129 dst = adjust_automodify_address_nv (dst, HImode, destreg, off);
22130 if (MEM_ALIGN (dst) < 2 * BITS_PER_UNIT)
22131 set_mem_align (dst, 2 * BITS_PER_UNIT);
22132 off = 2;
22133 emit_insn (gen_strset (destreg, dst,
22134 gen_lowpart (HImode, value)));
22135 }
22136 if (align_bytes & 4)
22137 {
22138 dst = adjust_automodify_address_nv (dst, SImode, destreg, off);
22139 if (MEM_ALIGN (dst) < 4 * BITS_PER_UNIT)
22140 set_mem_align (dst, 4 * BITS_PER_UNIT);
22141 off = 4;
22142 emit_insn (gen_strset (destreg, dst,
22143 gen_lowpart (SImode, value)));
22144 }
22145 dst = adjust_automodify_address_nv (dst, BLKmode, destreg, off);
22146 if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
22147 set_mem_align (dst, desired_align * BITS_PER_UNIT);
22148 if (MEM_SIZE_KNOWN_P (orig_dst))
22149 set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
22150 return dst;
22151 }
22152
22153 /* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation. */
22154 static enum stringop_alg
22155 decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size, bool memset,
22156 int *dynamic_check)
22157 {
22158 const struct stringop_algs * algs;
22159 bool optimize_for_speed;
22160 /* Algorithms using the rep prefix want at least edi and ecx;
22161 additionally, memset wants eax and memcpy wants esi. Don't
22162 consider such algorithms if the user has appropriated those
22163 registers for their own purposes. */
22164 bool rep_prefix_usable = !(fixed_regs[CX_REG] || fixed_regs[DI_REG]
22165 || (memset
22166 ? fixed_regs[AX_REG] : fixed_regs[SI_REG]));
22167
22168 #define ALG_USABLE_P(alg) (rep_prefix_usable \
22169 || (alg != rep_prefix_1_byte \
22170 && alg != rep_prefix_4_byte \
22171 && alg != rep_prefix_8_byte))
22172 const struct processor_costs *cost;
22173
22174 /* Even if the string operation call is cold, we still might spend a lot
22175 of time processing large blocks. */
22176 if (optimize_function_for_size_p (cfun)
22177 || (optimize_insn_for_size_p ()
22178 && expected_size != -1 && expected_size < 256))
22179 optimize_for_speed = false;
22180 else
22181 optimize_for_speed = true;
22182
22183 cost = optimize_for_speed ? ix86_cost : &ix86_size_cost;
22184
22185 *dynamic_check = -1;
22186 if (memset)
22187 algs = &cost->memset[TARGET_64BIT != 0];
22188 else
22189 algs = &cost->memcpy[TARGET_64BIT != 0];
22190 if (ix86_stringop_alg != no_stringop && ALG_USABLE_P (ix86_stringop_alg))
22191 return ix86_stringop_alg;
22192 /* rep; movq or rep; movl is the smallest variant. */
22193 else if (!optimize_for_speed)
22194 {
22195 if (!count || (count & 3))
22196 return rep_prefix_usable ? rep_prefix_1_byte : loop_1_byte;
22197 else
22198 return rep_prefix_usable ? rep_prefix_4_byte : loop;
22199 }
22200 /* Very tiny blocks are best handled via the loop, REP is expensive to setup.
22201 */
22202 else if (expected_size != -1 && expected_size < 4)
22203 return loop_1_byte;
22204 else if (expected_size != -1)
22205 {
22206 unsigned int i;
22207 enum stringop_alg alg = libcall;
22208 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
22209 {
22210 /* We get here if the algorithms that were not libcall-based
22211 were rep-prefix based and we are unable to use rep prefixes
22212 based on global register usage. Break out of the loop and
22213 use the heuristic below. */
22214 if (algs->size[i].max == 0)
22215 break;
22216 if (algs->size[i].max >= expected_size || algs->size[i].max == -1)
22217 {
22218 enum stringop_alg candidate = algs->size[i].alg;
22219
22220 if (candidate != libcall && ALG_USABLE_P (candidate))
22221 alg = candidate;
22222 /* Honor TARGET_INLINE_ALL_STRINGOPS by picking
22223 last non-libcall inline algorithm. */
22224 if (TARGET_INLINE_ALL_STRINGOPS)
22225 {
22226 /* When the current size is best to be copied by a libcall,
22227 but we are still forced to inline, run the heuristic below
22228 that will pick code for medium sized blocks. */
22229 if (alg != libcall)
22230 return alg;
22231 break;
22232 }
22233 else if (ALG_USABLE_P (candidate))
22234 return candidate;
22235 }
22236 }
22237 gcc_assert (TARGET_INLINE_ALL_STRINGOPS || !rep_prefix_usable);
22238 }
22239 /* When asked to inline the call anyway, try to pick meaningful choice.
22240 We look for maximal size of block that is faster to copy by hand and
22241 take blocks of at most of that size guessing that average size will
22242 be roughly half of the block.
22243
22244 If this turns out to be bad, we might simply specify the preferred
22245 choice in ix86_costs. */
22246 if ((TARGET_INLINE_ALL_STRINGOPS || TARGET_INLINE_STRINGOPS_DYNAMICALLY)
22247 && (algs->unknown_size == libcall || !ALG_USABLE_P (algs->unknown_size)))
22248 {
22249 int max = -1;
22250 enum stringop_alg alg;
22251 int i;
22252 bool any_alg_usable_p = true;
22253
22254 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
22255 {
22256 enum stringop_alg candidate = algs->size[i].alg;
22257 any_alg_usable_p = any_alg_usable_p && ALG_USABLE_P (candidate);
22258
22259 if (candidate != libcall && candidate
22260 && ALG_USABLE_P (candidate))
22261 max = algs->size[i].max;
22262 }
22263 /* If there aren't any usable algorithms, then recursing on
22264 smaller sizes isn't going to find anything. Just return the
22265 simple byte-at-a-time copy loop. */
22266 if (!any_alg_usable_p)
22267 {
22268 /* Pick something reasonable. */
22269 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
22270 *dynamic_check = 128;
22271 return loop_1_byte;
22272 }
22273 if (max == -1)
22274 max = 4096;
22275 alg = decide_alg (count, max / 2, memset, dynamic_check);
22276 gcc_assert (*dynamic_check == -1);
22277 gcc_assert (alg != libcall);
22278 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
22279 *dynamic_check = max;
22280 return alg;
22281 }
22282 return ALG_USABLE_P (algs->unknown_size) ? algs->unknown_size : libcall;
22283 #undef ALG_USABLE_P
22284 }
22285
22286 /* Decide on alignment. We know that the operand is already aligned to ALIGN
22287 (ALIGN can be based on profile feedback and thus it is not 100% guaranteed). */
22288 static int
22289 decide_alignment (int align,
22290 enum stringop_alg alg,
22291 int expected_size)
22292 {
22293 int desired_align = 0;
22294 switch (alg)
22295 {
22296 case no_stringop:
22297 gcc_unreachable ();
22298 case loop:
22299 case unrolled_loop:
22300 desired_align = GET_MODE_SIZE (Pmode);
22301 break;
22302 case rep_prefix_8_byte:
22303 desired_align = 8;
22304 break;
22305 case rep_prefix_4_byte:
22306 /* PentiumPro has special logic triggering for 8 byte aligned blocks.
22307 copying whole cacheline at once. */
22308 if (TARGET_PENTIUMPRO)
22309 desired_align = 8;
22310 else
22311 desired_align = 4;
22312 break;
22313 case rep_prefix_1_byte:
22314 /* PentiumPro has special logic triggering for 8 byte aligned blocks.
22315 copying whole cacheline at once. */
22316 if (TARGET_PENTIUMPRO)
22317 desired_align = 8;
22318 else
22319 desired_align = 1;
22320 break;
22321 case loop_1_byte:
22322 desired_align = 1;
22323 break;
22324 case libcall:
22325 return 0;
22326 }
22327
22328 if (optimize_size)
22329 desired_align = 1;
22330 if (desired_align < align)
22331 desired_align = align;
22332 if (expected_size != -1 && expected_size < 4)
22333 desired_align = align;
22334 return desired_align;
22335 }
22336
22337 /* Return the smallest power of 2 greater than VAL. */
22338 static int
22339 smallest_pow2_greater_than (int val)
22340 {
22341 int ret = 1;
22342 while (ret <= val)
22343 ret <<= 1;
22344 return ret;
22345 }
22346
22347 /* Expand string move (memcpy) operation. Use i386 string operations
22348 when profitable. expand_setmem contains similar code. The code
22349 depends upon architecture, block size and alignment, but always has
22350 the same overall structure:
22351
22352 1) Prologue guard: Conditional that jumps up to epilogues for small
22353 blocks that can be handled by epilogue alone. This is faster
22354 but also needed for correctness, since prologue assume the block
22355 is larger than the desired alignment.
22356
22357 Optional dynamic check for size and libcall for large
22358 blocks is emitted here too, with -minline-stringops-dynamically.
22359
22360 2) Prologue: copy first few bytes in order to get destination
22361 aligned to DESIRED_ALIGN. It is emitted only when ALIGN is less
22362 than DESIRED_ALIGN and up to DESIRED_ALIGN - ALIGN bytes can be
22363 copied. We emit either a jump tree on power of two sized
22364 blocks, or a byte loop.
22365
22366 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
22367 with specified algorithm.
22368
22369 4) Epilogue: code copying tail of the block that is too small to be
22370 handled by main body (or up to size guarded by prologue guard). */
22371
22372 bool
22373 ix86_expand_movmem (rtx dst, rtx src, rtx count_exp, rtx align_exp,
22374 rtx expected_align_exp, rtx expected_size_exp)
22375 {
22376 rtx destreg;
22377 rtx srcreg;
22378 rtx label = NULL;
22379 rtx tmp;
22380 rtx jump_around_label = NULL;
22381 HOST_WIDE_INT align = 1;
22382 unsigned HOST_WIDE_INT count = 0;
22383 HOST_WIDE_INT expected_size = -1;
22384 int size_needed = 0, epilogue_size_needed;
22385 int desired_align = 0, align_bytes = 0;
22386 enum stringop_alg alg;
22387 int dynamic_check;
22388 bool need_zero_guard = false;
22389
22390 if (CONST_INT_P (align_exp))
22391 align = INTVAL (align_exp);
22392 /* i386 can do misaligned access on reasonably increased cost. */
22393 if (CONST_INT_P (expected_align_exp)
22394 && INTVAL (expected_align_exp) > align)
22395 align = INTVAL (expected_align_exp);
22396 /* ALIGN is the minimum of destination and source alignment, but we care here
22397 just about destination alignment. */
22398 else if (MEM_ALIGN (dst) > (unsigned HOST_WIDE_INT) align * BITS_PER_UNIT)
22399 align = MEM_ALIGN (dst) / BITS_PER_UNIT;
22400
22401 if (CONST_INT_P (count_exp))
22402 count = expected_size = INTVAL (count_exp);
22403 if (CONST_INT_P (expected_size_exp) && count == 0)
22404 expected_size = INTVAL (expected_size_exp);
22405
22406 /* Make sure we don't need to care about overflow later on. */
22407 if (count > ((unsigned HOST_WIDE_INT) 1 << 30))
22408 return false;
22409
22410 /* Step 0: Decide on preferred algorithm, desired alignment and
22411 size of chunks to be copied by main loop. */
22412
22413 alg = decide_alg (count, expected_size, false, &dynamic_check);
22414 desired_align = decide_alignment (align, alg, expected_size);
22415
22416 if (!TARGET_ALIGN_STRINGOPS)
22417 align = desired_align;
22418
22419 if (alg == libcall)
22420 return false;
22421 gcc_assert (alg != no_stringop);
22422 if (!count)
22423 count_exp = copy_to_mode_reg (GET_MODE (count_exp), count_exp);
22424 destreg = copy_addr_to_reg (XEXP (dst, 0));
22425 srcreg = copy_addr_to_reg (XEXP (src, 0));
22426 switch (alg)
22427 {
22428 case libcall:
22429 case no_stringop:
22430 gcc_unreachable ();
22431 case loop:
22432 need_zero_guard = true;
22433 size_needed = GET_MODE_SIZE (word_mode);
22434 break;
22435 case unrolled_loop:
22436 need_zero_guard = true;
22437 size_needed = GET_MODE_SIZE (word_mode) * (TARGET_64BIT ? 4 : 2);
22438 break;
22439 case rep_prefix_8_byte:
22440 size_needed = 8;
22441 break;
22442 case rep_prefix_4_byte:
22443 size_needed = 4;
22444 break;
22445 case rep_prefix_1_byte:
22446 size_needed = 1;
22447 break;
22448 case loop_1_byte:
22449 need_zero_guard = true;
22450 size_needed = 1;
22451 break;
22452 }
22453
22454 epilogue_size_needed = size_needed;
22455
22456 /* Step 1: Prologue guard. */
22457
22458 /* Alignment code needs count to be in register. */
22459 if (CONST_INT_P (count_exp) && desired_align > align)
22460 {
22461 if (INTVAL (count_exp) > desired_align
22462 && INTVAL (count_exp) > size_needed)
22463 {
22464 align_bytes
22465 = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
22466 if (align_bytes <= 0)
22467 align_bytes = 0;
22468 else
22469 align_bytes = desired_align - align_bytes;
22470 }
22471 if (align_bytes == 0)
22472 count_exp = force_reg (counter_mode (count_exp), count_exp);
22473 }
22474 gcc_assert (desired_align >= 1 && align >= 1);
22475
22476 /* Ensure that alignment prologue won't copy past end of block. */
22477 if (size_needed > 1 || (desired_align > 1 && desired_align > align))
22478 {
22479 epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
22480 /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes.
22481 Make sure it is power of 2. */
22482 epilogue_size_needed = smallest_pow2_greater_than (epilogue_size_needed);
22483
22484 if (count)
22485 {
22486 if (count < (unsigned HOST_WIDE_INT)epilogue_size_needed)
22487 {
22488 /* If main algorithm works on QImode, no epilogue is needed.
22489 For small sizes just don't align anything. */
22490 if (size_needed == 1)
22491 desired_align = align;
22492 else
22493 goto epilogue;
22494 }
22495 }
22496 else
22497 {
22498 label = gen_label_rtx ();
22499 emit_cmp_and_jump_insns (count_exp,
22500 GEN_INT (epilogue_size_needed),
22501 LTU, 0, counter_mode (count_exp), 1, label);
22502 if (expected_size == -1 || expected_size < epilogue_size_needed)
22503 predict_jump (REG_BR_PROB_BASE * 60 / 100);
22504 else
22505 predict_jump (REG_BR_PROB_BASE * 20 / 100);
22506 }
22507 }
22508
22509 /* Emit code to decide on runtime whether library call or inline should be
22510 used. */
22511 if (dynamic_check != -1)
22512 {
22513 if (CONST_INT_P (count_exp))
22514 {
22515 if (UINTVAL (count_exp) >= (unsigned HOST_WIDE_INT)dynamic_check)
22516 {
22517 emit_block_move_via_libcall (dst, src, count_exp, false);
22518 count_exp = const0_rtx;
22519 goto epilogue;
22520 }
22521 }
22522 else
22523 {
22524 rtx hot_label = gen_label_rtx ();
22525 jump_around_label = gen_label_rtx ();
22526 emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
22527 LEU, 0, GET_MODE (count_exp), 1, hot_label);
22528 predict_jump (REG_BR_PROB_BASE * 90 / 100);
22529 emit_block_move_via_libcall (dst, src, count_exp, false);
22530 emit_jump (jump_around_label);
22531 emit_label (hot_label);
22532 }
22533 }
22534
22535 /* Step 2: Alignment prologue. */
22536
22537 if (desired_align > align)
22538 {
22539 if (align_bytes == 0)
22540 {
22541 /* Except for the first move in epilogue, we no longer know
22542 constant offset in aliasing info. It don't seems to worth
22543 the pain to maintain it for the first move, so throw away
22544 the info early. */
22545 src = change_address (src, BLKmode, srcreg);
22546 dst = change_address (dst, BLKmode, destreg);
22547 expand_movmem_prologue (dst, src, destreg, srcreg, count_exp, align,
22548 desired_align);
22549 }
22550 else
22551 {
22552 /* If we know how many bytes need to be stored before dst is
22553 sufficiently aligned, maintain aliasing info accurately. */
22554 dst = expand_constant_movmem_prologue (dst, &src, destreg, srcreg,
22555 desired_align, align_bytes);
22556 count_exp = plus_constant (counter_mode (count_exp),
22557 count_exp, -align_bytes);
22558 count -= align_bytes;
22559 }
22560 if (need_zero_guard
22561 && (count < (unsigned HOST_WIDE_INT) size_needed
22562 || (align_bytes == 0
22563 && count < ((unsigned HOST_WIDE_INT) size_needed
22564 + desired_align - align))))
22565 {
22566 /* It is possible that we copied enough so the main loop will not
22567 execute. */
22568 gcc_assert (size_needed > 1);
22569 if (label == NULL_RTX)
22570 label = gen_label_rtx ();
22571 emit_cmp_and_jump_insns (count_exp,
22572 GEN_INT (size_needed),
22573 LTU, 0, counter_mode (count_exp), 1, label);
22574 if (expected_size == -1
22575 || expected_size < (desired_align - align) / 2 + size_needed)
22576 predict_jump (REG_BR_PROB_BASE * 20 / 100);
22577 else
22578 predict_jump (REG_BR_PROB_BASE * 60 / 100);
22579 }
22580 }
22581 if (label && size_needed == 1)
22582 {
22583 emit_label (label);
22584 LABEL_NUSES (label) = 1;
22585 label = NULL;
22586 epilogue_size_needed = 1;
22587 }
22588 else if (label == NULL_RTX)
22589 epilogue_size_needed = size_needed;
22590
22591 /* Step 3: Main loop. */
22592
22593 switch (alg)
22594 {
22595 case libcall:
22596 case no_stringop:
22597 gcc_unreachable ();
22598 case loop_1_byte:
22599 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, NULL,
22600 count_exp, QImode, 1, expected_size);
22601 break;
22602 case loop:
22603 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, NULL,
22604 count_exp, word_mode, 1, expected_size);
22605 break;
22606 case unrolled_loop:
22607 /* Unroll only by factor of 2 in 32bit mode, since we don't have enough
22608 registers for 4 temporaries anyway. */
22609 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, NULL,
22610 count_exp, word_mode, TARGET_64BIT ? 4 : 2,
22611 expected_size);
22612 break;
22613 case rep_prefix_8_byte:
22614 expand_movmem_via_rep_mov (dst, src, destreg, srcreg, count_exp,
22615 DImode);
22616 break;
22617 case rep_prefix_4_byte:
22618 expand_movmem_via_rep_mov (dst, src, destreg, srcreg, count_exp,
22619 SImode);
22620 break;
22621 case rep_prefix_1_byte:
22622 expand_movmem_via_rep_mov (dst, src, destreg, srcreg, count_exp,
22623 QImode);
22624 break;
22625 }
22626 /* Adjust properly the offset of src and dest memory for aliasing. */
22627 if (CONST_INT_P (count_exp))
22628 {
22629 src = adjust_automodify_address_nv (src, BLKmode, srcreg,
22630 (count / size_needed) * size_needed);
22631 dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
22632 (count / size_needed) * size_needed);
22633 }
22634 else
22635 {
22636 src = change_address (src, BLKmode, srcreg);
22637 dst = change_address (dst, BLKmode, destreg);
22638 }
22639
22640 /* Step 4: Epilogue to copy the remaining bytes. */
22641 epilogue:
22642 if (label)
22643 {
22644 /* When the main loop is done, COUNT_EXP might hold original count,
22645 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
22646 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
22647 bytes. Compensate if needed. */
22648
22649 if (size_needed < epilogue_size_needed)
22650 {
22651 tmp =
22652 expand_simple_binop (counter_mode (count_exp), AND, count_exp,
22653 GEN_INT (size_needed - 1), count_exp, 1,
22654 OPTAB_DIRECT);
22655 if (tmp != count_exp)
22656 emit_move_insn (count_exp, tmp);
22657 }
22658 emit_label (label);
22659 LABEL_NUSES (label) = 1;
22660 }
22661
22662 if (count_exp != const0_rtx && epilogue_size_needed > 1)
22663 expand_movmem_epilogue (dst, src, destreg, srcreg, count_exp,
22664 epilogue_size_needed);
22665 if (jump_around_label)
22666 emit_label (jump_around_label);
22667 return true;
22668 }
22669
22670 /* Helper function for memcpy. For QImode value 0xXY produce
22671 0xXYXYXYXY of wide specified by MODE. This is essentially
22672 a * 0x10101010, but we can do slightly better than
22673 synth_mult by unwinding the sequence by hand on CPUs with
22674 slow multiply. */
22675 static rtx
22676 promote_duplicated_reg (enum machine_mode mode, rtx val)
22677 {
22678 enum machine_mode valmode = GET_MODE (val);
22679 rtx tmp;
22680 int nops = mode == DImode ? 3 : 2;
22681
22682 gcc_assert (mode == SImode || mode == DImode);
22683 if (val == const0_rtx)
22684 return copy_to_mode_reg (mode, const0_rtx);
22685 if (CONST_INT_P (val))
22686 {
22687 HOST_WIDE_INT v = INTVAL (val) & 255;
22688
22689 v |= v << 8;
22690 v |= v << 16;
22691 if (mode == DImode)
22692 v |= (v << 16) << 16;
22693 return copy_to_mode_reg (mode, gen_int_mode (v, mode));
22694 }
22695
22696 if (valmode == VOIDmode)
22697 valmode = QImode;
22698 if (valmode != QImode)
22699 val = gen_lowpart (QImode, val);
22700 if (mode == QImode)
22701 return val;
22702 if (!TARGET_PARTIAL_REG_STALL)
22703 nops--;
22704 if (ix86_cost->mult_init[mode == DImode ? 3 : 2]
22705 + ix86_cost->mult_bit * (mode == DImode ? 8 : 4)
22706 <= (ix86_cost->shift_const + ix86_cost->add) * nops
22707 + (COSTS_N_INSNS (TARGET_PARTIAL_REG_STALL == 0)))
22708 {
22709 rtx reg = convert_modes (mode, QImode, val, true);
22710 tmp = promote_duplicated_reg (mode, const1_rtx);
22711 return expand_simple_binop (mode, MULT, reg, tmp, NULL, 1,
22712 OPTAB_DIRECT);
22713 }
22714 else
22715 {
22716 rtx reg = convert_modes (mode, QImode, val, true);
22717
22718 if (!TARGET_PARTIAL_REG_STALL)
22719 if (mode == SImode)
22720 emit_insn (gen_movsi_insv_1 (reg, reg));
22721 else
22722 emit_insn (gen_movdi_insv_1 (reg, reg));
22723 else
22724 {
22725 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (8),
22726 NULL, 1, OPTAB_DIRECT);
22727 reg =
22728 expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
22729 }
22730 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (16),
22731 NULL, 1, OPTAB_DIRECT);
22732 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
22733 if (mode == SImode)
22734 return reg;
22735 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (32),
22736 NULL, 1, OPTAB_DIRECT);
22737 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
22738 return reg;
22739 }
22740 }
22741
22742 /* Duplicate value VAL using promote_duplicated_reg into maximal size that will
22743 be needed by main loop copying SIZE_NEEDED chunks and prologue getting
22744 alignment from ALIGN to DESIRED_ALIGN. */
22745 static rtx
22746 promote_duplicated_reg_to_size (rtx val, int size_needed, int desired_align, int align)
22747 {
22748 rtx promoted_val;
22749
22750 if (TARGET_64BIT
22751 && (size_needed > 4 || (desired_align > align && desired_align > 4)))
22752 promoted_val = promote_duplicated_reg (DImode, val);
22753 else if (size_needed > 2 || (desired_align > align && desired_align > 2))
22754 promoted_val = promote_duplicated_reg (SImode, val);
22755 else if (size_needed > 1 || (desired_align > align && desired_align > 1))
22756 promoted_val = promote_duplicated_reg (HImode, val);
22757 else
22758 promoted_val = val;
22759
22760 return promoted_val;
22761 }
22762
22763 /* Expand string clear operation (bzero). Use i386 string operations when
22764 profitable. See expand_movmem comment for explanation of individual
22765 steps performed. */
22766 bool
22767 ix86_expand_setmem (rtx dst, rtx count_exp, rtx val_exp, rtx align_exp,
22768 rtx expected_align_exp, rtx expected_size_exp)
22769 {
22770 rtx destreg;
22771 rtx label = NULL;
22772 rtx tmp;
22773 rtx jump_around_label = NULL;
22774 HOST_WIDE_INT align = 1;
22775 unsigned HOST_WIDE_INT count = 0;
22776 HOST_WIDE_INT expected_size = -1;
22777 int size_needed = 0, epilogue_size_needed;
22778 int desired_align = 0, align_bytes = 0;
22779 enum stringop_alg alg;
22780 rtx promoted_val = NULL;
22781 bool force_loopy_epilogue = false;
22782 int dynamic_check;
22783 bool need_zero_guard = false;
22784
22785 if (CONST_INT_P (align_exp))
22786 align = INTVAL (align_exp);
22787 /* i386 can do misaligned access on reasonably increased cost. */
22788 if (CONST_INT_P (expected_align_exp)
22789 && INTVAL (expected_align_exp) > align)
22790 align = INTVAL (expected_align_exp);
22791 if (CONST_INT_P (count_exp))
22792 count = expected_size = INTVAL (count_exp);
22793 if (CONST_INT_P (expected_size_exp) && count == 0)
22794 expected_size = INTVAL (expected_size_exp);
22795
22796 /* Make sure we don't need to care about overflow later on. */
22797 if (count > ((unsigned HOST_WIDE_INT) 1 << 30))
22798 return false;
22799
22800 /* Step 0: Decide on preferred algorithm, desired alignment and
22801 size of chunks to be copied by main loop. */
22802
22803 alg = decide_alg (count, expected_size, true, &dynamic_check);
22804 desired_align = decide_alignment (align, alg, expected_size);
22805
22806 if (!TARGET_ALIGN_STRINGOPS)
22807 align = desired_align;
22808
22809 if (alg == libcall)
22810 return false;
22811 gcc_assert (alg != no_stringop);
22812 if (!count)
22813 count_exp = copy_to_mode_reg (counter_mode (count_exp), count_exp);
22814 destreg = copy_addr_to_reg (XEXP (dst, 0));
22815 switch (alg)
22816 {
22817 case libcall:
22818 case no_stringop:
22819 gcc_unreachable ();
22820 case loop:
22821 need_zero_guard = true;
22822 size_needed = GET_MODE_SIZE (word_mode);
22823 break;
22824 case unrolled_loop:
22825 need_zero_guard = true;
22826 size_needed = GET_MODE_SIZE (word_mode) * 4;
22827 break;
22828 case rep_prefix_8_byte:
22829 size_needed = 8;
22830 break;
22831 case rep_prefix_4_byte:
22832 size_needed = 4;
22833 break;
22834 case rep_prefix_1_byte:
22835 size_needed = 1;
22836 break;
22837 case loop_1_byte:
22838 need_zero_guard = true;
22839 size_needed = 1;
22840 break;
22841 }
22842 epilogue_size_needed = size_needed;
22843
22844 /* Step 1: Prologue guard. */
22845
22846 /* Alignment code needs count to be in register. */
22847 if (CONST_INT_P (count_exp) && desired_align > align)
22848 {
22849 if (INTVAL (count_exp) > desired_align
22850 && INTVAL (count_exp) > size_needed)
22851 {
22852 align_bytes
22853 = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
22854 if (align_bytes <= 0)
22855 align_bytes = 0;
22856 else
22857 align_bytes = desired_align - align_bytes;
22858 }
22859 if (align_bytes == 0)
22860 {
22861 enum machine_mode mode = SImode;
22862 if (TARGET_64BIT && (count & ~0xffffffff))
22863 mode = DImode;
22864 count_exp = force_reg (mode, count_exp);
22865 }
22866 }
22867 /* Do the cheap promotion to allow better CSE across the
22868 main loop and epilogue (ie one load of the big constant in the
22869 front of all code. */
22870 if (CONST_INT_P (val_exp))
22871 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
22872 desired_align, align);
22873 /* Ensure that alignment prologue won't copy past end of block. */
22874 if (size_needed > 1 || (desired_align > 1 && desired_align > align))
22875 {
22876 epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
22877 /* Epilogue always copies COUNT_EXP & (EPILOGUE_SIZE_NEEDED - 1) bytes.
22878 Make sure it is power of 2. */
22879 epilogue_size_needed = smallest_pow2_greater_than (epilogue_size_needed);
22880
22881 /* To improve performance of small blocks, we jump around the VAL
22882 promoting mode. This mean that if the promoted VAL is not constant,
22883 we might not use it in the epilogue and have to use byte
22884 loop variant. */
22885 if (epilogue_size_needed > 2 && !promoted_val)
22886 force_loopy_epilogue = true;
22887 if (count)
22888 {
22889 if (count < (unsigned HOST_WIDE_INT)epilogue_size_needed)
22890 {
22891 /* If main algorithm works on QImode, no epilogue is needed.
22892 For small sizes just don't align anything. */
22893 if (size_needed == 1)
22894 desired_align = align;
22895 else
22896 goto epilogue;
22897 }
22898 }
22899 else
22900 {
22901 label = gen_label_rtx ();
22902 emit_cmp_and_jump_insns (count_exp,
22903 GEN_INT (epilogue_size_needed),
22904 LTU, 0, counter_mode (count_exp), 1, label);
22905 if (expected_size == -1 || expected_size <= epilogue_size_needed)
22906 predict_jump (REG_BR_PROB_BASE * 60 / 100);
22907 else
22908 predict_jump (REG_BR_PROB_BASE * 20 / 100);
22909 }
22910 }
22911 if (dynamic_check != -1)
22912 {
22913 rtx hot_label = gen_label_rtx ();
22914 jump_around_label = gen_label_rtx ();
22915 emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
22916 LEU, 0, counter_mode (count_exp), 1, hot_label);
22917 predict_jump (REG_BR_PROB_BASE * 90 / 100);
22918 set_storage_via_libcall (dst, count_exp, val_exp, false);
22919 emit_jump (jump_around_label);
22920 emit_label (hot_label);
22921 }
22922
22923 /* Step 2: Alignment prologue. */
22924
22925 /* Do the expensive promotion once we branched off the small blocks. */
22926 if (!promoted_val)
22927 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
22928 desired_align, align);
22929 gcc_assert (desired_align >= 1 && align >= 1);
22930
22931 if (desired_align > align)
22932 {
22933 if (align_bytes == 0)
22934 {
22935 /* Except for the first move in epilogue, we no longer know
22936 constant offset in aliasing info. It don't seems to worth
22937 the pain to maintain it for the first move, so throw away
22938 the info early. */
22939 dst = change_address (dst, BLKmode, destreg);
22940 expand_setmem_prologue (dst, destreg, promoted_val, count_exp, align,
22941 desired_align);
22942 }
22943 else
22944 {
22945 /* If we know how many bytes need to be stored before dst is
22946 sufficiently aligned, maintain aliasing info accurately. */
22947 dst = expand_constant_setmem_prologue (dst, destreg, promoted_val,
22948 desired_align, align_bytes);
22949 count_exp = plus_constant (counter_mode (count_exp),
22950 count_exp, -align_bytes);
22951 count -= align_bytes;
22952 }
22953 if (need_zero_guard
22954 && (count < (unsigned HOST_WIDE_INT) size_needed
22955 || (align_bytes == 0
22956 && count < ((unsigned HOST_WIDE_INT) size_needed
22957 + desired_align - align))))
22958 {
22959 /* It is possible that we copied enough so the main loop will not
22960 execute. */
22961 gcc_assert (size_needed > 1);
22962 if (label == NULL_RTX)
22963 label = gen_label_rtx ();
22964 emit_cmp_and_jump_insns (count_exp,
22965 GEN_INT (size_needed),
22966 LTU, 0, counter_mode (count_exp), 1, label);
22967 if (expected_size == -1
22968 || expected_size < (desired_align - align) / 2 + size_needed)
22969 predict_jump (REG_BR_PROB_BASE * 20 / 100);
22970 else
22971 predict_jump (REG_BR_PROB_BASE * 60 / 100);
22972 }
22973 }
22974 if (label && size_needed == 1)
22975 {
22976 emit_label (label);
22977 LABEL_NUSES (label) = 1;
22978 label = NULL;
22979 promoted_val = val_exp;
22980 epilogue_size_needed = 1;
22981 }
22982 else if (label == NULL_RTX)
22983 epilogue_size_needed = size_needed;
22984
22985 /* Step 3: Main loop. */
22986
22987 switch (alg)
22988 {
22989 case libcall:
22990 case no_stringop:
22991 gcc_unreachable ();
22992 case loop_1_byte:
22993 expand_set_or_movmem_via_loop (dst, NULL, destreg, NULL, promoted_val,
22994 count_exp, QImode, 1, expected_size);
22995 break;
22996 case loop:
22997 expand_set_or_movmem_via_loop (dst, NULL, destreg, NULL, promoted_val,
22998 count_exp, word_mode, 1, expected_size);
22999 break;
23000 case unrolled_loop:
23001 expand_set_or_movmem_via_loop (dst, NULL, destreg, NULL, promoted_val,
23002 count_exp, word_mode, 4, expected_size);
23003 break;
23004 case rep_prefix_8_byte:
23005 expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp,
23006 DImode, val_exp);
23007 break;
23008 case rep_prefix_4_byte:
23009 expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp,
23010 SImode, val_exp);
23011 break;
23012 case rep_prefix_1_byte:
23013 expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp,
23014 QImode, val_exp);
23015 break;
23016 }
23017 /* Adjust properly the offset of src and dest memory for aliasing. */
23018 if (CONST_INT_P (count_exp))
23019 dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
23020 (count / size_needed) * size_needed);
23021 else
23022 dst = change_address (dst, BLKmode, destreg);
23023
23024 /* Step 4: Epilogue to copy the remaining bytes. */
23025
23026 if (label)
23027 {
23028 /* When the main loop is done, COUNT_EXP might hold original count,
23029 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
23030 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
23031 bytes. Compensate if needed. */
23032
23033 if (size_needed < epilogue_size_needed)
23034 {
23035 tmp =
23036 expand_simple_binop (counter_mode (count_exp), AND, count_exp,
23037 GEN_INT (size_needed - 1), count_exp, 1,
23038 OPTAB_DIRECT);
23039 if (tmp != count_exp)
23040 emit_move_insn (count_exp, tmp);
23041 }
23042 emit_label (label);
23043 LABEL_NUSES (label) = 1;
23044 }
23045 epilogue:
23046 if (count_exp != const0_rtx && epilogue_size_needed > 1)
23047 {
23048 if (force_loopy_epilogue)
23049 expand_setmem_epilogue_via_loop (dst, destreg, val_exp, count_exp,
23050 epilogue_size_needed);
23051 else
23052 expand_setmem_epilogue (dst, destreg, promoted_val, count_exp,
23053 epilogue_size_needed);
23054 }
23055 if (jump_around_label)
23056 emit_label (jump_around_label);
23057 return true;
23058 }
23059
23060 /* Expand the appropriate insns for doing strlen if not just doing
23061 repnz; scasb
23062
23063 out = result, initialized with the start address
23064 align_rtx = alignment of the address.
23065 scratch = scratch register, initialized with the startaddress when
23066 not aligned, otherwise undefined
23067
23068 This is just the body. It needs the initializations mentioned above and
23069 some address computing at the end. These things are done in i386.md. */
23070
23071 static void
23072 ix86_expand_strlensi_unroll_1 (rtx out, rtx src, rtx align_rtx)
23073 {
23074 int align;
23075 rtx tmp;
23076 rtx align_2_label = NULL_RTX;
23077 rtx align_3_label = NULL_RTX;
23078 rtx align_4_label = gen_label_rtx ();
23079 rtx end_0_label = gen_label_rtx ();
23080 rtx mem;
23081 rtx tmpreg = gen_reg_rtx (SImode);
23082 rtx scratch = gen_reg_rtx (SImode);
23083 rtx cmp;
23084
23085 align = 0;
23086 if (CONST_INT_P (align_rtx))
23087 align = INTVAL (align_rtx);
23088
23089 /* Loop to check 1..3 bytes for null to get an aligned pointer. */
23090
23091 /* Is there a known alignment and is it less than 4? */
23092 if (align < 4)
23093 {
23094 rtx scratch1 = gen_reg_rtx (Pmode);
23095 emit_move_insn (scratch1, out);
23096 /* Is there a known alignment and is it not 2? */
23097 if (align != 2)
23098 {
23099 align_3_label = gen_label_rtx (); /* Label when aligned to 3-byte */
23100 align_2_label = gen_label_rtx (); /* Label when aligned to 2-byte */
23101
23102 /* Leave just the 3 lower bits. */
23103 align_rtx = expand_binop (Pmode, and_optab, scratch1, GEN_INT (3),
23104 NULL_RTX, 0, OPTAB_WIDEN);
23105
23106 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
23107 Pmode, 1, align_4_label);
23108 emit_cmp_and_jump_insns (align_rtx, const2_rtx, EQ, NULL,
23109 Pmode, 1, align_2_label);
23110 emit_cmp_and_jump_insns (align_rtx, const2_rtx, GTU, NULL,
23111 Pmode, 1, align_3_label);
23112 }
23113 else
23114 {
23115 /* Since the alignment is 2, we have to check 2 or 0 bytes;
23116 check if is aligned to 4 - byte. */
23117
23118 align_rtx = expand_binop (Pmode, and_optab, scratch1, const2_rtx,
23119 NULL_RTX, 0, OPTAB_WIDEN);
23120
23121 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
23122 Pmode, 1, align_4_label);
23123 }
23124
23125 mem = change_address (src, QImode, out);
23126
23127 /* Now compare the bytes. */
23128
23129 /* Compare the first n unaligned byte on a byte per byte basis. */
23130 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL,
23131 QImode, 1, end_0_label);
23132
23133 /* Increment the address. */
23134 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
23135
23136 /* Not needed with an alignment of 2 */
23137 if (align != 2)
23138 {
23139 emit_label (align_2_label);
23140
23141 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
23142 end_0_label);
23143
23144 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
23145
23146 emit_label (align_3_label);
23147 }
23148
23149 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
23150 end_0_label);
23151
23152 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
23153 }
23154
23155 /* Generate loop to check 4 bytes at a time. It is not a good idea to
23156 align this loop. It gives only huge programs, but does not help to
23157 speed up. */
23158 emit_label (align_4_label);
23159
23160 mem = change_address (src, SImode, out);
23161 emit_move_insn (scratch, mem);
23162 emit_insn (ix86_gen_add3 (out, out, GEN_INT (4)));
23163
23164 /* This formula yields a nonzero result iff one of the bytes is zero.
23165 This saves three branches inside loop and many cycles. */
23166
23167 emit_insn (gen_addsi3 (tmpreg, scratch, GEN_INT (-0x01010101)));
23168 emit_insn (gen_one_cmplsi2 (scratch, scratch));
23169 emit_insn (gen_andsi3 (tmpreg, tmpreg, scratch));
23170 emit_insn (gen_andsi3 (tmpreg, tmpreg,
23171 gen_int_mode (0x80808080, SImode)));
23172 emit_cmp_and_jump_insns (tmpreg, const0_rtx, EQ, 0, SImode, 1,
23173 align_4_label);
23174
23175 if (TARGET_CMOVE)
23176 {
23177 rtx reg = gen_reg_rtx (SImode);
23178 rtx reg2 = gen_reg_rtx (Pmode);
23179 emit_move_insn (reg, tmpreg);
23180 emit_insn (gen_lshrsi3 (reg, reg, GEN_INT (16)));
23181
23182 /* If zero is not in the first two bytes, move two bytes forward. */
23183 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
23184 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
23185 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
23186 emit_insn (gen_rtx_SET (VOIDmode, tmpreg,
23187 gen_rtx_IF_THEN_ELSE (SImode, tmp,
23188 reg,
23189 tmpreg)));
23190 /* Emit lea manually to avoid clobbering of flags. */
23191 emit_insn (gen_rtx_SET (SImode, reg2,
23192 gen_rtx_PLUS (Pmode, out, const2_rtx)));
23193
23194 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
23195 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
23196 emit_insn (gen_rtx_SET (VOIDmode, out,
23197 gen_rtx_IF_THEN_ELSE (Pmode, tmp,
23198 reg2,
23199 out)));
23200 }
23201 else
23202 {
23203 rtx end_2_label = gen_label_rtx ();
23204 /* Is zero in the first two bytes? */
23205
23206 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
23207 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
23208 tmp = gen_rtx_NE (VOIDmode, tmp, const0_rtx);
23209 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
23210 gen_rtx_LABEL_REF (VOIDmode, end_2_label),
23211 pc_rtx);
23212 tmp = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
23213 JUMP_LABEL (tmp) = end_2_label;
23214
23215 /* Not in the first two. Move two bytes forward. */
23216 emit_insn (gen_lshrsi3 (tmpreg, tmpreg, GEN_INT (16)));
23217 emit_insn (ix86_gen_add3 (out, out, const2_rtx));
23218
23219 emit_label (end_2_label);
23220
23221 }
23222
23223 /* Avoid branch in fixing the byte. */
23224 tmpreg = gen_lowpart (QImode, tmpreg);
23225 emit_insn (gen_addqi3_cc (tmpreg, tmpreg, tmpreg));
23226 tmp = gen_rtx_REG (CCmode, FLAGS_REG);
23227 cmp = gen_rtx_LTU (VOIDmode, tmp, const0_rtx);
23228 emit_insn (ix86_gen_sub3_carry (out, out, GEN_INT (3), tmp, cmp));
23229
23230 emit_label (end_0_label);
23231 }
23232
23233 /* Expand strlen. */
23234
23235 bool
23236 ix86_expand_strlen (rtx out, rtx src, rtx eoschar, rtx align)
23237 {
23238 rtx addr, scratch1, scratch2, scratch3, scratch4;
23239
23240 /* The generic case of strlen expander is long. Avoid it's
23241 expanding unless TARGET_INLINE_ALL_STRINGOPS. */
23242
23243 if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
23244 && !TARGET_INLINE_ALL_STRINGOPS
23245 && !optimize_insn_for_size_p ()
23246 && (!CONST_INT_P (align) || INTVAL (align) < 4))
23247 return false;
23248
23249 addr = force_reg (Pmode, XEXP (src, 0));
23250 scratch1 = gen_reg_rtx (Pmode);
23251
23252 if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
23253 && !optimize_insn_for_size_p ())
23254 {
23255 /* Well it seems that some optimizer does not combine a call like
23256 foo(strlen(bar), strlen(bar));
23257 when the move and the subtraction is done here. It does calculate
23258 the length just once when these instructions are done inside of
23259 output_strlen_unroll(). But I think since &bar[strlen(bar)] is
23260 often used and I use one fewer register for the lifetime of
23261 output_strlen_unroll() this is better. */
23262
23263 emit_move_insn (out, addr);
23264
23265 ix86_expand_strlensi_unroll_1 (out, src, align);
23266
23267 /* strlensi_unroll_1 returns the address of the zero at the end of
23268 the string, like memchr(), so compute the length by subtracting
23269 the start address. */
23270 emit_insn (ix86_gen_sub3 (out, out, addr));
23271 }
23272 else
23273 {
23274 rtx unspec;
23275
23276 /* Can't use this if the user has appropriated eax, ecx, or edi. */
23277 if (fixed_regs[AX_REG] || fixed_regs[CX_REG] || fixed_regs[DI_REG])
23278 return false;
23279
23280 scratch2 = gen_reg_rtx (Pmode);
23281 scratch3 = gen_reg_rtx (Pmode);
23282 scratch4 = force_reg (Pmode, constm1_rtx);
23283
23284 emit_move_insn (scratch3, addr);
23285 eoschar = force_reg (QImode, eoschar);
23286
23287 src = replace_equiv_address_nv (src, scratch3);
23288
23289 /* If .md starts supporting :P, this can be done in .md. */
23290 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (4, src, eoschar, align,
23291 scratch4), UNSPEC_SCAS);
23292 emit_insn (gen_strlenqi_1 (scratch1, scratch3, unspec));
23293 emit_insn (ix86_gen_one_cmpl2 (scratch2, scratch1));
23294 emit_insn (ix86_gen_add3 (out, scratch2, constm1_rtx));
23295 }
23296 return true;
23297 }
23298
23299 /* For given symbol (function) construct code to compute address of it's PLT
23300 entry in large x86-64 PIC model. */
23301 static rtx
23302 construct_plt_address (rtx symbol)
23303 {
23304 rtx tmp, unspec;
23305
23306 gcc_assert (GET_CODE (symbol) == SYMBOL_REF);
23307 gcc_assert (ix86_cmodel == CM_LARGE_PIC);
23308 gcc_assert (Pmode == DImode);
23309
23310 tmp = gen_reg_rtx (Pmode);
23311 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, symbol), UNSPEC_PLTOFF);
23312
23313 emit_move_insn (tmp, gen_rtx_CONST (Pmode, unspec));
23314 emit_insn (ix86_gen_add3 (tmp, tmp, pic_offset_table_rtx));
23315 return tmp;
23316 }
23317
23318 rtx
23319 ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1,
23320 rtx callarg2,
23321 rtx pop, bool sibcall)
23322 {
23323 /* We need to represent that SI and DI registers are clobbered
23324 by SYSV calls. */
23325 static int clobbered_registers[] = {
23326 XMM6_REG, XMM7_REG, XMM8_REG,
23327 XMM9_REG, XMM10_REG, XMM11_REG,
23328 XMM12_REG, XMM13_REG, XMM14_REG,
23329 XMM15_REG, SI_REG, DI_REG
23330 };
23331 rtx vec[ARRAY_SIZE (clobbered_registers) + 3];
23332 rtx use = NULL, call;
23333 unsigned int vec_len;
23334
23335 if (pop == const0_rtx)
23336 pop = NULL;
23337 gcc_assert (!TARGET_64BIT || !pop);
23338
23339 if (TARGET_MACHO && !TARGET_64BIT)
23340 {
23341 #if TARGET_MACHO
23342 if (flag_pic && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
23343 fnaddr = machopic_indirect_call_target (fnaddr);
23344 #endif
23345 }
23346 else
23347 {
23348 /* Static functions and indirect calls don't need the pic register. */
23349 if (flag_pic && (!TARGET_64BIT || ix86_cmodel == CM_LARGE_PIC)
23350 && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
23351 && ! SYMBOL_REF_LOCAL_P (XEXP (fnaddr, 0)))
23352 use_reg (&use, pic_offset_table_rtx);
23353 }
23354
23355 if (TARGET_64BIT && INTVAL (callarg2) >= 0)
23356 {
23357 rtx al = gen_rtx_REG (QImode, AX_REG);
23358 emit_move_insn (al, callarg2);
23359 use_reg (&use, al);
23360 }
23361
23362 if (ix86_cmodel == CM_LARGE_PIC
23363 && MEM_P (fnaddr)
23364 && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
23365 && !local_symbolic_operand (XEXP (fnaddr, 0), VOIDmode))
23366 fnaddr = gen_rtx_MEM (QImode, construct_plt_address (XEXP (fnaddr, 0)));
23367 else if (sibcall
23368 ? !sibcall_insn_operand (XEXP (fnaddr, 0), word_mode)
23369 : !call_insn_operand (XEXP (fnaddr, 0), word_mode))
23370 {
23371 fnaddr = XEXP (fnaddr, 0);
23372 if (GET_MODE (fnaddr) != word_mode)
23373 fnaddr = convert_to_mode (word_mode, fnaddr, 1);
23374 fnaddr = gen_rtx_MEM (QImode, copy_to_mode_reg (word_mode, fnaddr));
23375 }
23376
23377 vec_len = 0;
23378 call = gen_rtx_CALL (VOIDmode, fnaddr, callarg1);
23379 if (retval)
23380 call = gen_rtx_SET (VOIDmode, retval, call);
23381 vec[vec_len++] = call;
23382
23383 if (pop)
23384 {
23385 pop = gen_rtx_PLUS (Pmode, stack_pointer_rtx, pop);
23386 pop = gen_rtx_SET (VOIDmode, stack_pointer_rtx, pop);
23387 vec[vec_len++] = pop;
23388 }
23389
23390 if (TARGET_64BIT_MS_ABI
23391 && (!callarg2 || INTVAL (callarg2) != -2))
23392 {
23393 unsigned i;
23394
23395 vec[vec_len++] = gen_rtx_UNSPEC (VOIDmode, gen_rtvec (1, const0_rtx),
23396 UNSPEC_MS_TO_SYSV_CALL);
23397
23398 for (i = 0; i < ARRAY_SIZE (clobbered_registers); i++)
23399 vec[vec_len++]
23400 = gen_rtx_CLOBBER (SSE_REGNO_P (clobbered_registers[i])
23401 ? TImode : DImode,
23402 gen_rtx_REG (SSE_REGNO_P (clobbered_registers[i])
23403 ? TImode : DImode,
23404 clobbered_registers[i]));
23405 }
23406
23407 /* Add UNSPEC_CALL_NEEDS_VZEROUPPER decoration. */
23408 if (TARGET_VZEROUPPER)
23409 {
23410 int avx256;
23411 if (cfun->machine->callee_pass_avx256_p)
23412 {
23413 if (cfun->machine->callee_return_avx256_p)
23414 avx256 = callee_return_pass_avx256;
23415 else
23416 avx256 = callee_pass_avx256;
23417 }
23418 else if (cfun->machine->callee_return_avx256_p)
23419 avx256 = callee_return_avx256;
23420 else
23421 avx256 = call_no_avx256;
23422
23423 if (reload_completed)
23424 emit_insn (gen_avx_vzeroupper (GEN_INT (avx256)));
23425 else
23426 vec[vec_len++] = gen_rtx_UNSPEC (VOIDmode,
23427 gen_rtvec (1, GEN_INT (avx256)),
23428 UNSPEC_CALL_NEEDS_VZEROUPPER);
23429 }
23430
23431 if (vec_len > 1)
23432 call = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (vec_len, vec));
23433 call = emit_call_insn (call);
23434 if (use)
23435 CALL_INSN_FUNCTION_USAGE (call) = use;
23436
23437 return call;
23438 }
23439
23440 void
23441 ix86_split_call_vzeroupper (rtx insn, rtx vzeroupper)
23442 {
23443 rtx pat = PATTERN (insn);
23444 rtvec vec = XVEC (pat, 0);
23445 int len = GET_NUM_ELEM (vec) - 1;
23446
23447 /* Strip off the last entry of the parallel. */
23448 gcc_assert (GET_CODE (RTVEC_ELT (vec, len)) == UNSPEC);
23449 gcc_assert (XINT (RTVEC_ELT (vec, len), 1) == UNSPEC_CALL_NEEDS_VZEROUPPER);
23450 if (len == 1)
23451 pat = RTVEC_ELT (vec, 0);
23452 else
23453 pat = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (len, &RTVEC_ELT (vec, 0)));
23454
23455 emit_insn (gen_avx_vzeroupper (vzeroupper));
23456 emit_call_insn (pat);
23457 }
23458
23459 /* Output the assembly for a call instruction. */
23460
23461 const char *
23462 ix86_output_call_insn (rtx insn, rtx call_op)
23463 {
23464 bool direct_p = constant_call_address_operand (call_op, VOIDmode);
23465 bool seh_nop_p = false;
23466 const char *xasm;
23467
23468 if (SIBLING_CALL_P (insn))
23469 {
23470 if (direct_p)
23471 xasm = "jmp\t%P0";
23472 /* SEH epilogue detection requires the indirect branch case
23473 to include REX.W. */
23474 else if (TARGET_SEH)
23475 xasm = "rex.W jmp %A0";
23476 else
23477 xasm = "jmp\t%A0";
23478
23479 output_asm_insn (xasm, &call_op);
23480 return "";
23481 }
23482
23483 /* SEH unwinding can require an extra nop to be emitted in several
23484 circumstances. Determine if we have one of those. */
23485 if (TARGET_SEH)
23486 {
23487 rtx i;
23488
23489 for (i = NEXT_INSN (insn); i ; i = NEXT_INSN (i))
23490 {
23491 /* If we get to another real insn, we don't need the nop. */
23492 if (INSN_P (i))
23493 break;
23494
23495 /* If we get to the epilogue note, prevent a catch region from
23496 being adjacent to the standard epilogue sequence. If non-
23497 call-exceptions, we'll have done this during epilogue emission. */
23498 if (NOTE_P (i) && NOTE_KIND (i) == NOTE_INSN_EPILOGUE_BEG
23499 && !flag_non_call_exceptions
23500 && !can_throw_internal (insn))
23501 {
23502 seh_nop_p = true;
23503 break;
23504 }
23505 }
23506
23507 /* If we didn't find a real insn following the call, prevent the
23508 unwinder from looking into the next function. */
23509 if (i == NULL)
23510 seh_nop_p = true;
23511 }
23512
23513 if (direct_p)
23514 xasm = "call\t%P0";
23515 else
23516 xasm = "call\t%A0";
23517
23518 output_asm_insn (xasm, &call_op);
23519
23520 if (seh_nop_p)
23521 return "nop";
23522
23523 return "";
23524 }
23525 \f
23526 /* Clear stack slot assignments remembered from previous functions.
23527 This is called from INIT_EXPANDERS once before RTL is emitted for each
23528 function. */
23529
23530 static struct machine_function *
23531 ix86_init_machine_status (void)
23532 {
23533 struct machine_function *f;
23534
23535 f = ggc_alloc_cleared_machine_function ();
23536 f->use_fast_prologue_epilogue_nregs = -1;
23537 f->tls_descriptor_call_expanded_p = 0;
23538 f->call_abi = ix86_abi;
23539
23540 return f;
23541 }
23542
23543 /* Return a MEM corresponding to a stack slot with mode MODE.
23544 Allocate a new slot if necessary.
23545
23546 The RTL for a function can have several slots available: N is
23547 which slot to use. */
23548
23549 rtx
23550 assign_386_stack_local (enum machine_mode mode, enum ix86_stack_slot n)
23551 {
23552 struct stack_local_entry *s;
23553
23554 gcc_assert (n < MAX_386_STACK_LOCALS);
23555
23556 /* Virtual slot is valid only before vregs are instantiated. */
23557 gcc_assert ((n == SLOT_VIRTUAL) == !virtuals_instantiated);
23558
23559 for (s = ix86_stack_locals; s; s = s->next)
23560 if (s->mode == mode && s->n == n)
23561 return validize_mem (copy_rtx (s->rtl));
23562
23563 s = ggc_alloc_stack_local_entry ();
23564 s->n = n;
23565 s->mode = mode;
23566 s->rtl = assign_stack_local (mode, GET_MODE_SIZE (mode), 0);
23567
23568 s->next = ix86_stack_locals;
23569 ix86_stack_locals = s;
23570 return validize_mem (s->rtl);
23571 }
23572 \f
23573 /* Calculate the length of the memory address in the instruction encoding.
23574 Includes addr32 prefix, does not include the one-byte modrm, opcode,
23575 or other prefixes. */
23576
23577 int
23578 memory_address_length (rtx addr)
23579 {
23580 struct ix86_address parts;
23581 rtx base, index, disp;
23582 int len;
23583 int ok;
23584
23585 if (GET_CODE (addr) == PRE_DEC
23586 || GET_CODE (addr) == POST_INC
23587 || GET_CODE (addr) == PRE_MODIFY
23588 || GET_CODE (addr) == POST_MODIFY)
23589 return 0;
23590
23591 ok = ix86_decompose_address (addr, &parts);
23592 gcc_assert (ok);
23593
23594 if (parts.base && GET_CODE (parts.base) == SUBREG)
23595 parts.base = SUBREG_REG (parts.base);
23596 if (parts.index && GET_CODE (parts.index) == SUBREG)
23597 parts.index = SUBREG_REG (parts.index);
23598
23599 base = parts.base;
23600 index = parts.index;
23601 disp = parts.disp;
23602
23603 /* Add length of addr32 prefix. */
23604 len = (GET_CODE (addr) == ZERO_EXTEND
23605 || GET_CODE (addr) == AND);
23606
23607 /* Rule of thumb:
23608 - esp as the base always wants an index,
23609 - ebp as the base always wants a displacement,
23610 - r12 as the base always wants an index,
23611 - r13 as the base always wants a displacement. */
23612
23613 /* Register Indirect. */
23614 if (base && !index && !disp)
23615 {
23616 /* esp (for its index) and ebp (for its displacement) need
23617 the two-byte modrm form. Similarly for r12 and r13 in 64-bit
23618 code. */
23619 if (REG_P (addr)
23620 && (addr == arg_pointer_rtx
23621 || addr == frame_pointer_rtx
23622 || REGNO (addr) == SP_REG
23623 || REGNO (addr) == BP_REG
23624 || REGNO (addr) == R12_REG
23625 || REGNO (addr) == R13_REG))
23626 len = 1;
23627 }
23628
23629 /* Direct Addressing. In 64-bit mode mod 00 r/m 5
23630 is not disp32, but disp32(%rip), so for disp32
23631 SIB byte is needed, unless print_operand_address
23632 optimizes it into disp32(%rip) or (%rip) is implied
23633 by UNSPEC. */
23634 else if (disp && !base && !index)
23635 {
23636 len = 4;
23637 if (TARGET_64BIT)
23638 {
23639 rtx symbol = disp;
23640
23641 if (GET_CODE (disp) == CONST)
23642 symbol = XEXP (disp, 0);
23643 if (GET_CODE (symbol) == PLUS
23644 && CONST_INT_P (XEXP (symbol, 1)))
23645 symbol = XEXP (symbol, 0);
23646
23647 if (GET_CODE (symbol) != LABEL_REF
23648 && (GET_CODE (symbol) != SYMBOL_REF
23649 || SYMBOL_REF_TLS_MODEL (symbol) != 0)
23650 && (GET_CODE (symbol) != UNSPEC
23651 || (XINT (symbol, 1) != UNSPEC_GOTPCREL
23652 && XINT (symbol, 1) != UNSPEC_PCREL
23653 && XINT (symbol, 1) != UNSPEC_GOTNTPOFF)))
23654 len += 1;
23655 }
23656 }
23657
23658 else
23659 {
23660 /* Find the length of the displacement constant. */
23661 if (disp)
23662 {
23663 if (base && satisfies_constraint_K (disp))
23664 len = 1;
23665 else
23666 len = 4;
23667 }
23668 /* ebp always wants a displacement. Similarly r13. */
23669 else if (base && REG_P (base)
23670 && (REGNO (base) == BP_REG || REGNO (base) == R13_REG))
23671 len = 1;
23672
23673 /* An index requires the two-byte modrm form.... */
23674 if (index
23675 /* ...like esp (or r12), which always wants an index. */
23676 || base == arg_pointer_rtx
23677 || base == frame_pointer_rtx
23678 || (base && REG_P (base)
23679 && (REGNO (base) == SP_REG || REGNO (base) == R12_REG)))
23680 len += 1;
23681 }
23682
23683 switch (parts.seg)
23684 {
23685 case SEG_FS:
23686 case SEG_GS:
23687 len += 1;
23688 break;
23689 default:
23690 break;
23691 }
23692
23693 return len;
23694 }
23695
23696 /* Compute default value for "length_immediate" attribute. When SHORTFORM
23697 is set, expect that insn have 8bit immediate alternative. */
23698 int
23699 ix86_attr_length_immediate_default (rtx insn, bool shortform)
23700 {
23701 int len = 0;
23702 int i;
23703 extract_insn_cached (insn);
23704 for (i = recog_data.n_operands - 1; i >= 0; --i)
23705 if (CONSTANT_P (recog_data.operand[i]))
23706 {
23707 enum attr_mode mode = get_attr_mode (insn);
23708
23709 gcc_assert (!len);
23710 if (shortform && CONST_INT_P (recog_data.operand[i]))
23711 {
23712 HOST_WIDE_INT ival = INTVAL (recog_data.operand[i]);
23713 switch (mode)
23714 {
23715 case MODE_QI:
23716 len = 1;
23717 continue;
23718 case MODE_HI:
23719 ival = trunc_int_for_mode (ival, HImode);
23720 break;
23721 case MODE_SI:
23722 ival = trunc_int_for_mode (ival, SImode);
23723 break;
23724 default:
23725 break;
23726 }
23727 if (IN_RANGE (ival, -128, 127))
23728 {
23729 len = 1;
23730 continue;
23731 }
23732 }
23733 switch (mode)
23734 {
23735 case MODE_QI:
23736 len = 1;
23737 break;
23738 case MODE_HI:
23739 len = 2;
23740 break;
23741 case MODE_SI:
23742 len = 4;
23743 break;
23744 /* Immediates for DImode instructions are encoded as 32bit sign extended values. */
23745 case MODE_DI:
23746 len = 4;
23747 break;
23748 default:
23749 fatal_insn ("unknown insn mode", insn);
23750 }
23751 }
23752 return len;
23753 }
23754 /* Compute default value for "length_address" attribute. */
23755 int
23756 ix86_attr_length_address_default (rtx insn)
23757 {
23758 int i;
23759
23760 if (get_attr_type (insn) == TYPE_LEA)
23761 {
23762 rtx set = PATTERN (insn), addr;
23763
23764 if (GET_CODE (set) == PARALLEL)
23765 set = XVECEXP (set, 0, 0);
23766
23767 gcc_assert (GET_CODE (set) == SET);
23768
23769 addr = SET_SRC (set);
23770 if (TARGET_64BIT && get_attr_mode (insn) == MODE_SI)
23771 {
23772 if (GET_CODE (addr) == ZERO_EXTEND)
23773 addr = XEXP (addr, 0);
23774 if (GET_CODE (addr) == SUBREG)
23775 addr = SUBREG_REG (addr);
23776 }
23777
23778 return memory_address_length (addr);
23779 }
23780
23781 extract_insn_cached (insn);
23782 for (i = recog_data.n_operands - 1; i >= 0; --i)
23783 if (MEM_P (recog_data.operand[i]))
23784 {
23785 constrain_operands_cached (reload_completed);
23786 if (which_alternative != -1)
23787 {
23788 const char *constraints = recog_data.constraints[i];
23789 int alt = which_alternative;
23790
23791 while (*constraints == '=' || *constraints == '+')
23792 constraints++;
23793 while (alt-- > 0)
23794 while (*constraints++ != ',')
23795 ;
23796 /* Skip ignored operands. */
23797 if (*constraints == 'X')
23798 continue;
23799 }
23800 return memory_address_length (XEXP (recog_data.operand[i], 0));
23801 }
23802 return 0;
23803 }
23804
23805 /* Compute default value for "length_vex" attribute. It includes
23806 2 or 3 byte VEX prefix and 1 opcode byte. */
23807
23808 int
23809 ix86_attr_length_vex_default (rtx insn, bool has_0f_opcode, bool has_vex_w)
23810 {
23811 int i;
23812
23813 /* Only 0f opcode can use 2 byte VEX prefix and VEX W bit uses 3
23814 byte VEX prefix. */
23815 if (!has_0f_opcode || has_vex_w)
23816 return 3 + 1;
23817
23818 /* We can always use 2 byte VEX prefix in 32bit. */
23819 if (!TARGET_64BIT)
23820 return 2 + 1;
23821
23822 extract_insn_cached (insn);
23823
23824 for (i = recog_data.n_operands - 1; i >= 0; --i)
23825 if (REG_P (recog_data.operand[i]))
23826 {
23827 /* REX.W bit uses 3 byte VEX prefix. */
23828 if (GET_MODE (recog_data.operand[i]) == DImode
23829 && GENERAL_REG_P (recog_data.operand[i]))
23830 return 3 + 1;
23831 }
23832 else
23833 {
23834 /* REX.X or REX.B bits use 3 byte VEX prefix. */
23835 if (MEM_P (recog_data.operand[i])
23836 && x86_extended_reg_mentioned_p (recog_data.operand[i]))
23837 return 3 + 1;
23838 }
23839
23840 return 2 + 1;
23841 }
23842 \f
23843 /* Return the maximum number of instructions a cpu can issue. */
23844
23845 static int
23846 ix86_issue_rate (void)
23847 {
23848 switch (ix86_tune)
23849 {
23850 case PROCESSOR_PENTIUM:
23851 case PROCESSOR_ATOM:
23852 case PROCESSOR_K6:
23853 case PROCESSOR_BTVER2:
23854 return 2;
23855
23856 case PROCESSOR_PENTIUMPRO:
23857 case PROCESSOR_PENTIUM4:
23858 case PROCESSOR_CORE2_32:
23859 case PROCESSOR_CORE2_64:
23860 case PROCESSOR_COREI7_32:
23861 case PROCESSOR_COREI7_64:
23862 case PROCESSOR_ATHLON:
23863 case PROCESSOR_K8:
23864 case PROCESSOR_AMDFAM10:
23865 case PROCESSOR_NOCONA:
23866 case PROCESSOR_GENERIC32:
23867 case PROCESSOR_GENERIC64:
23868 case PROCESSOR_BDVER1:
23869 case PROCESSOR_BDVER2:
23870 case PROCESSOR_BTVER1:
23871 return 3;
23872
23873 default:
23874 return 1;
23875 }
23876 }
23877
23878 /* A subroutine of ix86_adjust_cost -- return TRUE iff INSN reads flags set
23879 by DEP_INSN and nothing set by DEP_INSN. */
23880
23881 static bool
23882 ix86_flags_dependent (rtx insn, rtx dep_insn, enum attr_type insn_type)
23883 {
23884 rtx set, set2;
23885
23886 /* Simplify the test for uninteresting insns. */
23887 if (insn_type != TYPE_SETCC
23888 && insn_type != TYPE_ICMOV
23889 && insn_type != TYPE_FCMOV
23890 && insn_type != TYPE_IBR)
23891 return false;
23892
23893 if ((set = single_set (dep_insn)) != 0)
23894 {
23895 set = SET_DEST (set);
23896 set2 = NULL_RTX;
23897 }
23898 else if (GET_CODE (PATTERN (dep_insn)) == PARALLEL
23899 && XVECLEN (PATTERN (dep_insn), 0) == 2
23900 && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 0)) == SET
23901 && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 1)) == SET)
23902 {
23903 set = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
23904 set2 = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
23905 }
23906 else
23907 return false;
23908
23909 if (!REG_P (set) || REGNO (set) != FLAGS_REG)
23910 return false;
23911
23912 /* This test is true if the dependent insn reads the flags but
23913 not any other potentially set register. */
23914 if (!reg_overlap_mentioned_p (set, PATTERN (insn)))
23915 return false;
23916
23917 if (set2 && reg_overlap_mentioned_p (set2, PATTERN (insn)))
23918 return false;
23919
23920 return true;
23921 }
23922
23923 /* Return true iff USE_INSN has a memory address with operands set by
23924 SET_INSN. */
23925
23926 bool
23927 ix86_agi_dependent (rtx set_insn, rtx use_insn)
23928 {
23929 int i;
23930 extract_insn_cached (use_insn);
23931 for (i = recog_data.n_operands - 1; i >= 0; --i)
23932 if (MEM_P (recog_data.operand[i]))
23933 {
23934 rtx addr = XEXP (recog_data.operand[i], 0);
23935 return modified_in_p (addr, set_insn) != 0;
23936 }
23937 return false;
23938 }
23939
23940 static int
23941 ix86_adjust_cost (rtx insn, rtx link, rtx dep_insn, int cost)
23942 {
23943 enum attr_type insn_type, dep_insn_type;
23944 enum attr_memory memory;
23945 rtx set, set2;
23946 int dep_insn_code_number;
23947
23948 /* Anti and output dependencies have zero cost on all CPUs. */
23949 if (REG_NOTE_KIND (link) != 0)
23950 return 0;
23951
23952 dep_insn_code_number = recog_memoized (dep_insn);
23953
23954 /* If we can't recognize the insns, we can't really do anything. */
23955 if (dep_insn_code_number < 0 || recog_memoized (insn) < 0)
23956 return cost;
23957
23958 insn_type = get_attr_type (insn);
23959 dep_insn_type = get_attr_type (dep_insn);
23960
23961 switch (ix86_tune)
23962 {
23963 case PROCESSOR_PENTIUM:
23964 /* Address Generation Interlock adds a cycle of latency. */
23965 if (insn_type == TYPE_LEA)
23966 {
23967 rtx addr = PATTERN (insn);
23968
23969 if (GET_CODE (addr) == PARALLEL)
23970 addr = XVECEXP (addr, 0, 0);
23971
23972 gcc_assert (GET_CODE (addr) == SET);
23973
23974 addr = SET_SRC (addr);
23975 if (modified_in_p (addr, dep_insn))
23976 cost += 1;
23977 }
23978 else if (ix86_agi_dependent (dep_insn, insn))
23979 cost += 1;
23980
23981 /* ??? Compares pair with jump/setcc. */
23982 if (ix86_flags_dependent (insn, dep_insn, insn_type))
23983 cost = 0;
23984
23985 /* Floating point stores require value to be ready one cycle earlier. */
23986 if (insn_type == TYPE_FMOV
23987 && get_attr_memory (insn) == MEMORY_STORE
23988 && !ix86_agi_dependent (dep_insn, insn))
23989 cost += 1;
23990 break;
23991
23992 case PROCESSOR_PENTIUMPRO:
23993 memory = get_attr_memory (insn);
23994
23995 /* INT->FP conversion is expensive. */
23996 if (get_attr_fp_int_src (dep_insn))
23997 cost += 5;
23998
23999 /* There is one cycle extra latency between an FP op and a store. */
24000 if (insn_type == TYPE_FMOV
24001 && (set = single_set (dep_insn)) != NULL_RTX
24002 && (set2 = single_set (insn)) != NULL_RTX
24003 && rtx_equal_p (SET_DEST (set), SET_SRC (set2))
24004 && MEM_P (SET_DEST (set2)))
24005 cost += 1;
24006
24007 /* Show ability of reorder buffer to hide latency of load by executing
24008 in parallel with previous instruction in case
24009 previous instruction is not needed to compute the address. */
24010 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
24011 && !ix86_agi_dependent (dep_insn, insn))
24012 {
24013 /* Claim moves to take one cycle, as core can issue one load
24014 at time and the next load can start cycle later. */
24015 if (dep_insn_type == TYPE_IMOV
24016 || dep_insn_type == TYPE_FMOV)
24017 cost = 1;
24018 else if (cost > 1)
24019 cost--;
24020 }
24021 break;
24022
24023 case PROCESSOR_K6:
24024 memory = get_attr_memory (insn);
24025
24026 /* The esp dependency is resolved before the instruction is really
24027 finished. */
24028 if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
24029 && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
24030 return 1;
24031
24032 /* INT->FP conversion is expensive. */
24033 if (get_attr_fp_int_src (dep_insn))
24034 cost += 5;
24035
24036 /* Show ability of reorder buffer to hide latency of load by executing
24037 in parallel with previous instruction in case
24038 previous instruction is not needed to compute the address. */
24039 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
24040 && !ix86_agi_dependent (dep_insn, insn))
24041 {
24042 /* Claim moves to take one cycle, as core can issue one load
24043 at time and the next load can start cycle later. */
24044 if (dep_insn_type == TYPE_IMOV
24045 || dep_insn_type == TYPE_FMOV)
24046 cost = 1;
24047 else if (cost > 2)
24048 cost -= 2;
24049 else
24050 cost = 1;
24051 }
24052 break;
24053
24054 case PROCESSOR_ATHLON:
24055 case PROCESSOR_K8:
24056 case PROCESSOR_AMDFAM10:
24057 case PROCESSOR_BDVER1:
24058 case PROCESSOR_BDVER2:
24059 case PROCESSOR_BTVER1:
24060 case PROCESSOR_BTVER2:
24061 case PROCESSOR_ATOM:
24062 case PROCESSOR_GENERIC32:
24063 case PROCESSOR_GENERIC64:
24064 memory = get_attr_memory (insn);
24065
24066 /* Show ability of reorder buffer to hide latency of load by executing
24067 in parallel with previous instruction in case
24068 previous instruction is not needed to compute the address. */
24069 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
24070 && !ix86_agi_dependent (dep_insn, insn))
24071 {
24072 enum attr_unit unit = get_attr_unit (insn);
24073 int loadcost = 3;
24074
24075 /* Because of the difference between the length of integer and
24076 floating unit pipeline preparation stages, the memory operands
24077 for floating point are cheaper.
24078
24079 ??? For Athlon it the difference is most probably 2. */
24080 if (unit == UNIT_INTEGER || unit == UNIT_UNKNOWN)
24081 loadcost = 3;
24082 else
24083 loadcost = TARGET_ATHLON ? 2 : 0;
24084
24085 if (cost >= loadcost)
24086 cost -= loadcost;
24087 else
24088 cost = 0;
24089 }
24090
24091 default:
24092 break;
24093 }
24094
24095 return cost;
24096 }
24097
24098 /* How many alternative schedules to try. This should be as wide as the
24099 scheduling freedom in the DFA, but no wider. Making this value too
24100 large results extra work for the scheduler. */
24101
24102 static int
24103 ia32_multipass_dfa_lookahead (void)
24104 {
24105 switch (ix86_tune)
24106 {
24107 case PROCESSOR_PENTIUM:
24108 return 2;
24109
24110 case PROCESSOR_PENTIUMPRO:
24111 case PROCESSOR_K6:
24112 return 1;
24113
24114 case PROCESSOR_CORE2_32:
24115 case PROCESSOR_CORE2_64:
24116 case PROCESSOR_COREI7_32:
24117 case PROCESSOR_COREI7_64:
24118 /* Generally, we want haifa-sched:max_issue() to look ahead as far
24119 as many instructions can be executed on a cycle, i.e.,
24120 issue_rate. I wonder why tuning for many CPUs does not do this. */
24121 return ix86_issue_rate ();
24122
24123 default:
24124 return 0;
24125 }
24126 }
24127
24128 /* Try to reorder ready list to take advantage of Atom pipelined IMUL
24129 execution. It is applied if
24130 (1) IMUL instruction is on the top of list;
24131 (2) There exists the only producer of independent IMUL instruction in
24132 ready list;
24133 (3) Put found producer on the top of ready list.
24134 Returns issue rate. */
24135
24136 static int
24137 ix86_sched_reorder(FILE *dump, int sched_verbose, rtx *ready, int *pn_ready,
24138 int clock_var ATTRIBUTE_UNUSED)
24139 {
24140 static int issue_rate = -1;
24141 int n_ready = *pn_ready;
24142 rtx insn, insn1, insn2;
24143 int i;
24144 sd_iterator_def sd_it;
24145 dep_t dep;
24146 int index = -1;
24147
24148 /* Set up issue rate. */
24149 issue_rate = ix86_issue_rate();
24150
24151 /* Do reodering for Atom only. */
24152 if (ix86_tune != PROCESSOR_ATOM)
24153 return issue_rate;
24154 /* Nothing to do if ready list contains only 1 instruction. */
24155 if (n_ready <= 1)
24156 return issue_rate;
24157
24158 /* Check that IMUL instruction is on the top of ready list. */
24159 insn = ready[n_ready - 1];
24160 if (!NONDEBUG_INSN_P (insn))
24161 return issue_rate;
24162 insn = PATTERN (insn);
24163 if (GET_CODE (insn) == PARALLEL)
24164 insn = XVECEXP (insn, 0, 0);
24165 if (GET_CODE (insn) != SET)
24166 return issue_rate;
24167 if (!(GET_CODE (SET_SRC (insn)) == MULT
24168 && GET_MODE (SET_SRC (insn)) == SImode))
24169 return issue_rate;
24170
24171 /* Search for producer of independent IMUL instruction. */
24172 for (i = n_ready - 2; i>= 0; i--)
24173 {
24174 insn = ready[i];
24175 if (!NONDEBUG_INSN_P (insn))
24176 continue;
24177 /* Skip IMUL instruction. */
24178 insn2 = PATTERN (insn);
24179 if (GET_CODE (insn2) == PARALLEL)
24180 insn2 = XVECEXP (insn2, 0, 0);
24181 if (GET_CODE (insn2) == SET
24182 && GET_CODE (SET_SRC (insn2)) == MULT
24183 && GET_MODE (SET_SRC (insn2)) == SImode)
24184 continue;
24185
24186 FOR_EACH_DEP (insn, SD_LIST_FORW, sd_it, dep)
24187 {
24188 rtx con;
24189 con = DEP_CON (dep);
24190 if (!NONDEBUG_INSN_P (con))
24191 continue;
24192 insn1 = PATTERN (con);
24193 if (GET_CODE (insn1) == PARALLEL)
24194 insn1 = XVECEXP (insn1, 0, 0);
24195
24196 if (GET_CODE (insn1) == SET
24197 && GET_CODE (SET_SRC (insn1)) == MULT
24198 && GET_MODE (SET_SRC (insn1)) == SImode)
24199 {
24200 sd_iterator_def sd_it1;
24201 dep_t dep1;
24202 /* Check if there is no other dependee for IMUL. */
24203 index = i;
24204 FOR_EACH_DEP (con, SD_LIST_BACK, sd_it1, dep1)
24205 {
24206 rtx pro;
24207 pro = DEP_PRO (dep1);
24208 if (!NONDEBUG_INSN_P (pro))
24209 continue;
24210 if (pro != insn)
24211 index = -1;
24212 }
24213 if (index >= 0)
24214 break;
24215 }
24216 }
24217 if (index >= 0)
24218 break;
24219 }
24220 if (index < 0)
24221 return issue_rate; /* Didn't find IMUL producer. */
24222
24223 if (sched_verbose > 1)
24224 fprintf(dump, ";;\tatom sched_reorder: swap %d and %d insns\n",
24225 INSN_UID (ready[index]), INSN_UID (ready[n_ready - 1]));
24226
24227 /* Put IMUL producer (ready[index]) at the top of ready list. */
24228 insn1= ready[index];
24229 for (i = index; i < n_ready - 1; i++)
24230 ready[i] = ready[i + 1];
24231 ready[n_ready - 1] = insn1;
24232
24233 return issue_rate;
24234 }
24235
24236 \f
24237
24238 /* Model decoder of Core 2/i7.
24239 Below hooks for multipass scheduling (see haifa-sched.c:max_issue)
24240 track the instruction fetch block boundaries and make sure that long
24241 (9+ bytes) instructions are assigned to D0. */
24242
24243 /* Maximum length of an insn that can be handled by
24244 a secondary decoder unit. '8' for Core 2/i7. */
24245 static int core2i7_secondary_decoder_max_insn_size;
24246
24247 /* Ifetch block size, i.e., number of bytes decoder reads per cycle.
24248 '16' for Core 2/i7. */
24249 static int core2i7_ifetch_block_size;
24250
24251 /* Maximum number of instructions decoder can handle per cycle.
24252 '6' for Core 2/i7. */
24253 static int core2i7_ifetch_block_max_insns;
24254
24255 typedef struct ix86_first_cycle_multipass_data_ *
24256 ix86_first_cycle_multipass_data_t;
24257 typedef const struct ix86_first_cycle_multipass_data_ *
24258 const_ix86_first_cycle_multipass_data_t;
24259
24260 /* A variable to store target state across calls to max_issue within
24261 one cycle. */
24262 static struct ix86_first_cycle_multipass_data_ _ix86_first_cycle_multipass_data,
24263 *ix86_first_cycle_multipass_data = &_ix86_first_cycle_multipass_data;
24264
24265 /* Initialize DATA. */
24266 static void
24267 core2i7_first_cycle_multipass_init (void *_data)
24268 {
24269 ix86_first_cycle_multipass_data_t data
24270 = (ix86_first_cycle_multipass_data_t) _data;
24271
24272 data->ifetch_block_len = 0;
24273 data->ifetch_block_n_insns = 0;
24274 data->ready_try_change = NULL;
24275 data->ready_try_change_size = 0;
24276 }
24277
24278 /* Advancing the cycle; reset ifetch block counts. */
24279 static void
24280 core2i7_dfa_post_advance_cycle (void)
24281 {
24282 ix86_first_cycle_multipass_data_t data = ix86_first_cycle_multipass_data;
24283
24284 gcc_assert (data->ifetch_block_n_insns <= core2i7_ifetch_block_max_insns);
24285
24286 data->ifetch_block_len = 0;
24287 data->ifetch_block_n_insns = 0;
24288 }
24289
24290 static int min_insn_size (rtx);
24291
24292 /* Filter out insns from ready_try that the core will not be able to issue
24293 on current cycle due to decoder. */
24294 static void
24295 core2i7_first_cycle_multipass_filter_ready_try
24296 (const_ix86_first_cycle_multipass_data_t data,
24297 char *ready_try, int n_ready, bool first_cycle_insn_p)
24298 {
24299 while (n_ready--)
24300 {
24301 rtx insn;
24302 int insn_size;
24303
24304 if (ready_try[n_ready])
24305 continue;
24306
24307 insn = get_ready_element (n_ready);
24308 insn_size = min_insn_size (insn);
24309
24310 if (/* If this is a too long an insn for a secondary decoder ... */
24311 (!first_cycle_insn_p
24312 && insn_size > core2i7_secondary_decoder_max_insn_size)
24313 /* ... or it would not fit into the ifetch block ... */
24314 || data->ifetch_block_len + insn_size > core2i7_ifetch_block_size
24315 /* ... or the decoder is full already ... */
24316 || data->ifetch_block_n_insns + 1 > core2i7_ifetch_block_max_insns)
24317 /* ... mask the insn out. */
24318 {
24319 ready_try[n_ready] = 1;
24320
24321 if (data->ready_try_change)
24322 SET_BIT (data->ready_try_change, n_ready);
24323 }
24324 }
24325 }
24326
24327 /* Prepare for a new round of multipass lookahead scheduling. */
24328 static void
24329 core2i7_first_cycle_multipass_begin (void *_data, char *ready_try, int n_ready,
24330 bool first_cycle_insn_p)
24331 {
24332 ix86_first_cycle_multipass_data_t data
24333 = (ix86_first_cycle_multipass_data_t) _data;
24334 const_ix86_first_cycle_multipass_data_t prev_data
24335 = ix86_first_cycle_multipass_data;
24336
24337 /* Restore the state from the end of the previous round. */
24338 data->ifetch_block_len = prev_data->ifetch_block_len;
24339 data->ifetch_block_n_insns = prev_data->ifetch_block_n_insns;
24340
24341 /* Filter instructions that cannot be issued on current cycle due to
24342 decoder restrictions. */
24343 core2i7_first_cycle_multipass_filter_ready_try (data, ready_try, n_ready,
24344 first_cycle_insn_p);
24345 }
24346
24347 /* INSN is being issued in current solution. Account for its impact on
24348 the decoder model. */
24349 static void
24350 core2i7_first_cycle_multipass_issue (void *_data, char *ready_try, int n_ready,
24351 rtx insn, const void *_prev_data)
24352 {
24353 ix86_first_cycle_multipass_data_t data
24354 = (ix86_first_cycle_multipass_data_t) _data;
24355 const_ix86_first_cycle_multipass_data_t prev_data
24356 = (const_ix86_first_cycle_multipass_data_t) _prev_data;
24357
24358 int insn_size = min_insn_size (insn);
24359
24360 data->ifetch_block_len = prev_data->ifetch_block_len + insn_size;
24361 data->ifetch_block_n_insns = prev_data->ifetch_block_n_insns + 1;
24362 gcc_assert (data->ifetch_block_len <= core2i7_ifetch_block_size
24363 && data->ifetch_block_n_insns <= core2i7_ifetch_block_max_insns);
24364
24365 /* Allocate or resize the bitmap for storing INSN's effect on ready_try. */
24366 if (!data->ready_try_change)
24367 {
24368 data->ready_try_change = sbitmap_alloc (n_ready);
24369 data->ready_try_change_size = n_ready;
24370 }
24371 else if (data->ready_try_change_size < n_ready)
24372 {
24373 data->ready_try_change = sbitmap_resize (data->ready_try_change,
24374 n_ready, 0);
24375 data->ready_try_change_size = n_ready;
24376 }
24377 sbitmap_zero (data->ready_try_change);
24378
24379 /* Filter out insns from ready_try that the core will not be able to issue
24380 on current cycle due to decoder. */
24381 core2i7_first_cycle_multipass_filter_ready_try (data, ready_try, n_ready,
24382 false);
24383 }
24384
24385 /* Revert the effect on ready_try. */
24386 static void
24387 core2i7_first_cycle_multipass_backtrack (const void *_data,
24388 char *ready_try,
24389 int n_ready ATTRIBUTE_UNUSED)
24390 {
24391 const_ix86_first_cycle_multipass_data_t data
24392 = (const_ix86_first_cycle_multipass_data_t) _data;
24393 unsigned int i = 0;
24394 sbitmap_iterator sbi;
24395
24396 gcc_assert (sbitmap_last_set_bit (data->ready_try_change) < n_ready);
24397 EXECUTE_IF_SET_IN_SBITMAP (data->ready_try_change, 0, i, sbi)
24398 {
24399 ready_try[i] = 0;
24400 }
24401 }
24402
24403 /* Save the result of multipass lookahead scheduling for the next round. */
24404 static void
24405 core2i7_first_cycle_multipass_end (const void *_data)
24406 {
24407 const_ix86_first_cycle_multipass_data_t data
24408 = (const_ix86_first_cycle_multipass_data_t) _data;
24409 ix86_first_cycle_multipass_data_t next_data
24410 = ix86_first_cycle_multipass_data;
24411
24412 if (data != NULL)
24413 {
24414 next_data->ifetch_block_len = data->ifetch_block_len;
24415 next_data->ifetch_block_n_insns = data->ifetch_block_n_insns;
24416 }
24417 }
24418
24419 /* Deallocate target data. */
24420 static void
24421 core2i7_first_cycle_multipass_fini (void *_data)
24422 {
24423 ix86_first_cycle_multipass_data_t data
24424 = (ix86_first_cycle_multipass_data_t) _data;
24425
24426 if (data->ready_try_change)
24427 {
24428 sbitmap_free (data->ready_try_change);
24429 data->ready_try_change = NULL;
24430 data->ready_try_change_size = 0;
24431 }
24432 }
24433
24434 /* Prepare for scheduling pass. */
24435 static void
24436 ix86_sched_init_global (FILE *dump ATTRIBUTE_UNUSED,
24437 int verbose ATTRIBUTE_UNUSED,
24438 int max_uid ATTRIBUTE_UNUSED)
24439 {
24440 /* Install scheduling hooks for current CPU. Some of these hooks are used
24441 in time-critical parts of the scheduler, so we only set them up when
24442 they are actually used. */
24443 switch (ix86_tune)
24444 {
24445 case PROCESSOR_CORE2_32:
24446 case PROCESSOR_CORE2_64:
24447 case PROCESSOR_COREI7_32:
24448 case PROCESSOR_COREI7_64:
24449 targetm.sched.dfa_post_advance_cycle
24450 = core2i7_dfa_post_advance_cycle;
24451 targetm.sched.first_cycle_multipass_init
24452 = core2i7_first_cycle_multipass_init;
24453 targetm.sched.first_cycle_multipass_begin
24454 = core2i7_first_cycle_multipass_begin;
24455 targetm.sched.first_cycle_multipass_issue
24456 = core2i7_first_cycle_multipass_issue;
24457 targetm.sched.first_cycle_multipass_backtrack
24458 = core2i7_first_cycle_multipass_backtrack;
24459 targetm.sched.first_cycle_multipass_end
24460 = core2i7_first_cycle_multipass_end;
24461 targetm.sched.first_cycle_multipass_fini
24462 = core2i7_first_cycle_multipass_fini;
24463
24464 /* Set decoder parameters. */
24465 core2i7_secondary_decoder_max_insn_size = 8;
24466 core2i7_ifetch_block_size = 16;
24467 core2i7_ifetch_block_max_insns = 6;
24468 break;
24469
24470 default:
24471 targetm.sched.dfa_post_advance_cycle = NULL;
24472 targetm.sched.first_cycle_multipass_init = NULL;
24473 targetm.sched.first_cycle_multipass_begin = NULL;
24474 targetm.sched.first_cycle_multipass_issue = NULL;
24475 targetm.sched.first_cycle_multipass_backtrack = NULL;
24476 targetm.sched.first_cycle_multipass_end = NULL;
24477 targetm.sched.first_cycle_multipass_fini = NULL;
24478 break;
24479 }
24480 }
24481
24482 \f
24483 /* Compute the alignment given to a constant that is being placed in memory.
24484 EXP is the constant and ALIGN is the alignment that the object would
24485 ordinarily have.
24486 The value of this function is used instead of that alignment to align
24487 the object. */
24488
24489 int
24490 ix86_constant_alignment (tree exp, int align)
24491 {
24492 if (TREE_CODE (exp) == REAL_CST || TREE_CODE (exp) == VECTOR_CST
24493 || TREE_CODE (exp) == INTEGER_CST)
24494 {
24495 if (TYPE_MODE (TREE_TYPE (exp)) == DFmode && align < 64)
24496 return 64;
24497 else if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (exp))) && align < 128)
24498 return 128;
24499 }
24500 else if (!optimize_size && TREE_CODE (exp) == STRING_CST
24501 && TREE_STRING_LENGTH (exp) >= 31 && align < BITS_PER_WORD)
24502 return BITS_PER_WORD;
24503
24504 return align;
24505 }
24506
24507 /* Compute the alignment for a static variable.
24508 TYPE is the data type, and ALIGN is the alignment that
24509 the object would ordinarily have. The value of this function is used
24510 instead of that alignment to align the object. */
24511
24512 int
24513 ix86_data_alignment (tree type, int align)
24514 {
24515 int max_align = optimize_size ? BITS_PER_WORD : MIN (256, MAX_OFILE_ALIGNMENT);
24516
24517 if (AGGREGATE_TYPE_P (type)
24518 && TYPE_SIZE (type)
24519 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
24520 && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= (unsigned) max_align
24521 || TREE_INT_CST_HIGH (TYPE_SIZE (type)))
24522 && align < max_align)
24523 align = max_align;
24524
24525 /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
24526 to 16byte boundary. */
24527 if (TARGET_64BIT)
24528 {
24529 if (AGGREGATE_TYPE_P (type)
24530 && TYPE_SIZE (type)
24531 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
24532 && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= 128
24533 || TREE_INT_CST_HIGH (TYPE_SIZE (type))) && align < 128)
24534 return 128;
24535 }
24536
24537 if (TREE_CODE (type) == ARRAY_TYPE)
24538 {
24539 if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
24540 return 64;
24541 if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
24542 return 128;
24543 }
24544 else if (TREE_CODE (type) == COMPLEX_TYPE)
24545 {
24546
24547 if (TYPE_MODE (type) == DCmode && align < 64)
24548 return 64;
24549 if ((TYPE_MODE (type) == XCmode
24550 || TYPE_MODE (type) == TCmode) && align < 128)
24551 return 128;
24552 }
24553 else if ((TREE_CODE (type) == RECORD_TYPE
24554 || TREE_CODE (type) == UNION_TYPE
24555 || TREE_CODE (type) == QUAL_UNION_TYPE)
24556 && TYPE_FIELDS (type))
24557 {
24558 if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
24559 return 64;
24560 if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
24561 return 128;
24562 }
24563 else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
24564 || TREE_CODE (type) == INTEGER_TYPE)
24565 {
24566 if (TYPE_MODE (type) == DFmode && align < 64)
24567 return 64;
24568 if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
24569 return 128;
24570 }
24571
24572 return align;
24573 }
24574
24575 /* Compute the alignment for a local variable or a stack slot. EXP is
24576 the data type or decl itself, MODE is the widest mode available and
24577 ALIGN is the alignment that the object would ordinarily have. The
24578 value of this macro is used instead of that alignment to align the
24579 object. */
24580
24581 unsigned int
24582 ix86_local_alignment (tree exp, enum machine_mode mode,
24583 unsigned int align)
24584 {
24585 tree type, decl;
24586
24587 if (exp && DECL_P (exp))
24588 {
24589 type = TREE_TYPE (exp);
24590 decl = exp;
24591 }
24592 else
24593 {
24594 type = exp;
24595 decl = NULL;
24596 }
24597
24598 /* Don't do dynamic stack realignment for long long objects with
24599 -mpreferred-stack-boundary=2. */
24600 if (!TARGET_64BIT
24601 && align == 64
24602 && ix86_preferred_stack_boundary < 64
24603 && (mode == DImode || (type && TYPE_MODE (type) == DImode))
24604 && (!type || !TYPE_USER_ALIGN (type))
24605 && (!decl || !DECL_USER_ALIGN (decl)))
24606 align = 32;
24607
24608 /* If TYPE is NULL, we are allocating a stack slot for caller-save
24609 register in MODE. We will return the largest alignment of XF
24610 and DF. */
24611 if (!type)
24612 {
24613 if (mode == XFmode && align < GET_MODE_ALIGNMENT (DFmode))
24614 align = GET_MODE_ALIGNMENT (DFmode);
24615 return align;
24616 }
24617
24618 /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
24619 to 16byte boundary. Exact wording is:
24620
24621 An array uses the same alignment as its elements, except that a local or
24622 global array variable of length at least 16 bytes or
24623 a C99 variable-length array variable always has alignment of at least 16 bytes.
24624
24625 This was added to allow use of aligned SSE instructions at arrays. This
24626 rule is meant for static storage (where compiler can not do the analysis
24627 by itself). We follow it for automatic variables only when convenient.
24628 We fully control everything in the function compiled and functions from
24629 other unit can not rely on the alignment.
24630
24631 Exclude va_list type. It is the common case of local array where
24632 we can not benefit from the alignment. */
24633 if (TARGET_64BIT && optimize_function_for_speed_p (cfun)
24634 && TARGET_SSE)
24635 {
24636 if (AGGREGATE_TYPE_P (type)
24637 && (va_list_type_node == NULL_TREE
24638 || (TYPE_MAIN_VARIANT (type)
24639 != TYPE_MAIN_VARIANT (va_list_type_node)))
24640 && TYPE_SIZE (type)
24641 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
24642 && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= 16
24643 || TREE_INT_CST_HIGH (TYPE_SIZE (type))) && align < 128)
24644 return 128;
24645 }
24646 if (TREE_CODE (type) == ARRAY_TYPE)
24647 {
24648 if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
24649 return 64;
24650 if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
24651 return 128;
24652 }
24653 else if (TREE_CODE (type) == COMPLEX_TYPE)
24654 {
24655 if (TYPE_MODE (type) == DCmode && align < 64)
24656 return 64;
24657 if ((TYPE_MODE (type) == XCmode
24658 || TYPE_MODE (type) == TCmode) && align < 128)
24659 return 128;
24660 }
24661 else if ((TREE_CODE (type) == RECORD_TYPE
24662 || TREE_CODE (type) == UNION_TYPE
24663 || TREE_CODE (type) == QUAL_UNION_TYPE)
24664 && TYPE_FIELDS (type))
24665 {
24666 if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
24667 return 64;
24668 if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
24669 return 128;
24670 }
24671 else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
24672 || TREE_CODE (type) == INTEGER_TYPE)
24673 {
24674
24675 if (TYPE_MODE (type) == DFmode && align < 64)
24676 return 64;
24677 if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
24678 return 128;
24679 }
24680 return align;
24681 }
24682
24683 /* Compute the minimum required alignment for dynamic stack realignment
24684 purposes for a local variable, parameter or a stack slot. EXP is
24685 the data type or decl itself, MODE is its mode and ALIGN is the
24686 alignment that the object would ordinarily have. */
24687
24688 unsigned int
24689 ix86_minimum_alignment (tree exp, enum machine_mode mode,
24690 unsigned int align)
24691 {
24692 tree type, decl;
24693
24694 if (exp && DECL_P (exp))
24695 {
24696 type = TREE_TYPE (exp);
24697 decl = exp;
24698 }
24699 else
24700 {
24701 type = exp;
24702 decl = NULL;
24703 }
24704
24705 if (TARGET_64BIT || align != 64 || ix86_preferred_stack_boundary >= 64)
24706 return align;
24707
24708 /* Don't do dynamic stack realignment for long long objects with
24709 -mpreferred-stack-boundary=2. */
24710 if ((mode == DImode || (type && TYPE_MODE (type) == DImode))
24711 && (!type || !TYPE_USER_ALIGN (type))
24712 && (!decl || !DECL_USER_ALIGN (decl)))
24713 return 32;
24714
24715 return align;
24716 }
24717 \f
24718 /* Find a location for the static chain incoming to a nested function.
24719 This is a register, unless all free registers are used by arguments. */
24720
24721 static rtx
24722 ix86_static_chain (const_tree fndecl, bool incoming_p)
24723 {
24724 unsigned regno;
24725
24726 if (!DECL_STATIC_CHAIN (fndecl))
24727 return NULL;
24728
24729 if (TARGET_64BIT)
24730 {
24731 /* We always use R10 in 64-bit mode. */
24732 regno = R10_REG;
24733 }
24734 else
24735 {
24736 tree fntype;
24737 unsigned int ccvt;
24738
24739 /* By default in 32-bit mode we use ECX to pass the static chain. */
24740 regno = CX_REG;
24741
24742 fntype = TREE_TYPE (fndecl);
24743 ccvt = ix86_get_callcvt (fntype);
24744 if ((ccvt & (IX86_CALLCVT_FASTCALL | IX86_CALLCVT_THISCALL)) != 0)
24745 {
24746 /* Fastcall functions use ecx/edx for arguments, which leaves
24747 us with EAX for the static chain.
24748 Thiscall functions use ecx for arguments, which also
24749 leaves us with EAX for the static chain. */
24750 regno = AX_REG;
24751 }
24752 else if (ix86_function_regparm (fntype, fndecl) == 3)
24753 {
24754 /* For regparm 3, we have no free call-clobbered registers in
24755 which to store the static chain. In order to implement this,
24756 we have the trampoline push the static chain to the stack.
24757 However, we can't push a value below the return address when
24758 we call the nested function directly, so we have to use an
24759 alternate entry point. For this we use ESI, and have the
24760 alternate entry point push ESI, so that things appear the
24761 same once we're executing the nested function. */
24762 if (incoming_p)
24763 {
24764 if (fndecl == current_function_decl)
24765 ix86_static_chain_on_stack = true;
24766 return gen_frame_mem (SImode,
24767 plus_constant (Pmode,
24768 arg_pointer_rtx, -8));
24769 }
24770 regno = SI_REG;
24771 }
24772 }
24773
24774 return gen_rtx_REG (Pmode, regno);
24775 }
24776
24777 /* Emit RTL insns to initialize the variable parts of a trampoline.
24778 FNDECL is the decl of the target address; M_TRAMP is a MEM for
24779 the trampoline, and CHAIN_VALUE is an RTX for the static chain
24780 to be passed to the target function. */
24781
24782 static void
24783 ix86_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
24784 {
24785 rtx mem, fnaddr;
24786 int opcode;
24787 int offset = 0;
24788
24789 fnaddr = XEXP (DECL_RTL (fndecl), 0);
24790
24791 if (TARGET_64BIT)
24792 {
24793 int size;
24794
24795 /* Load the function address to r11. Try to load address using
24796 the shorter movl instead of movabs. We may want to support
24797 movq for kernel mode, but kernel does not use trampolines at
24798 the moment. FNADDR is a 32bit address and may not be in
24799 DImode when ptr_mode == SImode. Always use movl in this
24800 case. */
24801 if (ptr_mode == SImode
24802 || x86_64_zext_immediate_operand (fnaddr, VOIDmode))
24803 {
24804 fnaddr = copy_addr_to_reg (fnaddr);
24805
24806 mem = adjust_address (m_tramp, HImode, offset);
24807 emit_move_insn (mem, gen_int_mode (0xbb41, HImode));
24808
24809 mem = adjust_address (m_tramp, SImode, offset + 2);
24810 emit_move_insn (mem, gen_lowpart (SImode, fnaddr));
24811 offset += 6;
24812 }
24813 else
24814 {
24815 mem = adjust_address (m_tramp, HImode, offset);
24816 emit_move_insn (mem, gen_int_mode (0xbb49, HImode));
24817
24818 mem = adjust_address (m_tramp, DImode, offset + 2);
24819 emit_move_insn (mem, fnaddr);
24820 offset += 10;
24821 }
24822
24823 /* Load static chain using movabs to r10. Use the shorter movl
24824 instead of movabs when ptr_mode == SImode. */
24825 if (ptr_mode == SImode)
24826 {
24827 opcode = 0xba41;
24828 size = 6;
24829 }
24830 else
24831 {
24832 opcode = 0xba49;
24833 size = 10;
24834 }
24835
24836 mem = adjust_address (m_tramp, HImode, offset);
24837 emit_move_insn (mem, gen_int_mode (opcode, HImode));
24838
24839 mem = adjust_address (m_tramp, ptr_mode, offset + 2);
24840 emit_move_insn (mem, chain_value);
24841 offset += size;
24842
24843 /* Jump to r11; the last (unused) byte is a nop, only there to
24844 pad the write out to a single 32-bit store. */
24845 mem = adjust_address (m_tramp, SImode, offset);
24846 emit_move_insn (mem, gen_int_mode (0x90e3ff49, SImode));
24847 offset += 4;
24848 }
24849 else
24850 {
24851 rtx disp, chain;
24852
24853 /* Depending on the static chain location, either load a register
24854 with a constant, or push the constant to the stack. All of the
24855 instructions are the same size. */
24856 chain = ix86_static_chain (fndecl, true);
24857 if (REG_P (chain))
24858 {
24859 switch (REGNO (chain))
24860 {
24861 case AX_REG:
24862 opcode = 0xb8; break;
24863 case CX_REG:
24864 opcode = 0xb9; break;
24865 default:
24866 gcc_unreachable ();
24867 }
24868 }
24869 else
24870 opcode = 0x68;
24871
24872 mem = adjust_address (m_tramp, QImode, offset);
24873 emit_move_insn (mem, gen_int_mode (opcode, QImode));
24874
24875 mem = adjust_address (m_tramp, SImode, offset + 1);
24876 emit_move_insn (mem, chain_value);
24877 offset += 5;
24878
24879 mem = adjust_address (m_tramp, QImode, offset);
24880 emit_move_insn (mem, gen_int_mode (0xe9, QImode));
24881
24882 mem = adjust_address (m_tramp, SImode, offset + 1);
24883
24884 /* Compute offset from the end of the jmp to the target function.
24885 In the case in which the trampoline stores the static chain on
24886 the stack, we need to skip the first insn which pushes the
24887 (call-saved) register static chain; this push is 1 byte. */
24888 offset += 5;
24889 disp = expand_binop (SImode, sub_optab, fnaddr,
24890 plus_constant (Pmode, XEXP (m_tramp, 0),
24891 offset - (MEM_P (chain) ? 1 : 0)),
24892 NULL_RTX, 1, OPTAB_DIRECT);
24893 emit_move_insn (mem, disp);
24894 }
24895
24896 gcc_assert (offset <= TRAMPOLINE_SIZE);
24897
24898 #ifdef HAVE_ENABLE_EXECUTE_STACK
24899 #ifdef CHECK_EXECUTE_STACK_ENABLED
24900 if (CHECK_EXECUTE_STACK_ENABLED)
24901 #endif
24902 emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__enable_execute_stack"),
24903 LCT_NORMAL, VOIDmode, 1, XEXP (m_tramp, 0), Pmode);
24904 #endif
24905 }
24906 \f
24907 /* The following file contains several enumerations and data structures
24908 built from the definitions in i386-builtin-types.def. */
24909
24910 #include "i386-builtin-types.inc"
24911
24912 /* Table for the ix86 builtin non-function types. */
24913 static GTY(()) tree ix86_builtin_type_tab[(int) IX86_BT_LAST_CPTR + 1];
24914
24915 /* Retrieve an element from the above table, building some of
24916 the types lazily. */
24917
24918 static tree
24919 ix86_get_builtin_type (enum ix86_builtin_type tcode)
24920 {
24921 unsigned int index;
24922 tree type, itype;
24923
24924 gcc_assert ((unsigned)tcode < ARRAY_SIZE(ix86_builtin_type_tab));
24925
24926 type = ix86_builtin_type_tab[(int) tcode];
24927 if (type != NULL)
24928 return type;
24929
24930 gcc_assert (tcode > IX86_BT_LAST_PRIM);
24931 if (tcode <= IX86_BT_LAST_VECT)
24932 {
24933 enum machine_mode mode;
24934
24935 index = tcode - IX86_BT_LAST_PRIM - 1;
24936 itype = ix86_get_builtin_type (ix86_builtin_type_vect_base[index]);
24937 mode = ix86_builtin_type_vect_mode[index];
24938
24939 type = build_vector_type_for_mode (itype, mode);
24940 }
24941 else
24942 {
24943 int quals;
24944
24945 index = tcode - IX86_BT_LAST_VECT - 1;
24946 if (tcode <= IX86_BT_LAST_PTR)
24947 quals = TYPE_UNQUALIFIED;
24948 else
24949 quals = TYPE_QUAL_CONST;
24950
24951 itype = ix86_get_builtin_type (ix86_builtin_type_ptr_base[index]);
24952 if (quals != TYPE_UNQUALIFIED)
24953 itype = build_qualified_type (itype, quals);
24954
24955 type = build_pointer_type (itype);
24956 }
24957
24958 ix86_builtin_type_tab[(int) tcode] = type;
24959 return type;
24960 }
24961
24962 /* Table for the ix86 builtin function types. */
24963 static GTY(()) tree ix86_builtin_func_type_tab[(int) IX86_BT_LAST_ALIAS + 1];
24964
24965 /* Retrieve an element from the above table, building some of
24966 the types lazily. */
24967
24968 static tree
24969 ix86_get_builtin_func_type (enum ix86_builtin_func_type tcode)
24970 {
24971 tree type;
24972
24973 gcc_assert ((unsigned)tcode < ARRAY_SIZE (ix86_builtin_func_type_tab));
24974
24975 type = ix86_builtin_func_type_tab[(int) tcode];
24976 if (type != NULL)
24977 return type;
24978
24979 if (tcode <= IX86_BT_LAST_FUNC)
24980 {
24981 unsigned start = ix86_builtin_func_start[(int) tcode];
24982 unsigned after = ix86_builtin_func_start[(int) tcode + 1];
24983 tree rtype, atype, args = void_list_node;
24984 unsigned i;
24985
24986 rtype = ix86_get_builtin_type (ix86_builtin_func_args[start]);
24987 for (i = after - 1; i > start; --i)
24988 {
24989 atype = ix86_get_builtin_type (ix86_builtin_func_args[i]);
24990 args = tree_cons (NULL, atype, args);
24991 }
24992
24993 type = build_function_type (rtype, args);
24994 }
24995 else
24996 {
24997 unsigned index = tcode - IX86_BT_LAST_FUNC - 1;
24998 enum ix86_builtin_func_type icode;
24999
25000 icode = ix86_builtin_func_alias_base[index];
25001 type = ix86_get_builtin_func_type (icode);
25002 }
25003
25004 ix86_builtin_func_type_tab[(int) tcode] = type;
25005 return type;
25006 }
25007
25008
25009 /* Codes for all the SSE/MMX builtins. */
25010 enum ix86_builtins
25011 {
25012 IX86_BUILTIN_ADDPS,
25013 IX86_BUILTIN_ADDSS,
25014 IX86_BUILTIN_DIVPS,
25015 IX86_BUILTIN_DIVSS,
25016 IX86_BUILTIN_MULPS,
25017 IX86_BUILTIN_MULSS,
25018 IX86_BUILTIN_SUBPS,
25019 IX86_BUILTIN_SUBSS,
25020
25021 IX86_BUILTIN_CMPEQPS,
25022 IX86_BUILTIN_CMPLTPS,
25023 IX86_BUILTIN_CMPLEPS,
25024 IX86_BUILTIN_CMPGTPS,
25025 IX86_BUILTIN_CMPGEPS,
25026 IX86_BUILTIN_CMPNEQPS,
25027 IX86_BUILTIN_CMPNLTPS,
25028 IX86_BUILTIN_CMPNLEPS,
25029 IX86_BUILTIN_CMPNGTPS,
25030 IX86_BUILTIN_CMPNGEPS,
25031 IX86_BUILTIN_CMPORDPS,
25032 IX86_BUILTIN_CMPUNORDPS,
25033 IX86_BUILTIN_CMPEQSS,
25034 IX86_BUILTIN_CMPLTSS,
25035 IX86_BUILTIN_CMPLESS,
25036 IX86_BUILTIN_CMPNEQSS,
25037 IX86_BUILTIN_CMPNLTSS,
25038 IX86_BUILTIN_CMPNLESS,
25039 IX86_BUILTIN_CMPNGTSS,
25040 IX86_BUILTIN_CMPNGESS,
25041 IX86_BUILTIN_CMPORDSS,
25042 IX86_BUILTIN_CMPUNORDSS,
25043
25044 IX86_BUILTIN_COMIEQSS,
25045 IX86_BUILTIN_COMILTSS,
25046 IX86_BUILTIN_COMILESS,
25047 IX86_BUILTIN_COMIGTSS,
25048 IX86_BUILTIN_COMIGESS,
25049 IX86_BUILTIN_COMINEQSS,
25050 IX86_BUILTIN_UCOMIEQSS,
25051 IX86_BUILTIN_UCOMILTSS,
25052 IX86_BUILTIN_UCOMILESS,
25053 IX86_BUILTIN_UCOMIGTSS,
25054 IX86_BUILTIN_UCOMIGESS,
25055 IX86_BUILTIN_UCOMINEQSS,
25056
25057 IX86_BUILTIN_CVTPI2PS,
25058 IX86_BUILTIN_CVTPS2PI,
25059 IX86_BUILTIN_CVTSI2SS,
25060 IX86_BUILTIN_CVTSI642SS,
25061 IX86_BUILTIN_CVTSS2SI,
25062 IX86_BUILTIN_CVTSS2SI64,
25063 IX86_BUILTIN_CVTTPS2PI,
25064 IX86_BUILTIN_CVTTSS2SI,
25065 IX86_BUILTIN_CVTTSS2SI64,
25066
25067 IX86_BUILTIN_MAXPS,
25068 IX86_BUILTIN_MAXSS,
25069 IX86_BUILTIN_MINPS,
25070 IX86_BUILTIN_MINSS,
25071
25072 IX86_BUILTIN_LOADUPS,
25073 IX86_BUILTIN_STOREUPS,
25074 IX86_BUILTIN_MOVSS,
25075
25076 IX86_BUILTIN_MOVHLPS,
25077 IX86_BUILTIN_MOVLHPS,
25078 IX86_BUILTIN_LOADHPS,
25079 IX86_BUILTIN_LOADLPS,
25080 IX86_BUILTIN_STOREHPS,
25081 IX86_BUILTIN_STORELPS,
25082
25083 IX86_BUILTIN_MASKMOVQ,
25084 IX86_BUILTIN_MOVMSKPS,
25085 IX86_BUILTIN_PMOVMSKB,
25086
25087 IX86_BUILTIN_MOVNTPS,
25088 IX86_BUILTIN_MOVNTQ,
25089
25090 IX86_BUILTIN_LOADDQU,
25091 IX86_BUILTIN_STOREDQU,
25092
25093 IX86_BUILTIN_PACKSSWB,
25094 IX86_BUILTIN_PACKSSDW,
25095 IX86_BUILTIN_PACKUSWB,
25096
25097 IX86_BUILTIN_PADDB,
25098 IX86_BUILTIN_PADDW,
25099 IX86_BUILTIN_PADDD,
25100 IX86_BUILTIN_PADDQ,
25101 IX86_BUILTIN_PADDSB,
25102 IX86_BUILTIN_PADDSW,
25103 IX86_BUILTIN_PADDUSB,
25104 IX86_BUILTIN_PADDUSW,
25105 IX86_BUILTIN_PSUBB,
25106 IX86_BUILTIN_PSUBW,
25107 IX86_BUILTIN_PSUBD,
25108 IX86_BUILTIN_PSUBQ,
25109 IX86_BUILTIN_PSUBSB,
25110 IX86_BUILTIN_PSUBSW,
25111 IX86_BUILTIN_PSUBUSB,
25112 IX86_BUILTIN_PSUBUSW,
25113
25114 IX86_BUILTIN_PAND,
25115 IX86_BUILTIN_PANDN,
25116 IX86_BUILTIN_POR,
25117 IX86_BUILTIN_PXOR,
25118
25119 IX86_BUILTIN_PAVGB,
25120 IX86_BUILTIN_PAVGW,
25121
25122 IX86_BUILTIN_PCMPEQB,
25123 IX86_BUILTIN_PCMPEQW,
25124 IX86_BUILTIN_PCMPEQD,
25125 IX86_BUILTIN_PCMPGTB,
25126 IX86_BUILTIN_PCMPGTW,
25127 IX86_BUILTIN_PCMPGTD,
25128
25129 IX86_BUILTIN_PMADDWD,
25130
25131 IX86_BUILTIN_PMAXSW,
25132 IX86_BUILTIN_PMAXUB,
25133 IX86_BUILTIN_PMINSW,
25134 IX86_BUILTIN_PMINUB,
25135
25136 IX86_BUILTIN_PMULHUW,
25137 IX86_BUILTIN_PMULHW,
25138 IX86_BUILTIN_PMULLW,
25139
25140 IX86_BUILTIN_PSADBW,
25141 IX86_BUILTIN_PSHUFW,
25142
25143 IX86_BUILTIN_PSLLW,
25144 IX86_BUILTIN_PSLLD,
25145 IX86_BUILTIN_PSLLQ,
25146 IX86_BUILTIN_PSRAW,
25147 IX86_BUILTIN_PSRAD,
25148 IX86_BUILTIN_PSRLW,
25149 IX86_BUILTIN_PSRLD,
25150 IX86_BUILTIN_PSRLQ,
25151 IX86_BUILTIN_PSLLWI,
25152 IX86_BUILTIN_PSLLDI,
25153 IX86_BUILTIN_PSLLQI,
25154 IX86_BUILTIN_PSRAWI,
25155 IX86_BUILTIN_PSRADI,
25156 IX86_BUILTIN_PSRLWI,
25157 IX86_BUILTIN_PSRLDI,
25158 IX86_BUILTIN_PSRLQI,
25159
25160 IX86_BUILTIN_PUNPCKHBW,
25161 IX86_BUILTIN_PUNPCKHWD,
25162 IX86_BUILTIN_PUNPCKHDQ,
25163 IX86_BUILTIN_PUNPCKLBW,
25164 IX86_BUILTIN_PUNPCKLWD,
25165 IX86_BUILTIN_PUNPCKLDQ,
25166
25167 IX86_BUILTIN_SHUFPS,
25168
25169 IX86_BUILTIN_RCPPS,
25170 IX86_BUILTIN_RCPSS,
25171 IX86_BUILTIN_RSQRTPS,
25172 IX86_BUILTIN_RSQRTPS_NR,
25173 IX86_BUILTIN_RSQRTSS,
25174 IX86_BUILTIN_RSQRTF,
25175 IX86_BUILTIN_SQRTPS,
25176 IX86_BUILTIN_SQRTPS_NR,
25177 IX86_BUILTIN_SQRTSS,
25178
25179 IX86_BUILTIN_UNPCKHPS,
25180 IX86_BUILTIN_UNPCKLPS,
25181
25182 IX86_BUILTIN_ANDPS,
25183 IX86_BUILTIN_ANDNPS,
25184 IX86_BUILTIN_ORPS,
25185 IX86_BUILTIN_XORPS,
25186
25187 IX86_BUILTIN_EMMS,
25188 IX86_BUILTIN_LDMXCSR,
25189 IX86_BUILTIN_STMXCSR,
25190 IX86_BUILTIN_SFENCE,
25191
25192 /* 3DNow! Original */
25193 IX86_BUILTIN_FEMMS,
25194 IX86_BUILTIN_PAVGUSB,
25195 IX86_BUILTIN_PF2ID,
25196 IX86_BUILTIN_PFACC,
25197 IX86_BUILTIN_PFADD,
25198 IX86_BUILTIN_PFCMPEQ,
25199 IX86_BUILTIN_PFCMPGE,
25200 IX86_BUILTIN_PFCMPGT,
25201 IX86_BUILTIN_PFMAX,
25202 IX86_BUILTIN_PFMIN,
25203 IX86_BUILTIN_PFMUL,
25204 IX86_BUILTIN_PFRCP,
25205 IX86_BUILTIN_PFRCPIT1,
25206 IX86_BUILTIN_PFRCPIT2,
25207 IX86_BUILTIN_PFRSQIT1,
25208 IX86_BUILTIN_PFRSQRT,
25209 IX86_BUILTIN_PFSUB,
25210 IX86_BUILTIN_PFSUBR,
25211 IX86_BUILTIN_PI2FD,
25212 IX86_BUILTIN_PMULHRW,
25213
25214 /* 3DNow! Athlon Extensions */
25215 IX86_BUILTIN_PF2IW,
25216 IX86_BUILTIN_PFNACC,
25217 IX86_BUILTIN_PFPNACC,
25218 IX86_BUILTIN_PI2FW,
25219 IX86_BUILTIN_PSWAPDSI,
25220 IX86_BUILTIN_PSWAPDSF,
25221
25222 /* SSE2 */
25223 IX86_BUILTIN_ADDPD,
25224 IX86_BUILTIN_ADDSD,
25225 IX86_BUILTIN_DIVPD,
25226 IX86_BUILTIN_DIVSD,
25227 IX86_BUILTIN_MULPD,
25228 IX86_BUILTIN_MULSD,
25229 IX86_BUILTIN_SUBPD,
25230 IX86_BUILTIN_SUBSD,
25231
25232 IX86_BUILTIN_CMPEQPD,
25233 IX86_BUILTIN_CMPLTPD,
25234 IX86_BUILTIN_CMPLEPD,
25235 IX86_BUILTIN_CMPGTPD,
25236 IX86_BUILTIN_CMPGEPD,
25237 IX86_BUILTIN_CMPNEQPD,
25238 IX86_BUILTIN_CMPNLTPD,
25239 IX86_BUILTIN_CMPNLEPD,
25240 IX86_BUILTIN_CMPNGTPD,
25241 IX86_BUILTIN_CMPNGEPD,
25242 IX86_BUILTIN_CMPORDPD,
25243 IX86_BUILTIN_CMPUNORDPD,
25244 IX86_BUILTIN_CMPEQSD,
25245 IX86_BUILTIN_CMPLTSD,
25246 IX86_BUILTIN_CMPLESD,
25247 IX86_BUILTIN_CMPNEQSD,
25248 IX86_BUILTIN_CMPNLTSD,
25249 IX86_BUILTIN_CMPNLESD,
25250 IX86_BUILTIN_CMPORDSD,
25251 IX86_BUILTIN_CMPUNORDSD,
25252
25253 IX86_BUILTIN_COMIEQSD,
25254 IX86_BUILTIN_COMILTSD,
25255 IX86_BUILTIN_COMILESD,
25256 IX86_BUILTIN_COMIGTSD,
25257 IX86_BUILTIN_COMIGESD,
25258 IX86_BUILTIN_COMINEQSD,
25259 IX86_BUILTIN_UCOMIEQSD,
25260 IX86_BUILTIN_UCOMILTSD,
25261 IX86_BUILTIN_UCOMILESD,
25262 IX86_BUILTIN_UCOMIGTSD,
25263 IX86_BUILTIN_UCOMIGESD,
25264 IX86_BUILTIN_UCOMINEQSD,
25265
25266 IX86_BUILTIN_MAXPD,
25267 IX86_BUILTIN_MAXSD,
25268 IX86_BUILTIN_MINPD,
25269 IX86_BUILTIN_MINSD,
25270
25271 IX86_BUILTIN_ANDPD,
25272 IX86_BUILTIN_ANDNPD,
25273 IX86_BUILTIN_ORPD,
25274 IX86_BUILTIN_XORPD,
25275
25276 IX86_BUILTIN_SQRTPD,
25277 IX86_BUILTIN_SQRTSD,
25278
25279 IX86_BUILTIN_UNPCKHPD,
25280 IX86_BUILTIN_UNPCKLPD,
25281
25282 IX86_BUILTIN_SHUFPD,
25283
25284 IX86_BUILTIN_LOADUPD,
25285 IX86_BUILTIN_STOREUPD,
25286 IX86_BUILTIN_MOVSD,
25287
25288 IX86_BUILTIN_LOADHPD,
25289 IX86_BUILTIN_LOADLPD,
25290
25291 IX86_BUILTIN_CVTDQ2PD,
25292 IX86_BUILTIN_CVTDQ2PS,
25293
25294 IX86_BUILTIN_CVTPD2DQ,
25295 IX86_BUILTIN_CVTPD2PI,
25296 IX86_BUILTIN_CVTPD2PS,
25297 IX86_BUILTIN_CVTTPD2DQ,
25298 IX86_BUILTIN_CVTTPD2PI,
25299
25300 IX86_BUILTIN_CVTPI2PD,
25301 IX86_BUILTIN_CVTSI2SD,
25302 IX86_BUILTIN_CVTSI642SD,
25303
25304 IX86_BUILTIN_CVTSD2SI,
25305 IX86_BUILTIN_CVTSD2SI64,
25306 IX86_BUILTIN_CVTSD2SS,
25307 IX86_BUILTIN_CVTSS2SD,
25308 IX86_BUILTIN_CVTTSD2SI,
25309 IX86_BUILTIN_CVTTSD2SI64,
25310
25311 IX86_BUILTIN_CVTPS2DQ,
25312 IX86_BUILTIN_CVTPS2PD,
25313 IX86_BUILTIN_CVTTPS2DQ,
25314
25315 IX86_BUILTIN_MOVNTI,
25316 IX86_BUILTIN_MOVNTI64,
25317 IX86_BUILTIN_MOVNTPD,
25318 IX86_BUILTIN_MOVNTDQ,
25319
25320 IX86_BUILTIN_MOVQ128,
25321
25322 /* SSE2 MMX */
25323 IX86_BUILTIN_MASKMOVDQU,
25324 IX86_BUILTIN_MOVMSKPD,
25325 IX86_BUILTIN_PMOVMSKB128,
25326
25327 IX86_BUILTIN_PACKSSWB128,
25328 IX86_BUILTIN_PACKSSDW128,
25329 IX86_BUILTIN_PACKUSWB128,
25330
25331 IX86_BUILTIN_PADDB128,
25332 IX86_BUILTIN_PADDW128,
25333 IX86_BUILTIN_PADDD128,
25334 IX86_BUILTIN_PADDQ128,
25335 IX86_BUILTIN_PADDSB128,
25336 IX86_BUILTIN_PADDSW128,
25337 IX86_BUILTIN_PADDUSB128,
25338 IX86_BUILTIN_PADDUSW128,
25339 IX86_BUILTIN_PSUBB128,
25340 IX86_BUILTIN_PSUBW128,
25341 IX86_BUILTIN_PSUBD128,
25342 IX86_BUILTIN_PSUBQ128,
25343 IX86_BUILTIN_PSUBSB128,
25344 IX86_BUILTIN_PSUBSW128,
25345 IX86_BUILTIN_PSUBUSB128,
25346 IX86_BUILTIN_PSUBUSW128,
25347
25348 IX86_BUILTIN_PAND128,
25349 IX86_BUILTIN_PANDN128,
25350 IX86_BUILTIN_POR128,
25351 IX86_BUILTIN_PXOR128,
25352
25353 IX86_BUILTIN_PAVGB128,
25354 IX86_BUILTIN_PAVGW128,
25355
25356 IX86_BUILTIN_PCMPEQB128,
25357 IX86_BUILTIN_PCMPEQW128,
25358 IX86_BUILTIN_PCMPEQD128,
25359 IX86_BUILTIN_PCMPGTB128,
25360 IX86_BUILTIN_PCMPGTW128,
25361 IX86_BUILTIN_PCMPGTD128,
25362
25363 IX86_BUILTIN_PMADDWD128,
25364
25365 IX86_BUILTIN_PMAXSW128,
25366 IX86_BUILTIN_PMAXUB128,
25367 IX86_BUILTIN_PMINSW128,
25368 IX86_BUILTIN_PMINUB128,
25369
25370 IX86_BUILTIN_PMULUDQ,
25371 IX86_BUILTIN_PMULUDQ128,
25372 IX86_BUILTIN_PMULHUW128,
25373 IX86_BUILTIN_PMULHW128,
25374 IX86_BUILTIN_PMULLW128,
25375
25376 IX86_BUILTIN_PSADBW128,
25377 IX86_BUILTIN_PSHUFHW,
25378 IX86_BUILTIN_PSHUFLW,
25379 IX86_BUILTIN_PSHUFD,
25380
25381 IX86_BUILTIN_PSLLDQI128,
25382 IX86_BUILTIN_PSLLWI128,
25383 IX86_BUILTIN_PSLLDI128,
25384 IX86_BUILTIN_PSLLQI128,
25385 IX86_BUILTIN_PSRAWI128,
25386 IX86_BUILTIN_PSRADI128,
25387 IX86_BUILTIN_PSRLDQI128,
25388 IX86_BUILTIN_PSRLWI128,
25389 IX86_BUILTIN_PSRLDI128,
25390 IX86_BUILTIN_PSRLQI128,
25391
25392 IX86_BUILTIN_PSLLDQ128,
25393 IX86_BUILTIN_PSLLW128,
25394 IX86_BUILTIN_PSLLD128,
25395 IX86_BUILTIN_PSLLQ128,
25396 IX86_BUILTIN_PSRAW128,
25397 IX86_BUILTIN_PSRAD128,
25398 IX86_BUILTIN_PSRLW128,
25399 IX86_BUILTIN_PSRLD128,
25400 IX86_BUILTIN_PSRLQ128,
25401
25402 IX86_BUILTIN_PUNPCKHBW128,
25403 IX86_BUILTIN_PUNPCKHWD128,
25404 IX86_BUILTIN_PUNPCKHDQ128,
25405 IX86_BUILTIN_PUNPCKHQDQ128,
25406 IX86_BUILTIN_PUNPCKLBW128,
25407 IX86_BUILTIN_PUNPCKLWD128,
25408 IX86_BUILTIN_PUNPCKLDQ128,
25409 IX86_BUILTIN_PUNPCKLQDQ128,
25410
25411 IX86_BUILTIN_CLFLUSH,
25412 IX86_BUILTIN_MFENCE,
25413 IX86_BUILTIN_LFENCE,
25414 IX86_BUILTIN_PAUSE,
25415
25416 IX86_BUILTIN_BSRSI,
25417 IX86_BUILTIN_BSRDI,
25418 IX86_BUILTIN_RDPMC,
25419 IX86_BUILTIN_RDTSC,
25420 IX86_BUILTIN_RDTSCP,
25421 IX86_BUILTIN_ROLQI,
25422 IX86_BUILTIN_ROLHI,
25423 IX86_BUILTIN_RORQI,
25424 IX86_BUILTIN_RORHI,
25425
25426 /* SSE3. */
25427 IX86_BUILTIN_ADDSUBPS,
25428 IX86_BUILTIN_HADDPS,
25429 IX86_BUILTIN_HSUBPS,
25430 IX86_BUILTIN_MOVSHDUP,
25431 IX86_BUILTIN_MOVSLDUP,
25432 IX86_BUILTIN_ADDSUBPD,
25433 IX86_BUILTIN_HADDPD,
25434 IX86_BUILTIN_HSUBPD,
25435 IX86_BUILTIN_LDDQU,
25436
25437 IX86_BUILTIN_MONITOR,
25438 IX86_BUILTIN_MWAIT,
25439
25440 /* SSSE3. */
25441 IX86_BUILTIN_PHADDW,
25442 IX86_BUILTIN_PHADDD,
25443 IX86_BUILTIN_PHADDSW,
25444 IX86_BUILTIN_PHSUBW,
25445 IX86_BUILTIN_PHSUBD,
25446 IX86_BUILTIN_PHSUBSW,
25447 IX86_BUILTIN_PMADDUBSW,
25448 IX86_BUILTIN_PMULHRSW,
25449 IX86_BUILTIN_PSHUFB,
25450 IX86_BUILTIN_PSIGNB,
25451 IX86_BUILTIN_PSIGNW,
25452 IX86_BUILTIN_PSIGND,
25453 IX86_BUILTIN_PALIGNR,
25454 IX86_BUILTIN_PABSB,
25455 IX86_BUILTIN_PABSW,
25456 IX86_BUILTIN_PABSD,
25457
25458 IX86_BUILTIN_PHADDW128,
25459 IX86_BUILTIN_PHADDD128,
25460 IX86_BUILTIN_PHADDSW128,
25461 IX86_BUILTIN_PHSUBW128,
25462 IX86_BUILTIN_PHSUBD128,
25463 IX86_BUILTIN_PHSUBSW128,
25464 IX86_BUILTIN_PMADDUBSW128,
25465 IX86_BUILTIN_PMULHRSW128,
25466 IX86_BUILTIN_PSHUFB128,
25467 IX86_BUILTIN_PSIGNB128,
25468 IX86_BUILTIN_PSIGNW128,
25469 IX86_BUILTIN_PSIGND128,
25470 IX86_BUILTIN_PALIGNR128,
25471 IX86_BUILTIN_PABSB128,
25472 IX86_BUILTIN_PABSW128,
25473 IX86_BUILTIN_PABSD128,
25474
25475 /* AMDFAM10 - SSE4A New Instructions. */
25476 IX86_BUILTIN_MOVNTSD,
25477 IX86_BUILTIN_MOVNTSS,
25478 IX86_BUILTIN_EXTRQI,
25479 IX86_BUILTIN_EXTRQ,
25480 IX86_BUILTIN_INSERTQI,
25481 IX86_BUILTIN_INSERTQ,
25482
25483 /* SSE4.1. */
25484 IX86_BUILTIN_BLENDPD,
25485 IX86_BUILTIN_BLENDPS,
25486 IX86_BUILTIN_BLENDVPD,
25487 IX86_BUILTIN_BLENDVPS,
25488 IX86_BUILTIN_PBLENDVB128,
25489 IX86_BUILTIN_PBLENDW128,
25490
25491 IX86_BUILTIN_DPPD,
25492 IX86_BUILTIN_DPPS,
25493
25494 IX86_BUILTIN_INSERTPS128,
25495
25496 IX86_BUILTIN_MOVNTDQA,
25497 IX86_BUILTIN_MPSADBW128,
25498 IX86_BUILTIN_PACKUSDW128,
25499 IX86_BUILTIN_PCMPEQQ,
25500 IX86_BUILTIN_PHMINPOSUW128,
25501
25502 IX86_BUILTIN_PMAXSB128,
25503 IX86_BUILTIN_PMAXSD128,
25504 IX86_BUILTIN_PMAXUD128,
25505 IX86_BUILTIN_PMAXUW128,
25506
25507 IX86_BUILTIN_PMINSB128,
25508 IX86_BUILTIN_PMINSD128,
25509 IX86_BUILTIN_PMINUD128,
25510 IX86_BUILTIN_PMINUW128,
25511
25512 IX86_BUILTIN_PMOVSXBW128,
25513 IX86_BUILTIN_PMOVSXBD128,
25514 IX86_BUILTIN_PMOVSXBQ128,
25515 IX86_BUILTIN_PMOVSXWD128,
25516 IX86_BUILTIN_PMOVSXWQ128,
25517 IX86_BUILTIN_PMOVSXDQ128,
25518
25519 IX86_BUILTIN_PMOVZXBW128,
25520 IX86_BUILTIN_PMOVZXBD128,
25521 IX86_BUILTIN_PMOVZXBQ128,
25522 IX86_BUILTIN_PMOVZXWD128,
25523 IX86_BUILTIN_PMOVZXWQ128,
25524 IX86_BUILTIN_PMOVZXDQ128,
25525
25526 IX86_BUILTIN_PMULDQ128,
25527 IX86_BUILTIN_PMULLD128,
25528
25529 IX86_BUILTIN_ROUNDSD,
25530 IX86_BUILTIN_ROUNDSS,
25531
25532 IX86_BUILTIN_ROUNDPD,
25533 IX86_BUILTIN_ROUNDPS,
25534
25535 IX86_BUILTIN_FLOORPD,
25536 IX86_BUILTIN_CEILPD,
25537 IX86_BUILTIN_TRUNCPD,
25538 IX86_BUILTIN_RINTPD,
25539 IX86_BUILTIN_ROUNDPD_AZ,
25540
25541 IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX,
25542 IX86_BUILTIN_CEILPD_VEC_PACK_SFIX,
25543 IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX,
25544
25545 IX86_BUILTIN_FLOORPS,
25546 IX86_BUILTIN_CEILPS,
25547 IX86_BUILTIN_TRUNCPS,
25548 IX86_BUILTIN_RINTPS,
25549 IX86_BUILTIN_ROUNDPS_AZ,
25550
25551 IX86_BUILTIN_FLOORPS_SFIX,
25552 IX86_BUILTIN_CEILPS_SFIX,
25553 IX86_BUILTIN_ROUNDPS_AZ_SFIX,
25554
25555 IX86_BUILTIN_PTESTZ,
25556 IX86_BUILTIN_PTESTC,
25557 IX86_BUILTIN_PTESTNZC,
25558
25559 IX86_BUILTIN_VEC_INIT_V2SI,
25560 IX86_BUILTIN_VEC_INIT_V4HI,
25561 IX86_BUILTIN_VEC_INIT_V8QI,
25562 IX86_BUILTIN_VEC_EXT_V2DF,
25563 IX86_BUILTIN_VEC_EXT_V2DI,
25564 IX86_BUILTIN_VEC_EXT_V4SF,
25565 IX86_BUILTIN_VEC_EXT_V4SI,
25566 IX86_BUILTIN_VEC_EXT_V8HI,
25567 IX86_BUILTIN_VEC_EXT_V2SI,
25568 IX86_BUILTIN_VEC_EXT_V4HI,
25569 IX86_BUILTIN_VEC_EXT_V16QI,
25570 IX86_BUILTIN_VEC_SET_V2DI,
25571 IX86_BUILTIN_VEC_SET_V4SF,
25572 IX86_BUILTIN_VEC_SET_V4SI,
25573 IX86_BUILTIN_VEC_SET_V8HI,
25574 IX86_BUILTIN_VEC_SET_V4HI,
25575 IX86_BUILTIN_VEC_SET_V16QI,
25576
25577 IX86_BUILTIN_VEC_PACK_SFIX,
25578 IX86_BUILTIN_VEC_PACK_SFIX256,
25579
25580 /* SSE4.2. */
25581 IX86_BUILTIN_CRC32QI,
25582 IX86_BUILTIN_CRC32HI,
25583 IX86_BUILTIN_CRC32SI,
25584 IX86_BUILTIN_CRC32DI,
25585
25586 IX86_BUILTIN_PCMPESTRI128,
25587 IX86_BUILTIN_PCMPESTRM128,
25588 IX86_BUILTIN_PCMPESTRA128,
25589 IX86_BUILTIN_PCMPESTRC128,
25590 IX86_BUILTIN_PCMPESTRO128,
25591 IX86_BUILTIN_PCMPESTRS128,
25592 IX86_BUILTIN_PCMPESTRZ128,
25593 IX86_BUILTIN_PCMPISTRI128,
25594 IX86_BUILTIN_PCMPISTRM128,
25595 IX86_BUILTIN_PCMPISTRA128,
25596 IX86_BUILTIN_PCMPISTRC128,
25597 IX86_BUILTIN_PCMPISTRO128,
25598 IX86_BUILTIN_PCMPISTRS128,
25599 IX86_BUILTIN_PCMPISTRZ128,
25600
25601 IX86_BUILTIN_PCMPGTQ,
25602
25603 /* AES instructions */
25604 IX86_BUILTIN_AESENC128,
25605 IX86_BUILTIN_AESENCLAST128,
25606 IX86_BUILTIN_AESDEC128,
25607 IX86_BUILTIN_AESDECLAST128,
25608 IX86_BUILTIN_AESIMC128,
25609 IX86_BUILTIN_AESKEYGENASSIST128,
25610
25611 /* PCLMUL instruction */
25612 IX86_BUILTIN_PCLMULQDQ128,
25613
25614 /* AVX */
25615 IX86_BUILTIN_ADDPD256,
25616 IX86_BUILTIN_ADDPS256,
25617 IX86_BUILTIN_ADDSUBPD256,
25618 IX86_BUILTIN_ADDSUBPS256,
25619 IX86_BUILTIN_ANDPD256,
25620 IX86_BUILTIN_ANDPS256,
25621 IX86_BUILTIN_ANDNPD256,
25622 IX86_BUILTIN_ANDNPS256,
25623 IX86_BUILTIN_BLENDPD256,
25624 IX86_BUILTIN_BLENDPS256,
25625 IX86_BUILTIN_BLENDVPD256,
25626 IX86_BUILTIN_BLENDVPS256,
25627 IX86_BUILTIN_DIVPD256,
25628 IX86_BUILTIN_DIVPS256,
25629 IX86_BUILTIN_DPPS256,
25630 IX86_BUILTIN_HADDPD256,
25631 IX86_BUILTIN_HADDPS256,
25632 IX86_BUILTIN_HSUBPD256,
25633 IX86_BUILTIN_HSUBPS256,
25634 IX86_BUILTIN_MAXPD256,
25635 IX86_BUILTIN_MAXPS256,
25636 IX86_BUILTIN_MINPD256,
25637 IX86_BUILTIN_MINPS256,
25638 IX86_BUILTIN_MULPD256,
25639 IX86_BUILTIN_MULPS256,
25640 IX86_BUILTIN_ORPD256,
25641 IX86_BUILTIN_ORPS256,
25642 IX86_BUILTIN_SHUFPD256,
25643 IX86_BUILTIN_SHUFPS256,
25644 IX86_BUILTIN_SUBPD256,
25645 IX86_BUILTIN_SUBPS256,
25646 IX86_BUILTIN_XORPD256,
25647 IX86_BUILTIN_XORPS256,
25648 IX86_BUILTIN_CMPSD,
25649 IX86_BUILTIN_CMPSS,
25650 IX86_BUILTIN_CMPPD,
25651 IX86_BUILTIN_CMPPS,
25652 IX86_BUILTIN_CMPPD256,
25653 IX86_BUILTIN_CMPPS256,
25654 IX86_BUILTIN_CVTDQ2PD256,
25655 IX86_BUILTIN_CVTDQ2PS256,
25656 IX86_BUILTIN_CVTPD2PS256,
25657 IX86_BUILTIN_CVTPS2DQ256,
25658 IX86_BUILTIN_CVTPS2PD256,
25659 IX86_BUILTIN_CVTTPD2DQ256,
25660 IX86_BUILTIN_CVTPD2DQ256,
25661 IX86_BUILTIN_CVTTPS2DQ256,
25662 IX86_BUILTIN_EXTRACTF128PD256,
25663 IX86_BUILTIN_EXTRACTF128PS256,
25664 IX86_BUILTIN_EXTRACTF128SI256,
25665 IX86_BUILTIN_VZEROALL,
25666 IX86_BUILTIN_VZEROUPPER,
25667 IX86_BUILTIN_VPERMILVARPD,
25668 IX86_BUILTIN_VPERMILVARPS,
25669 IX86_BUILTIN_VPERMILVARPD256,
25670 IX86_BUILTIN_VPERMILVARPS256,
25671 IX86_BUILTIN_VPERMILPD,
25672 IX86_BUILTIN_VPERMILPS,
25673 IX86_BUILTIN_VPERMILPD256,
25674 IX86_BUILTIN_VPERMILPS256,
25675 IX86_BUILTIN_VPERMIL2PD,
25676 IX86_BUILTIN_VPERMIL2PS,
25677 IX86_BUILTIN_VPERMIL2PD256,
25678 IX86_BUILTIN_VPERMIL2PS256,
25679 IX86_BUILTIN_VPERM2F128PD256,
25680 IX86_BUILTIN_VPERM2F128PS256,
25681 IX86_BUILTIN_VPERM2F128SI256,
25682 IX86_BUILTIN_VBROADCASTSS,
25683 IX86_BUILTIN_VBROADCASTSD256,
25684 IX86_BUILTIN_VBROADCASTSS256,
25685 IX86_BUILTIN_VBROADCASTPD256,
25686 IX86_BUILTIN_VBROADCASTPS256,
25687 IX86_BUILTIN_VINSERTF128PD256,
25688 IX86_BUILTIN_VINSERTF128PS256,
25689 IX86_BUILTIN_VINSERTF128SI256,
25690 IX86_BUILTIN_LOADUPD256,
25691 IX86_BUILTIN_LOADUPS256,
25692 IX86_BUILTIN_STOREUPD256,
25693 IX86_BUILTIN_STOREUPS256,
25694 IX86_BUILTIN_LDDQU256,
25695 IX86_BUILTIN_MOVNTDQ256,
25696 IX86_BUILTIN_MOVNTPD256,
25697 IX86_BUILTIN_MOVNTPS256,
25698 IX86_BUILTIN_LOADDQU256,
25699 IX86_BUILTIN_STOREDQU256,
25700 IX86_BUILTIN_MASKLOADPD,
25701 IX86_BUILTIN_MASKLOADPS,
25702 IX86_BUILTIN_MASKSTOREPD,
25703 IX86_BUILTIN_MASKSTOREPS,
25704 IX86_BUILTIN_MASKLOADPD256,
25705 IX86_BUILTIN_MASKLOADPS256,
25706 IX86_BUILTIN_MASKSTOREPD256,
25707 IX86_BUILTIN_MASKSTOREPS256,
25708 IX86_BUILTIN_MOVSHDUP256,
25709 IX86_BUILTIN_MOVSLDUP256,
25710 IX86_BUILTIN_MOVDDUP256,
25711
25712 IX86_BUILTIN_SQRTPD256,
25713 IX86_BUILTIN_SQRTPS256,
25714 IX86_BUILTIN_SQRTPS_NR256,
25715 IX86_BUILTIN_RSQRTPS256,
25716 IX86_BUILTIN_RSQRTPS_NR256,
25717
25718 IX86_BUILTIN_RCPPS256,
25719
25720 IX86_BUILTIN_ROUNDPD256,
25721 IX86_BUILTIN_ROUNDPS256,
25722
25723 IX86_BUILTIN_FLOORPD256,
25724 IX86_BUILTIN_CEILPD256,
25725 IX86_BUILTIN_TRUNCPD256,
25726 IX86_BUILTIN_RINTPD256,
25727 IX86_BUILTIN_ROUNDPD_AZ256,
25728
25729 IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256,
25730 IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256,
25731 IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256,
25732
25733 IX86_BUILTIN_FLOORPS256,
25734 IX86_BUILTIN_CEILPS256,
25735 IX86_BUILTIN_TRUNCPS256,
25736 IX86_BUILTIN_RINTPS256,
25737 IX86_BUILTIN_ROUNDPS_AZ256,
25738
25739 IX86_BUILTIN_FLOORPS_SFIX256,
25740 IX86_BUILTIN_CEILPS_SFIX256,
25741 IX86_BUILTIN_ROUNDPS_AZ_SFIX256,
25742
25743 IX86_BUILTIN_UNPCKHPD256,
25744 IX86_BUILTIN_UNPCKLPD256,
25745 IX86_BUILTIN_UNPCKHPS256,
25746 IX86_BUILTIN_UNPCKLPS256,
25747
25748 IX86_BUILTIN_SI256_SI,
25749 IX86_BUILTIN_PS256_PS,
25750 IX86_BUILTIN_PD256_PD,
25751 IX86_BUILTIN_SI_SI256,
25752 IX86_BUILTIN_PS_PS256,
25753 IX86_BUILTIN_PD_PD256,
25754
25755 IX86_BUILTIN_VTESTZPD,
25756 IX86_BUILTIN_VTESTCPD,
25757 IX86_BUILTIN_VTESTNZCPD,
25758 IX86_BUILTIN_VTESTZPS,
25759 IX86_BUILTIN_VTESTCPS,
25760 IX86_BUILTIN_VTESTNZCPS,
25761 IX86_BUILTIN_VTESTZPD256,
25762 IX86_BUILTIN_VTESTCPD256,
25763 IX86_BUILTIN_VTESTNZCPD256,
25764 IX86_BUILTIN_VTESTZPS256,
25765 IX86_BUILTIN_VTESTCPS256,
25766 IX86_BUILTIN_VTESTNZCPS256,
25767 IX86_BUILTIN_PTESTZ256,
25768 IX86_BUILTIN_PTESTC256,
25769 IX86_BUILTIN_PTESTNZC256,
25770
25771 IX86_BUILTIN_MOVMSKPD256,
25772 IX86_BUILTIN_MOVMSKPS256,
25773
25774 /* AVX2 */
25775 IX86_BUILTIN_MPSADBW256,
25776 IX86_BUILTIN_PABSB256,
25777 IX86_BUILTIN_PABSW256,
25778 IX86_BUILTIN_PABSD256,
25779 IX86_BUILTIN_PACKSSDW256,
25780 IX86_BUILTIN_PACKSSWB256,
25781 IX86_BUILTIN_PACKUSDW256,
25782 IX86_BUILTIN_PACKUSWB256,
25783 IX86_BUILTIN_PADDB256,
25784 IX86_BUILTIN_PADDW256,
25785 IX86_BUILTIN_PADDD256,
25786 IX86_BUILTIN_PADDQ256,
25787 IX86_BUILTIN_PADDSB256,
25788 IX86_BUILTIN_PADDSW256,
25789 IX86_BUILTIN_PADDUSB256,
25790 IX86_BUILTIN_PADDUSW256,
25791 IX86_BUILTIN_PALIGNR256,
25792 IX86_BUILTIN_AND256I,
25793 IX86_BUILTIN_ANDNOT256I,
25794 IX86_BUILTIN_PAVGB256,
25795 IX86_BUILTIN_PAVGW256,
25796 IX86_BUILTIN_PBLENDVB256,
25797 IX86_BUILTIN_PBLENDVW256,
25798 IX86_BUILTIN_PCMPEQB256,
25799 IX86_BUILTIN_PCMPEQW256,
25800 IX86_BUILTIN_PCMPEQD256,
25801 IX86_BUILTIN_PCMPEQQ256,
25802 IX86_BUILTIN_PCMPGTB256,
25803 IX86_BUILTIN_PCMPGTW256,
25804 IX86_BUILTIN_PCMPGTD256,
25805 IX86_BUILTIN_PCMPGTQ256,
25806 IX86_BUILTIN_PHADDW256,
25807 IX86_BUILTIN_PHADDD256,
25808 IX86_BUILTIN_PHADDSW256,
25809 IX86_BUILTIN_PHSUBW256,
25810 IX86_BUILTIN_PHSUBD256,
25811 IX86_BUILTIN_PHSUBSW256,
25812 IX86_BUILTIN_PMADDUBSW256,
25813 IX86_BUILTIN_PMADDWD256,
25814 IX86_BUILTIN_PMAXSB256,
25815 IX86_BUILTIN_PMAXSW256,
25816 IX86_BUILTIN_PMAXSD256,
25817 IX86_BUILTIN_PMAXUB256,
25818 IX86_BUILTIN_PMAXUW256,
25819 IX86_BUILTIN_PMAXUD256,
25820 IX86_BUILTIN_PMINSB256,
25821 IX86_BUILTIN_PMINSW256,
25822 IX86_BUILTIN_PMINSD256,
25823 IX86_BUILTIN_PMINUB256,
25824 IX86_BUILTIN_PMINUW256,
25825 IX86_BUILTIN_PMINUD256,
25826 IX86_BUILTIN_PMOVMSKB256,
25827 IX86_BUILTIN_PMOVSXBW256,
25828 IX86_BUILTIN_PMOVSXBD256,
25829 IX86_BUILTIN_PMOVSXBQ256,
25830 IX86_BUILTIN_PMOVSXWD256,
25831 IX86_BUILTIN_PMOVSXWQ256,
25832 IX86_BUILTIN_PMOVSXDQ256,
25833 IX86_BUILTIN_PMOVZXBW256,
25834 IX86_BUILTIN_PMOVZXBD256,
25835 IX86_BUILTIN_PMOVZXBQ256,
25836 IX86_BUILTIN_PMOVZXWD256,
25837 IX86_BUILTIN_PMOVZXWQ256,
25838 IX86_BUILTIN_PMOVZXDQ256,
25839 IX86_BUILTIN_PMULDQ256,
25840 IX86_BUILTIN_PMULHRSW256,
25841 IX86_BUILTIN_PMULHUW256,
25842 IX86_BUILTIN_PMULHW256,
25843 IX86_BUILTIN_PMULLW256,
25844 IX86_BUILTIN_PMULLD256,
25845 IX86_BUILTIN_PMULUDQ256,
25846 IX86_BUILTIN_POR256,
25847 IX86_BUILTIN_PSADBW256,
25848 IX86_BUILTIN_PSHUFB256,
25849 IX86_BUILTIN_PSHUFD256,
25850 IX86_BUILTIN_PSHUFHW256,
25851 IX86_BUILTIN_PSHUFLW256,
25852 IX86_BUILTIN_PSIGNB256,
25853 IX86_BUILTIN_PSIGNW256,
25854 IX86_BUILTIN_PSIGND256,
25855 IX86_BUILTIN_PSLLDQI256,
25856 IX86_BUILTIN_PSLLWI256,
25857 IX86_BUILTIN_PSLLW256,
25858 IX86_BUILTIN_PSLLDI256,
25859 IX86_BUILTIN_PSLLD256,
25860 IX86_BUILTIN_PSLLQI256,
25861 IX86_BUILTIN_PSLLQ256,
25862 IX86_BUILTIN_PSRAWI256,
25863 IX86_BUILTIN_PSRAW256,
25864 IX86_BUILTIN_PSRADI256,
25865 IX86_BUILTIN_PSRAD256,
25866 IX86_BUILTIN_PSRLDQI256,
25867 IX86_BUILTIN_PSRLWI256,
25868 IX86_BUILTIN_PSRLW256,
25869 IX86_BUILTIN_PSRLDI256,
25870 IX86_BUILTIN_PSRLD256,
25871 IX86_BUILTIN_PSRLQI256,
25872 IX86_BUILTIN_PSRLQ256,
25873 IX86_BUILTIN_PSUBB256,
25874 IX86_BUILTIN_PSUBW256,
25875 IX86_BUILTIN_PSUBD256,
25876 IX86_BUILTIN_PSUBQ256,
25877 IX86_BUILTIN_PSUBSB256,
25878 IX86_BUILTIN_PSUBSW256,
25879 IX86_BUILTIN_PSUBUSB256,
25880 IX86_BUILTIN_PSUBUSW256,
25881 IX86_BUILTIN_PUNPCKHBW256,
25882 IX86_BUILTIN_PUNPCKHWD256,
25883 IX86_BUILTIN_PUNPCKHDQ256,
25884 IX86_BUILTIN_PUNPCKHQDQ256,
25885 IX86_BUILTIN_PUNPCKLBW256,
25886 IX86_BUILTIN_PUNPCKLWD256,
25887 IX86_BUILTIN_PUNPCKLDQ256,
25888 IX86_BUILTIN_PUNPCKLQDQ256,
25889 IX86_BUILTIN_PXOR256,
25890 IX86_BUILTIN_MOVNTDQA256,
25891 IX86_BUILTIN_VBROADCASTSS_PS,
25892 IX86_BUILTIN_VBROADCASTSS_PS256,
25893 IX86_BUILTIN_VBROADCASTSD_PD256,
25894 IX86_BUILTIN_VBROADCASTSI256,
25895 IX86_BUILTIN_PBLENDD256,
25896 IX86_BUILTIN_PBLENDD128,
25897 IX86_BUILTIN_PBROADCASTB256,
25898 IX86_BUILTIN_PBROADCASTW256,
25899 IX86_BUILTIN_PBROADCASTD256,
25900 IX86_BUILTIN_PBROADCASTQ256,
25901 IX86_BUILTIN_PBROADCASTB128,
25902 IX86_BUILTIN_PBROADCASTW128,
25903 IX86_BUILTIN_PBROADCASTD128,
25904 IX86_BUILTIN_PBROADCASTQ128,
25905 IX86_BUILTIN_VPERMVARSI256,
25906 IX86_BUILTIN_VPERMDF256,
25907 IX86_BUILTIN_VPERMVARSF256,
25908 IX86_BUILTIN_VPERMDI256,
25909 IX86_BUILTIN_VPERMTI256,
25910 IX86_BUILTIN_VEXTRACT128I256,
25911 IX86_BUILTIN_VINSERT128I256,
25912 IX86_BUILTIN_MASKLOADD,
25913 IX86_BUILTIN_MASKLOADQ,
25914 IX86_BUILTIN_MASKLOADD256,
25915 IX86_BUILTIN_MASKLOADQ256,
25916 IX86_BUILTIN_MASKSTORED,
25917 IX86_BUILTIN_MASKSTOREQ,
25918 IX86_BUILTIN_MASKSTORED256,
25919 IX86_BUILTIN_MASKSTOREQ256,
25920 IX86_BUILTIN_PSLLVV4DI,
25921 IX86_BUILTIN_PSLLVV2DI,
25922 IX86_BUILTIN_PSLLVV8SI,
25923 IX86_BUILTIN_PSLLVV4SI,
25924 IX86_BUILTIN_PSRAVV8SI,
25925 IX86_BUILTIN_PSRAVV4SI,
25926 IX86_BUILTIN_PSRLVV4DI,
25927 IX86_BUILTIN_PSRLVV2DI,
25928 IX86_BUILTIN_PSRLVV8SI,
25929 IX86_BUILTIN_PSRLVV4SI,
25930
25931 IX86_BUILTIN_GATHERSIV2DF,
25932 IX86_BUILTIN_GATHERSIV4DF,
25933 IX86_BUILTIN_GATHERDIV2DF,
25934 IX86_BUILTIN_GATHERDIV4DF,
25935 IX86_BUILTIN_GATHERSIV4SF,
25936 IX86_BUILTIN_GATHERSIV8SF,
25937 IX86_BUILTIN_GATHERDIV4SF,
25938 IX86_BUILTIN_GATHERDIV8SF,
25939 IX86_BUILTIN_GATHERSIV2DI,
25940 IX86_BUILTIN_GATHERSIV4DI,
25941 IX86_BUILTIN_GATHERDIV2DI,
25942 IX86_BUILTIN_GATHERDIV4DI,
25943 IX86_BUILTIN_GATHERSIV4SI,
25944 IX86_BUILTIN_GATHERSIV8SI,
25945 IX86_BUILTIN_GATHERDIV4SI,
25946 IX86_BUILTIN_GATHERDIV8SI,
25947
25948 /* Alternate 4 element gather for the vectorizer where
25949 all operands are 32-byte wide. */
25950 IX86_BUILTIN_GATHERALTSIV4DF,
25951 IX86_BUILTIN_GATHERALTDIV8SF,
25952 IX86_BUILTIN_GATHERALTSIV4DI,
25953 IX86_BUILTIN_GATHERALTDIV8SI,
25954
25955 /* TFmode support builtins. */
25956 IX86_BUILTIN_INFQ,
25957 IX86_BUILTIN_HUGE_VALQ,
25958 IX86_BUILTIN_FABSQ,
25959 IX86_BUILTIN_COPYSIGNQ,
25960
25961 /* Vectorizer support builtins. */
25962 IX86_BUILTIN_CPYSGNPS,
25963 IX86_BUILTIN_CPYSGNPD,
25964 IX86_BUILTIN_CPYSGNPS256,
25965 IX86_BUILTIN_CPYSGNPD256,
25966
25967 /* FMA4 instructions. */
25968 IX86_BUILTIN_VFMADDSS,
25969 IX86_BUILTIN_VFMADDSD,
25970 IX86_BUILTIN_VFMADDPS,
25971 IX86_BUILTIN_VFMADDPD,
25972 IX86_BUILTIN_VFMADDPS256,
25973 IX86_BUILTIN_VFMADDPD256,
25974 IX86_BUILTIN_VFMADDSUBPS,
25975 IX86_BUILTIN_VFMADDSUBPD,
25976 IX86_BUILTIN_VFMADDSUBPS256,
25977 IX86_BUILTIN_VFMADDSUBPD256,
25978
25979 /* FMA3 instructions. */
25980 IX86_BUILTIN_VFMADDSS3,
25981 IX86_BUILTIN_VFMADDSD3,
25982
25983 /* XOP instructions. */
25984 IX86_BUILTIN_VPCMOV,
25985 IX86_BUILTIN_VPCMOV_V2DI,
25986 IX86_BUILTIN_VPCMOV_V4SI,
25987 IX86_BUILTIN_VPCMOV_V8HI,
25988 IX86_BUILTIN_VPCMOV_V16QI,
25989 IX86_BUILTIN_VPCMOV_V4SF,
25990 IX86_BUILTIN_VPCMOV_V2DF,
25991 IX86_BUILTIN_VPCMOV256,
25992 IX86_BUILTIN_VPCMOV_V4DI256,
25993 IX86_BUILTIN_VPCMOV_V8SI256,
25994 IX86_BUILTIN_VPCMOV_V16HI256,
25995 IX86_BUILTIN_VPCMOV_V32QI256,
25996 IX86_BUILTIN_VPCMOV_V8SF256,
25997 IX86_BUILTIN_VPCMOV_V4DF256,
25998
25999 IX86_BUILTIN_VPPERM,
26000
26001 IX86_BUILTIN_VPMACSSWW,
26002 IX86_BUILTIN_VPMACSWW,
26003 IX86_BUILTIN_VPMACSSWD,
26004 IX86_BUILTIN_VPMACSWD,
26005 IX86_BUILTIN_VPMACSSDD,
26006 IX86_BUILTIN_VPMACSDD,
26007 IX86_BUILTIN_VPMACSSDQL,
26008 IX86_BUILTIN_VPMACSSDQH,
26009 IX86_BUILTIN_VPMACSDQL,
26010 IX86_BUILTIN_VPMACSDQH,
26011 IX86_BUILTIN_VPMADCSSWD,
26012 IX86_BUILTIN_VPMADCSWD,
26013
26014 IX86_BUILTIN_VPHADDBW,
26015 IX86_BUILTIN_VPHADDBD,
26016 IX86_BUILTIN_VPHADDBQ,
26017 IX86_BUILTIN_VPHADDWD,
26018 IX86_BUILTIN_VPHADDWQ,
26019 IX86_BUILTIN_VPHADDDQ,
26020 IX86_BUILTIN_VPHADDUBW,
26021 IX86_BUILTIN_VPHADDUBD,
26022 IX86_BUILTIN_VPHADDUBQ,
26023 IX86_BUILTIN_VPHADDUWD,
26024 IX86_BUILTIN_VPHADDUWQ,
26025 IX86_BUILTIN_VPHADDUDQ,
26026 IX86_BUILTIN_VPHSUBBW,
26027 IX86_BUILTIN_VPHSUBWD,
26028 IX86_BUILTIN_VPHSUBDQ,
26029
26030 IX86_BUILTIN_VPROTB,
26031 IX86_BUILTIN_VPROTW,
26032 IX86_BUILTIN_VPROTD,
26033 IX86_BUILTIN_VPROTQ,
26034 IX86_BUILTIN_VPROTB_IMM,
26035 IX86_BUILTIN_VPROTW_IMM,
26036 IX86_BUILTIN_VPROTD_IMM,
26037 IX86_BUILTIN_VPROTQ_IMM,
26038
26039 IX86_BUILTIN_VPSHLB,
26040 IX86_BUILTIN_VPSHLW,
26041 IX86_BUILTIN_VPSHLD,
26042 IX86_BUILTIN_VPSHLQ,
26043 IX86_BUILTIN_VPSHAB,
26044 IX86_BUILTIN_VPSHAW,
26045 IX86_BUILTIN_VPSHAD,
26046 IX86_BUILTIN_VPSHAQ,
26047
26048 IX86_BUILTIN_VFRCZSS,
26049 IX86_BUILTIN_VFRCZSD,
26050 IX86_BUILTIN_VFRCZPS,
26051 IX86_BUILTIN_VFRCZPD,
26052 IX86_BUILTIN_VFRCZPS256,
26053 IX86_BUILTIN_VFRCZPD256,
26054
26055 IX86_BUILTIN_VPCOMEQUB,
26056 IX86_BUILTIN_VPCOMNEUB,
26057 IX86_BUILTIN_VPCOMLTUB,
26058 IX86_BUILTIN_VPCOMLEUB,
26059 IX86_BUILTIN_VPCOMGTUB,
26060 IX86_BUILTIN_VPCOMGEUB,
26061 IX86_BUILTIN_VPCOMFALSEUB,
26062 IX86_BUILTIN_VPCOMTRUEUB,
26063
26064 IX86_BUILTIN_VPCOMEQUW,
26065 IX86_BUILTIN_VPCOMNEUW,
26066 IX86_BUILTIN_VPCOMLTUW,
26067 IX86_BUILTIN_VPCOMLEUW,
26068 IX86_BUILTIN_VPCOMGTUW,
26069 IX86_BUILTIN_VPCOMGEUW,
26070 IX86_BUILTIN_VPCOMFALSEUW,
26071 IX86_BUILTIN_VPCOMTRUEUW,
26072
26073 IX86_BUILTIN_VPCOMEQUD,
26074 IX86_BUILTIN_VPCOMNEUD,
26075 IX86_BUILTIN_VPCOMLTUD,
26076 IX86_BUILTIN_VPCOMLEUD,
26077 IX86_BUILTIN_VPCOMGTUD,
26078 IX86_BUILTIN_VPCOMGEUD,
26079 IX86_BUILTIN_VPCOMFALSEUD,
26080 IX86_BUILTIN_VPCOMTRUEUD,
26081
26082 IX86_BUILTIN_VPCOMEQUQ,
26083 IX86_BUILTIN_VPCOMNEUQ,
26084 IX86_BUILTIN_VPCOMLTUQ,
26085 IX86_BUILTIN_VPCOMLEUQ,
26086 IX86_BUILTIN_VPCOMGTUQ,
26087 IX86_BUILTIN_VPCOMGEUQ,
26088 IX86_BUILTIN_VPCOMFALSEUQ,
26089 IX86_BUILTIN_VPCOMTRUEUQ,
26090
26091 IX86_BUILTIN_VPCOMEQB,
26092 IX86_BUILTIN_VPCOMNEB,
26093 IX86_BUILTIN_VPCOMLTB,
26094 IX86_BUILTIN_VPCOMLEB,
26095 IX86_BUILTIN_VPCOMGTB,
26096 IX86_BUILTIN_VPCOMGEB,
26097 IX86_BUILTIN_VPCOMFALSEB,
26098 IX86_BUILTIN_VPCOMTRUEB,
26099
26100 IX86_BUILTIN_VPCOMEQW,
26101 IX86_BUILTIN_VPCOMNEW,
26102 IX86_BUILTIN_VPCOMLTW,
26103 IX86_BUILTIN_VPCOMLEW,
26104 IX86_BUILTIN_VPCOMGTW,
26105 IX86_BUILTIN_VPCOMGEW,
26106 IX86_BUILTIN_VPCOMFALSEW,
26107 IX86_BUILTIN_VPCOMTRUEW,
26108
26109 IX86_BUILTIN_VPCOMEQD,
26110 IX86_BUILTIN_VPCOMNED,
26111 IX86_BUILTIN_VPCOMLTD,
26112 IX86_BUILTIN_VPCOMLED,
26113 IX86_BUILTIN_VPCOMGTD,
26114 IX86_BUILTIN_VPCOMGED,
26115 IX86_BUILTIN_VPCOMFALSED,
26116 IX86_BUILTIN_VPCOMTRUED,
26117
26118 IX86_BUILTIN_VPCOMEQQ,
26119 IX86_BUILTIN_VPCOMNEQ,
26120 IX86_BUILTIN_VPCOMLTQ,
26121 IX86_BUILTIN_VPCOMLEQ,
26122 IX86_BUILTIN_VPCOMGTQ,
26123 IX86_BUILTIN_VPCOMGEQ,
26124 IX86_BUILTIN_VPCOMFALSEQ,
26125 IX86_BUILTIN_VPCOMTRUEQ,
26126
26127 /* LWP instructions. */
26128 IX86_BUILTIN_LLWPCB,
26129 IX86_BUILTIN_SLWPCB,
26130 IX86_BUILTIN_LWPVAL32,
26131 IX86_BUILTIN_LWPVAL64,
26132 IX86_BUILTIN_LWPINS32,
26133 IX86_BUILTIN_LWPINS64,
26134
26135 IX86_BUILTIN_CLZS,
26136
26137 /* RTM */
26138 IX86_BUILTIN_XBEGIN,
26139 IX86_BUILTIN_XEND,
26140 IX86_BUILTIN_XABORT,
26141 IX86_BUILTIN_XTEST,
26142
26143 /* BMI instructions. */
26144 IX86_BUILTIN_BEXTR32,
26145 IX86_BUILTIN_BEXTR64,
26146 IX86_BUILTIN_CTZS,
26147
26148 /* TBM instructions. */
26149 IX86_BUILTIN_BEXTRI32,
26150 IX86_BUILTIN_BEXTRI64,
26151
26152 /* BMI2 instructions. */
26153 IX86_BUILTIN_BZHI32,
26154 IX86_BUILTIN_BZHI64,
26155 IX86_BUILTIN_PDEP32,
26156 IX86_BUILTIN_PDEP64,
26157 IX86_BUILTIN_PEXT32,
26158 IX86_BUILTIN_PEXT64,
26159
26160 /* ADX instructions. */
26161 IX86_BUILTIN_ADDCARRYX32,
26162 IX86_BUILTIN_ADDCARRYX64,
26163
26164 /* FSGSBASE instructions. */
26165 IX86_BUILTIN_RDFSBASE32,
26166 IX86_BUILTIN_RDFSBASE64,
26167 IX86_BUILTIN_RDGSBASE32,
26168 IX86_BUILTIN_RDGSBASE64,
26169 IX86_BUILTIN_WRFSBASE32,
26170 IX86_BUILTIN_WRFSBASE64,
26171 IX86_BUILTIN_WRGSBASE32,
26172 IX86_BUILTIN_WRGSBASE64,
26173
26174 /* RDRND instructions. */
26175 IX86_BUILTIN_RDRAND16_STEP,
26176 IX86_BUILTIN_RDRAND32_STEP,
26177 IX86_BUILTIN_RDRAND64_STEP,
26178
26179 /* RDSEED instructions. */
26180 IX86_BUILTIN_RDSEED16_STEP,
26181 IX86_BUILTIN_RDSEED32_STEP,
26182 IX86_BUILTIN_RDSEED64_STEP,
26183
26184 /* F16C instructions. */
26185 IX86_BUILTIN_CVTPH2PS,
26186 IX86_BUILTIN_CVTPH2PS256,
26187 IX86_BUILTIN_CVTPS2PH,
26188 IX86_BUILTIN_CVTPS2PH256,
26189
26190 /* CFString built-in for darwin */
26191 IX86_BUILTIN_CFSTRING,
26192
26193 /* Builtins to get CPU type and supported features. */
26194 IX86_BUILTIN_CPU_INIT,
26195 IX86_BUILTIN_CPU_IS,
26196 IX86_BUILTIN_CPU_SUPPORTS,
26197
26198 IX86_BUILTIN_MAX
26199 };
26200
26201 /* Table for the ix86 builtin decls. */
26202 static GTY(()) tree ix86_builtins[(int) IX86_BUILTIN_MAX];
26203
26204 /* Table of all of the builtin functions that are possible with different ISA's
26205 but are waiting to be built until a function is declared to use that
26206 ISA. */
26207 struct builtin_isa {
26208 const char *name; /* function name */
26209 enum ix86_builtin_func_type tcode; /* type to use in the declaration */
26210 HOST_WIDE_INT isa; /* isa_flags this builtin is defined for */
26211 bool const_p; /* true if the declaration is constant */
26212 bool set_and_not_built_p;
26213 };
26214
26215 static struct builtin_isa ix86_builtins_isa[(int) IX86_BUILTIN_MAX];
26216
26217
26218 /* Add an ix86 target builtin function with CODE, NAME and TYPE. Save the MASK
26219 of which isa_flags to use in the ix86_builtins_isa array. Stores the
26220 function decl in the ix86_builtins array. Returns the function decl or
26221 NULL_TREE, if the builtin was not added.
26222
26223 If the front end has a special hook for builtin functions, delay adding
26224 builtin functions that aren't in the current ISA until the ISA is changed
26225 with function specific optimization. Doing so, can save about 300K for the
26226 default compiler. When the builtin is expanded, check at that time whether
26227 it is valid.
26228
26229 If the front end doesn't have a special hook, record all builtins, even if
26230 it isn't an instruction set in the current ISA in case the user uses
26231 function specific options for a different ISA, so that we don't get scope
26232 errors if a builtin is added in the middle of a function scope. */
26233
26234 static inline tree
26235 def_builtin (HOST_WIDE_INT mask, const char *name,
26236 enum ix86_builtin_func_type tcode,
26237 enum ix86_builtins code)
26238 {
26239 tree decl = NULL_TREE;
26240
26241 if (!(mask & OPTION_MASK_ISA_64BIT) || TARGET_64BIT)
26242 {
26243 ix86_builtins_isa[(int) code].isa = mask;
26244
26245 mask &= ~OPTION_MASK_ISA_64BIT;
26246 if (mask == 0
26247 || (mask & ix86_isa_flags) != 0
26248 || (lang_hooks.builtin_function
26249 == lang_hooks.builtin_function_ext_scope))
26250
26251 {
26252 tree type = ix86_get_builtin_func_type (tcode);
26253 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
26254 NULL, NULL_TREE);
26255 ix86_builtins[(int) code] = decl;
26256 ix86_builtins_isa[(int) code].set_and_not_built_p = false;
26257 }
26258 else
26259 {
26260 ix86_builtins[(int) code] = NULL_TREE;
26261 ix86_builtins_isa[(int) code].tcode = tcode;
26262 ix86_builtins_isa[(int) code].name = name;
26263 ix86_builtins_isa[(int) code].const_p = false;
26264 ix86_builtins_isa[(int) code].set_and_not_built_p = true;
26265 }
26266 }
26267
26268 return decl;
26269 }
26270
26271 /* Like def_builtin, but also marks the function decl "const". */
26272
26273 static inline tree
26274 def_builtin_const (HOST_WIDE_INT mask, const char *name,
26275 enum ix86_builtin_func_type tcode, enum ix86_builtins code)
26276 {
26277 tree decl = def_builtin (mask, name, tcode, code);
26278 if (decl)
26279 TREE_READONLY (decl) = 1;
26280 else
26281 ix86_builtins_isa[(int) code].const_p = true;
26282
26283 return decl;
26284 }
26285
26286 /* Add any new builtin functions for a given ISA that may not have been
26287 declared. This saves a bit of space compared to adding all of the
26288 declarations to the tree, even if we didn't use them. */
26289
26290 static void
26291 ix86_add_new_builtins (HOST_WIDE_INT isa)
26292 {
26293 int i;
26294
26295 for (i = 0; i < (int)IX86_BUILTIN_MAX; i++)
26296 {
26297 if ((ix86_builtins_isa[i].isa & isa) != 0
26298 && ix86_builtins_isa[i].set_and_not_built_p)
26299 {
26300 tree decl, type;
26301
26302 /* Don't define the builtin again. */
26303 ix86_builtins_isa[i].set_and_not_built_p = false;
26304
26305 type = ix86_get_builtin_func_type (ix86_builtins_isa[i].tcode);
26306 decl = add_builtin_function_ext_scope (ix86_builtins_isa[i].name,
26307 type, i, BUILT_IN_MD, NULL,
26308 NULL_TREE);
26309
26310 ix86_builtins[i] = decl;
26311 if (ix86_builtins_isa[i].const_p)
26312 TREE_READONLY (decl) = 1;
26313 }
26314 }
26315 }
26316
26317 /* Bits for builtin_description.flag. */
26318
26319 /* Set when we don't support the comparison natively, and should
26320 swap_comparison in order to support it. */
26321 #define BUILTIN_DESC_SWAP_OPERANDS 1
26322
26323 struct builtin_description
26324 {
26325 const HOST_WIDE_INT mask;
26326 const enum insn_code icode;
26327 const char *const name;
26328 const enum ix86_builtins code;
26329 const enum rtx_code comparison;
26330 const int flag;
26331 };
26332
26333 static const struct builtin_description bdesc_comi[] =
26334 {
26335 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comieq", IX86_BUILTIN_COMIEQSS, UNEQ, 0 },
26336 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comilt", IX86_BUILTIN_COMILTSS, UNLT, 0 },
26337 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comile", IX86_BUILTIN_COMILESS, UNLE, 0 },
26338 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comigt", IX86_BUILTIN_COMIGTSS, GT, 0 },
26339 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comige", IX86_BUILTIN_COMIGESS, GE, 0 },
26340 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comineq", IX86_BUILTIN_COMINEQSS, LTGT, 0 },
26341 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomieq", IX86_BUILTIN_UCOMIEQSS, UNEQ, 0 },
26342 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomilt", IX86_BUILTIN_UCOMILTSS, UNLT, 0 },
26343 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomile", IX86_BUILTIN_UCOMILESS, UNLE, 0 },
26344 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomigt", IX86_BUILTIN_UCOMIGTSS, GT, 0 },
26345 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomige", IX86_BUILTIN_UCOMIGESS, GE, 0 },
26346 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomineq", IX86_BUILTIN_UCOMINEQSS, LTGT, 0 },
26347 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdeq", IX86_BUILTIN_COMIEQSD, UNEQ, 0 },
26348 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdlt", IX86_BUILTIN_COMILTSD, UNLT, 0 },
26349 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdle", IX86_BUILTIN_COMILESD, UNLE, 0 },
26350 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdgt", IX86_BUILTIN_COMIGTSD, GT, 0 },
26351 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdge", IX86_BUILTIN_COMIGESD, GE, 0 },
26352 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdneq", IX86_BUILTIN_COMINEQSD, LTGT, 0 },
26353 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdeq", IX86_BUILTIN_UCOMIEQSD, UNEQ, 0 },
26354 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdlt", IX86_BUILTIN_UCOMILTSD, UNLT, 0 },
26355 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdle", IX86_BUILTIN_UCOMILESD, UNLE, 0 },
26356 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdgt", IX86_BUILTIN_UCOMIGTSD, GT, 0 },
26357 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdge", IX86_BUILTIN_UCOMIGESD, GE, 0 },
26358 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdneq", IX86_BUILTIN_UCOMINEQSD, LTGT, 0 },
26359 };
26360
26361 static const struct builtin_description bdesc_pcmpestr[] =
26362 {
26363 /* SSE4.2 */
26364 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestri128", IX86_BUILTIN_PCMPESTRI128, UNKNOWN, 0 },
26365 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestrm128", IX86_BUILTIN_PCMPESTRM128, UNKNOWN, 0 },
26366 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestria128", IX86_BUILTIN_PCMPESTRA128, UNKNOWN, (int) CCAmode },
26367 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestric128", IX86_BUILTIN_PCMPESTRC128, UNKNOWN, (int) CCCmode },
26368 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestrio128", IX86_BUILTIN_PCMPESTRO128, UNKNOWN, (int) CCOmode },
26369 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestris128", IX86_BUILTIN_PCMPESTRS128, UNKNOWN, (int) CCSmode },
26370 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestriz128", IX86_BUILTIN_PCMPESTRZ128, UNKNOWN, (int) CCZmode },
26371 };
26372
26373 static const struct builtin_description bdesc_pcmpistr[] =
26374 {
26375 /* SSE4.2 */
26376 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistri128", IX86_BUILTIN_PCMPISTRI128, UNKNOWN, 0 },
26377 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistrm128", IX86_BUILTIN_PCMPISTRM128, UNKNOWN, 0 },
26378 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistria128", IX86_BUILTIN_PCMPISTRA128, UNKNOWN, (int) CCAmode },
26379 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistric128", IX86_BUILTIN_PCMPISTRC128, UNKNOWN, (int) CCCmode },
26380 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistrio128", IX86_BUILTIN_PCMPISTRO128, UNKNOWN, (int) CCOmode },
26381 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistris128", IX86_BUILTIN_PCMPISTRS128, UNKNOWN, (int) CCSmode },
26382 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistriz128", IX86_BUILTIN_PCMPISTRZ128, UNKNOWN, (int) CCZmode },
26383 };
26384
26385 /* Special builtins with variable number of arguments. */
26386 static const struct builtin_description bdesc_special_args[] =
26387 {
26388 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rdtsc, "__builtin_ia32_rdtsc", IX86_BUILTIN_RDTSC, UNKNOWN, (int) UINT64_FTYPE_VOID },
26389 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rdtscp, "__builtin_ia32_rdtscp", IX86_BUILTIN_RDTSCP, UNKNOWN, (int) UINT64_FTYPE_PUNSIGNED },
26390 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_pause, "__builtin_ia32_pause", IX86_BUILTIN_PAUSE, UNKNOWN, (int) VOID_FTYPE_VOID },
26391
26392 /* MMX */
26393 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_emms, "__builtin_ia32_emms", IX86_BUILTIN_EMMS, UNKNOWN, (int) VOID_FTYPE_VOID },
26394
26395 /* 3DNow! */
26396 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_femms, "__builtin_ia32_femms", IX86_BUILTIN_FEMMS, UNKNOWN, (int) VOID_FTYPE_VOID },
26397
26398 /* SSE */
26399 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movups, "__builtin_ia32_storeups", IX86_BUILTIN_STOREUPS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF },
26400 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movntv4sf, "__builtin_ia32_movntps", IX86_BUILTIN_MOVNTPS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF },
26401 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movups, "__builtin_ia32_loadups", IX86_BUILTIN_LOADUPS, UNKNOWN, (int) V4SF_FTYPE_PCFLOAT },
26402
26403 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_loadhps_exp, "__builtin_ia32_loadhps", IX86_BUILTIN_LOADHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_PCV2SF },
26404 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_loadlps_exp, "__builtin_ia32_loadlps", IX86_BUILTIN_LOADLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_PCV2SF },
26405 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_storehps, "__builtin_ia32_storehps", IX86_BUILTIN_STOREHPS, UNKNOWN, (int) VOID_FTYPE_PV2SF_V4SF },
26406 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_storelps, "__builtin_ia32_storelps", IX86_BUILTIN_STORELPS, UNKNOWN, (int) VOID_FTYPE_PV2SF_V4SF },
26407
26408 /* SSE or 3DNow!A */
26409 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_sse_sfence, "__builtin_ia32_sfence", IX86_BUILTIN_SFENCE, UNKNOWN, (int) VOID_FTYPE_VOID },
26410 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_sse_movntq, "__builtin_ia32_movntq", IX86_BUILTIN_MOVNTQ, UNKNOWN, (int) VOID_FTYPE_PULONGLONG_ULONGLONG },
26411
26412 /* SSE2 */
26413 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_lfence, "__builtin_ia32_lfence", IX86_BUILTIN_LFENCE, UNKNOWN, (int) VOID_FTYPE_VOID },
26414 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_mfence, 0, IX86_BUILTIN_MFENCE, UNKNOWN, (int) VOID_FTYPE_VOID },
26415 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movupd, "__builtin_ia32_storeupd", IX86_BUILTIN_STOREUPD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF },
26416 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movdqu, "__builtin_ia32_storedqu", IX86_BUILTIN_STOREDQU, UNKNOWN, (int) VOID_FTYPE_PCHAR_V16QI },
26417 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntv2df, "__builtin_ia32_movntpd", IX86_BUILTIN_MOVNTPD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF },
26418 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntv2di, "__builtin_ia32_movntdq", IX86_BUILTIN_MOVNTDQ, UNKNOWN, (int) VOID_FTYPE_PV2DI_V2DI },
26419 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntisi, "__builtin_ia32_movnti", IX86_BUILTIN_MOVNTI, UNKNOWN, (int) VOID_FTYPE_PINT_INT },
26420 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_movntidi, "__builtin_ia32_movnti64", IX86_BUILTIN_MOVNTI64, UNKNOWN, (int) VOID_FTYPE_PLONGLONG_LONGLONG },
26421 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movupd, "__builtin_ia32_loadupd", IX86_BUILTIN_LOADUPD, UNKNOWN, (int) V2DF_FTYPE_PCDOUBLE },
26422 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movdqu, "__builtin_ia32_loaddqu", IX86_BUILTIN_LOADDQU, UNKNOWN, (int) V16QI_FTYPE_PCCHAR },
26423
26424 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loadhpd_exp, "__builtin_ia32_loadhpd", IX86_BUILTIN_LOADHPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_PCDOUBLE },
26425 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loadlpd_exp, "__builtin_ia32_loadlpd", IX86_BUILTIN_LOADLPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_PCDOUBLE },
26426
26427 /* SSE3 */
26428 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_lddqu, "__builtin_ia32_lddqu", IX86_BUILTIN_LDDQU, UNKNOWN, (int) V16QI_FTYPE_PCCHAR },
26429
26430 /* SSE4.1 */
26431 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_movntdqa, "__builtin_ia32_movntdqa", IX86_BUILTIN_MOVNTDQA, UNKNOWN, (int) V2DI_FTYPE_PV2DI },
26432
26433 /* SSE4A */
26434 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_vmmovntv2df, "__builtin_ia32_movntsd", IX86_BUILTIN_MOVNTSD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF },
26435 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_vmmovntv4sf, "__builtin_ia32_movntss", IX86_BUILTIN_MOVNTSS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF },
26436
26437 /* AVX */
26438 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vzeroall, "__builtin_ia32_vzeroall", IX86_BUILTIN_VZEROALL, UNKNOWN, (int) VOID_FTYPE_VOID },
26439 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vzeroupper, "__builtin_ia32_vzeroupper", IX86_BUILTIN_VZEROUPPER, UNKNOWN, (int) VOID_FTYPE_VOID },
26440
26441 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv4sf, "__builtin_ia32_vbroadcastss", IX86_BUILTIN_VBROADCASTSS, UNKNOWN, (int) V4SF_FTYPE_PCFLOAT },
26442 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv4df, "__builtin_ia32_vbroadcastsd256", IX86_BUILTIN_VBROADCASTSD256, UNKNOWN, (int) V4DF_FTYPE_PCDOUBLE },
26443 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv8sf, "__builtin_ia32_vbroadcastss256", IX86_BUILTIN_VBROADCASTSS256, UNKNOWN, (int) V8SF_FTYPE_PCFLOAT },
26444 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastf128_v4df, "__builtin_ia32_vbroadcastf128_pd256", IX86_BUILTIN_VBROADCASTPD256, UNKNOWN, (int) V4DF_FTYPE_PCV2DF },
26445 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastf128_v8sf, "__builtin_ia32_vbroadcastf128_ps256", IX86_BUILTIN_VBROADCASTPS256, UNKNOWN, (int) V8SF_FTYPE_PCV4SF },
26446
26447 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movupd256, "__builtin_ia32_loadupd256", IX86_BUILTIN_LOADUPD256, UNKNOWN, (int) V4DF_FTYPE_PCDOUBLE },
26448 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movups256, "__builtin_ia32_loadups256", IX86_BUILTIN_LOADUPS256, UNKNOWN, (int) V8SF_FTYPE_PCFLOAT },
26449 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movupd256, "__builtin_ia32_storeupd256", IX86_BUILTIN_STOREUPD256, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V4DF },
26450 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movups256, "__builtin_ia32_storeups256", IX86_BUILTIN_STOREUPS256, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V8SF },
26451 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movdqu256, "__builtin_ia32_loaddqu256", IX86_BUILTIN_LOADDQU256, UNKNOWN, (int) V32QI_FTYPE_PCCHAR },
26452 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movdqu256, "__builtin_ia32_storedqu256", IX86_BUILTIN_STOREDQU256, UNKNOWN, (int) VOID_FTYPE_PCHAR_V32QI },
26453 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_lddqu256, "__builtin_ia32_lddqu256", IX86_BUILTIN_LDDQU256, UNKNOWN, (int) V32QI_FTYPE_PCCHAR },
26454
26455 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv4di, "__builtin_ia32_movntdq256", IX86_BUILTIN_MOVNTDQ256, UNKNOWN, (int) VOID_FTYPE_PV4DI_V4DI },
26456 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv4df, "__builtin_ia32_movntpd256", IX86_BUILTIN_MOVNTPD256, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V4DF },
26457 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv8sf, "__builtin_ia32_movntps256", IX86_BUILTIN_MOVNTPS256, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V8SF },
26458
26459 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadpd, "__builtin_ia32_maskloadpd", IX86_BUILTIN_MASKLOADPD, UNKNOWN, (int) V2DF_FTYPE_PCV2DF_V2DI },
26460 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadps, "__builtin_ia32_maskloadps", IX86_BUILTIN_MASKLOADPS, UNKNOWN, (int) V4SF_FTYPE_PCV4SF_V4SI },
26461 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadpd256, "__builtin_ia32_maskloadpd256", IX86_BUILTIN_MASKLOADPD256, UNKNOWN, (int) V4DF_FTYPE_PCV4DF_V4DI },
26462 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadps256, "__builtin_ia32_maskloadps256", IX86_BUILTIN_MASKLOADPS256, UNKNOWN, (int) V8SF_FTYPE_PCV8SF_V8SI },
26463 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstorepd, "__builtin_ia32_maskstorepd", IX86_BUILTIN_MASKSTOREPD, UNKNOWN, (int) VOID_FTYPE_PV2DF_V2DI_V2DF },
26464 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstoreps, "__builtin_ia32_maskstoreps", IX86_BUILTIN_MASKSTOREPS, UNKNOWN, (int) VOID_FTYPE_PV4SF_V4SI_V4SF },
26465 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstorepd256, "__builtin_ia32_maskstorepd256", IX86_BUILTIN_MASKSTOREPD256, UNKNOWN, (int) VOID_FTYPE_PV4DF_V4DI_V4DF },
26466 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstoreps256, "__builtin_ia32_maskstoreps256", IX86_BUILTIN_MASKSTOREPS256, UNKNOWN, (int) VOID_FTYPE_PV8SF_V8SI_V8SF },
26467
26468 /* AVX2 */
26469 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_movntdqa, "__builtin_ia32_movntdqa256", IX86_BUILTIN_MOVNTDQA256, UNKNOWN, (int) V4DI_FTYPE_PV4DI },
26470 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadd, "__builtin_ia32_maskloadd", IX86_BUILTIN_MASKLOADD, UNKNOWN, (int) V4SI_FTYPE_PCV4SI_V4SI },
26471 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadq, "__builtin_ia32_maskloadq", IX86_BUILTIN_MASKLOADQ, UNKNOWN, (int) V2DI_FTYPE_PCV2DI_V2DI },
26472 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadd256, "__builtin_ia32_maskloadd256", IX86_BUILTIN_MASKLOADD256, UNKNOWN, (int) V8SI_FTYPE_PCV8SI_V8SI },
26473 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadq256, "__builtin_ia32_maskloadq256", IX86_BUILTIN_MASKLOADQ256, UNKNOWN, (int) V4DI_FTYPE_PCV4DI_V4DI },
26474 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstored, "__builtin_ia32_maskstored", IX86_BUILTIN_MASKSTORED, UNKNOWN, (int) VOID_FTYPE_PV4SI_V4SI_V4SI },
26475 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstoreq, "__builtin_ia32_maskstoreq", IX86_BUILTIN_MASKSTOREQ, UNKNOWN, (int) VOID_FTYPE_PV2DI_V2DI_V2DI },
26476 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstored256, "__builtin_ia32_maskstored256", IX86_BUILTIN_MASKSTORED256, UNKNOWN, (int) VOID_FTYPE_PV8SI_V8SI_V8SI },
26477 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstoreq256, "__builtin_ia32_maskstoreq256", IX86_BUILTIN_MASKSTOREQ256, UNKNOWN, (int) VOID_FTYPE_PV4DI_V4DI_V4DI },
26478
26479 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_llwpcb, "__builtin_ia32_llwpcb", IX86_BUILTIN_LLWPCB, UNKNOWN, (int) VOID_FTYPE_PVOID },
26480 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_slwpcb, "__builtin_ia32_slwpcb", IX86_BUILTIN_SLWPCB, UNKNOWN, (int) PVOID_FTYPE_VOID },
26481 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpvalsi3, "__builtin_ia32_lwpval32", IX86_BUILTIN_LWPVAL32, UNKNOWN, (int) VOID_FTYPE_UINT_UINT_UINT },
26482 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpvaldi3, "__builtin_ia32_lwpval64", IX86_BUILTIN_LWPVAL64, UNKNOWN, (int) VOID_FTYPE_UINT64_UINT_UINT },
26483 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpinssi3, "__builtin_ia32_lwpins32", IX86_BUILTIN_LWPINS32, UNKNOWN, (int) UCHAR_FTYPE_UINT_UINT_UINT },
26484 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpinsdi3, "__builtin_ia32_lwpins64", IX86_BUILTIN_LWPINS64, UNKNOWN, (int) UCHAR_FTYPE_UINT64_UINT_UINT },
26485
26486 /* FSGSBASE */
26487 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdfsbasesi, "__builtin_ia32_rdfsbase32", IX86_BUILTIN_RDFSBASE32, UNKNOWN, (int) UNSIGNED_FTYPE_VOID },
26488 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdfsbasedi, "__builtin_ia32_rdfsbase64", IX86_BUILTIN_RDFSBASE64, UNKNOWN, (int) UINT64_FTYPE_VOID },
26489 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdgsbasesi, "__builtin_ia32_rdgsbase32", IX86_BUILTIN_RDGSBASE32, UNKNOWN, (int) UNSIGNED_FTYPE_VOID },
26490 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdgsbasedi, "__builtin_ia32_rdgsbase64", IX86_BUILTIN_RDGSBASE64, UNKNOWN, (int) UINT64_FTYPE_VOID },
26491 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrfsbasesi, "__builtin_ia32_wrfsbase32", IX86_BUILTIN_WRFSBASE32, UNKNOWN, (int) VOID_FTYPE_UNSIGNED },
26492 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrfsbasedi, "__builtin_ia32_wrfsbase64", IX86_BUILTIN_WRFSBASE64, UNKNOWN, (int) VOID_FTYPE_UINT64 },
26493 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrgsbasesi, "__builtin_ia32_wrgsbase32", IX86_BUILTIN_WRGSBASE32, UNKNOWN, (int) VOID_FTYPE_UNSIGNED },
26494 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrgsbasedi, "__builtin_ia32_wrgsbase64", IX86_BUILTIN_WRGSBASE64, UNKNOWN, (int) VOID_FTYPE_UINT64 },
26495
26496 /* RTM */
26497 { OPTION_MASK_ISA_RTM, CODE_FOR_xbegin, "__builtin_ia32_xbegin", IX86_BUILTIN_XBEGIN, UNKNOWN, (int) UNSIGNED_FTYPE_VOID },
26498 { OPTION_MASK_ISA_RTM, CODE_FOR_xend, "__builtin_ia32_xend", IX86_BUILTIN_XEND, UNKNOWN, (int) VOID_FTYPE_VOID },
26499 { OPTION_MASK_ISA_RTM, CODE_FOR_xtest, "__builtin_ia32_xtest", IX86_BUILTIN_XTEST, UNKNOWN, (int) INT_FTYPE_VOID },
26500 };
26501
26502 /* Builtins with variable number of arguments. */
26503 static const struct builtin_description bdesc_args[] =
26504 {
26505 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_bsr, "__builtin_ia32_bsrsi", IX86_BUILTIN_BSRSI, UNKNOWN, (int) INT_FTYPE_INT },
26506 { OPTION_MASK_ISA_64BIT, CODE_FOR_bsr_rex64, "__builtin_ia32_bsrdi", IX86_BUILTIN_BSRDI, UNKNOWN, (int) INT64_FTYPE_INT64 },
26507 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rdpmc, "__builtin_ia32_rdpmc", IX86_BUILTIN_RDPMC, UNKNOWN, (int) UINT64_FTYPE_INT },
26508 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotlqi3, "__builtin_ia32_rolqi", IX86_BUILTIN_ROLQI, UNKNOWN, (int) UINT8_FTYPE_UINT8_INT },
26509 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotlhi3, "__builtin_ia32_rolhi", IX86_BUILTIN_ROLHI, UNKNOWN, (int) UINT16_FTYPE_UINT16_INT },
26510 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotrqi3, "__builtin_ia32_rorqi", IX86_BUILTIN_RORQI, UNKNOWN, (int) UINT8_FTYPE_UINT8_INT },
26511 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotrhi3, "__builtin_ia32_rorhi", IX86_BUILTIN_RORHI, UNKNOWN, (int) UINT16_FTYPE_UINT16_INT },
26512
26513 /* MMX */
26514 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv8qi3, "__builtin_ia32_paddb", IX86_BUILTIN_PADDB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26515 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv4hi3, "__builtin_ia32_paddw", IX86_BUILTIN_PADDW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26516 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv2si3, "__builtin_ia32_paddd", IX86_BUILTIN_PADDD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26517 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv8qi3, "__builtin_ia32_psubb", IX86_BUILTIN_PSUBB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26518 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv4hi3, "__builtin_ia32_psubw", IX86_BUILTIN_PSUBW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26519 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv2si3, "__builtin_ia32_psubd", IX86_BUILTIN_PSUBD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26520
26521 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ssaddv8qi3, "__builtin_ia32_paddsb", IX86_BUILTIN_PADDSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26522 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ssaddv4hi3, "__builtin_ia32_paddsw", IX86_BUILTIN_PADDSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26523 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_sssubv8qi3, "__builtin_ia32_psubsb", IX86_BUILTIN_PSUBSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26524 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_sssubv4hi3, "__builtin_ia32_psubsw", IX86_BUILTIN_PSUBSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26525 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_usaddv8qi3, "__builtin_ia32_paddusb", IX86_BUILTIN_PADDUSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26526 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_usaddv4hi3, "__builtin_ia32_paddusw", IX86_BUILTIN_PADDUSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26527 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ussubv8qi3, "__builtin_ia32_psubusb", IX86_BUILTIN_PSUBUSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26528 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ussubv4hi3, "__builtin_ia32_psubusw", IX86_BUILTIN_PSUBUSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26529
26530 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_mulv4hi3, "__builtin_ia32_pmullw", IX86_BUILTIN_PMULLW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26531 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_smulv4hi3_highpart, "__builtin_ia32_pmulhw", IX86_BUILTIN_PMULHW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26532
26533 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_andv2si3, "__builtin_ia32_pand", IX86_BUILTIN_PAND, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26534 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_andnotv2si3, "__builtin_ia32_pandn", IX86_BUILTIN_PANDN, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26535 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_iorv2si3, "__builtin_ia32_por", IX86_BUILTIN_POR, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26536 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_xorv2si3, "__builtin_ia32_pxor", IX86_BUILTIN_PXOR, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26537
26538 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_eqv8qi3, "__builtin_ia32_pcmpeqb", IX86_BUILTIN_PCMPEQB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26539 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_eqv4hi3, "__builtin_ia32_pcmpeqw", IX86_BUILTIN_PCMPEQW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26540 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_eqv2si3, "__builtin_ia32_pcmpeqd", IX86_BUILTIN_PCMPEQD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26541 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_gtv8qi3, "__builtin_ia32_pcmpgtb", IX86_BUILTIN_PCMPGTB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26542 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_gtv4hi3, "__builtin_ia32_pcmpgtw", IX86_BUILTIN_PCMPGTW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26543 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_gtv2si3, "__builtin_ia32_pcmpgtd", IX86_BUILTIN_PCMPGTD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26544
26545 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckhbw, "__builtin_ia32_punpckhbw", IX86_BUILTIN_PUNPCKHBW, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26546 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckhwd, "__builtin_ia32_punpckhwd", IX86_BUILTIN_PUNPCKHWD, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26547 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckhdq, "__builtin_ia32_punpckhdq", IX86_BUILTIN_PUNPCKHDQ, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26548 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpcklbw, "__builtin_ia32_punpcklbw", IX86_BUILTIN_PUNPCKLBW, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26549 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpcklwd, "__builtin_ia32_punpcklwd", IX86_BUILTIN_PUNPCKLWD, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI},
26550 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckldq, "__builtin_ia32_punpckldq", IX86_BUILTIN_PUNPCKLDQ, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI},
26551
26552 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_packsswb, "__builtin_ia32_packsswb", IX86_BUILTIN_PACKSSWB, UNKNOWN, (int) V8QI_FTYPE_V4HI_V4HI },
26553 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_packssdw, "__builtin_ia32_packssdw", IX86_BUILTIN_PACKSSDW, UNKNOWN, (int) V4HI_FTYPE_V2SI_V2SI },
26554 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_packuswb, "__builtin_ia32_packuswb", IX86_BUILTIN_PACKUSWB, UNKNOWN, (int) V8QI_FTYPE_V4HI_V4HI },
26555
26556 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_pmaddwd, "__builtin_ia32_pmaddwd", IX86_BUILTIN_PMADDWD, UNKNOWN, (int) V2SI_FTYPE_V4HI_V4HI },
26557
26558 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv4hi3, "__builtin_ia32_psllwi", IX86_BUILTIN_PSLLWI, UNKNOWN, (int) V4HI_FTYPE_V4HI_SI_COUNT },
26559 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv2si3, "__builtin_ia32_pslldi", IX86_BUILTIN_PSLLDI, UNKNOWN, (int) V2SI_FTYPE_V2SI_SI_COUNT },
26560 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv1di3, "__builtin_ia32_psllqi", IX86_BUILTIN_PSLLQI, UNKNOWN, (int) V1DI_FTYPE_V1DI_SI_COUNT },
26561 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv4hi3, "__builtin_ia32_psllw", IX86_BUILTIN_PSLLW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI_COUNT },
26562 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv2si3, "__builtin_ia32_pslld", IX86_BUILTIN_PSLLD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI_COUNT },
26563 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv1di3, "__builtin_ia32_psllq", IX86_BUILTIN_PSLLQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI_COUNT },
26564
26565 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv4hi3, "__builtin_ia32_psrlwi", IX86_BUILTIN_PSRLWI, UNKNOWN, (int) V4HI_FTYPE_V4HI_SI_COUNT },
26566 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv2si3, "__builtin_ia32_psrldi", IX86_BUILTIN_PSRLDI, UNKNOWN, (int) V2SI_FTYPE_V2SI_SI_COUNT },
26567 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv1di3, "__builtin_ia32_psrlqi", IX86_BUILTIN_PSRLQI, UNKNOWN, (int) V1DI_FTYPE_V1DI_SI_COUNT },
26568 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv4hi3, "__builtin_ia32_psrlw", IX86_BUILTIN_PSRLW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI_COUNT },
26569 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv2si3, "__builtin_ia32_psrld", IX86_BUILTIN_PSRLD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI_COUNT },
26570 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv1di3, "__builtin_ia32_psrlq", IX86_BUILTIN_PSRLQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI_COUNT },
26571
26572 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv4hi3, "__builtin_ia32_psrawi", IX86_BUILTIN_PSRAWI, UNKNOWN, (int) V4HI_FTYPE_V4HI_SI_COUNT },
26573 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv2si3, "__builtin_ia32_psradi", IX86_BUILTIN_PSRADI, UNKNOWN, (int) V2SI_FTYPE_V2SI_SI_COUNT },
26574 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv4hi3, "__builtin_ia32_psraw", IX86_BUILTIN_PSRAW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI_COUNT },
26575 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv2si3, "__builtin_ia32_psrad", IX86_BUILTIN_PSRAD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI_COUNT },
26576
26577 /* 3DNow! */
26578 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_pf2id, "__builtin_ia32_pf2id", IX86_BUILTIN_PF2ID, UNKNOWN, (int) V2SI_FTYPE_V2SF },
26579 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_floatv2si2, "__builtin_ia32_pi2fd", IX86_BUILTIN_PI2FD, UNKNOWN, (int) V2SF_FTYPE_V2SI },
26580 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rcpv2sf2, "__builtin_ia32_pfrcp", IX86_BUILTIN_PFRCP, UNKNOWN, (int) V2SF_FTYPE_V2SF },
26581 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rsqrtv2sf2, "__builtin_ia32_pfrsqrt", IX86_BUILTIN_PFRSQRT, UNKNOWN, (int) V2SF_FTYPE_V2SF },
26582
26583 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_uavgv8qi3, "__builtin_ia32_pavgusb", IX86_BUILTIN_PAVGUSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26584 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_haddv2sf3, "__builtin_ia32_pfacc", IX86_BUILTIN_PFACC, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26585 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_addv2sf3, "__builtin_ia32_pfadd", IX86_BUILTIN_PFADD, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26586 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_eqv2sf3, "__builtin_ia32_pfcmpeq", IX86_BUILTIN_PFCMPEQ, UNKNOWN, (int) V2SI_FTYPE_V2SF_V2SF },
26587 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_gev2sf3, "__builtin_ia32_pfcmpge", IX86_BUILTIN_PFCMPGE, UNKNOWN, (int) V2SI_FTYPE_V2SF_V2SF },
26588 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_gtv2sf3, "__builtin_ia32_pfcmpgt", IX86_BUILTIN_PFCMPGT, UNKNOWN, (int) V2SI_FTYPE_V2SF_V2SF },
26589 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_smaxv2sf3, "__builtin_ia32_pfmax", IX86_BUILTIN_PFMAX, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26590 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_sminv2sf3, "__builtin_ia32_pfmin", IX86_BUILTIN_PFMIN, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26591 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_mulv2sf3, "__builtin_ia32_pfmul", IX86_BUILTIN_PFMUL, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26592 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rcpit1v2sf3, "__builtin_ia32_pfrcpit1", IX86_BUILTIN_PFRCPIT1, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26593 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rcpit2v2sf3, "__builtin_ia32_pfrcpit2", IX86_BUILTIN_PFRCPIT2, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26594 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rsqit1v2sf3, "__builtin_ia32_pfrsqit1", IX86_BUILTIN_PFRSQIT1, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26595 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_subv2sf3, "__builtin_ia32_pfsub", IX86_BUILTIN_PFSUB, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26596 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_subrv2sf3, "__builtin_ia32_pfsubr", IX86_BUILTIN_PFSUBR, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26597 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_pmulhrwv4hi3, "__builtin_ia32_pmulhrw", IX86_BUILTIN_PMULHRW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26598
26599 /* 3DNow!A */
26600 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pf2iw, "__builtin_ia32_pf2iw", IX86_BUILTIN_PF2IW, UNKNOWN, (int) V2SI_FTYPE_V2SF },
26601 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pi2fw, "__builtin_ia32_pi2fw", IX86_BUILTIN_PI2FW, UNKNOWN, (int) V2SF_FTYPE_V2SI },
26602 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pswapdv2si2, "__builtin_ia32_pswapdsi", IX86_BUILTIN_PSWAPDSI, UNKNOWN, (int) V2SI_FTYPE_V2SI },
26603 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pswapdv2sf2, "__builtin_ia32_pswapdsf", IX86_BUILTIN_PSWAPDSF, UNKNOWN, (int) V2SF_FTYPE_V2SF },
26604 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_hsubv2sf3, "__builtin_ia32_pfnacc", IX86_BUILTIN_PFNACC, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26605 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_addsubv2sf3, "__builtin_ia32_pfpnacc", IX86_BUILTIN_PFPNACC, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26606
26607 /* SSE */
26608 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movmskps, "__builtin_ia32_movmskps", IX86_BUILTIN_MOVMSKPS, UNKNOWN, (int) INT_FTYPE_V4SF },
26609 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_sqrtv4sf2, "__builtin_ia32_sqrtps", IX86_BUILTIN_SQRTPS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
26610 { OPTION_MASK_ISA_SSE, CODE_FOR_sqrtv4sf2, "__builtin_ia32_sqrtps_nr", IX86_BUILTIN_SQRTPS_NR, UNKNOWN, (int) V4SF_FTYPE_V4SF },
26611 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_rsqrtv4sf2, "__builtin_ia32_rsqrtps", IX86_BUILTIN_RSQRTPS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
26612 { OPTION_MASK_ISA_SSE, CODE_FOR_rsqrtv4sf2, "__builtin_ia32_rsqrtps_nr", IX86_BUILTIN_RSQRTPS_NR, UNKNOWN, (int) V4SF_FTYPE_V4SF },
26613 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_rcpv4sf2, "__builtin_ia32_rcpps", IX86_BUILTIN_RCPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
26614 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtps2pi, "__builtin_ia32_cvtps2pi", IX86_BUILTIN_CVTPS2PI, UNKNOWN, (int) V2SI_FTYPE_V4SF },
26615 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtss2si, "__builtin_ia32_cvtss2si", IX86_BUILTIN_CVTSS2SI, UNKNOWN, (int) INT_FTYPE_V4SF },
26616 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvtss2siq, "__builtin_ia32_cvtss2si64", IX86_BUILTIN_CVTSS2SI64, UNKNOWN, (int) INT64_FTYPE_V4SF },
26617 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvttps2pi, "__builtin_ia32_cvttps2pi", IX86_BUILTIN_CVTTPS2PI, UNKNOWN, (int) V2SI_FTYPE_V4SF },
26618 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvttss2si, "__builtin_ia32_cvttss2si", IX86_BUILTIN_CVTTSS2SI, UNKNOWN, (int) INT_FTYPE_V4SF },
26619 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvttss2siq, "__builtin_ia32_cvttss2si64", IX86_BUILTIN_CVTTSS2SI64, UNKNOWN, (int) INT64_FTYPE_V4SF },
26620
26621 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_shufps, "__builtin_ia32_shufps", IX86_BUILTIN_SHUFPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
26622
26623 { OPTION_MASK_ISA_SSE, CODE_FOR_addv4sf3, "__builtin_ia32_addps", IX86_BUILTIN_ADDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26624 { OPTION_MASK_ISA_SSE, CODE_FOR_subv4sf3, "__builtin_ia32_subps", IX86_BUILTIN_SUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26625 { OPTION_MASK_ISA_SSE, CODE_FOR_mulv4sf3, "__builtin_ia32_mulps", IX86_BUILTIN_MULPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26626 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_divv4sf3, "__builtin_ia32_divps", IX86_BUILTIN_DIVPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26627 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmaddv4sf3, "__builtin_ia32_addss", IX86_BUILTIN_ADDSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26628 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsubv4sf3, "__builtin_ia32_subss", IX86_BUILTIN_SUBSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26629 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmulv4sf3, "__builtin_ia32_mulss", IX86_BUILTIN_MULSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26630 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmdivv4sf3, "__builtin_ia32_divss", IX86_BUILTIN_DIVSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26631
26632 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpeqps", IX86_BUILTIN_CMPEQPS, EQ, (int) V4SF_FTYPE_V4SF_V4SF },
26633 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpltps", IX86_BUILTIN_CMPLTPS, LT, (int) V4SF_FTYPE_V4SF_V4SF },
26634 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpleps", IX86_BUILTIN_CMPLEPS, LE, (int) V4SF_FTYPE_V4SF_V4SF },
26635 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpgtps", IX86_BUILTIN_CMPGTPS, LT, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
26636 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpgeps", IX86_BUILTIN_CMPGEPS, LE, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
26637 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpunordps", IX86_BUILTIN_CMPUNORDPS, UNORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
26638 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpneqps", IX86_BUILTIN_CMPNEQPS, NE, (int) V4SF_FTYPE_V4SF_V4SF },
26639 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpnltps", IX86_BUILTIN_CMPNLTPS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF },
26640 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpnleps", IX86_BUILTIN_CMPNLEPS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF },
26641 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpngtps", IX86_BUILTIN_CMPNGTPS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
26642 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpngeps", IX86_BUILTIN_CMPNGEPS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF_SWAP},
26643 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpordps", IX86_BUILTIN_CMPORDPS, ORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
26644 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpeqss", IX86_BUILTIN_CMPEQSS, EQ, (int) V4SF_FTYPE_V4SF_V4SF },
26645 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpltss", IX86_BUILTIN_CMPLTSS, LT, (int) V4SF_FTYPE_V4SF_V4SF },
26646 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpless", IX86_BUILTIN_CMPLESS, LE, (int) V4SF_FTYPE_V4SF_V4SF },
26647 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpunordss", IX86_BUILTIN_CMPUNORDSS, UNORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
26648 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpneqss", IX86_BUILTIN_CMPNEQSS, NE, (int) V4SF_FTYPE_V4SF_V4SF },
26649 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpnltss", IX86_BUILTIN_CMPNLTSS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF },
26650 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpnless", IX86_BUILTIN_CMPNLESS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF },
26651 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpngtss", IX86_BUILTIN_CMPNGTSS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
26652 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpngess", IX86_BUILTIN_CMPNGESS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
26653 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpordss", IX86_BUILTIN_CMPORDSS, ORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
26654
26655 { OPTION_MASK_ISA_SSE, CODE_FOR_sminv4sf3, "__builtin_ia32_minps", IX86_BUILTIN_MINPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26656 { OPTION_MASK_ISA_SSE, CODE_FOR_smaxv4sf3, "__builtin_ia32_maxps", IX86_BUILTIN_MAXPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26657 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsminv4sf3, "__builtin_ia32_minss", IX86_BUILTIN_MINSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26658 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsmaxv4sf3, "__builtin_ia32_maxss", IX86_BUILTIN_MAXSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26659
26660 { OPTION_MASK_ISA_SSE, CODE_FOR_andv4sf3, "__builtin_ia32_andps", IX86_BUILTIN_ANDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26661 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_andnotv4sf3, "__builtin_ia32_andnps", IX86_BUILTIN_ANDNPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26662 { OPTION_MASK_ISA_SSE, CODE_FOR_iorv4sf3, "__builtin_ia32_orps", IX86_BUILTIN_ORPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26663 { OPTION_MASK_ISA_SSE, CODE_FOR_xorv4sf3, "__builtin_ia32_xorps", IX86_BUILTIN_XORPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26664
26665 { OPTION_MASK_ISA_SSE, CODE_FOR_copysignv4sf3, "__builtin_ia32_copysignps", IX86_BUILTIN_CPYSGNPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26666
26667 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movss, "__builtin_ia32_movss", IX86_BUILTIN_MOVSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26668 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movhlps_exp, "__builtin_ia32_movhlps", IX86_BUILTIN_MOVHLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26669 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movlhps_exp, "__builtin_ia32_movlhps", IX86_BUILTIN_MOVLHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26670 { OPTION_MASK_ISA_SSE, CODE_FOR_vec_interleave_highv4sf, "__builtin_ia32_unpckhps", IX86_BUILTIN_UNPCKHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26671 { OPTION_MASK_ISA_SSE, CODE_FOR_vec_interleave_lowv4sf, "__builtin_ia32_unpcklps", IX86_BUILTIN_UNPCKLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26672
26673 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtpi2ps, "__builtin_ia32_cvtpi2ps", IX86_BUILTIN_CVTPI2PS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V2SI },
26674 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtsi2ss, "__builtin_ia32_cvtsi2ss", IX86_BUILTIN_CVTSI2SS, UNKNOWN, (int) V4SF_FTYPE_V4SF_SI },
26675 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvtsi2ssq, "__builtin_ia32_cvtsi642ss", IX86_BUILTIN_CVTSI642SS, UNKNOWN, V4SF_FTYPE_V4SF_DI },
26676
26677 { OPTION_MASK_ISA_SSE, CODE_FOR_rsqrtsf2, "__builtin_ia32_rsqrtf", IX86_BUILTIN_RSQRTF, UNKNOWN, (int) FLOAT_FTYPE_FLOAT },
26678
26679 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsqrtv4sf2, "__builtin_ia32_sqrtss", IX86_BUILTIN_SQRTSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_VEC_MERGE },
26680 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmrsqrtv4sf2, "__builtin_ia32_rsqrtss", IX86_BUILTIN_RSQRTSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_VEC_MERGE },
26681 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmrcpv4sf2, "__builtin_ia32_rcpss", IX86_BUILTIN_RCPSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_VEC_MERGE },
26682
26683 { OPTION_MASK_ISA_SSE, CODE_FOR_abstf2, 0, IX86_BUILTIN_FABSQ, UNKNOWN, (int) FLOAT128_FTYPE_FLOAT128 },
26684 { OPTION_MASK_ISA_SSE, CODE_FOR_copysigntf3, 0, IX86_BUILTIN_COPYSIGNQ, UNKNOWN, (int) FLOAT128_FTYPE_FLOAT128_FLOAT128 },
26685
26686 /* SSE MMX or 3Dnow!A */
26687 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_uavgv8qi3, "__builtin_ia32_pavgb", IX86_BUILTIN_PAVGB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26688 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_uavgv4hi3, "__builtin_ia32_pavgw", IX86_BUILTIN_PAVGW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26689 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_umulv4hi3_highpart, "__builtin_ia32_pmulhuw", IX86_BUILTIN_PMULHUW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26690
26691 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_umaxv8qi3, "__builtin_ia32_pmaxub", IX86_BUILTIN_PMAXUB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26692 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_smaxv4hi3, "__builtin_ia32_pmaxsw", IX86_BUILTIN_PMAXSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26693 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_uminv8qi3, "__builtin_ia32_pminub", IX86_BUILTIN_PMINUB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26694 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_sminv4hi3, "__builtin_ia32_pminsw", IX86_BUILTIN_PMINSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26695
26696 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_psadbw, "__builtin_ia32_psadbw", IX86_BUILTIN_PSADBW, UNKNOWN, (int) V1DI_FTYPE_V8QI_V8QI },
26697 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pmovmskb, "__builtin_ia32_pmovmskb", IX86_BUILTIN_PMOVMSKB, UNKNOWN, (int) INT_FTYPE_V8QI },
26698
26699 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pshufw, "__builtin_ia32_pshufw", IX86_BUILTIN_PSHUFW, UNKNOWN, (int) V4HI_FTYPE_V4HI_INT },
26700
26701 /* SSE2 */
26702 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_shufpd, "__builtin_ia32_shufpd", IX86_BUILTIN_SHUFPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
26703
26704 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movmskpd, "__builtin_ia32_movmskpd", IX86_BUILTIN_MOVMSKPD, UNKNOWN, (int) INT_FTYPE_V2DF },
26705 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pmovmskb, "__builtin_ia32_pmovmskb128", IX86_BUILTIN_PMOVMSKB128, UNKNOWN, (int) INT_FTYPE_V16QI },
26706 { OPTION_MASK_ISA_SSE2, CODE_FOR_sqrtv2df2, "__builtin_ia32_sqrtpd", IX86_BUILTIN_SQRTPD, UNKNOWN, (int) V2DF_FTYPE_V2DF },
26707 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtdq2pd, "__builtin_ia32_cvtdq2pd", IX86_BUILTIN_CVTDQ2PD, UNKNOWN, (int) V2DF_FTYPE_V4SI },
26708 { OPTION_MASK_ISA_SSE2, CODE_FOR_floatv4siv4sf2, "__builtin_ia32_cvtdq2ps", IX86_BUILTIN_CVTDQ2PS, UNKNOWN, (int) V4SF_FTYPE_V4SI },
26709
26710 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpd2dq, "__builtin_ia32_cvtpd2dq", IX86_BUILTIN_CVTPD2DQ, UNKNOWN, (int) V4SI_FTYPE_V2DF },
26711 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpd2pi, "__builtin_ia32_cvtpd2pi", IX86_BUILTIN_CVTPD2PI, UNKNOWN, (int) V2SI_FTYPE_V2DF },
26712 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpd2ps, "__builtin_ia32_cvtpd2ps", IX86_BUILTIN_CVTPD2PS, UNKNOWN, (int) V4SF_FTYPE_V2DF },
26713 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvttpd2dq, "__builtin_ia32_cvttpd2dq", IX86_BUILTIN_CVTTPD2DQ, UNKNOWN, (int) V4SI_FTYPE_V2DF },
26714 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvttpd2pi, "__builtin_ia32_cvttpd2pi", IX86_BUILTIN_CVTTPD2PI, UNKNOWN, (int) V2SI_FTYPE_V2DF },
26715
26716 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpi2pd, "__builtin_ia32_cvtpi2pd", IX86_BUILTIN_CVTPI2PD, UNKNOWN, (int) V2DF_FTYPE_V2SI },
26717
26718 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtsd2si, "__builtin_ia32_cvtsd2si", IX86_BUILTIN_CVTSD2SI, UNKNOWN, (int) INT_FTYPE_V2DF },
26719 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvttsd2si, "__builtin_ia32_cvttsd2si", IX86_BUILTIN_CVTTSD2SI, UNKNOWN, (int) INT_FTYPE_V2DF },
26720 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvtsd2siq, "__builtin_ia32_cvtsd2si64", IX86_BUILTIN_CVTSD2SI64, UNKNOWN, (int) INT64_FTYPE_V2DF },
26721 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvttsd2siq, "__builtin_ia32_cvttsd2si64", IX86_BUILTIN_CVTTSD2SI64, UNKNOWN, (int) INT64_FTYPE_V2DF },
26722
26723 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtps2dq, "__builtin_ia32_cvtps2dq", IX86_BUILTIN_CVTPS2DQ, UNKNOWN, (int) V4SI_FTYPE_V4SF },
26724 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtps2pd, "__builtin_ia32_cvtps2pd", IX86_BUILTIN_CVTPS2PD, UNKNOWN, (int) V2DF_FTYPE_V4SF },
26725 { OPTION_MASK_ISA_SSE2, CODE_FOR_fix_truncv4sfv4si2, "__builtin_ia32_cvttps2dq", IX86_BUILTIN_CVTTPS2DQ, UNKNOWN, (int) V4SI_FTYPE_V4SF },
26726
26727 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv2df3, "__builtin_ia32_addpd", IX86_BUILTIN_ADDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26728 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv2df3, "__builtin_ia32_subpd", IX86_BUILTIN_SUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26729 { OPTION_MASK_ISA_SSE2, CODE_FOR_mulv2df3, "__builtin_ia32_mulpd", IX86_BUILTIN_MULPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26730 { OPTION_MASK_ISA_SSE2, CODE_FOR_divv2df3, "__builtin_ia32_divpd", IX86_BUILTIN_DIVPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26731 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmaddv2df3, "__builtin_ia32_addsd", IX86_BUILTIN_ADDSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26732 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsubv2df3, "__builtin_ia32_subsd", IX86_BUILTIN_SUBSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26733 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmulv2df3, "__builtin_ia32_mulsd", IX86_BUILTIN_MULSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26734 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmdivv2df3, "__builtin_ia32_divsd", IX86_BUILTIN_DIVSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26735
26736 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpeqpd", IX86_BUILTIN_CMPEQPD, EQ, (int) V2DF_FTYPE_V2DF_V2DF },
26737 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpltpd", IX86_BUILTIN_CMPLTPD, LT, (int) V2DF_FTYPE_V2DF_V2DF },
26738 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmplepd", IX86_BUILTIN_CMPLEPD, LE, (int) V2DF_FTYPE_V2DF_V2DF },
26739 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpgtpd", IX86_BUILTIN_CMPGTPD, LT, (int) V2DF_FTYPE_V2DF_V2DF_SWAP },
26740 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpgepd", IX86_BUILTIN_CMPGEPD, LE, (int) V2DF_FTYPE_V2DF_V2DF_SWAP},
26741 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpunordpd", IX86_BUILTIN_CMPUNORDPD, UNORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
26742 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpneqpd", IX86_BUILTIN_CMPNEQPD, NE, (int) V2DF_FTYPE_V2DF_V2DF },
26743 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpnltpd", IX86_BUILTIN_CMPNLTPD, UNGE, (int) V2DF_FTYPE_V2DF_V2DF },
26744 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpnlepd", IX86_BUILTIN_CMPNLEPD, UNGT, (int) V2DF_FTYPE_V2DF_V2DF },
26745 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpngtpd", IX86_BUILTIN_CMPNGTPD, UNGE, (int) V2DF_FTYPE_V2DF_V2DF_SWAP },
26746 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpngepd", IX86_BUILTIN_CMPNGEPD, UNGT, (int) V2DF_FTYPE_V2DF_V2DF_SWAP },
26747 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpordpd", IX86_BUILTIN_CMPORDPD, ORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
26748 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpeqsd", IX86_BUILTIN_CMPEQSD, EQ, (int) V2DF_FTYPE_V2DF_V2DF },
26749 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpltsd", IX86_BUILTIN_CMPLTSD, LT, (int) V2DF_FTYPE_V2DF_V2DF },
26750 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmplesd", IX86_BUILTIN_CMPLESD, LE, (int) V2DF_FTYPE_V2DF_V2DF },
26751 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpunordsd", IX86_BUILTIN_CMPUNORDSD, UNORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
26752 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpneqsd", IX86_BUILTIN_CMPNEQSD, NE, (int) V2DF_FTYPE_V2DF_V2DF },
26753 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpnltsd", IX86_BUILTIN_CMPNLTSD, UNGE, (int) V2DF_FTYPE_V2DF_V2DF },
26754 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpnlesd", IX86_BUILTIN_CMPNLESD, UNGT, (int) V2DF_FTYPE_V2DF_V2DF },
26755 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpordsd", IX86_BUILTIN_CMPORDSD, ORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
26756
26757 { OPTION_MASK_ISA_SSE2, CODE_FOR_sminv2df3, "__builtin_ia32_minpd", IX86_BUILTIN_MINPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26758 { OPTION_MASK_ISA_SSE2, CODE_FOR_smaxv2df3, "__builtin_ia32_maxpd", IX86_BUILTIN_MAXPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26759 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsminv2df3, "__builtin_ia32_minsd", IX86_BUILTIN_MINSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26760 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsmaxv2df3, "__builtin_ia32_maxsd", IX86_BUILTIN_MAXSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26761
26762 { OPTION_MASK_ISA_SSE2, CODE_FOR_andv2df3, "__builtin_ia32_andpd", IX86_BUILTIN_ANDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26763 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_andnotv2df3, "__builtin_ia32_andnpd", IX86_BUILTIN_ANDNPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26764 { OPTION_MASK_ISA_SSE2, CODE_FOR_iorv2df3, "__builtin_ia32_orpd", IX86_BUILTIN_ORPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26765 { OPTION_MASK_ISA_SSE2, CODE_FOR_xorv2df3, "__builtin_ia32_xorpd", IX86_BUILTIN_XORPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26766
26767 { OPTION_MASK_ISA_SSE2, CODE_FOR_copysignv2df3, "__builtin_ia32_copysignpd", IX86_BUILTIN_CPYSGNPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26768
26769 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movsd, "__builtin_ia32_movsd", IX86_BUILTIN_MOVSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26770 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv2df, "__builtin_ia32_unpckhpd", IX86_BUILTIN_UNPCKHPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26771 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv2df, "__builtin_ia32_unpcklpd", IX86_BUILTIN_UNPCKLPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26772
26773 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_pack_sfix_v2df, "__builtin_ia32_vec_pack_sfix", IX86_BUILTIN_VEC_PACK_SFIX, UNKNOWN, (int) V4SI_FTYPE_V2DF_V2DF },
26774
26775 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv16qi3, "__builtin_ia32_paddb128", IX86_BUILTIN_PADDB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26776 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv8hi3, "__builtin_ia32_paddw128", IX86_BUILTIN_PADDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26777 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv4si3, "__builtin_ia32_paddd128", IX86_BUILTIN_PADDD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26778 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv2di3, "__builtin_ia32_paddq128", IX86_BUILTIN_PADDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26779 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv16qi3, "__builtin_ia32_psubb128", IX86_BUILTIN_PSUBB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26780 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv8hi3, "__builtin_ia32_psubw128", IX86_BUILTIN_PSUBW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26781 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv4si3, "__builtin_ia32_psubd128", IX86_BUILTIN_PSUBD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26782 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv2di3, "__builtin_ia32_psubq128", IX86_BUILTIN_PSUBQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26783
26784 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ssaddv16qi3, "__builtin_ia32_paddsb128", IX86_BUILTIN_PADDSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26785 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ssaddv8hi3, "__builtin_ia32_paddsw128", IX86_BUILTIN_PADDSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26786 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_sssubv16qi3, "__builtin_ia32_psubsb128", IX86_BUILTIN_PSUBSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26787 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_sssubv8hi3, "__builtin_ia32_psubsw128", IX86_BUILTIN_PSUBSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26788 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_usaddv16qi3, "__builtin_ia32_paddusb128", IX86_BUILTIN_PADDUSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26789 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_usaddv8hi3, "__builtin_ia32_paddusw128", IX86_BUILTIN_PADDUSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26790 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ussubv16qi3, "__builtin_ia32_psubusb128", IX86_BUILTIN_PSUBUSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26791 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ussubv8hi3, "__builtin_ia32_psubusw128", IX86_BUILTIN_PSUBUSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26792
26793 { OPTION_MASK_ISA_SSE2, CODE_FOR_mulv8hi3, "__builtin_ia32_pmullw128", IX86_BUILTIN_PMULLW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26794 { OPTION_MASK_ISA_SSE2, CODE_FOR_smulv8hi3_highpart, "__builtin_ia32_pmulhw128", IX86_BUILTIN_PMULHW128, UNKNOWN,(int) V8HI_FTYPE_V8HI_V8HI },
26795
26796 { OPTION_MASK_ISA_SSE2, CODE_FOR_andv2di3, "__builtin_ia32_pand128", IX86_BUILTIN_PAND128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26797 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_andnotv2di3, "__builtin_ia32_pandn128", IX86_BUILTIN_PANDN128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26798 { OPTION_MASK_ISA_SSE2, CODE_FOR_iorv2di3, "__builtin_ia32_por128", IX86_BUILTIN_POR128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26799 { OPTION_MASK_ISA_SSE2, CODE_FOR_xorv2di3, "__builtin_ia32_pxor128", IX86_BUILTIN_PXOR128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26800
26801 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_uavgv16qi3, "__builtin_ia32_pavgb128", IX86_BUILTIN_PAVGB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26802 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_uavgv8hi3, "__builtin_ia32_pavgw128", IX86_BUILTIN_PAVGW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26803
26804 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_eqv16qi3, "__builtin_ia32_pcmpeqb128", IX86_BUILTIN_PCMPEQB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26805 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_eqv8hi3, "__builtin_ia32_pcmpeqw128", IX86_BUILTIN_PCMPEQW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26806 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_eqv4si3, "__builtin_ia32_pcmpeqd128", IX86_BUILTIN_PCMPEQD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26807 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_gtv16qi3, "__builtin_ia32_pcmpgtb128", IX86_BUILTIN_PCMPGTB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26808 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_gtv8hi3, "__builtin_ia32_pcmpgtw128", IX86_BUILTIN_PCMPGTW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26809 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_gtv4si3, "__builtin_ia32_pcmpgtd128", IX86_BUILTIN_PCMPGTD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26810
26811 { OPTION_MASK_ISA_SSE2, CODE_FOR_umaxv16qi3, "__builtin_ia32_pmaxub128", IX86_BUILTIN_PMAXUB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26812 { OPTION_MASK_ISA_SSE2, CODE_FOR_smaxv8hi3, "__builtin_ia32_pmaxsw128", IX86_BUILTIN_PMAXSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26813 { OPTION_MASK_ISA_SSE2, CODE_FOR_uminv16qi3, "__builtin_ia32_pminub128", IX86_BUILTIN_PMINUB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26814 { OPTION_MASK_ISA_SSE2, CODE_FOR_sminv8hi3, "__builtin_ia32_pminsw128", IX86_BUILTIN_PMINSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26815
26816 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv16qi, "__builtin_ia32_punpckhbw128", IX86_BUILTIN_PUNPCKHBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26817 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv8hi, "__builtin_ia32_punpckhwd128", IX86_BUILTIN_PUNPCKHWD128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26818 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv4si, "__builtin_ia32_punpckhdq128", IX86_BUILTIN_PUNPCKHDQ128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26819 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv2di, "__builtin_ia32_punpckhqdq128", IX86_BUILTIN_PUNPCKHQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26820 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv16qi, "__builtin_ia32_punpcklbw128", IX86_BUILTIN_PUNPCKLBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26821 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv8hi, "__builtin_ia32_punpcklwd128", IX86_BUILTIN_PUNPCKLWD128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26822 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv4si, "__builtin_ia32_punpckldq128", IX86_BUILTIN_PUNPCKLDQ128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26823 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv2di, "__builtin_ia32_punpcklqdq128", IX86_BUILTIN_PUNPCKLQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26824
26825 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_packsswb, "__builtin_ia32_packsswb128", IX86_BUILTIN_PACKSSWB128, UNKNOWN, (int) V16QI_FTYPE_V8HI_V8HI },
26826 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_packssdw, "__builtin_ia32_packssdw128", IX86_BUILTIN_PACKSSDW128, UNKNOWN, (int) V8HI_FTYPE_V4SI_V4SI },
26827 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_packuswb, "__builtin_ia32_packuswb128", IX86_BUILTIN_PACKUSWB128, UNKNOWN, (int) V16QI_FTYPE_V8HI_V8HI },
26828
26829 { OPTION_MASK_ISA_SSE2, CODE_FOR_umulv8hi3_highpart, "__builtin_ia32_pmulhuw128", IX86_BUILTIN_PMULHUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26830 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_psadbw, "__builtin_ia32_psadbw128", IX86_BUILTIN_PSADBW128, UNKNOWN, (int) V2DI_FTYPE_V16QI_V16QI },
26831
26832 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_umulv1siv1di3, "__builtin_ia32_pmuludq", IX86_BUILTIN_PMULUDQ, UNKNOWN, (int) V1DI_FTYPE_V2SI_V2SI },
26833 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_widen_umult_even_v4si, "__builtin_ia32_pmuludq128", IX86_BUILTIN_PMULUDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI_V4SI },
26834
26835 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pmaddwd, "__builtin_ia32_pmaddwd128", IX86_BUILTIN_PMADDWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI_V8HI },
26836
26837 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtsi2sd, "__builtin_ia32_cvtsi2sd", IX86_BUILTIN_CVTSI2SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_SI },
26838 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvtsi2sdq, "__builtin_ia32_cvtsi642sd", IX86_BUILTIN_CVTSI642SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_DI },
26839 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtsd2ss, "__builtin_ia32_cvtsd2ss", IX86_BUILTIN_CVTSD2SS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V2DF },
26840 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtss2sd, "__builtin_ia32_cvtss2sd", IX86_BUILTIN_CVTSS2SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V4SF },
26841
26842 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ashlv1ti3, "__builtin_ia32_pslldqi128", IX86_BUILTIN_PSLLDQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT_CONVERT },
26843 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv8hi3, "__builtin_ia32_psllwi128", IX86_BUILTIN_PSLLWI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_SI_COUNT },
26844 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv4si3, "__builtin_ia32_pslldi128", IX86_BUILTIN_PSLLDI128, UNKNOWN, (int) V4SI_FTYPE_V4SI_SI_COUNT },
26845 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv2di3, "__builtin_ia32_psllqi128", IX86_BUILTIN_PSLLQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_SI_COUNT },
26846 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv8hi3, "__builtin_ia32_psllw128", IX86_BUILTIN_PSLLW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_COUNT },
26847 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv4si3, "__builtin_ia32_pslld128", IX86_BUILTIN_PSLLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_COUNT },
26848 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv2di3, "__builtin_ia32_psllq128", IX86_BUILTIN_PSLLQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_COUNT },
26849
26850 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_lshrv1ti3, "__builtin_ia32_psrldqi128", IX86_BUILTIN_PSRLDQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT_CONVERT },
26851 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv8hi3, "__builtin_ia32_psrlwi128", IX86_BUILTIN_PSRLWI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_SI_COUNT },
26852 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv4si3, "__builtin_ia32_psrldi128", IX86_BUILTIN_PSRLDI128, UNKNOWN, (int) V4SI_FTYPE_V4SI_SI_COUNT },
26853 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv2di3, "__builtin_ia32_psrlqi128", IX86_BUILTIN_PSRLQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_SI_COUNT },
26854 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv8hi3, "__builtin_ia32_psrlw128", IX86_BUILTIN_PSRLW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_COUNT },
26855 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv4si3, "__builtin_ia32_psrld128", IX86_BUILTIN_PSRLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_COUNT },
26856 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv2di3, "__builtin_ia32_psrlq128", IX86_BUILTIN_PSRLQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_COUNT },
26857
26858 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv8hi3, "__builtin_ia32_psrawi128", IX86_BUILTIN_PSRAWI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_SI_COUNT },
26859 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv4si3, "__builtin_ia32_psradi128", IX86_BUILTIN_PSRADI128, UNKNOWN, (int) V4SI_FTYPE_V4SI_SI_COUNT },
26860 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv8hi3, "__builtin_ia32_psraw128", IX86_BUILTIN_PSRAW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_COUNT },
26861 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv4si3, "__builtin_ia32_psrad128", IX86_BUILTIN_PSRAD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_COUNT },
26862
26863 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pshufd, "__builtin_ia32_pshufd", IX86_BUILTIN_PSHUFD, UNKNOWN, (int) V4SI_FTYPE_V4SI_INT },
26864 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pshuflw, "__builtin_ia32_pshuflw", IX86_BUILTIN_PSHUFLW, UNKNOWN, (int) V8HI_FTYPE_V8HI_INT },
26865 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pshufhw, "__builtin_ia32_pshufhw", IX86_BUILTIN_PSHUFHW, UNKNOWN, (int) V8HI_FTYPE_V8HI_INT },
26866
26867 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsqrtv2df2, "__builtin_ia32_sqrtsd", IX86_BUILTIN_SQRTSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_VEC_MERGE },
26868
26869 { OPTION_MASK_ISA_SSE, CODE_FOR_sse2_movq128, "__builtin_ia32_movq128", IX86_BUILTIN_MOVQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI },
26870
26871 /* SSE2 MMX */
26872 { OPTION_MASK_ISA_SSE2, CODE_FOR_mmx_addv1di3, "__builtin_ia32_paddq", IX86_BUILTIN_PADDQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI },
26873 { OPTION_MASK_ISA_SSE2, CODE_FOR_mmx_subv1di3, "__builtin_ia32_psubq", IX86_BUILTIN_PSUBQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI },
26874
26875 /* SSE3 */
26876 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_movshdup, "__builtin_ia32_movshdup", IX86_BUILTIN_MOVSHDUP, UNKNOWN, (int) V4SF_FTYPE_V4SF},
26877 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_movsldup, "__builtin_ia32_movsldup", IX86_BUILTIN_MOVSLDUP, UNKNOWN, (int) V4SF_FTYPE_V4SF },
26878
26879 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_addsubv4sf3, "__builtin_ia32_addsubps", IX86_BUILTIN_ADDSUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26880 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_addsubv2df3, "__builtin_ia32_addsubpd", IX86_BUILTIN_ADDSUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26881 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_haddv4sf3, "__builtin_ia32_haddps", IX86_BUILTIN_HADDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26882 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_haddv2df3, "__builtin_ia32_haddpd", IX86_BUILTIN_HADDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26883 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_hsubv4sf3, "__builtin_ia32_hsubps", IX86_BUILTIN_HSUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26884 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_hsubv2df3, "__builtin_ia32_hsubpd", IX86_BUILTIN_HSUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26885
26886 /* SSSE3 */
26887 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv16qi2, "__builtin_ia32_pabsb128", IX86_BUILTIN_PABSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI },
26888 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv8qi2, "__builtin_ia32_pabsb", IX86_BUILTIN_PABSB, UNKNOWN, (int) V8QI_FTYPE_V8QI },
26889 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv8hi2, "__builtin_ia32_pabsw128", IX86_BUILTIN_PABSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI },
26890 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv4hi2, "__builtin_ia32_pabsw", IX86_BUILTIN_PABSW, UNKNOWN, (int) V4HI_FTYPE_V4HI },
26891 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv4si2, "__builtin_ia32_pabsd128", IX86_BUILTIN_PABSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI },
26892 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv2si2, "__builtin_ia32_pabsd", IX86_BUILTIN_PABSD, UNKNOWN, (int) V2SI_FTYPE_V2SI },
26893
26894 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddwv8hi3, "__builtin_ia32_phaddw128", IX86_BUILTIN_PHADDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26895 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddwv4hi3, "__builtin_ia32_phaddw", IX86_BUILTIN_PHADDW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26896 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phadddv4si3, "__builtin_ia32_phaddd128", IX86_BUILTIN_PHADDD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26897 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phadddv2si3, "__builtin_ia32_phaddd", IX86_BUILTIN_PHADDD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26898 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddswv8hi3, "__builtin_ia32_phaddsw128", IX86_BUILTIN_PHADDSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26899 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddswv4hi3, "__builtin_ia32_phaddsw", IX86_BUILTIN_PHADDSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26900 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubwv8hi3, "__builtin_ia32_phsubw128", IX86_BUILTIN_PHSUBW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26901 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubwv4hi3, "__builtin_ia32_phsubw", IX86_BUILTIN_PHSUBW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26902 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubdv4si3, "__builtin_ia32_phsubd128", IX86_BUILTIN_PHSUBD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26903 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubdv2si3, "__builtin_ia32_phsubd", IX86_BUILTIN_PHSUBD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26904 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubswv8hi3, "__builtin_ia32_phsubsw128", IX86_BUILTIN_PHSUBSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26905 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubswv4hi3, "__builtin_ia32_phsubsw", IX86_BUILTIN_PHSUBSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26906 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmaddubsw128, "__builtin_ia32_pmaddubsw128", IX86_BUILTIN_PMADDUBSW128, UNKNOWN, (int) V8HI_FTYPE_V16QI_V16QI },
26907 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmaddubsw, "__builtin_ia32_pmaddubsw", IX86_BUILTIN_PMADDUBSW, UNKNOWN, (int) V4HI_FTYPE_V8QI_V8QI },
26908 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmulhrswv8hi3, "__builtin_ia32_pmulhrsw128", IX86_BUILTIN_PMULHRSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26909 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmulhrswv4hi3, "__builtin_ia32_pmulhrsw", IX86_BUILTIN_PMULHRSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26910 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pshufbv16qi3, "__builtin_ia32_pshufb128", IX86_BUILTIN_PSHUFB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26911 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pshufbv8qi3, "__builtin_ia32_pshufb", IX86_BUILTIN_PSHUFB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26912 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv16qi3, "__builtin_ia32_psignb128", IX86_BUILTIN_PSIGNB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26913 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv8qi3, "__builtin_ia32_psignb", IX86_BUILTIN_PSIGNB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26914 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv8hi3, "__builtin_ia32_psignw128", IX86_BUILTIN_PSIGNW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26915 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv4hi3, "__builtin_ia32_psignw", IX86_BUILTIN_PSIGNW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26916 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv4si3, "__builtin_ia32_psignd128", IX86_BUILTIN_PSIGND128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26917 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv2si3, "__builtin_ia32_psignd", IX86_BUILTIN_PSIGND, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26918
26919 /* SSSE3. */
26920 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_palignrti, "__builtin_ia32_palignr128", IX86_BUILTIN_PALIGNR128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_INT_CONVERT },
26921 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_palignrdi, "__builtin_ia32_palignr", IX86_BUILTIN_PALIGNR, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI_INT_CONVERT },
26922
26923 /* SSE4.1 */
26924 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendpd, "__builtin_ia32_blendpd", IX86_BUILTIN_BLENDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
26925 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendps, "__builtin_ia32_blendps", IX86_BUILTIN_BLENDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
26926 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendvpd, "__builtin_ia32_blendvpd", IX86_BUILTIN_BLENDVPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_V2DF },
26927 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendvps, "__builtin_ia32_blendvps", IX86_BUILTIN_BLENDVPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_V4SF },
26928 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_dppd, "__builtin_ia32_dppd", IX86_BUILTIN_DPPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
26929 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_dpps, "__builtin_ia32_dpps", IX86_BUILTIN_DPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
26930 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_insertps, "__builtin_ia32_insertps128", IX86_BUILTIN_INSERTPS128, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
26931 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_mpsadbw, "__builtin_ia32_mpsadbw128", IX86_BUILTIN_MPSADBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI_INT },
26932 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_pblendvb, "__builtin_ia32_pblendvb128", IX86_BUILTIN_PBLENDVB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI_V16QI },
26933 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_pblendw, "__builtin_ia32_pblendw128", IX86_BUILTIN_PBLENDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_INT },
26934
26935 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv8qiv8hi2, "__builtin_ia32_pmovsxbw128", IX86_BUILTIN_PMOVSXBW128, UNKNOWN, (int) V8HI_FTYPE_V16QI },
26936 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv4qiv4si2, "__builtin_ia32_pmovsxbd128", IX86_BUILTIN_PMOVSXBD128, UNKNOWN, (int) V4SI_FTYPE_V16QI },
26937 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv2qiv2di2, "__builtin_ia32_pmovsxbq128", IX86_BUILTIN_PMOVSXBQ128, UNKNOWN, (int) V2DI_FTYPE_V16QI },
26938 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv4hiv4si2, "__builtin_ia32_pmovsxwd128", IX86_BUILTIN_PMOVSXWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI },
26939 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv2hiv2di2, "__builtin_ia32_pmovsxwq128", IX86_BUILTIN_PMOVSXWQ128, UNKNOWN, (int) V2DI_FTYPE_V8HI },
26940 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv2siv2di2, "__builtin_ia32_pmovsxdq128", IX86_BUILTIN_PMOVSXDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI },
26941 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv8qiv8hi2, "__builtin_ia32_pmovzxbw128", IX86_BUILTIN_PMOVZXBW128, UNKNOWN, (int) V8HI_FTYPE_V16QI },
26942 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv4qiv4si2, "__builtin_ia32_pmovzxbd128", IX86_BUILTIN_PMOVZXBD128, UNKNOWN, (int) V4SI_FTYPE_V16QI },
26943 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv2qiv2di2, "__builtin_ia32_pmovzxbq128", IX86_BUILTIN_PMOVZXBQ128, UNKNOWN, (int) V2DI_FTYPE_V16QI },
26944 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv4hiv4si2, "__builtin_ia32_pmovzxwd128", IX86_BUILTIN_PMOVZXWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI },
26945 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv2hiv2di2, "__builtin_ia32_pmovzxwq128", IX86_BUILTIN_PMOVZXWQ128, UNKNOWN, (int) V2DI_FTYPE_V8HI },
26946 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv2siv2di2, "__builtin_ia32_pmovzxdq128", IX86_BUILTIN_PMOVZXDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI },
26947 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_phminposuw, "__builtin_ia32_phminposuw128", IX86_BUILTIN_PHMINPOSUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI },
26948
26949 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_packusdw, "__builtin_ia32_packusdw128", IX86_BUILTIN_PACKUSDW128, UNKNOWN, (int) V8HI_FTYPE_V4SI_V4SI },
26950 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_eqv2di3, "__builtin_ia32_pcmpeqq", IX86_BUILTIN_PCMPEQQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26951 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_smaxv16qi3, "__builtin_ia32_pmaxsb128", IX86_BUILTIN_PMAXSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26952 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_smaxv4si3, "__builtin_ia32_pmaxsd128", IX86_BUILTIN_PMAXSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26953 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_umaxv4si3, "__builtin_ia32_pmaxud128", IX86_BUILTIN_PMAXUD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26954 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_umaxv8hi3, "__builtin_ia32_pmaxuw128", IX86_BUILTIN_PMAXUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26955 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sminv16qi3, "__builtin_ia32_pminsb128", IX86_BUILTIN_PMINSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26956 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sminv4si3, "__builtin_ia32_pminsd128", IX86_BUILTIN_PMINSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26957 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_uminv4si3, "__builtin_ia32_pminud128", IX86_BUILTIN_PMINUD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26958 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_uminv8hi3, "__builtin_ia32_pminuw128", IX86_BUILTIN_PMINUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26959 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_mulv2siv2di3, "__builtin_ia32_pmuldq128", IX86_BUILTIN_PMULDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI_V4SI },
26960 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_mulv4si3, "__builtin_ia32_pmulld128", IX86_BUILTIN_PMULLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26961
26962 /* SSE4.1 */
26963 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_roundpd", IX86_BUILTIN_ROUNDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_INT },
26964 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_roundps", IX86_BUILTIN_ROUNDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_INT },
26965 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundsd, "__builtin_ia32_roundsd", IX86_BUILTIN_ROUNDSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
26966 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundss, "__builtin_ia32_roundss", IX86_BUILTIN_ROUNDSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
26967
26968 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_floorpd", IX86_BUILTIN_FLOORPD, (enum rtx_code) ROUND_FLOOR, (int) V2DF_FTYPE_V2DF_ROUND },
26969 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_ceilpd", IX86_BUILTIN_CEILPD, (enum rtx_code) ROUND_CEIL, (int) V2DF_FTYPE_V2DF_ROUND },
26970 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_truncpd", IX86_BUILTIN_TRUNCPD, (enum rtx_code) ROUND_TRUNC, (int) V2DF_FTYPE_V2DF_ROUND },
26971 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_rintpd", IX86_BUILTIN_RINTPD, (enum rtx_code) ROUND_MXCSR, (int) V2DF_FTYPE_V2DF_ROUND },
26972
26973 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd_vec_pack_sfix, "__builtin_ia32_floorpd_vec_pack_sfix", IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX, (enum rtx_code) ROUND_FLOOR, (int) V4SI_FTYPE_V2DF_V2DF_ROUND },
26974 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd_vec_pack_sfix, "__builtin_ia32_ceilpd_vec_pack_sfix", IX86_BUILTIN_CEILPD_VEC_PACK_SFIX, (enum rtx_code) ROUND_CEIL, (int) V4SI_FTYPE_V2DF_V2DF_ROUND },
26975
26976 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv2df2, "__builtin_ia32_roundpd_az", IX86_BUILTIN_ROUNDPD_AZ, UNKNOWN, (int) V2DF_FTYPE_V2DF },
26977 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv2df2_vec_pack_sfix, "__builtin_ia32_roundpd_az_vec_pack_sfix", IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX, UNKNOWN, (int) V4SI_FTYPE_V2DF_V2DF },
26978
26979 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_floorps", IX86_BUILTIN_FLOORPS, (enum rtx_code) ROUND_FLOOR, (int) V4SF_FTYPE_V4SF_ROUND },
26980 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_ceilps", IX86_BUILTIN_CEILPS, (enum rtx_code) ROUND_CEIL, (int) V4SF_FTYPE_V4SF_ROUND },
26981 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_truncps", IX86_BUILTIN_TRUNCPS, (enum rtx_code) ROUND_TRUNC, (int) V4SF_FTYPE_V4SF_ROUND },
26982 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_rintps", IX86_BUILTIN_RINTPS, (enum rtx_code) ROUND_MXCSR, (int) V4SF_FTYPE_V4SF_ROUND },
26983
26984 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps_sfix, "__builtin_ia32_floorps_sfix", IX86_BUILTIN_FLOORPS_SFIX, (enum rtx_code) ROUND_FLOOR, (int) V4SI_FTYPE_V4SF_ROUND },
26985 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps_sfix, "__builtin_ia32_ceilps_sfix", IX86_BUILTIN_CEILPS_SFIX, (enum rtx_code) ROUND_CEIL, (int) V4SI_FTYPE_V4SF_ROUND },
26986
26987 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv4sf2, "__builtin_ia32_roundps_az", IX86_BUILTIN_ROUNDPS_AZ, UNKNOWN, (int) V4SF_FTYPE_V4SF },
26988 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv4sf2_sfix, "__builtin_ia32_roundps_az_sfix", IX86_BUILTIN_ROUNDPS_AZ_SFIX, UNKNOWN, (int) V4SI_FTYPE_V4SF },
26989
26990 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestz128", IX86_BUILTIN_PTESTZ, EQ, (int) INT_FTYPE_V2DI_V2DI_PTEST },
26991 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestc128", IX86_BUILTIN_PTESTC, LTU, (int) INT_FTYPE_V2DI_V2DI_PTEST },
26992 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestnzc128", IX86_BUILTIN_PTESTNZC, GTU, (int) INT_FTYPE_V2DI_V2DI_PTEST },
26993
26994 /* SSE4.2 */
26995 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_gtv2di3, "__builtin_ia32_pcmpgtq", IX86_BUILTIN_PCMPGTQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26996 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32, CODE_FOR_sse4_2_crc32qi, "__builtin_ia32_crc32qi", IX86_BUILTIN_CRC32QI, UNKNOWN, (int) UINT_FTYPE_UINT_UCHAR },
26997 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32, CODE_FOR_sse4_2_crc32hi, "__builtin_ia32_crc32hi", IX86_BUILTIN_CRC32HI, UNKNOWN, (int) UINT_FTYPE_UINT_USHORT },
26998 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32, CODE_FOR_sse4_2_crc32si, "__builtin_ia32_crc32si", IX86_BUILTIN_CRC32SI, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
26999 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse4_2_crc32di, "__builtin_ia32_crc32di", IX86_BUILTIN_CRC32DI, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
27000
27001 /* SSE4A */
27002 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_extrqi, "__builtin_ia32_extrqi", IX86_BUILTIN_EXTRQI, UNKNOWN, (int) V2DI_FTYPE_V2DI_UINT_UINT },
27003 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_extrq, "__builtin_ia32_extrq", IX86_BUILTIN_EXTRQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V16QI },
27004 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_insertqi, "__builtin_ia32_insertqi", IX86_BUILTIN_INSERTQI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_UINT_UINT },
27005 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_insertq, "__builtin_ia32_insertq", IX86_BUILTIN_INSERTQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27006
27007 /* AES */
27008 { OPTION_MASK_ISA_SSE2, CODE_FOR_aeskeygenassist, 0, IX86_BUILTIN_AESKEYGENASSIST128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT },
27009 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesimc, 0, IX86_BUILTIN_AESIMC128, UNKNOWN, (int) V2DI_FTYPE_V2DI },
27010
27011 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesenc, 0, IX86_BUILTIN_AESENC128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27012 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesenclast, 0, IX86_BUILTIN_AESENCLAST128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27013 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesdec, 0, IX86_BUILTIN_AESDEC128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27014 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesdeclast, 0, IX86_BUILTIN_AESDECLAST128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27015
27016 /* PCLMUL */
27017 { OPTION_MASK_ISA_SSE2, CODE_FOR_pclmulqdq, 0, IX86_BUILTIN_PCLMULQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_INT },
27018
27019 /* AVX */
27020 { OPTION_MASK_ISA_AVX, CODE_FOR_addv4df3, "__builtin_ia32_addpd256", IX86_BUILTIN_ADDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27021 { OPTION_MASK_ISA_AVX, CODE_FOR_addv8sf3, "__builtin_ia32_addps256", IX86_BUILTIN_ADDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27022 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_addsubv4df3, "__builtin_ia32_addsubpd256", IX86_BUILTIN_ADDSUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27023 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_addsubv8sf3, "__builtin_ia32_addsubps256", IX86_BUILTIN_ADDSUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27024 { OPTION_MASK_ISA_AVX, CODE_FOR_andv4df3, "__builtin_ia32_andpd256", IX86_BUILTIN_ANDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27025 { OPTION_MASK_ISA_AVX, CODE_FOR_andv8sf3, "__builtin_ia32_andps256", IX86_BUILTIN_ANDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27026 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_andnotv4df3, "__builtin_ia32_andnpd256", IX86_BUILTIN_ANDNPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27027 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_andnotv8sf3, "__builtin_ia32_andnps256", IX86_BUILTIN_ANDNPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27028 { OPTION_MASK_ISA_AVX, CODE_FOR_divv4df3, "__builtin_ia32_divpd256", IX86_BUILTIN_DIVPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27029 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_divv8sf3, "__builtin_ia32_divps256", IX86_BUILTIN_DIVPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27030 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_haddv4df3, "__builtin_ia32_haddpd256", IX86_BUILTIN_HADDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27031 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_hsubv8sf3, "__builtin_ia32_hsubps256", IX86_BUILTIN_HSUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27032 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_hsubv4df3, "__builtin_ia32_hsubpd256", IX86_BUILTIN_HSUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27033 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_haddv8sf3, "__builtin_ia32_haddps256", IX86_BUILTIN_HADDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27034 { OPTION_MASK_ISA_AVX, CODE_FOR_smaxv4df3, "__builtin_ia32_maxpd256", IX86_BUILTIN_MAXPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27035 { OPTION_MASK_ISA_AVX, CODE_FOR_smaxv8sf3, "__builtin_ia32_maxps256", IX86_BUILTIN_MAXPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27036 { OPTION_MASK_ISA_AVX, CODE_FOR_sminv4df3, "__builtin_ia32_minpd256", IX86_BUILTIN_MINPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27037 { OPTION_MASK_ISA_AVX, CODE_FOR_sminv8sf3, "__builtin_ia32_minps256", IX86_BUILTIN_MINPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27038 { OPTION_MASK_ISA_AVX, CODE_FOR_mulv4df3, "__builtin_ia32_mulpd256", IX86_BUILTIN_MULPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27039 { OPTION_MASK_ISA_AVX, CODE_FOR_mulv8sf3, "__builtin_ia32_mulps256", IX86_BUILTIN_MULPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27040 { OPTION_MASK_ISA_AVX, CODE_FOR_iorv4df3, "__builtin_ia32_orpd256", IX86_BUILTIN_ORPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27041 { OPTION_MASK_ISA_AVX, CODE_FOR_iorv8sf3, "__builtin_ia32_orps256", IX86_BUILTIN_ORPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27042 { OPTION_MASK_ISA_AVX, CODE_FOR_subv4df3, "__builtin_ia32_subpd256", IX86_BUILTIN_SUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27043 { OPTION_MASK_ISA_AVX, CODE_FOR_subv8sf3, "__builtin_ia32_subps256", IX86_BUILTIN_SUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27044 { OPTION_MASK_ISA_AVX, CODE_FOR_xorv4df3, "__builtin_ia32_xorpd256", IX86_BUILTIN_XORPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27045 { OPTION_MASK_ISA_AVX, CODE_FOR_xorv8sf3, "__builtin_ia32_xorps256", IX86_BUILTIN_XORPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27046
27047 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv2df3, "__builtin_ia32_vpermilvarpd", IX86_BUILTIN_VPERMILVARPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DI },
27048 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv4sf3, "__builtin_ia32_vpermilvarps", IX86_BUILTIN_VPERMILVARPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SI },
27049 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv4df3, "__builtin_ia32_vpermilvarpd256", IX86_BUILTIN_VPERMILVARPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DI },
27050 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv8sf3, "__builtin_ia32_vpermilvarps256", IX86_BUILTIN_VPERMILVARPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SI },
27051
27052 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendpd256, "__builtin_ia32_blendpd256", IX86_BUILTIN_BLENDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
27053 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendps256, "__builtin_ia32_blendps256", IX86_BUILTIN_BLENDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
27054 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendvpd256, "__builtin_ia32_blendvpd256", IX86_BUILTIN_BLENDVPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_V4DF },
27055 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendvps256, "__builtin_ia32_blendvps256", IX86_BUILTIN_BLENDVPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_V8SF },
27056 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_dpps256, "__builtin_ia32_dpps256", IX86_BUILTIN_DPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
27057 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_shufpd256, "__builtin_ia32_shufpd256", IX86_BUILTIN_SHUFPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
27058 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_shufps256, "__builtin_ia32_shufps256", IX86_BUILTIN_SHUFPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
27059 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vmcmpv2df3, "__builtin_ia32_cmpsd", IX86_BUILTIN_CMPSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
27060 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vmcmpv4sf3, "__builtin_ia32_cmpss", IX86_BUILTIN_CMPSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
27061 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv2df3, "__builtin_ia32_cmppd", IX86_BUILTIN_CMPPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
27062 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv4sf3, "__builtin_ia32_cmpps", IX86_BUILTIN_CMPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
27063 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv4df3, "__builtin_ia32_cmppd256", IX86_BUILTIN_CMPPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
27064 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv8sf3, "__builtin_ia32_cmpps256", IX86_BUILTIN_CMPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
27065 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vextractf128v4df, "__builtin_ia32_vextractf128_pd256", IX86_BUILTIN_EXTRACTF128PD256, UNKNOWN, (int) V2DF_FTYPE_V4DF_INT },
27066 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vextractf128v8sf, "__builtin_ia32_vextractf128_ps256", IX86_BUILTIN_EXTRACTF128PS256, UNKNOWN, (int) V4SF_FTYPE_V8SF_INT },
27067 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vextractf128v8si, "__builtin_ia32_vextractf128_si256", IX86_BUILTIN_EXTRACTF128SI256, UNKNOWN, (int) V4SI_FTYPE_V8SI_INT },
27068 { OPTION_MASK_ISA_AVX, CODE_FOR_floatv4siv4df2, "__builtin_ia32_cvtdq2pd256", IX86_BUILTIN_CVTDQ2PD256, UNKNOWN, (int) V4DF_FTYPE_V4SI },
27069 { OPTION_MASK_ISA_AVX, CODE_FOR_floatv8siv8sf2, "__builtin_ia32_cvtdq2ps256", IX86_BUILTIN_CVTDQ2PS256, UNKNOWN, (int) V8SF_FTYPE_V8SI },
27070 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtpd2ps256, "__builtin_ia32_cvtpd2ps256", IX86_BUILTIN_CVTPD2PS256, UNKNOWN, (int) V4SF_FTYPE_V4DF },
27071 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtps2dq256, "__builtin_ia32_cvtps2dq256", IX86_BUILTIN_CVTPS2DQ256, UNKNOWN, (int) V8SI_FTYPE_V8SF },
27072 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtps2pd256, "__builtin_ia32_cvtps2pd256", IX86_BUILTIN_CVTPS2PD256, UNKNOWN, (int) V4DF_FTYPE_V4SF },
27073 { OPTION_MASK_ISA_AVX, CODE_FOR_fix_truncv4dfv4si2, "__builtin_ia32_cvttpd2dq256", IX86_BUILTIN_CVTTPD2DQ256, UNKNOWN, (int) V4SI_FTYPE_V4DF },
27074 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtpd2dq256, "__builtin_ia32_cvtpd2dq256", IX86_BUILTIN_CVTPD2DQ256, UNKNOWN, (int) V4SI_FTYPE_V4DF },
27075 { OPTION_MASK_ISA_AVX, CODE_FOR_fix_truncv8sfv8si2, "__builtin_ia32_cvttps2dq256", IX86_BUILTIN_CVTTPS2DQ256, UNKNOWN, (int) V8SI_FTYPE_V8SF },
27076 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v4df3, "__builtin_ia32_vperm2f128_pd256", IX86_BUILTIN_VPERM2F128PD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
27077 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v8sf3, "__builtin_ia32_vperm2f128_ps256", IX86_BUILTIN_VPERM2F128PS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
27078 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v8si3, "__builtin_ia32_vperm2f128_si256", IX86_BUILTIN_VPERM2F128SI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_INT },
27079 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv2df, "__builtin_ia32_vpermilpd", IX86_BUILTIN_VPERMILPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_INT },
27080 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv4sf, "__builtin_ia32_vpermilps", IX86_BUILTIN_VPERMILPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_INT },
27081 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv4df, "__builtin_ia32_vpermilpd256", IX86_BUILTIN_VPERMILPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT },
27082 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv8sf, "__builtin_ia32_vpermilps256", IX86_BUILTIN_VPERMILPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_INT },
27083 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vinsertf128v4df, "__builtin_ia32_vinsertf128_pd256", IX86_BUILTIN_VINSERTF128PD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V2DF_INT },
27084 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vinsertf128v8sf, "__builtin_ia32_vinsertf128_ps256", IX86_BUILTIN_VINSERTF128PS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V4SF_INT },
27085 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vinsertf128v8si, "__builtin_ia32_vinsertf128_si256", IX86_BUILTIN_VINSERTF128SI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_INT },
27086
27087 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movshdup256, "__builtin_ia32_movshdup256", IX86_BUILTIN_MOVSHDUP256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
27088 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movsldup256, "__builtin_ia32_movsldup256", IX86_BUILTIN_MOVSLDUP256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
27089 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movddup256, "__builtin_ia32_movddup256", IX86_BUILTIN_MOVDDUP256, UNKNOWN, (int) V4DF_FTYPE_V4DF },
27090
27091 { OPTION_MASK_ISA_AVX, CODE_FOR_sqrtv4df2, "__builtin_ia32_sqrtpd256", IX86_BUILTIN_SQRTPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF },
27092 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_sqrtv8sf2, "__builtin_ia32_sqrtps256", IX86_BUILTIN_SQRTPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
27093 { OPTION_MASK_ISA_AVX, CODE_FOR_sqrtv8sf2, "__builtin_ia32_sqrtps_nr256", IX86_BUILTIN_SQRTPS_NR256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
27094 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_rsqrtv8sf2, "__builtin_ia32_rsqrtps256", IX86_BUILTIN_RSQRTPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
27095 { OPTION_MASK_ISA_AVX, CODE_FOR_rsqrtv8sf2, "__builtin_ia32_rsqrtps_nr256", IX86_BUILTIN_RSQRTPS_NR256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
27096
27097 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_rcpv8sf2, "__builtin_ia32_rcpps256", IX86_BUILTIN_RCPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
27098
27099 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_roundpd256", IX86_BUILTIN_ROUNDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT },
27100 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_roundps256", IX86_BUILTIN_ROUNDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_INT },
27101
27102 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_floorpd256", IX86_BUILTIN_FLOORPD256, (enum rtx_code) ROUND_FLOOR, (int) V4DF_FTYPE_V4DF_ROUND },
27103 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_ceilpd256", IX86_BUILTIN_CEILPD256, (enum rtx_code) ROUND_CEIL, (int) V4DF_FTYPE_V4DF_ROUND },
27104 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_truncpd256", IX86_BUILTIN_TRUNCPD256, (enum rtx_code) ROUND_TRUNC, (int) V4DF_FTYPE_V4DF_ROUND },
27105 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_rintpd256", IX86_BUILTIN_RINTPD256, (enum rtx_code) ROUND_MXCSR, (int) V4DF_FTYPE_V4DF_ROUND },
27106
27107 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv4df2, "__builtin_ia32_roundpd_az256", IX86_BUILTIN_ROUNDPD_AZ256, UNKNOWN, (int) V4DF_FTYPE_V4DF },
27108 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv4df2_vec_pack_sfix, "__builtin_ia32_roundpd_az_vec_pack_sfix256", IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256, UNKNOWN, (int) V8SI_FTYPE_V4DF_V4DF },
27109
27110 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd_vec_pack_sfix256, "__builtin_ia32_floorpd_vec_pack_sfix256", IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256, (enum rtx_code) ROUND_FLOOR, (int) V8SI_FTYPE_V4DF_V4DF_ROUND },
27111 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd_vec_pack_sfix256, "__builtin_ia32_ceilpd_vec_pack_sfix256", IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256, (enum rtx_code) ROUND_CEIL, (int) V8SI_FTYPE_V4DF_V4DF_ROUND },
27112
27113 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_floorps256", IX86_BUILTIN_FLOORPS256, (enum rtx_code) ROUND_FLOOR, (int) V8SF_FTYPE_V8SF_ROUND },
27114 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_ceilps256", IX86_BUILTIN_CEILPS256, (enum rtx_code) ROUND_CEIL, (int) V8SF_FTYPE_V8SF_ROUND },
27115 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_truncps256", IX86_BUILTIN_TRUNCPS256, (enum rtx_code) ROUND_TRUNC, (int) V8SF_FTYPE_V8SF_ROUND },
27116 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_rintps256", IX86_BUILTIN_RINTPS256, (enum rtx_code) ROUND_MXCSR, (int) V8SF_FTYPE_V8SF_ROUND },
27117
27118 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps_sfix256, "__builtin_ia32_floorps_sfix256", IX86_BUILTIN_FLOORPS_SFIX256, (enum rtx_code) ROUND_FLOOR, (int) V8SI_FTYPE_V8SF_ROUND },
27119 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps_sfix256, "__builtin_ia32_ceilps_sfix256", IX86_BUILTIN_CEILPS_SFIX256, (enum rtx_code) ROUND_CEIL, (int) V8SI_FTYPE_V8SF_ROUND },
27120
27121 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv8sf2, "__builtin_ia32_roundps_az256", IX86_BUILTIN_ROUNDPS_AZ256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
27122 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv8sf2_sfix, "__builtin_ia32_roundps_az_sfix256", IX86_BUILTIN_ROUNDPS_AZ_SFIX256, UNKNOWN, (int) V8SI_FTYPE_V8SF },
27123
27124 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpckhpd256, "__builtin_ia32_unpckhpd256", IX86_BUILTIN_UNPCKHPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27125 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpcklpd256, "__builtin_ia32_unpcklpd256", IX86_BUILTIN_UNPCKLPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27126 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpckhps256, "__builtin_ia32_unpckhps256", IX86_BUILTIN_UNPCKHPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27127 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpcklps256, "__builtin_ia32_unpcklps256", IX86_BUILTIN_UNPCKLPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27128
27129 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_si256_si, "__builtin_ia32_si256_si", IX86_BUILTIN_SI256_SI, UNKNOWN, (int) V8SI_FTYPE_V4SI },
27130 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ps256_ps, "__builtin_ia32_ps256_ps", IX86_BUILTIN_PS256_PS, UNKNOWN, (int) V8SF_FTYPE_V4SF },
27131 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_pd256_pd, "__builtin_ia32_pd256_pd", IX86_BUILTIN_PD256_PD, UNKNOWN, (int) V4DF_FTYPE_V2DF },
27132 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_extract_lo_v8si, "__builtin_ia32_si_si256", IX86_BUILTIN_SI_SI256, UNKNOWN, (int) V4SI_FTYPE_V8SI },
27133 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_extract_lo_v8sf, "__builtin_ia32_ps_ps256", IX86_BUILTIN_PS_PS256, UNKNOWN, (int) V4SF_FTYPE_V8SF },
27134 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_extract_lo_v4df, "__builtin_ia32_pd_pd256", IX86_BUILTIN_PD_PD256, UNKNOWN, (int) V2DF_FTYPE_V4DF },
27135
27136 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd, "__builtin_ia32_vtestzpd", IX86_BUILTIN_VTESTZPD, EQ, (int) INT_FTYPE_V2DF_V2DF_PTEST },
27137 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd, "__builtin_ia32_vtestcpd", IX86_BUILTIN_VTESTCPD, LTU, (int) INT_FTYPE_V2DF_V2DF_PTEST },
27138 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd, "__builtin_ia32_vtestnzcpd", IX86_BUILTIN_VTESTNZCPD, GTU, (int) INT_FTYPE_V2DF_V2DF_PTEST },
27139 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps, "__builtin_ia32_vtestzps", IX86_BUILTIN_VTESTZPS, EQ, (int) INT_FTYPE_V4SF_V4SF_PTEST },
27140 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps, "__builtin_ia32_vtestcps", IX86_BUILTIN_VTESTCPS, LTU, (int) INT_FTYPE_V4SF_V4SF_PTEST },
27141 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps, "__builtin_ia32_vtestnzcps", IX86_BUILTIN_VTESTNZCPS, GTU, (int) INT_FTYPE_V4SF_V4SF_PTEST },
27142 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd256, "__builtin_ia32_vtestzpd256", IX86_BUILTIN_VTESTZPD256, EQ, (int) INT_FTYPE_V4DF_V4DF_PTEST },
27143 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd256, "__builtin_ia32_vtestcpd256", IX86_BUILTIN_VTESTCPD256, LTU, (int) INT_FTYPE_V4DF_V4DF_PTEST },
27144 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd256, "__builtin_ia32_vtestnzcpd256", IX86_BUILTIN_VTESTNZCPD256, GTU, (int) INT_FTYPE_V4DF_V4DF_PTEST },
27145 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestzps256", IX86_BUILTIN_VTESTZPS256, EQ, (int) INT_FTYPE_V8SF_V8SF_PTEST },
27146 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestcps256", IX86_BUILTIN_VTESTCPS256, LTU, (int) INT_FTYPE_V8SF_V8SF_PTEST },
27147 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestnzcps256", IX86_BUILTIN_VTESTNZCPS256, GTU, (int) INT_FTYPE_V8SF_V8SF_PTEST },
27148 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, "__builtin_ia32_ptestz256", IX86_BUILTIN_PTESTZ256, EQ, (int) INT_FTYPE_V4DI_V4DI_PTEST },
27149 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, "__builtin_ia32_ptestc256", IX86_BUILTIN_PTESTC256, LTU, (int) INT_FTYPE_V4DI_V4DI_PTEST },
27150 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, "__builtin_ia32_ptestnzc256", IX86_BUILTIN_PTESTNZC256, GTU, (int) INT_FTYPE_V4DI_V4DI_PTEST },
27151
27152 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movmskpd256, "__builtin_ia32_movmskpd256", IX86_BUILTIN_MOVMSKPD256, UNKNOWN, (int) INT_FTYPE_V4DF },
27153 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movmskps256, "__builtin_ia32_movmskps256", IX86_BUILTIN_MOVMSKPS256, UNKNOWN, (int) INT_FTYPE_V8SF },
27154
27155 { OPTION_MASK_ISA_AVX, CODE_FOR_copysignv8sf3, "__builtin_ia32_copysignps256", IX86_BUILTIN_CPYSGNPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27156 { OPTION_MASK_ISA_AVX, CODE_FOR_copysignv4df3, "__builtin_ia32_copysignpd256", IX86_BUILTIN_CPYSGNPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27157
27158 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_pack_sfix_v4df, "__builtin_ia32_vec_pack_sfix256 ", IX86_BUILTIN_VEC_PACK_SFIX256, UNKNOWN, (int) V8SI_FTYPE_V4DF_V4DF },
27159
27160 /* AVX2 */
27161 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_mpsadbw, "__builtin_ia32_mpsadbw256", IX86_BUILTIN_MPSADBW256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI_INT },
27162 { OPTION_MASK_ISA_AVX2, CODE_FOR_absv32qi2, "__builtin_ia32_pabsb256", IX86_BUILTIN_PABSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI },
27163 { OPTION_MASK_ISA_AVX2, CODE_FOR_absv16hi2, "__builtin_ia32_pabsw256", IX86_BUILTIN_PABSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI },
27164 { OPTION_MASK_ISA_AVX2, CODE_FOR_absv8si2, "__builtin_ia32_pabsd256", IX86_BUILTIN_PABSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI },
27165 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packssdw, "__builtin_ia32_packssdw256", IX86_BUILTIN_PACKSSDW256, UNKNOWN, (int) V16HI_FTYPE_V8SI_V8SI },
27166 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packsswb, "__builtin_ia32_packsswb256", IX86_BUILTIN_PACKSSWB256, UNKNOWN, (int) V32QI_FTYPE_V16HI_V16HI },
27167 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packusdw, "__builtin_ia32_packusdw256", IX86_BUILTIN_PACKUSDW256, UNKNOWN, (int) V16HI_FTYPE_V8SI_V8SI },
27168 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packuswb, "__builtin_ia32_packuswb256", IX86_BUILTIN_PACKUSWB256, UNKNOWN, (int) V32QI_FTYPE_V16HI_V16HI },
27169 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv32qi3, "__builtin_ia32_paddb256", IX86_BUILTIN_PADDB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27170 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv16hi3, "__builtin_ia32_paddw256", IX86_BUILTIN_PADDW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27171 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv8si3, "__builtin_ia32_paddd256", IX86_BUILTIN_PADDD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27172 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv4di3, "__builtin_ia32_paddq256", IX86_BUILTIN_PADDQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27173 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ssaddv32qi3, "__builtin_ia32_paddsb256", IX86_BUILTIN_PADDSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27174 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ssaddv16hi3, "__builtin_ia32_paddsw256", IX86_BUILTIN_PADDSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27175 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_usaddv32qi3, "__builtin_ia32_paddusb256", IX86_BUILTIN_PADDUSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27176 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_usaddv16hi3, "__builtin_ia32_paddusw256", IX86_BUILTIN_PADDUSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27177 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_palignrv2ti, "__builtin_ia32_palignr256", IX86_BUILTIN_PALIGNR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI_INT_CONVERT },
27178 { OPTION_MASK_ISA_AVX2, CODE_FOR_andv4di3, "__builtin_ia32_andsi256", IX86_BUILTIN_AND256I, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27179 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_andnotv4di3, "__builtin_ia32_andnotsi256", IX86_BUILTIN_ANDNOT256I, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27180 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_uavgv32qi3, "__builtin_ia32_pavgb256", IX86_BUILTIN_PAVGB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27181 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_uavgv16hi3, "__builtin_ia32_pavgw256", IX86_BUILTIN_PAVGW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27182 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblendvb, "__builtin_ia32_pblendvb256", IX86_BUILTIN_PBLENDVB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI_V32QI },
27183 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblendw, "__builtin_ia32_pblendw256", IX86_BUILTIN_PBLENDVW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI_INT },
27184 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv32qi3, "__builtin_ia32_pcmpeqb256", IX86_BUILTIN_PCMPEQB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27185 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv16hi3, "__builtin_ia32_pcmpeqw256", IX86_BUILTIN_PCMPEQW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27186 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv8si3, "__builtin_ia32_pcmpeqd256", IX86_BUILTIN_PCMPEQD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27187 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv4di3, "__builtin_ia32_pcmpeqq256", IX86_BUILTIN_PCMPEQQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27188 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv32qi3, "__builtin_ia32_pcmpgtb256", IX86_BUILTIN_PCMPGTB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27189 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv16hi3, "__builtin_ia32_pcmpgtw256", IX86_BUILTIN_PCMPGTW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27190 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv8si3, "__builtin_ia32_pcmpgtd256", IX86_BUILTIN_PCMPGTD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27191 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv4di3, "__builtin_ia32_pcmpgtq256", IX86_BUILTIN_PCMPGTQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27192 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phaddwv16hi3, "__builtin_ia32_phaddw256", IX86_BUILTIN_PHADDW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27193 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phadddv8si3, "__builtin_ia32_phaddd256", IX86_BUILTIN_PHADDD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27194 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phaddswv16hi3, "__builtin_ia32_phaddsw256", IX86_BUILTIN_PHADDSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27195 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phsubwv16hi3, "__builtin_ia32_phsubw256", IX86_BUILTIN_PHSUBW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27196 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phsubdv8si3, "__builtin_ia32_phsubd256", IX86_BUILTIN_PHSUBD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27197 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phsubswv16hi3, "__builtin_ia32_phsubsw256", IX86_BUILTIN_PHSUBSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27198 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmaddubsw256, "__builtin_ia32_pmaddubsw256", IX86_BUILTIN_PMADDUBSW256, UNKNOWN, (int) V16HI_FTYPE_V32QI_V32QI },
27199 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmaddwd, "__builtin_ia32_pmaddwd256", IX86_BUILTIN_PMADDWD256, UNKNOWN, (int) V8SI_FTYPE_V16HI_V16HI },
27200 { OPTION_MASK_ISA_AVX2, CODE_FOR_smaxv32qi3, "__builtin_ia32_pmaxsb256", IX86_BUILTIN_PMAXSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27201 { OPTION_MASK_ISA_AVX2, CODE_FOR_smaxv16hi3, "__builtin_ia32_pmaxsw256", IX86_BUILTIN_PMAXSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27202 { OPTION_MASK_ISA_AVX2, CODE_FOR_smaxv8si3 , "__builtin_ia32_pmaxsd256", IX86_BUILTIN_PMAXSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27203 { OPTION_MASK_ISA_AVX2, CODE_FOR_umaxv32qi3, "__builtin_ia32_pmaxub256", IX86_BUILTIN_PMAXUB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27204 { OPTION_MASK_ISA_AVX2, CODE_FOR_umaxv16hi3, "__builtin_ia32_pmaxuw256", IX86_BUILTIN_PMAXUW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27205 { OPTION_MASK_ISA_AVX2, CODE_FOR_umaxv8si3 , "__builtin_ia32_pmaxud256", IX86_BUILTIN_PMAXUD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27206 { OPTION_MASK_ISA_AVX2, CODE_FOR_sminv32qi3, "__builtin_ia32_pminsb256", IX86_BUILTIN_PMINSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27207 { OPTION_MASK_ISA_AVX2, CODE_FOR_sminv16hi3, "__builtin_ia32_pminsw256", IX86_BUILTIN_PMINSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27208 { OPTION_MASK_ISA_AVX2, CODE_FOR_sminv8si3 , "__builtin_ia32_pminsd256", IX86_BUILTIN_PMINSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27209 { OPTION_MASK_ISA_AVX2, CODE_FOR_uminv32qi3, "__builtin_ia32_pminub256", IX86_BUILTIN_PMINUB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27210 { OPTION_MASK_ISA_AVX2, CODE_FOR_uminv16hi3, "__builtin_ia32_pminuw256", IX86_BUILTIN_PMINUW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27211 { OPTION_MASK_ISA_AVX2, CODE_FOR_uminv8si3 , "__builtin_ia32_pminud256", IX86_BUILTIN_PMINUD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27212 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmovmskb, "__builtin_ia32_pmovmskb256", IX86_BUILTIN_PMOVMSKB256, UNKNOWN, (int) INT_FTYPE_V32QI },
27213 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv16qiv16hi2, "__builtin_ia32_pmovsxbw256", IX86_BUILTIN_PMOVSXBW256, UNKNOWN, (int) V16HI_FTYPE_V16QI },
27214 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv8qiv8si2 , "__builtin_ia32_pmovsxbd256", IX86_BUILTIN_PMOVSXBD256, UNKNOWN, (int) V8SI_FTYPE_V16QI },
27215 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv4qiv4di2 , "__builtin_ia32_pmovsxbq256", IX86_BUILTIN_PMOVSXBQ256, UNKNOWN, (int) V4DI_FTYPE_V16QI },
27216 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv8hiv8si2 , "__builtin_ia32_pmovsxwd256", IX86_BUILTIN_PMOVSXWD256, UNKNOWN, (int) V8SI_FTYPE_V8HI },
27217 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv4hiv4di2 , "__builtin_ia32_pmovsxwq256", IX86_BUILTIN_PMOVSXWQ256, UNKNOWN, (int) V4DI_FTYPE_V8HI },
27218 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv4siv4di2 , "__builtin_ia32_pmovsxdq256", IX86_BUILTIN_PMOVSXDQ256, UNKNOWN, (int) V4DI_FTYPE_V4SI },
27219 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv16qiv16hi2, "__builtin_ia32_pmovzxbw256", IX86_BUILTIN_PMOVZXBW256, UNKNOWN, (int) V16HI_FTYPE_V16QI },
27220 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv8qiv8si2 , "__builtin_ia32_pmovzxbd256", IX86_BUILTIN_PMOVZXBD256, UNKNOWN, (int) V8SI_FTYPE_V16QI },
27221 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4qiv4di2 , "__builtin_ia32_pmovzxbq256", IX86_BUILTIN_PMOVZXBQ256, UNKNOWN, (int) V4DI_FTYPE_V16QI },
27222 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv8hiv8si2 , "__builtin_ia32_pmovzxwd256", IX86_BUILTIN_PMOVZXWD256, UNKNOWN, (int) V8SI_FTYPE_V8HI },
27223 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4hiv4di2 , "__builtin_ia32_pmovzxwq256", IX86_BUILTIN_PMOVZXWQ256, UNKNOWN, (int) V4DI_FTYPE_V8HI },
27224 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4siv4di2 , "__builtin_ia32_pmovzxdq256", IX86_BUILTIN_PMOVZXDQ256, UNKNOWN, (int) V4DI_FTYPE_V4SI },
27225 { OPTION_MASK_ISA_AVX2, CODE_FOR_vec_widen_smult_even_v8si, "__builtin_ia32_pmuldq256", IX86_BUILTIN_PMULDQ256, UNKNOWN, (int) V4DI_FTYPE_V8SI_V8SI },
27226 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_umulhrswv16hi3 , "__builtin_ia32_pmulhrsw256", IX86_BUILTIN_PMULHRSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27227 { OPTION_MASK_ISA_AVX2, CODE_FOR_umulv16hi3_highpart, "__builtin_ia32_pmulhuw256" , IX86_BUILTIN_PMULHUW256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27228 { OPTION_MASK_ISA_AVX2, CODE_FOR_smulv16hi3_highpart, "__builtin_ia32_pmulhw256" , IX86_BUILTIN_PMULHW256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27229 { OPTION_MASK_ISA_AVX2, CODE_FOR_mulv16hi3, "__builtin_ia32_pmullw256" , IX86_BUILTIN_PMULLW256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27230 { OPTION_MASK_ISA_AVX2, CODE_FOR_mulv8si3, "__builtin_ia32_pmulld256" , IX86_BUILTIN_PMULLD256 , UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27231 { OPTION_MASK_ISA_AVX2, CODE_FOR_vec_widen_umult_even_v8si, "__builtin_ia32_pmuludq256", IX86_BUILTIN_PMULUDQ256, UNKNOWN, (int) V4DI_FTYPE_V8SI_V8SI },
27232 { OPTION_MASK_ISA_AVX2, CODE_FOR_iorv4di3, "__builtin_ia32_por256", IX86_BUILTIN_POR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27233 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psadbw, "__builtin_ia32_psadbw256", IX86_BUILTIN_PSADBW256, UNKNOWN, (int) V16HI_FTYPE_V32QI_V32QI },
27234 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshufbv32qi3, "__builtin_ia32_pshufb256", IX86_BUILTIN_PSHUFB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27235 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshufdv3, "__builtin_ia32_pshufd256", IX86_BUILTIN_PSHUFD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_INT },
27236 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshufhwv3, "__builtin_ia32_pshufhw256", IX86_BUILTIN_PSHUFHW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_INT },
27237 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshuflwv3, "__builtin_ia32_pshuflw256", IX86_BUILTIN_PSHUFLW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_INT },
27238 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psignv32qi3, "__builtin_ia32_psignb256", IX86_BUILTIN_PSIGNB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27239 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psignv16hi3, "__builtin_ia32_psignw256", IX86_BUILTIN_PSIGNW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27240 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psignv8si3 , "__builtin_ia32_psignd256", IX86_BUILTIN_PSIGND256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27241 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlv2ti3, "__builtin_ia32_pslldqi256", IX86_BUILTIN_PSLLDQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_CONVERT },
27242 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv16hi3, "__builtin_ia32_psllwi256", IX86_BUILTIN_PSLLWI256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_SI_COUNT },
27243 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv16hi3, "__builtin_ia32_psllw256", IX86_BUILTIN_PSLLW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V8HI_COUNT },
27244 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv8si3, "__builtin_ia32_pslldi256", IX86_BUILTIN_PSLLDI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_SI_COUNT },
27245 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv8si3, "__builtin_ia32_pslld256", IX86_BUILTIN_PSLLD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_COUNT },
27246 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv4di3, "__builtin_ia32_psllqi256", IX86_BUILTIN_PSLLQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_COUNT },
27247 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv4di3, "__builtin_ia32_psllq256", IX86_BUILTIN_PSLLQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V2DI_COUNT },
27248 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv16hi3, "__builtin_ia32_psrawi256", IX86_BUILTIN_PSRAWI256, UNKNOWN, (int) V16HI_FTYPE_V16HI_SI_COUNT },
27249 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv16hi3, "__builtin_ia32_psraw256", IX86_BUILTIN_PSRAW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V8HI_COUNT },
27250 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv8si3, "__builtin_ia32_psradi256", IX86_BUILTIN_PSRADI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_SI_COUNT },
27251 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv8si3, "__builtin_ia32_psrad256", IX86_BUILTIN_PSRAD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_COUNT },
27252 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrv2ti3, "__builtin_ia32_psrldqi256", IX86_BUILTIN_PSRLDQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_CONVERT },
27253 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv16hi3, "__builtin_ia32_psrlwi256", IX86_BUILTIN_PSRLWI256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_SI_COUNT },
27254 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv16hi3, "__builtin_ia32_psrlw256", IX86_BUILTIN_PSRLW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V8HI_COUNT },
27255 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv8si3, "__builtin_ia32_psrldi256", IX86_BUILTIN_PSRLDI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_SI_COUNT },
27256 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv8si3, "__builtin_ia32_psrld256", IX86_BUILTIN_PSRLD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_COUNT },
27257 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv4di3, "__builtin_ia32_psrlqi256", IX86_BUILTIN_PSRLQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_COUNT },
27258 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv4di3, "__builtin_ia32_psrlq256", IX86_BUILTIN_PSRLQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V2DI_COUNT },
27259 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv32qi3, "__builtin_ia32_psubb256", IX86_BUILTIN_PSUBB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27260 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv16hi3, "__builtin_ia32_psubw256", IX86_BUILTIN_PSUBW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27261 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv8si3, "__builtin_ia32_psubd256", IX86_BUILTIN_PSUBD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27262 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv4di3, "__builtin_ia32_psubq256", IX86_BUILTIN_PSUBQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27263 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sssubv32qi3, "__builtin_ia32_psubsb256", IX86_BUILTIN_PSUBSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27264 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sssubv16hi3, "__builtin_ia32_psubsw256", IX86_BUILTIN_PSUBSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27265 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ussubv32qi3, "__builtin_ia32_psubusb256", IX86_BUILTIN_PSUBUSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27266 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ussubv16hi3, "__builtin_ia32_psubusw256", IX86_BUILTIN_PSUBUSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27267 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv32qi, "__builtin_ia32_punpckhbw256", IX86_BUILTIN_PUNPCKHBW256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27268 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv16hi, "__builtin_ia32_punpckhwd256", IX86_BUILTIN_PUNPCKHWD256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27269 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv8si, "__builtin_ia32_punpckhdq256", IX86_BUILTIN_PUNPCKHDQ256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27270 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv4di, "__builtin_ia32_punpckhqdq256", IX86_BUILTIN_PUNPCKHQDQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27271 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv32qi, "__builtin_ia32_punpcklbw256", IX86_BUILTIN_PUNPCKLBW256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27272 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv16hi, "__builtin_ia32_punpcklwd256", IX86_BUILTIN_PUNPCKLWD256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27273 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv8si, "__builtin_ia32_punpckldq256", IX86_BUILTIN_PUNPCKLDQ256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27274 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv4di, "__builtin_ia32_punpcklqdq256", IX86_BUILTIN_PUNPCKLQDQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27275 { OPTION_MASK_ISA_AVX2, CODE_FOR_xorv4di3, "__builtin_ia32_pxor256", IX86_BUILTIN_PXOR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27276 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv4sf, "__builtin_ia32_vbroadcastss_ps", IX86_BUILTIN_VBROADCASTSS_PS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
27277 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv8sf, "__builtin_ia32_vbroadcastss_ps256", IX86_BUILTIN_VBROADCASTSS_PS256, UNKNOWN, (int) V8SF_FTYPE_V4SF },
27278 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv4df, "__builtin_ia32_vbroadcastsd_pd256", IX86_BUILTIN_VBROADCASTSD_PD256, UNKNOWN, (int) V4DF_FTYPE_V2DF },
27279 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vbroadcasti128_v4di, "__builtin_ia32_vbroadcastsi256", IX86_BUILTIN_VBROADCASTSI256, UNKNOWN, (int) V4DI_FTYPE_V2DI },
27280 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblenddv4si, "__builtin_ia32_pblendd128", IX86_BUILTIN_PBLENDD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_INT },
27281 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblenddv8si, "__builtin_ia32_pblendd256", IX86_BUILTIN_PBLENDD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_INT },
27282 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv32qi, "__builtin_ia32_pbroadcastb256", IX86_BUILTIN_PBROADCASTB256, UNKNOWN, (int) V32QI_FTYPE_V16QI },
27283 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv16hi, "__builtin_ia32_pbroadcastw256", IX86_BUILTIN_PBROADCASTW256, UNKNOWN, (int) V16HI_FTYPE_V8HI },
27284 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv8si, "__builtin_ia32_pbroadcastd256", IX86_BUILTIN_PBROADCASTD256, UNKNOWN, (int) V8SI_FTYPE_V4SI },
27285 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv4di, "__builtin_ia32_pbroadcastq256", IX86_BUILTIN_PBROADCASTQ256, UNKNOWN, (int) V4DI_FTYPE_V2DI },
27286 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv16qi, "__builtin_ia32_pbroadcastb128", IX86_BUILTIN_PBROADCASTB128, UNKNOWN, (int) V16QI_FTYPE_V16QI },
27287 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv8hi, "__builtin_ia32_pbroadcastw128", IX86_BUILTIN_PBROADCASTW128, UNKNOWN, (int) V8HI_FTYPE_V8HI },
27288 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv4si, "__builtin_ia32_pbroadcastd128", IX86_BUILTIN_PBROADCASTD128, UNKNOWN, (int) V4SI_FTYPE_V4SI },
27289 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv2di, "__builtin_ia32_pbroadcastq128", IX86_BUILTIN_PBROADCASTQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI },
27290 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permvarv8si, "__builtin_ia32_permvarsi256", IX86_BUILTIN_VPERMVARSI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27291 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permvarv8sf, "__builtin_ia32_permvarsf256", IX86_BUILTIN_VPERMVARSF256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SI },
27292 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv4df, "__builtin_ia32_permdf256", IX86_BUILTIN_VPERMDF256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT },
27293 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv4di, "__builtin_ia32_permdi256", IX86_BUILTIN_VPERMDI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT },
27294 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv2ti, "__builtin_ia32_permti256", IX86_BUILTIN_VPERMTI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI_INT },
27295 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_extracti128, "__builtin_ia32_extract128i256", IX86_BUILTIN_VEXTRACT128I256, UNKNOWN, (int) V2DI_FTYPE_V4DI_INT },
27296 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_inserti128, "__builtin_ia32_insert128i256", IX86_BUILTIN_VINSERT128I256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V2DI_INT },
27297 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv4di, "__builtin_ia32_psllv4di", IX86_BUILTIN_PSLLVV4DI, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27298 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv2di, "__builtin_ia32_psllv2di", IX86_BUILTIN_PSLLVV2DI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27299 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv8si, "__builtin_ia32_psllv8si", IX86_BUILTIN_PSLLVV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27300 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv4si, "__builtin_ia32_psllv4si", IX86_BUILTIN_PSLLVV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27301 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashrvv8si, "__builtin_ia32_psrav8si", IX86_BUILTIN_PSRAVV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27302 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashrvv4si, "__builtin_ia32_psrav4si", IX86_BUILTIN_PSRAVV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27303 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv4di, "__builtin_ia32_psrlv4di", IX86_BUILTIN_PSRLVV4DI, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27304 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv2di, "__builtin_ia32_psrlv2di", IX86_BUILTIN_PSRLVV2DI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27305 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv8si, "__builtin_ia32_psrlv8si", IX86_BUILTIN_PSRLVV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27306 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv4si, "__builtin_ia32_psrlv4si", IX86_BUILTIN_PSRLVV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27307
27308 { OPTION_MASK_ISA_LZCNT, CODE_FOR_clzhi2_lzcnt, "__builtin_clzs", IX86_BUILTIN_CLZS, UNKNOWN, (int) UINT16_FTYPE_UINT16 },
27309
27310 /* BMI */
27311 { OPTION_MASK_ISA_BMI, CODE_FOR_bmi_bextr_si, "__builtin_ia32_bextr_u32", IX86_BUILTIN_BEXTR32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
27312 { OPTION_MASK_ISA_BMI, CODE_FOR_bmi_bextr_di, "__builtin_ia32_bextr_u64", IX86_BUILTIN_BEXTR64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
27313 { OPTION_MASK_ISA_BMI, CODE_FOR_ctzhi2, "__builtin_ctzs", IX86_BUILTIN_CTZS, UNKNOWN, (int) UINT16_FTYPE_UINT16 },
27314
27315 /* TBM */
27316 { OPTION_MASK_ISA_TBM, CODE_FOR_tbm_bextri_si, "__builtin_ia32_bextri_u32", IX86_BUILTIN_BEXTRI32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
27317 { OPTION_MASK_ISA_TBM, CODE_FOR_tbm_bextri_di, "__builtin_ia32_bextri_u64", IX86_BUILTIN_BEXTRI64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
27318
27319 /* F16C */
27320 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtph2ps, "__builtin_ia32_vcvtph2ps", IX86_BUILTIN_CVTPH2PS, UNKNOWN, (int) V4SF_FTYPE_V8HI },
27321 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtph2ps256, "__builtin_ia32_vcvtph2ps256", IX86_BUILTIN_CVTPH2PS256, UNKNOWN, (int) V8SF_FTYPE_V8HI },
27322 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtps2ph, "__builtin_ia32_vcvtps2ph", IX86_BUILTIN_CVTPS2PH, UNKNOWN, (int) V8HI_FTYPE_V4SF_INT },
27323 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtps2ph256, "__builtin_ia32_vcvtps2ph256", IX86_BUILTIN_CVTPS2PH256, UNKNOWN, (int) V8HI_FTYPE_V8SF_INT },
27324
27325 /* BMI2 */
27326 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_bzhi_si3, "__builtin_ia32_bzhi_si", IX86_BUILTIN_BZHI32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
27327 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_bzhi_di3, "__builtin_ia32_bzhi_di", IX86_BUILTIN_BZHI64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
27328 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pdep_si3, "__builtin_ia32_pdep_si", IX86_BUILTIN_PDEP32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
27329 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pdep_di3, "__builtin_ia32_pdep_di", IX86_BUILTIN_PDEP64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
27330 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pext_si3, "__builtin_ia32_pext_si", IX86_BUILTIN_PEXT32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
27331 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pext_di3, "__builtin_ia32_pext_di", IX86_BUILTIN_PEXT64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
27332 };
27333
27334 /* FMA4 and XOP. */
27335 #define MULTI_ARG_4_DF2_DI_I V2DF_FTYPE_V2DF_V2DF_V2DI_INT
27336 #define MULTI_ARG_4_DF2_DI_I1 V4DF_FTYPE_V4DF_V4DF_V4DI_INT
27337 #define MULTI_ARG_4_SF2_SI_I V4SF_FTYPE_V4SF_V4SF_V4SI_INT
27338 #define MULTI_ARG_4_SF2_SI_I1 V8SF_FTYPE_V8SF_V8SF_V8SI_INT
27339 #define MULTI_ARG_3_SF V4SF_FTYPE_V4SF_V4SF_V4SF
27340 #define MULTI_ARG_3_DF V2DF_FTYPE_V2DF_V2DF_V2DF
27341 #define MULTI_ARG_3_SF2 V8SF_FTYPE_V8SF_V8SF_V8SF
27342 #define MULTI_ARG_3_DF2 V4DF_FTYPE_V4DF_V4DF_V4DF
27343 #define MULTI_ARG_3_DI V2DI_FTYPE_V2DI_V2DI_V2DI
27344 #define MULTI_ARG_3_SI V4SI_FTYPE_V4SI_V4SI_V4SI
27345 #define MULTI_ARG_3_SI_DI V4SI_FTYPE_V4SI_V4SI_V2DI
27346 #define MULTI_ARG_3_HI V8HI_FTYPE_V8HI_V8HI_V8HI
27347 #define MULTI_ARG_3_HI_SI V8HI_FTYPE_V8HI_V8HI_V4SI
27348 #define MULTI_ARG_3_QI V16QI_FTYPE_V16QI_V16QI_V16QI
27349 #define MULTI_ARG_3_DI2 V4DI_FTYPE_V4DI_V4DI_V4DI
27350 #define MULTI_ARG_3_SI2 V8SI_FTYPE_V8SI_V8SI_V8SI
27351 #define MULTI_ARG_3_HI2 V16HI_FTYPE_V16HI_V16HI_V16HI
27352 #define MULTI_ARG_3_QI2 V32QI_FTYPE_V32QI_V32QI_V32QI
27353 #define MULTI_ARG_2_SF V4SF_FTYPE_V4SF_V4SF
27354 #define MULTI_ARG_2_DF V2DF_FTYPE_V2DF_V2DF
27355 #define MULTI_ARG_2_DI V2DI_FTYPE_V2DI_V2DI
27356 #define MULTI_ARG_2_SI V4SI_FTYPE_V4SI_V4SI
27357 #define MULTI_ARG_2_HI V8HI_FTYPE_V8HI_V8HI
27358 #define MULTI_ARG_2_QI V16QI_FTYPE_V16QI_V16QI
27359 #define MULTI_ARG_2_DI_IMM V2DI_FTYPE_V2DI_SI
27360 #define MULTI_ARG_2_SI_IMM V4SI_FTYPE_V4SI_SI
27361 #define MULTI_ARG_2_HI_IMM V8HI_FTYPE_V8HI_SI
27362 #define MULTI_ARG_2_QI_IMM V16QI_FTYPE_V16QI_SI
27363 #define MULTI_ARG_2_DI_CMP V2DI_FTYPE_V2DI_V2DI_CMP
27364 #define MULTI_ARG_2_SI_CMP V4SI_FTYPE_V4SI_V4SI_CMP
27365 #define MULTI_ARG_2_HI_CMP V8HI_FTYPE_V8HI_V8HI_CMP
27366 #define MULTI_ARG_2_QI_CMP V16QI_FTYPE_V16QI_V16QI_CMP
27367 #define MULTI_ARG_2_SF_TF V4SF_FTYPE_V4SF_V4SF_TF
27368 #define MULTI_ARG_2_DF_TF V2DF_FTYPE_V2DF_V2DF_TF
27369 #define MULTI_ARG_2_DI_TF V2DI_FTYPE_V2DI_V2DI_TF
27370 #define MULTI_ARG_2_SI_TF V4SI_FTYPE_V4SI_V4SI_TF
27371 #define MULTI_ARG_2_HI_TF V8HI_FTYPE_V8HI_V8HI_TF
27372 #define MULTI_ARG_2_QI_TF V16QI_FTYPE_V16QI_V16QI_TF
27373 #define MULTI_ARG_1_SF V4SF_FTYPE_V4SF
27374 #define MULTI_ARG_1_DF V2DF_FTYPE_V2DF
27375 #define MULTI_ARG_1_SF2 V8SF_FTYPE_V8SF
27376 #define MULTI_ARG_1_DF2 V4DF_FTYPE_V4DF
27377 #define MULTI_ARG_1_DI V2DI_FTYPE_V2DI
27378 #define MULTI_ARG_1_SI V4SI_FTYPE_V4SI
27379 #define MULTI_ARG_1_HI V8HI_FTYPE_V8HI
27380 #define MULTI_ARG_1_QI V16QI_FTYPE_V16QI
27381 #define MULTI_ARG_1_SI_DI V2DI_FTYPE_V4SI
27382 #define MULTI_ARG_1_HI_DI V2DI_FTYPE_V8HI
27383 #define MULTI_ARG_1_HI_SI V4SI_FTYPE_V8HI
27384 #define MULTI_ARG_1_QI_DI V2DI_FTYPE_V16QI
27385 #define MULTI_ARG_1_QI_SI V4SI_FTYPE_V16QI
27386 #define MULTI_ARG_1_QI_HI V8HI_FTYPE_V16QI
27387
27388 static const struct builtin_description bdesc_multi_arg[] =
27389 {
27390 { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_vmfmadd_v4sf,
27391 "__builtin_ia32_vfmaddss", IX86_BUILTIN_VFMADDSS,
27392 UNKNOWN, (int)MULTI_ARG_3_SF },
27393 { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_vmfmadd_v2df,
27394 "__builtin_ia32_vfmaddsd", IX86_BUILTIN_VFMADDSD,
27395 UNKNOWN, (int)MULTI_ARG_3_DF },
27396
27397 { OPTION_MASK_ISA_FMA, CODE_FOR_fmai_vmfmadd_v4sf,
27398 "__builtin_ia32_vfmaddss3", IX86_BUILTIN_VFMADDSS3,
27399 UNKNOWN, (int)MULTI_ARG_3_SF },
27400 { OPTION_MASK_ISA_FMA, CODE_FOR_fmai_vmfmadd_v2df,
27401 "__builtin_ia32_vfmaddsd3", IX86_BUILTIN_VFMADDSD3,
27402 UNKNOWN, (int)MULTI_ARG_3_DF },
27403
27404 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v4sf,
27405 "__builtin_ia32_vfmaddps", IX86_BUILTIN_VFMADDPS,
27406 UNKNOWN, (int)MULTI_ARG_3_SF },
27407 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v2df,
27408 "__builtin_ia32_vfmaddpd", IX86_BUILTIN_VFMADDPD,
27409 UNKNOWN, (int)MULTI_ARG_3_DF },
27410 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v8sf,
27411 "__builtin_ia32_vfmaddps256", IX86_BUILTIN_VFMADDPS256,
27412 UNKNOWN, (int)MULTI_ARG_3_SF2 },
27413 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v4df,
27414 "__builtin_ia32_vfmaddpd256", IX86_BUILTIN_VFMADDPD256,
27415 UNKNOWN, (int)MULTI_ARG_3_DF2 },
27416
27417 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v4sf,
27418 "__builtin_ia32_vfmaddsubps", IX86_BUILTIN_VFMADDSUBPS,
27419 UNKNOWN, (int)MULTI_ARG_3_SF },
27420 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v2df,
27421 "__builtin_ia32_vfmaddsubpd", IX86_BUILTIN_VFMADDSUBPD,
27422 UNKNOWN, (int)MULTI_ARG_3_DF },
27423 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v8sf,
27424 "__builtin_ia32_vfmaddsubps256", IX86_BUILTIN_VFMADDSUBPS256,
27425 UNKNOWN, (int)MULTI_ARG_3_SF2 },
27426 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v4df,
27427 "__builtin_ia32_vfmaddsubpd256", IX86_BUILTIN_VFMADDSUBPD256,
27428 UNKNOWN, (int)MULTI_ARG_3_DF2 },
27429
27430 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v2di, "__builtin_ia32_vpcmov", IX86_BUILTIN_VPCMOV, UNKNOWN, (int)MULTI_ARG_3_DI },
27431 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v2di, "__builtin_ia32_vpcmov_v2di", IX86_BUILTIN_VPCMOV_V2DI, UNKNOWN, (int)MULTI_ARG_3_DI },
27432 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4si, "__builtin_ia32_vpcmov_v4si", IX86_BUILTIN_VPCMOV_V4SI, UNKNOWN, (int)MULTI_ARG_3_SI },
27433 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v8hi, "__builtin_ia32_vpcmov_v8hi", IX86_BUILTIN_VPCMOV_V8HI, UNKNOWN, (int)MULTI_ARG_3_HI },
27434 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v16qi, "__builtin_ia32_vpcmov_v16qi",IX86_BUILTIN_VPCMOV_V16QI,UNKNOWN, (int)MULTI_ARG_3_QI },
27435 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v2df, "__builtin_ia32_vpcmov_v2df", IX86_BUILTIN_VPCMOV_V2DF, UNKNOWN, (int)MULTI_ARG_3_DF },
27436 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4sf, "__builtin_ia32_vpcmov_v4sf", IX86_BUILTIN_VPCMOV_V4SF, UNKNOWN, (int)MULTI_ARG_3_SF },
27437
27438 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4di256, "__builtin_ia32_vpcmov256", IX86_BUILTIN_VPCMOV256, UNKNOWN, (int)MULTI_ARG_3_DI2 },
27439 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4di256, "__builtin_ia32_vpcmov_v4di256", IX86_BUILTIN_VPCMOV_V4DI256, UNKNOWN, (int)MULTI_ARG_3_DI2 },
27440 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v8si256, "__builtin_ia32_vpcmov_v8si256", IX86_BUILTIN_VPCMOV_V8SI256, UNKNOWN, (int)MULTI_ARG_3_SI2 },
27441 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v16hi256, "__builtin_ia32_vpcmov_v16hi256", IX86_BUILTIN_VPCMOV_V16HI256, UNKNOWN, (int)MULTI_ARG_3_HI2 },
27442 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v32qi256, "__builtin_ia32_vpcmov_v32qi256", IX86_BUILTIN_VPCMOV_V32QI256, UNKNOWN, (int)MULTI_ARG_3_QI2 },
27443 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4df256, "__builtin_ia32_vpcmov_v4df256", IX86_BUILTIN_VPCMOV_V4DF256, UNKNOWN, (int)MULTI_ARG_3_DF2 },
27444 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v8sf256, "__builtin_ia32_vpcmov_v8sf256", IX86_BUILTIN_VPCMOV_V8SF256, UNKNOWN, (int)MULTI_ARG_3_SF2 },
27445
27446 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pperm, "__builtin_ia32_vpperm", IX86_BUILTIN_VPPERM, UNKNOWN, (int)MULTI_ARG_3_QI },
27447
27448 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssww, "__builtin_ia32_vpmacssww", IX86_BUILTIN_VPMACSSWW, UNKNOWN, (int)MULTI_ARG_3_HI },
27449 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsww, "__builtin_ia32_vpmacsww", IX86_BUILTIN_VPMACSWW, UNKNOWN, (int)MULTI_ARG_3_HI },
27450 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsswd, "__builtin_ia32_vpmacsswd", IX86_BUILTIN_VPMACSSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
27451 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacswd, "__builtin_ia32_vpmacswd", IX86_BUILTIN_VPMACSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
27452 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssdd, "__builtin_ia32_vpmacssdd", IX86_BUILTIN_VPMACSSDD, UNKNOWN, (int)MULTI_ARG_3_SI },
27453 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsdd, "__builtin_ia32_vpmacsdd", IX86_BUILTIN_VPMACSDD, UNKNOWN, (int)MULTI_ARG_3_SI },
27454 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssdql, "__builtin_ia32_vpmacssdql", IX86_BUILTIN_VPMACSSDQL, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
27455 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssdqh, "__builtin_ia32_vpmacssdqh", IX86_BUILTIN_VPMACSSDQH, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
27456 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsdql, "__builtin_ia32_vpmacsdql", IX86_BUILTIN_VPMACSDQL, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
27457 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsdqh, "__builtin_ia32_vpmacsdqh", IX86_BUILTIN_VPMACSDQH, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
27458 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmadcsswd, "__builtin_ia32_vpmadcsswd", IX86_BUILTIN_VPMADCSSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
27459 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmadcswd, "__builtin_ia32_vpmadcswd", IX86_BUILTIN_VPMADCSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
27460
27461 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv2di3, "__builtin_ia32_vprotq", IX86_BUILTIN_VPROTQ, UNKNOWN, (int)MULTI_ARG_2_DI },
27462 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv4si3, "__builtin_ia32_vprotd", IX86_BUILTIN_VPROTD, UNKNOWN, (int)MULTI_ARG_2_SI },
27463 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv8hi3, "__builtin_ia32_vprotw", IX86_BUILTIN_VPROTW, UNKNOWN, (int)MULTI_ARG_2_HI },
27464 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv16qi3, "__builtin_ia32_vprotb", IX86_BUILTIN_VPROTB, UNKNOWN, (int)MULTI_ARG_2_QI },
27465 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv2di3, "__builtin_ia32_vprotqi", IX86_BUILTIN_VPROTQ_IMM, UNKNOWN, (int)MULTI_ARG_2_DI_IMM },
27466 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv4si3, "__builtin_ia32_vprotdi", IX86_BUILTIN_VPROTD_IMM, UNKNOWN, (int)MULTI_ARG_2_SI_IMM },
27467 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv8hi3, "__builtin_ia32_vprotwi", IX86_BUILTIN_VPROTW_IMM, UNKNOWN, (int)MULTI_ARG_2_HI_IMM },
27468 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv16qi3, "__builtin_ia32_vprotbi", IX86_BUILTIN_VPROTB_IMM, UNKNOWN, (int)MULTI_ARG_2_QI_IMM },
27469 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav2di3, "__builtin_ia32_vpshaq", IX86_BUILTIN_VPSHAQ, UNKNOWN, (int)MULTI_ARG_2_DI },
27470 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav4si3, "__builtin_ia32_vpshad", IX86_BUILTIN_VPSHAD, UNKNOWN, (int)MULTI_ARG_2_SI },
27471 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav8hi3, "__builtin_ia32_vpshaw", IX86_BUILTIN_VPSHAW, UNKNOWN, (int)MULTI_ARG_2_HI },
27472 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav16qi3, "__builtin_ia32_vpshab", IX86_BUILTIN_VPSHAB, UNKNOWN, (int)MULTI_ARG_2_QI },
27473 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv2di3, "__builtin_ia32_vpshlq", IX86_BUILTIN_VPSHLQ, UNKNOWN, (int)MULTI_ARG_2_DI },
27474 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv4si3, "__builtin_ia32_vpshld", IX86_BUILTIN_VPSHLD, UNKNOWN, (int)MULTI_ARG_2_SI },
27475 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv8hi3, "__builtin_ia32_vpshlw", IX86_BUILTIN_VPSHLW, UNKNOWN, (int)MULTI_ARG_2_HI },
27476 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv16qi3, "__builtin_ia32_vpshlb", IX86_BUILTIN_VPSHLB, UNKNOWN, (int)MULTI_ARG_2_QI },
27477
27478 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vmfrczv4sf2, "__builtin_ia32_vfrczss", IX86_BUILTIN_VFRCZSS, UNKNOWN, (int)MULTI_ARG_2_SF },
27479 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vmfrczv2df2, "__builtin_ia32_vfrczsd", IX86_BUILTIN_VFRCZSD, UNKNOWN, (int)MULTI_ARG_2_DF },
27480 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv4sf2, "__builtin_ia32_vfrczps", IX86_BUILTIN_VFRCZPS, UNKNOWN, (int)MULTI_ARG_1_SF },
27481 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv2df2, "__builtin_ia32_vfrczpd", IX86_BUILTIN_VFRCZPD, UNKNOWN, (int)MULTI_ARG_1_DF },
27482 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv8sf2, "__builtin_ia32_vfrczps256", IX86_BUILTIN_VFRCZPS256, UNKNOWN, (int)MULTI_ARG_1_SF2 },
27483 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv4df2, "__builtin_ia32_vfrczpd256", IX86_BUILTIN_VFRCZPD256, UNKNOWN, (int)MULTI_ARG_1_DF2 },
27484
27485 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddbw, "__builtin_ia32_vphaddbw", IX86_BUILTIN_VPHADDBW, UNKNOWN, (int)MULTI_ARG_1_QI_HI },
27486 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddbd, "__builtin_ia32_vphaddbd", IX86_BUILTIN_VPHADDBD, UNKNOWN, (int)MULTI_ARG_1_QI_SI },
27487 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddbq, "__builtin_ia32_vphaddbq", IX86_BUILTIN_VPHADDBQ, UNKNOWN, (int)MULTI_ARG_1_QI_DI },
27488 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddwd, "__builtin_ia32_vphaddwd", IX86_BUILTIN_VPHADDWD, UNKNOWN, (int)MULTI_ARG_1_HI_SI },
27489 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddwq, "__builtin_ia32_vphaddwq", IX86_BUILTIN_VPHADDWQ, UNKNOWN, (int)MULTI_ARG_1_HI_DI },
27490 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phadddq, "__builtin_ia32_vphadddq", IX86_BUILTIN_VPHADDDQ, UNKNOWN, (int)MULTI_ARG_1_SI_DI },
27491 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddubw, "__builtin_ia32_vphaddubw", IX86_BUILTIN_VPHADDUBW, UNKNOWN, (int)MULTI_ARG_1_QI_HI },
27492 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddubd, "__builtin_ia32_vphaddubd", IX86_BUILTIN_VPHADDUBD, UNKNOWN, (int)MULTI_ARG_1_QI_SI },
27493 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddubq, "__builtin_ia32_vphaddubq", IX86_BUILTIN_VPHADDUBQ, UNKNOWN, (int)MULTI_ARG_1_QI_DI },
27494 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phadduwd, "__builtin_ia32_vphadduwd", IX86_BUILTIN_VPHADDUWD, UNKNOWN, (int)MULTI_ARG_1_HI_SI },
27495 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phadduwq, "__builtin_ia32_vphadduwq", IX86_BUILTIN_VPHADDUWQ, UNKNOWN, (int)MULTI_ARG_1_HI_DI },
27496 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddudq, "__builtin_ia32_vphaddudq", IX86_BUILTIN_VPHADDUDQ, UNKNOWN, (int)MULTI_ARG_1_SI_DI },
27497 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phsubbw, "__builtin_ia32_vphsubbw", IX86_BUILTIN_VPHSUBBW, UNKNOWN, (int)MULTI_ARG_1_QI_HI },
27498 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phsubwd, "__builtin_ia32_vphsubwd", IX86_BUILTIN_VPHSUBWD, UNKNOWN, (int)MULTI_ARG_1_HI_SI },
27499 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phsubdq, "__builtin_ia32_vphsubdq", IX86_BUILTIN_VPHSUBDQ, UNKNOWN, (int)MULTI_ARG_1_SI_DI },
27500
27501 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomeqb", IX86_BUILTIN_VPCOMEQB, EQ, (int)MULTI_ARG_2_QI_CMP },
27502 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomneb", IX86_BUILTIN_VPCOMNEB, NE, (int)MULTI_ARG_2_QI_CMP },
27503 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomneqb", IX86_BUILTIN_VPCOMNEB, NE, (int)MULTI_ARG_2_QI_CMP },
27504 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomltb", IX86_BUILTIN_VPCOMLTB, LT, (int)MULTI_ARG_2_QI_CMP },
27505 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomleb", IX86_BUILTIN_VPCOMLEB, LE, (int)MULTI_ARG_2_QI_CMP },
27506 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomgtb", IX86_BUILTIN_VPCOMGTB, GT, (int)MULTI_ARG_2_QI_CMP },
27507 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomgeb", IX86_BUILTIN_VPCOMGEB, GE, (int)MULTI_ARG_2_QI_CMP },
27508
27509 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomeqw", IX86_BUILTIN_VPCOMEQW, EQ, (int)MULTI_ARG_2_HI_CMP },
27510 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomnew", IX86_BUILTIN_VPCOMNEW, NE, (int)MULTI_ARG_2_HI_CMP },
27511 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomneqw", IX86_BUILTIN_VPCOMNEW, NE, (int)MULTI_ARG_2_HI_CMP },
27512 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomltw", IX86_BUILTIN_VPCOMLTW, LT, (int)MULTI_ARG_2_HI_CMP },
27513 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomlew", IX86_BUILTIN_VPCOMLEW, LE, (int)MULTI_ARG_2_HI_CMP },
27514 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomgtw", IX86_BUILTIN_VPCOMGTW, GT, (int)MULTI_ARG_2_HI_CMP },
27515 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomgew", IX86_BUILTIN_VPCOMGEW, GE, (int)MULTI_ARG_2_HI_CMP },
27516
27517 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomeqd", IX86_BUILTIN_VPCOMEQD, EQ, (int)MULTI_ARG_2_SI_CMP },
27518 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomned", IX86_BUILTIN_VPCOMNED, NE, (int)MULTI_ARG_2_SI_CMP },
27519 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomneqd", IX86_BUILTIN_VPCOMNED, NE, (int)MULTI_ARG_2_SI_CMP },
27520 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomltd", IX86_BUILTIN_VPCOMLTD, LT, (int)MULTI_ARG_2_SI_CMP },
27521 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomled", IX86_BUILTIN_VPCOMLED, LE, (int)MULTI_ARG_2_SI_CMP },
27522 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomgtd", IX86_BUILTIN_VPCOMGTD, GT, (int)MULTI_ARG_2_SI_CMP },
27523 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomged", IX86_BUILTIN_VPCOMGED, GE, (int)MULTI_ARG_2_SI_CMP },
27524
27525 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomeqq", IX86_BUILTIN_VPCOMEQQ, EQ, (int)MULTI_ARG_2_DI_CMP },
27526 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomneq", IX86_BUILTIN_VPCOMNEQ, NE, (int)MULTI_ARG_2_DI_CMP },
27527 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomneqq", IX86_BUILTIN_VPCOMNEQ, NE, (int)MULTI_ARG_2_DI_CMP },
27528 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomltq", IX86_BUILTIN_VPCOMLTQ, LT, (int)MULTI_ARG_2_DI_CMP },
27529 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomleq", IX86_BUILTIN_VPCOMLEQ, LE, (int)MULTI_ARG_2_DI_CMP },
27530 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomgtq", IX86_BUILTIN_VPCOMGTQ, GT, (int)MULTI_ARG_2_DI_CMP },
27531 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomgeq", IX86_BUILTIN_VPCOMGEQ, GE, (int)MULTI_ARG_2_DI_CMP },
27532
27533 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v16qi3,"__builtin_ia32_vpcomequb", IX86_BUILTIN_VPCOMEQUB, EQ, (int)MULTI_ARG_2_QI_CMP },
27534 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v16qi3,"__builtin_ia32_vpcomneub", IX86_BUILTIN_VPCOMNEUB, NE, (int)MULTI_ARG_2_QI_CMP },
27535 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v16qi3,"__builtin_ia32_vpcomnequb", IX86_BUILTIN_VPCOMNEUB, NE, (int)MULTI_ARG_2_QI_CMP },
27536 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomltub", IX86_BUILTIN_VPCOMLTUB, LTU, (int)MULTI_ARG_2_QI_CMP },
27537 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomleub", IX86_BUILTIN_VPCOMLEUB, LEU, (int)MULTI_ARG_2_QI_CMP },
27538 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomgtub", IX86_BUILTIN_VPCOMGTUB, GTU, (int)MULTI_ARG_2_QI_CMP },
27539 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomgeub", IX86_BUILTIN_VPCOMGEUB, GEU, (int)MULTI_ARG_2_QI_CMP },
27540
27541 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v8hi3, "__builtin_ia32_vpcomequw", IX86_BUILTIN_VPCOMEQUW, EQ, (int)MULTI_ARG_2_HI_CMP },
27542 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v8hi3, "__builtin_ia32_vpcomneuw", IX86_BUILTIN_VPCOMNEUW, NE, (int)MULTI_ARG_2_HI_CMP },
27543 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v8hi3, "__builtin_ia32_vpcomnequw", IX86_BUILTIN_VPCOMNEUW, NE, (int)MULTI_ARG_2_HI_CMP },
27544 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomltuw", IX86_BUILTIN_VPCOMLTUW, LTU, (int)MULTI_ARG_2_HI_CMP },
27545 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomleuw", IX86_BUILTIN_VPCOMLEUW, LEU, (int)MULTI_ARG_2_HI_CMP },
27546 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomgtuw", IX86_BUILTIN_VPCOMGTUW, GTU, (int)MULTI_ARG_2_HI_CMP },
27547 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomgeuw", IX86_BUILTIN_VPCOMGEUW, GEU, (int)MULTI_ARG_2_HI_CMP },
27548
27549 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v4si3, "__builtin_ia32_vpcomequd", IX86_BUILTIN_VPCOMEQUD, EQ, (int)MULTI_ARG_2_SI_CMP },
27550 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v4si3, "__builtin_ia32_vpcomneud", IX86_BUILTIN_VPCOMNEUD, NE, (int)MULTI_ARG_2_SI_CMP },
27551 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v4si3, "__builtin_ia32_vpcomnequd", IX86_BUILTIN_VPCOMNEUD, NE, (int)MULTI_ARG_2_SI_CMP },
27552 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomltud", IX86_BUILTIN_VPCOMLTUD, LTU, (int)MULTI_ARG_2_SI_CMP },
27553 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomleud", IX86_BUILTIN_VPCOMLEUD, LEU, (int)MULTI_ARG_2_SI_CMP },
27554 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomgtud", IX86_BUILTIN_VPCOMGTUD, GTU, (int)MULTI_ARG_2_SI_CMP },
27555 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomgeud", IX86_BUILTIN_VPCOMGEUD, GEU, (int)MULTI_ARG_2_SI_CMP },
27556
27557 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v2di3, "__builtin_ia32_vpcomequq", IX86_BUILTIN_VPCOMEQUQ, EQ, (int)MULTI_ARG_2_DI_CMP },
27558 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v2di3, "__builtin_ia32_vpcomneuq", IX86_BUILTIN_VPCOMNEUQ, NE, (int)MULTI_ARG_2_DI_CMP },
27559 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v2di3, "__builtin_ia32_vpcomnequq", IX86_BUILTIN_VPCOMNEUQ, NE, (int)MULTI_ARG_2_DI_CMP },
27560 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomltuq", IX86_BUILTIN_VPCOMLTUQ, LTU, (int)MULTI_ARG_2_DI_CMP },
27561 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomleuq", IX86_BUILTIN_VPCOMLEUQ, LEU, (int)MULTI_ARG_2_DI_CMP },
27562 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomgtuq", IX86_BUILTIN_VPCOMGTUQ, GTU, (int)MULTI_ARG_2_DI_CMP },
27563 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomgeuq", IX86_BUILTIN_VPCOMGEUQ, GEU, (int)MULTI_ARG_2_DI_CMP },
27564
27565 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomfalseb", IX86_BUILTIN_VPCOMFALSEB, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_QI_TF },
27566 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomfalsew", IX86_BUILTIN_VPCOMFALSEW, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_HI_TF },
27567 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomfalsed", IX86_BUILTIN_VPCOMFALSED, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_SI_TF },
27568 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomfalseq", IX86_BUILTIN_VPCOMFALSEQ, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_DI_TF },
27569 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomfalseub",IX86_BUILTIN_VPCOMFALSEUB,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_QI_TF },
27570 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomfalseuw",IX86_BUILTIN_VPCOMFALSEUW,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_HI_TF },
27571 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomfalseud",IX86_BUILTIN_VPCOMFALSEUD,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_SI_TF },
27572 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomfalseuq",IX86_BUILTIN_VPCOMFALSEUQ,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_DI_TF },
27573
27574 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomtrueb", IX86_BUILTIN_VPCOMTRUEB, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_QI_TF },
27575 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomtruew", IX86_BUILTIN_VPCOMTRUEW, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_HI_TF },
27576 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomtrued", IX86_BUILTIN_VPCOMTRUED, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_SI_TF },
27577 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomtrueq", IX86_BUILTIN_VPCOMTRUEQ, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_DI_TF },
27578 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomtrueub", IX86_BUILTIN_VPCOMTRUEUB, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_QI_TF },
27579 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomtrueuw", IX86_BUILTIN_VPCOMTRUEUW, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_HI_TF },
27580 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomtrueud", IX86_BUILTIN_VPCOMTRUEUD, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_SI_TF },
27581 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomtrueuq", IX86_BUILTIN_VPCOMTRUEUQ, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_DI_TF },
27582
27583 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v2df3, "__builtin_ia32_vpermil2pd", IX86_BUILTIN_VPERMIL2PD, UNKNOWN, (int)MULTI_ARG_4_DF2_DI_I },
27584 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v4sf3, "__builtin_ia32_vpermil2ps", IX86_BUILTIN_VPERMIL2PS, UNKNOWN, (int)MULTI_ARG_4_SF2_SI_I },
27585 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v4df3, "__builtin_ia32_vpermil2pd256", IX86_BUILTIN_VPERMIL2PD256, UNKNOWN, (int)MULTI_ARG_4_DF2_DI_I1 },
27586 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v8sf3, "__builtin_ia32_vpermil2ps256", IX86_BUILTIN_VPERMIL2PS256, UNKNOWN, (int)MULTI_ARG_4_SF2_SI_I1 },
27587
27588 };
27589 \f
27590 /* TM vector builtins. */
27591
27592 /* Reuse the existing x86-specific `struct builtin_description' cause
27593 we're lazy. Add casts to make them fit. */
27594 static const struct builtin_description bdesc_tm[] =
27595 {
27596 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WM64", (enum ix86_builtins) BUILT_IN_TM_STORE_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
27597 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WaRM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
27598 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WaWM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
27599 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
27600 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RaRM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
27601 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RaWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
27602 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RfWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
27603
27604 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WM128", (enum ix86_builtins) BUILT_IN_TM_STORE_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
27605 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WaRM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
27606 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WaWM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
27607 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
27608 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RaRM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
27609 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RaWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
27610 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RfWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
27611
27612 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WM256", (enum ix86_builtins) BUILT_IN_TM_STORE_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
27613 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WaRM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
27614 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WaWM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
27615 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
27616 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RaRM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
27617 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RaWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
27618 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RfWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
27619
27620 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_LM64", (enum ix86_builtins) BUILT_IN_TM_LOG_M64, UNKNOWN, VOID_FTYPE_PCVOID },
27621 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_LM128", (enum ix86_builtins) BUILT_IN_TM_LOG_M128, UNKNOWN, VOID_FTYPE_PCVOID },
27622 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_LM256", (enum ix86_builtins) BUILT_IN_TM_LOG_M256, UNKNOWN, VOID_FTYPE_PCVOID },
27623 };
27624
27625 /* TM callbacks. */
27626
27627 /* Return the builtin decl needed to load a vector of TYPE. */
27628
27629 static tree
27630 ix86_builtin_tm_load (tree type)
27631 {
27632 if (TREE_CODE (type) == VECTOR_TYPE)
27633 {
27634 switch (tree_low_cst (TYPE_SIZE (type), 1))
27635 {
27636 case 64:
27637 return builtin_decl_explicit (BUILT_IN_TM_LOAD_M64);
27638 case 128:
27639 return builtin_decl_explicit (BUILT_IN_TM_LOAD_M128);
27640 case 256:
27641 return builtin_decl_explicit (BUILT_IN_TM_LOAD_M256);
27642 }
27643 }
27644 return NULL_TREE;
27645 }
27646
27647 /* Return the builtin decl needed to store a vector of TYPE. */
27648
27649 static tree
27650 ix86_builtin_tm_store (tree type)
27651 {
27652 if (TREE_CODE (type) == VECTOR_TYPE)
27653 {
27654 switch (tree_low_cst (TYPE_SIZE (type), 1))
27655 {
27656 case 64:
27657 return builtin_decl_explicit (BUILT_IN_TM_STORE_M64);
27658 case 128:
27659 return builtin_decl_explicit (BUILT_IN_TM_STORE_M128);
27660 case 256:
27661 return builtin_decl_explicit (BUILT_IN_TM_STORE_M256);
27662 }
27663 }
27664 return NULL_TREE;
27665 }
27666 \f
27667 /* Initialize the transactional memory vector load/store builtins. */
27668
27669 static void
27670 ix86_init_tm_builtins (void)
27671 {
27672 enum ix86_builtin_func_type ftype;
27673 const struct builtin_description *d;
27674 size_t i;
27675 tree decl;
27676 tree attrs_load, attrs_type_load, attrs_store, attrs_type_store;
27677 tree attrs_log, attrs_type_log;
27678
27679 if (!flag_tm)
27680 return;
27681
27682 /* If there are no builtins defined, we must be compiling in a
27683 language without trans-mem support. */
27684 if (!builtin_decl_explicit_p (BUILT_IN_TM_LOAD_1))
27685 return;
27686
27687 /* Use whatever attributes a normal TM load has. */
27688 decl = builtin_decl_explicit (BUILT_IN_TM_LOAD_1);
27689 attrs_load = DECL_ATTRIBUTES (decl);
27690 attrs_type_load = TYPE_ATTRIBUTES (TREE_TYPE (decl));
27691 /* Use whatever attributes a normal TM store has. */
27692 decl = builtin_decl_explicit (BUILT_IN_TM_STORE_1);
27693 attrs_store = DECL_ATTRIBUTES (decl);
27694 attrs_type_store = TYPE_ATTRIBUTES (TREE_TYPE (decl));
27695 /* Use whatever attributes a normal TM log has. */
27696 decl = builtin_decl_explicit (BUILT_IN_TM_LOG);
27697 attrs_log = DECL_ATTRIBUTES (decl);
27698 attrs_type_log = TYPE_ATTRIBUTES (TREE_TYPE (decl));
27699
27700 for (i = 0, d = bdesc_tm;
27701 i < ARRAY_SIZE (bdesc_tm);
27702 i++, d++)
27703 {
27704 if ((d->mask & ix86_isa_flags) != 0
27705 || (lang_hooks.builtin_function
27706 == lang_hooks.builtin_function_ext_scope))
27707 {
27708 tree type, attrs, attrs_type;
27709 enum built_in_function code = (enum built_in_function) d->code;
27710
27711 ftype = (enum ix86_builtin_func_type) d->flag;
27712 type = ix86_get_builtin_func_type (ftype);
27713
27714 if (BUILTIN_TM_LOAD_P (code))
27715 {
27716 attrs = attrs_load;
27717 attrs_type = attrs_type_load;
27718 }
27719 else if (BUILTIN_TM_STORE_P (code))
27720 {
27721 attrs = attrs_store;
27722 attrs_type = attrs_type_store;
27723 }
27724 else
27725 {
27726 attrs = attrs_log;
27727 attrs_type = attrs_type_log;
27728 }
27729 decl = add_builtin_function (d->name, type, code, BUILT_IN_NORMAL,
27730 /* The builtin without the prefix for
27731 calling it directly. */
27732 d->name + strlen ("__builtin_"),
27733 attrs);
27734 /* add_builtin_function() will set the DECL_ATTRIBUTES, now
27735 set the TYPE_ATTRIBUTES. */
27736 decl_attributes (&TREE_TYPE (decl), attrs_type, ATTR_FLAG_BUILT_IN);
27737
27738 set_builtin_decl (code, decl, false);
27739 }
27740 }
27741 }
27742
27743 /* Set up all the MMX/SSE builtins, even builtins for instructions that are not
27744 in the current target ISA to allow the user to compile particular modules
27745 with different target specific options that differ from the command line
27746 options. */
27747 static void
27748 ix86_init_mmx_sse_builtins (void)
27749 {
27750 const struct builtin_description * d;
27751 enum ix86_builtin_func_type ftype;
27752 size_t i;
27753
27754 /* Add all special builtins with variable number of operands. */
27755 for (i = 0, d = bdesc_special_args;
27756 i < ARRAY_SIZE (bdesc_special_args);
27757 i++, d++)
27758 {
27759 if (d->name == 0)
27760 continue;
27761
27762 ftype = (enum ix86_builtin_func_type) d->flag;
27763 def_builtin (d->mask, d->name, ftype, d->code);
27764 }
27765
27766 /* Add all builtins with variable number of operands. */
27767 for (i = 0, d = bdesc_args;
27768 i < ARRAY_SIZE (bdesc_args);
27769 i++, d++)
27770 {
27771 if (d->name == 0)
27772 continue;
27773
27774 ftype = (enum ix86_builtin_func_type) d->flag;
27775 def_builtin_const (d->mask, d->name, ftype, d->code);
27776 }
27777
27778 /* pcmpestr[im] insns. */
27779 for (i = 0, d = bdesc_pcmpestr;
27780 i < ARRAY_SIZE (bdesc_pcmpestr);
27781 i++, d++)
27782 {
27783 if (d->code == IX86_BUILTIN_PCMPESTRM128)
27784 ftype = V16QI_FTYPE_V16QI_INT_V16QI_INT_INT;
27785 else
27786 ftype = INT_FTYPE_V16QI_INT_V16QI_INT_INT;
27787 def_builtin_const (d->mask, d->name, ftype, d->code);
27788 }
27789
27790 /* pcmpistr[im] insns. */
27791 for (i = 0, d = bdesc_pcmpistr;
27792 i < ARRAY_SIZE (bdesc_pcmpistr);
27793 i++, d++)
27794 {
27795 if (d->code == IX86_BUILTIN_PCMPISTRM128)
27796 ftype = V16QI_FTYPE_V16QI_V16QI_INT;
27797 else
27798 ftype = INT_FTYPE_V16QI_V16QI_INT;
27799 def_builtin_const (d->mask, d->name, ftype, d->code);
27800 }
27801
27802 /* comi/ucomi insns. */
27803 for (i = 0, d = bdesc_comi; i < ARRAY_SIZE (bdesc_comi); i++, d++)
27804 {
27805 if (d->mask == OPTION_MASK_ISA_SSE2)
27806 ftype = INT_FTYPE_V2DF_V2DF;
27807 else
27808 ftype = INT_FTYPE_V4SF_V4SF;
27809 def_builtin_const (d->mask, d->name, ftype, d->code);
27810 }
27811
27812 /* SSE */
27813 def_builtin (OPTION_MASK_ISA_SSE, "__builtin_ia32_ldmxcsr",
27814 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_LDMXCSR);
27815 def_builtin (OPTION_MASK_ISA_SSE, "__builtin_ia32_stmxcsr",
27816 UNSIGNED_FTYPE_VOID, IX86_BUILTIN_STMXCSR);
27817
27818 /* SSE or 3DNow!A */
27819 def_builtin (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
27820 "__builtin_ia32_maskmovq", VOID_FTYPE_V8QI_V8QI_PCHAR,
27821 IX86_BUILTIN_MASKMOVQ);
27822
27823 /* SSE2 */
27824 def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_maskmovdqu",
27825 VOID_FTYPE_V16QI_V16QI_PCHAR, IX86_BUILTIN_MASKMOVDQU);
27826
27827 def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_clflush",
27828 VOID_FTYPE_PCVOID, IX86_BUILTIN_CLFLUSH);
27829 x86_mfence = def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_mfence",
27830 VOID_FTYPE_VOID, IX86_BUILTIN_MFENCE);
27831
27832 /* SSE3. */
27833 def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_monitor",
27834 VOID_FTYPE_PCVOID_UNSIGNED_UNSIGNED, IX86_BUILTIN_MONITOR);
27835 def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_mwait",
27836 VOID_FTYPE_UNSIGNED_UNSIGNED, IX86_BUILTIN_MWAIT);
27837
27838 /* AES */
27839 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesenc128",
27840 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENC128);
27841 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesenclast128",
27842 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENCLAST128);
27843 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesdec128",
27844 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDEC128);
27845 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesdeclast128",
27846 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDECLAST128);
27847 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesimc128",
27848 V2DI_FTYPE_V2DI, IX86_BUILTIN_AESIMC128);
27849 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aeskeygenassist128",
27850 V2DI_FTYPE_V2DI_INT, IX86_BUILTIN_AESKEYGENASSIST128);
27851
27852 /* PCLMUL */
27853 def_builtin_const (OPTION_MASK_ISA_PCLMUL, "__builtin_ia32_pclmulqdq128",
27854 V2DI_FTYPE_V2DI_V2DI_INT, IX86_BUILTIN_PCLMULQDQ128);
27855
27856 /* RDRND */
27857 def_builtin (OPTION_MASK_ISA_RDRND, "__builtin_ia32_rdrand16_step",
27858 INT_FTYPE_PUSHORT, IX86_BUILTIN_RDRAND16_STEP);
27859 def_builtin (OPTION_MASK_ISA_RDRND, "__builtin_ia32_rdrand32_step",
27860 INT_FTYPE_PUNSIGNED, IX86_BUILTIN_RDRAND32_STEP);
27861 def_builtin (OPTION_MASK_ISA_RDRND | OPTION_MASK_ISA_64BIT,
27862 "__builtin_ia32_rdrand64_step", INT_FTYPE_PULONGLONG,
27863 IX86_BUILTIN_RDRAND64_STEP);
27864
27865 /* AVX2 */
27866 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv2df",
27867 V2DF_FTYPE_V2DF_PCDOUBLE_V4SI_V2DF_INT,
27868 IX86_BUILTIN_GATHERSIV2DF);
27869
27870 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4df",
27871 V4DF_FTYPE_V4DF_PCDOUBLE_V4SI_V4DF_INT,
27872 IX86_BUILTIN_GATHERSIV4DF);
27873
27874 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv2df",
27875 V2DF_FTYPE_V2DF_PCDOUBLE_V2DI_V2DF_INT,
27876 IX86_BUILTIN_GATHERDIV2DF);
27877
27878 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4df",
27879 V4DF_FTYPE_V4DF_PCDOUBLE_V4DI_V4DF_INT,
27880 IX86_BUILTIN_GATHERDIV4DF);
27881
27882 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4sf",
27883 V4SF_FTYPE_V4SF_PCFLOAT_V4SI_V4SF_INT,
27884 IX86_BUILTIN_GATHERSIV4SF);
27885
27886 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv8sf",
27887 V8SF_FTYPE_V8SF_PCFLOAT_V8SI_V8SF_INT,
27888 IX86_BUILTIN_GATHERSIV8SF);
27889
27890 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4sf",
27891 V4SF_FTYPE_V4SF_PCFLOAT_V2DI_V4SF_INT,
27892 IX86_BUILTIN_GATHERDIV4SF);
27893
27894 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4sf256",
27895 V4SF_FTYPE_V4SF_PCFLOAT_V4DI_V4SF_INT,
27896 IX86_BUILTIN_GATHERDIV8SF);
27897
27898 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv2di",
27899 V2DI_FTYPE_V2DI_PCINT64_V4SI_V2DI_INT,
27900 IX86_BUILTIN_GATHERSIV2DI);
27901
27902 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4di",
27903 V4DI_FTYPE_V4DI_PCINT64_V4SI_V4DI_INT,
27904 IX86_BUILTIN_GATHERSIV4DI);
27905
27906 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv2di",
27907 V2DI_FTYPE_V2DI_PCINT64_V2DI_V2DI_INT,
27908 IX86_BUILTIN_GATHERDIV2DI);
27909
27910 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4di",
27911 V4DI_FTYPE_V4DI_PCINT64_V4DI_V4DI_INT,
27912 IX86_BUILTIN_GATHERDIV4DI);
27913
27914 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4si",
27915 V4SI_FTYPE_V4SI_PCINT_V4SI_V4SI_INT,
27916 IX86_BUILTIN_GATHERSIV4SI);
27917
27918 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv8si",
27919 V8SI_FTYPE_V8SI_PCINT_V8SI_V8SI_INT,
27920 IX86_BUILTIN_GATHERSIV8SI);
27921
27922 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4si",
27923 V4SI_FTYPE_V4SI_PCINT_V2DI_V4SI_INT,
27924 IX86_BUILTIN_GATHERDIV4SI);
27925
27926 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4si256",
27927 V4SI_FTYPE_V4SI_PCINT_V4DI_V4SI_INT,
27928 IX86_BUILTIN_GATHERDIV8SI);
27929
27930 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4df ",
27931 V4DF_FTYPE_V4DF_PCDOUBLE_V8SI_V4DF_INT,
27932 IX86_BUILTIN_GATHERALTSIV4DF);
27933
27934 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4sf256 ",
27935 V8SF_FTYPE_V8SF_PCFLOAT_V4DI_V8SF_INT,
27936 IX86_BUILTIN_GATHERALTDIV8SF);
27937
27938 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4di ",
27939 V4DI_FTYPE_V4DI_PCINT64_V8SI_V4DI_INT,
27940 IX86_BUILTIN_GATHERALTSIV4DI);
27941
27942 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4si256 ",
27943 V8SI_FTYPE_V8SI_PCINT_V4DI_V8SI_INT,
27944 IX86_BUILTIN_GATHERALTDIV8SI);
27945
27946 /* RTM. */
27947 def_builtin (OPTION_MASK_ISA_RTM, "__builtin_ia32_xabort",
27948 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_XABORT);
27949
27950 /* MMX access to the vec_init patterns. */
27951 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v2si",
27952 V2SI_FTYPE_INT_INT, IX86_BUILTIN_VEC_INIT_V2SI);
27953
27954 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v4hi",
27955 V4HI_FTYPE_HI_HI_HI_HI,
27956 IX86_BUILTIN_VEC_INIT_V4HI);
27957
27958 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v8qi",
27959 V8QI_FTYPE_QI_QI_QI_QI_QI_QI_QI_QI,
27960 IX86_BUILTIN_VEC_INIT_V8QI);
27961
27962 /* Access to the vec_extract patterns. */
27963 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2df",
27964 DOUBLE_FTYPE_V2DF_INT, IX86_BUILTIN_VEC_EXT_V2DF);
27965 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2di",
27966 DI_FTYPE_V2DI_INT, IX86_BUILTIN_VEC_EXT_V2DI);
27967 def_builtin_const (OPTION_MASK_ISA_SSE, "__builtin_ia32_vec_ext_v4sf",
27968 FLOAT_FTYPE_V4SF_INT, IX86_BUILTIN_VEC_EXT_V4SF);
27969 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v4si",
27970 SI_FTYPE_V4SI_INT, IX86_BUILTIN_VEC_EXT_V4SI);
27971 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v8hi",
27972 HI_FTYPE_V8HI_INT, IX86_BUILTIN_VEC_EXT_V8HI);
27973
27974 def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
27975 "__builtin_ia32_vec_ext_v4hi",
27976 HI_FTYPE_V4HI_INT, IX86_BUILTIN_VEC_EXT_V4HI);
27977
27978 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_ext_v2si",
27979 SI_FTYPE_V2SI_INT, IX86_BUILTIN_VEC_EXT_V2SI);
27980
27981 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v16qi",
27982 QI_FTYPE_V16QI_INT, IX86_BUILTIN_VEC_EXT_V16QI);
27983
27984 /* Access to the vec_set patterns. */
27985 def_builtin_const (OPTION_MASK_ISA_SSE4_1 | OPTION_MASK_ISA_64BIT,
27986 "__builtin_ia32_vec_set_v2di",
27987 V2DI_FTYPE_V2DI_DI_INT, IX86_BUILTIN_VEC_SET_V2DI);
27988
27989 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4sf",
27990 V4SF_FTYPE_V4SF_FLOAT_INT, IX86_BUILTIN_VEC_SET_V4SF);
27991
27992 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4si",
27993 V4SI_FTYPE_V4SI_SI_INT, IX86_BUILTIN_VEC_SET_V4SI);
27994
27995 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_set_v8hi",
27996 V8HI_FTYPE_V8HI_HI_INT, IX86_BUILTIN_VEC_SET_V8HI);
27997
27998 def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
27999 "__builtin_ia32_vec_set_v4hi",
28000 V4HI_FTYPE_V4HI_HI_INT, IX86_BUILTIN_VEC_SET_V4HI);
28001
28002 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v16qi",
28003 V16QI_FTYPE_V16QI_QI_INT, IX86_BUILTIN_VEC_SET_V16QI);
28004
28005 /* RDSEED */
28006 def_builtin (OPTION_MASK_ISA_RDSEED, "__builtin_ia32_rdseed_hi_step",
28007 INT_FTYPE_PUSHORT, IX86_BUILTIN_RDSEED16_STEP);
28008 def_builtin (OPTION_MASK_ISA_RDSEED, "__builtin_ia32_rdseed_si_step",
28009 INT_FTYPE_PUNSIGNED, IX86_BUILTIN_RDSEED32_STEP);
28010 def_builtin (OPTION_MASK_ISA_RDSEED | OPTION_MASK_ISA_64BIT,
28011 "__builtin_ia32_rdseed_di_step",
28012 INT_FTYPE_PULONGLONG, IX86_BUILTIN_RDSEED64_STEP);
28013
28014 /* ADCX */
28015 def_builtin (0, "__builtin_ia32_addcarryx_u32",
28016 UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED, IX86_BUILTIN_ADDCARRYX32);
28017 def_builtin (OPTION_MASK_ISA_64BIT,
28018 "__builtin_ia32_addcarryx_u64",
28019 UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG,
28020 IX86_BUILTIN_ADDCARRYX64);
28021
28022 /* Add FMA4 multi-arg argument instructions */
28023 for (i = 0, d = bdesc_multi_arg; i < ARRAY_SIZE (bdesc_multi_arg); i++, d++)
28024 {
28025 if (d->name == 0)
28026 continue;
28027
28028 ftype = (enum ix86_builtin_func_type) d->flag;
28029 def_builtin_const (d->mask, d->name, ftype, d->code);
28030 }
28031 }
28032
28033 /* This builds the processor_model struct type defined in
28034 libgcc/config/i386/cpuinfo.c */
28035
28036 static tree
28037 build_processor_model_struct (void)
28038 {
28039 const char *field_name[] = {"__cpu_vendor", "__cpu_type", "__cpu_subtype",
28040 "__cpu_features"};
28041 tree field = NULL_TREE, field_chain = NULL_TREE;
28042 int i;
28043 tree type = make_node (RECORD_TYPE);
28044
28045 /* The first 3 fields are unsigned int. */
28046 for (i = 0; i < 3; ++i)
28047 {
28048 field = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
28049 get_identifier (field_name[i]), unsigned_type_node);
28050 if (field_chain != NULL_TREE)
28051 DECL_CHAIN (field) = field_chain;
28052 field_chain = field;
28053 }
28054
28055 /* The last field is an array of unsigned integers of size one. */
28056 field = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
28057 get_identifier (field_name[3]),
28058 build_array_type (unsigned_type_node,
28059 build_index_type (size_one_node)));
28060 if (field_chain != NULL_TREE)
28061 DECL_CHAIN (field) = field_chain;
28062 field_chain = field;
28063
28064 finish_builtin_struct (type, "__processor_model", field_chain, NULL_TREE);
28065 return type;
28066 }
28067
28068 /* Returns a extern, comdat VAR_DECL of type TYPE and name NAME. */
28069
28070 static tree
28071 make_var_decl (tree type, const char *name)
28072 {
28073 tree new_decl;
28074
28075 new_decl = build_decl (UNKNOWN_LOCATION,
28076 VAR_DECL,
28077 get_identifier(name),
28078 type);
28079
28080 DECL_EXTERNAL (new_decl) = 1;
28081 TREE_STATIC (new_decl) = 1;
28082 TREE_PUBLIC (new_decl) = 1;
28083 DECL_INITIAL (new_decl) = 0;
28084 DECL_ARTIFICIAL (new_decl) = 0;
28085 DECL_PRESERVE_P (new_decl) = 1;
28086
28087 make_decl_one_only (new_decl, DECL_ASSEMBLER_NAME (new_decl));
28088 assemble_variable (new_decl, 0, 0, 0);
28089
28090 return new_decl;
28091 }
28092
28093 /* FNDECL is a __builtin_cpu_is or a __builtin_cpu_supports call that is folded
28094 into an integer defined in libgcc/config/i386/cpuinfo.c */
28095
28096 static tree
28097 fold_builtin_cpu (tree fndecl, tree *args)
28098 {
28099 unsigned int i;
28100 enum ix86_builtins fn_code = (enum ix86_builtins)
28101 DECL_FUNCTION_CODE (fndecl);
28102 tree param_string_cst = NULL;
28103
28104 /* This is the order of bit-fields in __processor_features in cpuinfo.c */
28105 enum processor_features
28106 {
28107 F_CMOV = 0,
28108 F_MMX,
28109 F_POPCNT,
28110 F_SSE,
28111 F_SSE2,
28112 F_SSE3,
28113 F_SSSE3,
28114 F_SSE4_1,
28115 F_SSE4_2,
28116 F_AVX,
28117 F_AVX2,
28118 F_MAX
28119 };
28120
28121 /* These are the values for vendor types and cpu types and subtypes
28122 in cpuinfo.c. Cpu types and subtypes should be subtracted by
28123 the corresponding start value. */
28124 enum processor_model
28125 {
28126 M_INTEL = 1,
28127 M_AMD,
28128 M_CPU_TYPE_START,
28129 M_INTEL_ATOM,
28130 M_INTEL_CORE2,
28131 M_INTEL_COREI7,
28132 M_AMDFAM10H,
28133 M_AMDFAM15H,
28134 M_CPU_SUBTYPE_START,
28135 M_INTEL_COREI7_NEHALEM,
28136 M_INTEL_COREI7_WESTMERE,
28137 M_INTEL_COREI7_SANDYBRIDGE,
28138 M_AMDFAM10H_BARCELONA,
28139 M_AMDFAM10H_SHANGHAI,
28140 M_AMDFAM10H_ISTANBUL,
28141 M_AMDFAM15H_BDVER1,
28142 M_AMDFAM15H_BDVER2
28143 };
28144
28145 static struct _arch_names_table
28146 {
28147 const char *const name;
28148 const enum processor_model model;
28149 }
28150 const arch_names_table[] =
28151 {
28152 {"amd", M_AMD},
28153 {"intel", M_INTEL},
28154 {"atom", M_INTEL_ATOM},
28155 {"core2", M_INTEL_CORE2},
28156 {"corei7", M_INTEL_COREI7},
28157 {"nehalem", M_INTEL_COREI7_NEHALEM},
28158 {"westmere", M_INTEL_COREI7_WESTMERE},
28159 {"sandybridge", M_INTEL_COREI7_SANDYBRIDGE},
28160 {"amdfam10h", M_AMDFAM10H},
28161 {"barcelona", M_AMDFAM10H_BARCELONA},
28162 {"shanghai", M_AMDFAM10H_SHANGHAI},
28163 {"istanbul", M_AMDFAM10H_ISTANBUL},
28164 {"amdfam15h", M_AMDFAM15H},
28165 {"bdver1", M_AMDFAM15H_BDVER1},
28166 {"bdver2", M_AMDFAM15H_BDVER2},
28167 };
28168
28169 static struct _isa_names_table
28170 {
28171 const char *const name;
28172 const enum processor_features feature;
28173 }
28174 const isa_names_table[] =
28175 {
28176 {"cmov", F_CMOV},
28177 {"mmx", F_MMX},
28178 {"popcnt", F_POPCNT},
28179 {"sse", F_SSE},
28180 {"sse2", F_SSE2},
28181 {"sse3", F_SSE3},
28182 {"ssse3", F_SSSE3},
28183 {"sse4.1", F_SSE4_1},
28184 {"sse4.2", F_SSE4_2},
28185 {"avx", F_AVX},
28186 {"avx2", F_AVX2}
28187 };
28188
28189 static tree __processor_model_type = NULL_TREE;
28190 static tree __cpu_model_var = NULL_TREE;
28191
28192 if (__processor_model_type == NULL_TREE)
28193 __processor_model_type = build_processor_model_struct ();
28194
28195 if (__cpu_model_var == NULL_TREE)
28196 __cpu_model_var = make_var_decl (__processor_model_type,
28197 "__cpu_model");
28198
28199 gcc_assert ((args != NULL) && (*args != NULL));
28200
28201 param_string_cst = *args;
28202 while (param_string_cst
28203 && TREE_CODE (param_string_cst) != STRING_CST)
28204 {
28205 /* *args must be a expr that can contain other EXPRS leading to a
28206 STRING_CST. */
28207 if (!EXPR_P (param_string_cst))
28208 {
28209 error ("Parameter to builtin must be a string constant or literal");
28210 return integer_zero_node;
28211 }
28212 param_string_cst = TREE_OPERAND (EXPR_CHECK (param_string_cst), 0);
28213 }
28214
28215 gcc_assert (param_string_cst);
28216
28217 if (fn_code == IX86_BUILTIN_CPU_IS)
28218 {
28219 tree ref;
28220 tree field;
28221 unsigned int field_val = 0;
28222 unsigned int NUM_ARCH_NAMES
28223 = sizeof (arch_names_table) / sizeof (struct _arch_names_table);
28224
28225 for (i = 0; i < NUM_ARCH_NAMES; i++)
28226 if (strcmp (arch_names_table[i].name,
28227 TREE_STRING_POINTER (param_string_cst)) == 0)
28228 break;
28229
28230 if (i == NUM_ARCH_NAMES)
28231 {
28232 error ("Parameter to builtin not valid: %s",
28233 TREE_STRING_POINTER (param_string_cst));
28234 return integer_zero_node;
28235 }
28236
28237 field = TYPE_FIELDS (__processor_model_type);
28238 field_val = arch_names_table[i].model;
28239
28240 /* CPU types are stored in the next field. */
28241 if (field_val > M_CPU_TYPE_START
28242 && field_val < M_CPU_SUBTYPE_START)
28243 {
28244 field = DECL_CHAIN (field);
28245 field_val -= M_CPU_TYPE_START;
28246 }
28247
28248 /* CPU subtypes are stored in the next field. */
28249 if (field_val > M_CPU_SUBTYPE_START)
28250 {
28251 field = DECL_CHAIN ( DECL_CHAIN (field));
28252 field_val -= M_CPU_SUBTYPE_START;
28253 }
28254
28255 /* Get the appropriate field in __cpu_model. */
28256 ref = build3 (COMPONENT_REF, TREE_TYPE (field), __cpu_model_var,
28257 field, NULL_TREE);
28258
28259 /* Check the value. */
28260 return build2 (EQ_EXPR, unsigned_type_node, ref,
28261 build_int_cstu (unsigned_type_node, field_val));
28262 }
28263 else if (fn_code == IX86_BUILTIN_CPU_SUPPORTS)
28264 {
28265 tree ref;
28266 tree array_elt;
28267 tree field;
28268 unsigned int field_val = 0;
28269 unsigned int NUM_ISA_NAMES
28270 = sizeof (isa_names_table) / sizeof (struct _isa_names_table);
28271
28272 for (i = 0; i < NUM_ISA_NAMES; i++)
28273 if (strcmp (isa_names_table[i].name,
28274 TREE_STRING_POINTER (param_string_cst)) == 0)
28275 break;
28276
28277 if (i == NUM_ISA_NAMES)
28278 {
28279 error ("Parameter to builtin not valid: %s",
28280 TREE_STRING_POINTER (param_string_cst));
28281 return integer_zero_node;
28282 }
28283
28284 field = TYPE_FIELDS (__processor_model_type);
28285 /* Get the last field, which is __cpu_features. */
28286 while (DECL_CHAIN (field))
28287 field = DECL_CHAIN (field);
28288
28289 /* Get the appropriate field: __cpu_model.__cpu_features */
28290 ref = build3 (COMPONENT_REF, TREE_TYPE (field), __cpu_model_var,
28291 field, NULL_TREE);
28292
28293 /* Access the 0th element of __cpu_features array. */
28294 array_elt = build4 (ARRAY_REF, unsigned_type_node, ref,
28295 integer_zero_node, NULL_TREE, NULL_TREE);
28296
28297 field_val = (1 << isa_names_table[i].feature);
28298 /* Return __cpu_model.__cpu_features[0] & field_val */
28299 return build2 (BIT_AND_EXPR, unsigned_type_node, array_elt,
28300 build_int_cstu (unsigned_type_node, field_val));
28301 }
28302 gcc_unreachable ();
28303 }
28304
28305 static tree
28306 ix86_fold_builtin (tree fndecl, int n_args,
28307 tree *args, bool ignore ATTRIBUTE_UNUSED)
28308 {
28309 if (DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD)
28310 {
28311 enum ix86_builtins fn_code = (enum ix86_builtins)
28312 DECL_FUNCTION_CODE (fndecl);
28313 if (fn_code == IX86_BUILTIN_CPU_IS
28314 || fn_code == IX86_BUILTIN_CPU_SUPPORTS)
28315 {
28316 gcc_assert (n_args == 1);
28317 return fold_builtin_cpu (fndecl, args);
28318 }
28319 }
28320
28321 #ifdef SUBTARGET_FOLD_BUILTIN
28322 return SUBTARGET_FOLD_BUILTIN (fndecl, n_args, args, ignore);
28323 #endif
28324
28325 return NULL_TREE;
28326 }
28327
28328 /* Make builtins to detect cpu type and features supported. NAME is
28329 the builtin name, CODE is the builtin code, and FTYPE is the function
28330 type of the builtin. */
28331
28332 static void
28333 make_cpu_type_builtin (const char* name, int code,
28334 enum ix86_builtin_func_type ftype, bool is_const)
28335 {
28336 tree decl;
28337 tree type;
28338
28339 type = ix86_get_builtin_func_type (ftype);
28340 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
28341 NULL, NULL_TREE);
28342 gcc_assert (decl != NULL_TREE);
28343 ix86_builtins[(int) code] = decl;
28344 TREE_READONLY (decl) = is_const;
28345 }
28346
28347 /* Make builtins to get CPU type and features supported. The created
28348 builtins are :
28349
28350 __builtin_cpu_init (), to detect cpu type and features,
28351 __builtin_cpu_is ("<CPUNAME>"), to check if cpu is of type <CPUNAME>,
28352 __builtin_cpu_supports ("<FEATURE>"), to check if cpu supports <FEATURE>
28353 */
28354
28355 static void
28356 ix86_init_platform_type_builtins (void)
28357 {
28358 make_cpu_type_builtin ("__builtin_cpu_init", IX86_BUILTIN_CPU_INIT,
28359 INT_FTYPE_VOID, false);
28360 make_cpu_type_builtin ("__builtin_cpu_is", IX86_BUILTIN_CPU_IS,
28361 INT_FTYPE_PCCHAR, true);
28362 make_cpu_type_builtin ("__builtin_cpu_supports", IX86_BUILTIN_CPU_SUPPORTS,
28363 INT_FTYPE_PCCHAR, true);
28364 }
28365
28366 /* Internal method for ix86_init_builtins. */
28367
28368 static void
28369 ix86_init_builtins_va_builtins_abi (void)
28370 {
28371 tree ms_va_ref, sysv_va_ref;
28372 tree fnvoid_va_end_ms, fnvoid_va_end_sysv;
28373 tree fnvoid_va_start_ms, fnvoid_va_start_sysv;
28374 tree fnvoid_va_copy_ms, fnvoid_va_copy_sysv;
28375 tree fnattr_ms = NULL_TREE, fnattr_sysv = NULL_TREE;
28376
28377 if (!TARGET_64BIT)
28378 return;
28379 fnattr_ms = build_tree_list (get_identifier ("ms_abi"), NULL_TREE);
28380 fnattr_sysv = build_tree_list (get_identifier ("sysv_abi"), NULL_TREE);
28381 ms_va_ref = build_reference_type (ms_va_list_type_node);
28382 sysv_va_ref =
28383 build_pointer_type (TREE_TYPE (sysv_va_list_type_node));
28384
28385 fnvoid_va_end_ms =
28386 build_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
28387 fnvoid_va_start_ms =
28388 build_varargs_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
28389 fnvoid_va_end_sysv =
28390 build_function_type_list (void_type_node, sysv_va_ref, NULL_TREE);
28391 fnvoid_va_start_sysv =
28392 build_varargs_function_type_list (void_type_node, sysv_va_ref,
28393 NULL_TREE);
28394 fnvoid_va_copy_ms =
28395 build_function_type_list (void_type_node, ms_va_ref, ms_va_list_type_node,
28396 NULL_TREE);
28397 fnvoid_va_copy_sysv =
28398 build_function_type_list (void_type_node, sysv_va_ref,
28399 sysv_va_ref, NULL_TREE);
28400
28401 add_builtin_function ("__builtin_ms_va_start", fnvoid_va_start_ms,
28402 BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_ms);
28403 add_builtin_function ("__builtin_ms_va_end", fnvoid_va_end_ms,
28404 BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_ms);
28405 add_builtin_function ("__builtin_ms_va_copy", fnvoid_va_copy_ms,
28406 BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_ms);
28407 add_builtin_function ("__builtin_sysv_va_start", fnvoid_va_start_sysv,
28408 BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_sysv);
28409 add_builtin_function ("__builtin_sysv_va_end", fnvoid_va_end_sysv,
28410 BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_sysv);
28411 add_builtin_function ("__builtin_sysv_va_copy", fnvoid_va_copy_sysv,
28412 BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_sysv);
28413 }
28414
28415 static void
28416 ix86_init_builtin_types (void)
28417 {
28418 tree float128_type_node, float80_type_node;
28419
28420 /* The __float80 type. */
28421 float80_type_node = long_double_type_node;
28422 if (TYPE_MODE (float80_type_node) != XFmode)
28423 {
28424 /* The __float80 type. */
28425 float80_type_node = make_node (REAL_TYPE);
28426
28427 TYPE_PRECISION (float80_type_node) = 80;
28428 layout_type (float80_type_node);
28429 }
28430 lang_hooks.types.register_builtin_type (float80_type_node, "__float80");
28431
28432 /* The __float128 type. */
28433 float128_type_node = make_node (REAL_TYPE);
28434 TYPE_PRECISION (float128_type_node) = 128;
28435 layout_type (float128_type_node);
28436 lang_hooks.types.register_builtin_type (float128_type_node, "__float128");
28437
28438 /* This macro is built by i386-builtin-types.awk. */
28439 DEFINE_BUILTIN_PRIMITIVE_TYPES;
28440 }
28441
28442 static void
28443 ix86_init_builtins (void)
28444 {
28445 tree t;
28446
28447 ix86_init_builtin_types ();
28448
28449 /* Builtins to get CPU type and features. */
28450 ix86_init_platform_type_builtins ();
28451
28452 /* TFmode support builtins. */
28453 def_builtin_const (0, "__builtin_infq",
28454 FLOAT128_FTYPE_VOID, IX86_BUILTIN_INFQ);
28455 def_builtin_const (0, "__builtin_huge_valq",
28456 FLOAT128_FTYPE_VOID, IX86_BUILTIN_HUGE_VALQ);
28457
28458 /* We will expand them to normal call if SSE isn't available since
28459 they are used by libgcc. */
28460 t = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128);
28461 t = add_builtin_function ("__builtin_fabsq", t, IX86_BUILTIN_FABSQ,
28462 BUILT_IN_MD, "__fabstf2", NULL_TREE);
28463 TREE_READONLY (t) = 1;
28464 ix86_builtins[(int) IX86_BUILTIN_FABSQ] = t;
28465
28466 t = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128_FLOAT128);
28467 t = add_builtin_function ("__builtin_copysignq", t, IX86_BUILTIN_COPYSIGNQ,
28468 BUILT_IN_MD, "__copysigntf3", NULL_TREE);
28469 TREE_READONLY (t) = 1;
28470 ix86_builtins[(int) IX86_BUILTIN_COPYSIGNQ] = t;
28471
28472 ix86_init_tm_builtins ();
28473 ix86_init_mmx_sse_builtins ();
28474
28475 if (TARGET_LP64)
28476 ix86_init_builtins_va_builtins_abi ();
28477
28478 #ifdef SUBTARGET_INIT_BUILTINS
28479 SUBTARGET_INIT_BUILTINS;
28480 #endif
28481 }
28482
28483 /* Return the ix86 builtin for CODE. */
28484
28485 static tree
28486 ix86_builtin_decl (unsigned code, bool initialize_p ATTRIBUTE_UNUSED)
28487 {
28488 if (code >= IX86_BUILTIN_MAX)
28489 return error_mark_node;
28490
28491 return ix86_builtins[code];
28492 }
28493
28494 /* Errors in the source file can cause expand_expr to return const0_rtx
28495 where we expect a vector. To avoid crashing, use one of the vector
28496 clear instructions. */
28497 static rtx
28498 safe_vector_operand (rtx x, enum machine_mode mode)
28499 {
28500 if (x == const0_rtx)
28501 x = CONST0_RTX (mode);
28502 return x;
28503 }
28504
28505 /* Subroutine of ix86_expand_builtin to take care of binop insns. */
28506
28507 static rtx
28508 ix86_expand_binop_builtin (enum insn_code icode, tree exp, rtx target)
28509 {
28510 rtx pat;
28511 tree arg0 = CALL_EXPR_ARG (exp, 0);
28512 tree arg1 = CALL_EXPR_ARG (exp, 1);
28513 rtx op0 = expand_normal (arg0);
28514 rtx op1 = expand_normal (arg1);
28515 enum machine_mode tmode = insn_data[icode].operand[0].mode;
28516 enum machine_mode mode0 = insn_data[icode].operand[1].mode;
28517 enum machine_mode mode1 = insn_data[icode].operand[2].mode;
28518
28519 if (VECTOR_MODE_P (mode0))
28520 op0 = safe_vector_operand (op0, mode0);
28521 if (VECTOR_MODE_P (mode1))
28522 op1 = safe_vector_operand (op1, mode1);
28523
28524 if (optimize || !target
28525 || GET_MODE (target) != tmode
28526 || !insn_data[icode].operand[0].predicate (target, tmode))
28527 target = gen_reg_rtx (tmode);
28528
28529 if (GET_MODE (op1) == SImode && mode1 == TImode)
28530 {
28531 rtx x = gen_reg_rtx (V4SImode);
28532 emit_insn (gen_sse2_loadd (x, op1));
28533 op1 = gen_lowpart (TImode, x);
28534 }
28535
28536 if (!insn_data[icode].operand[1].predicate (op0, mode0))
28537 op0 = copy_to_mode_reg (mode0, op0);
28538 if (!insn_data[icode].operand[2].predicate (op1, mode1))
28539 op1 = copy_to_mode_reg (mode1, op1);
28540
28541 pat = GEN_FCN (icode) (target, op0, op1);
28542 if (! pat)
28543 return 0;
28544
28545 emit_insn (pat);
28546
28547 return target;
28548 }
28549
28550 /* Subroutine of ix86_expand_builtin to take care of 2-4 argument insns. */
28551
28552 static rtx
28553 ix86_expand_multi_arg_builtin (enum insn_code icode, tree exp, rtx target,
28554 enum ix86_builtin_func_type m_type,
28555 enum rtx_code sub_code)
28556 {
28557 rtx pat;
28558 int i;
28559 int nargs;
28560 bool comparison_p = false;
28561 bool tf_p = false;
28562 bool last_arg_constant = false;
28563 int num_memory = 0;
28564 struct {
28565 rtx op;
28566 enum machine_mode mode;
28567 } args[4];
28568
28569 enum machine_mode tmode = insn_data[icode].operand[0].mode;
28570
28571 switch (m_type)
28572 {
28573 case MULTI_ARG_4_DF2_DI_I:
28574 case MULTI_ARG_4_DF2_DI_I1:
28575 case MULTI_ARG_4_SF2_SI_I:
28576 case MULTI_ARG_4_SF2_SI_I1:
28577 nargs = 4;
28578 last_arg_constant = true;
28579 break;
28580
28581 case MULTI_ARG_3_SF:
28582 case MULTI_ARG_3_DF:
28583 case MULTI_ARG_3_SF2:
28584 case MULTI_ARG_3_DF2:
28585 case MULTI_ARG_3_DI:
28586 case MULTI_ARG_3_SI:
28587 case MULTI_ARG_3_SI_DI:
28588 case MULTI_ARG_3_HI:
28589 case MULTI_ARG_3_HI_SI:
28590 case MULTI_ARG_3_QI:
28591 case MULTI_ARG_3_DI2:
28592 case MULTI_ARG_3_SI2:
28593 case MULTI_ARG_3_HI2:
28594 case MULTI_ARG_3_QI2:
28595 nargs = 3;
28596 break;
28597
28598 case MULTI_ARG_2_SF:
28599 case MULTI_ARG_2_DF:
28600 case MULTI_ARG_2_DI:
28601 case MULTI_ARG_2_SI:
28602 case MULTI_ARG_2_HI:
28603 case MULTI_ARG_2_QI:
28604 nargs = 2;
28605 break;
28606
28607 case MULTI_ARG_2_DI_IMM:
28608 case MULTI_ARG_2_SI_IMM:
28609 case MULTI_ARG_2_HI_IMM:
28610 case MULTI_ARG_2_QI_IMM:
28611 nargs = 2;
28612 last_arg_constant = true;
28613 break;
28614
28615 case MULTI_ARG_1_SF:
28616 case MULTI_ARG_1_DF:
28617 case MULTI_ARG_1_SF2:
28618 case MULTI_ARG_1_DF2:
28619 case MULTI_ARG_1_DI:
28620 case MULTI_ARG_1_SI:
28621 case MULTI_ARG_1_HI:
28622 case MULTI_ARG_1_QI:
28623 case MULTI_ARG_1_SI_DI:
28624 case MULTI_ARG_1_HI_DI:
28625 case MULTI_ARG_1_HI_SI:
28626 case MULTI_ARG_1_QI_DI:
28627 case MULTI_ARG_1_QI_SI:
28628 case MULTI_ARG_1_QI_HI:
28629 nargs = 1;
28630 break;
28631
28632 case MULTI_ARG_2_DI_CMP:
28633 case MULTI_ARG_2_SI_CMP:
28634 case MULTI_ARG_2_HI_CMP:
28635 case MULTI_ARG_2_QI_CMP:
28636 nargs = 2;
28637 comparison_p = true;
28638 break;
28639
28640 case MULTI_ARG_2_SF_TF:
28641 case MULTI_ARG_2_DF_TF:
28642 case MULTI_ARG_2_DI_TF:
28643 case MULTI_ARG_2_SI_TF:
28644 case MULTI_ARG_2_HI_TF:
28645 case MULTI_ARG_2_QI_TF:
28646 nargs = 2;
28647 tf_p = true;
28648 break;
28649
28650 default:
28651 gcc_unreachable ();
28652 }
28653
28654 if (optimize || !target
28655 || GET_MODE (target) != tmode
28656 || !insn_data[icode].operand[0].predicate (target, tmode))
28657 target = gen_reg_rtx (tmode);
28658
28659 gcc_assert (nargs <= 4);
28660
28661 for (i = 0; i < nargs; i++)
28662 {
28663 tree arg = CALL_EXPR_ARG (exp, i);
28664 rtx op = expand_normal (arg);
28665 int adjust = (comparison_p) ? 1 : 0;
28666 enum machine_mode mode = insn_data[icode].operand[i+adjust+1].mode;
28667
28668 if (last_arg_constant && i == nargs - 1)
28669 {
28670 if (!insn_data[icode].operand[i + 1].predicate (op, mode))
28671 {
28672 enum insn_code new_icode = icode;
28673 switch (icode)
28674 {
28675 case CODE_FOR_xop_vpermil2v2df3:
28676 case CODE_FOR_xop_vpermil2v4sf3:
28677 case CODE_FOR_xop_vpermil2v4df3:
28678 case CODE_FOR_xop_vpermil2v8sf3:
28679 error ("the last argument must be a 2-bit immediate");
28680 return gen_reg_rtx (tmode);
28681 case CODE_FOR_xop_rotlv2di3:
28682 new_icode = CODE_FOR_rotlv2di3;
28683 goto xop_rotl;
28684 case CODE_FOR_xop_rotlv4si3:
28685 new_icode = CODE_FOR_rotlv4si3;
28686 goto xop_rotl;
28687 case CODE_FOR_xop_rotlv8hi3:
28688 new_icode = CODE_FOR_rotlv8hi3;
28689 goto xop_rotl;
28690 case CODE_FOR_xop_rotlv16qi3:
28691 new_icode = CODE_FOR_rotlv16qi3;
28692 xop_rotl:
28693 if (CONST_INT_P (op))
28694 {
28695 int mask = GET_MODE_BITSIZE (GET_MODE_INNER (tmode)) - 1;
28696 op = GEN_INT (INTVAL (op) & mask);
28697 gcc_checking_assert
28698 (insn_data[icode].operand[i + 1].predicate (op, mode));
28699 }
28700 else
28701 {
28702 gcc_checking_assert
28703 (nargs == 2
28704 && insn_data[new_icode].operand[0].mode == tmode
28705 && insn_data[new_icode].operand[1].mode == tmode
28706 && insn_data[new_icode].operand[2].mode == mode
28707 && insn_data[new_icode].operand[0].predicate
28708 == insn_data[icode].operand[0].predicate
28709 && insn_data[new_icode].operand[1].predicate
28710 == insn_data[icode].operand[1].predicate);
28711 icode = new_icode;
28712 goto non_constant;
28713 }
28714 break;
28715 default:
28716 gcc_unreachable ();
28717 }
28718 }
28719 }
28720 else
28721 {
28722 non_constant:
28723 if (VECTOR_MODE_P (mode))
28724 op = safe_vector_operand (op, mode);
28725
28726 /* If we aren't optimizing, only allow one memory operand to be
28727 generated. */
28728 if (memory_operand (op, mode))
28729 num_memory++;
28730
28731 gcc_assert (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode);
28732
28733 if (optimize
28734 || !insn_data[icode].operand[i+adjust+1].predicate (op, mode)
28735 || num_memory > 1)
28736 op = force_reg (mode, op);
28737 }
28738
28739 args[i].op = op;
28740 args[i].mode = mode;
28741 }
28742
28743 switch (nargs)
28744 {
28745 case 1:
28746 pat = GEN_FCN (icode) (target, args[0].op);
28747 break;
28748
28749 case 2:
28750 if (tf_p)
28751 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
28752 GEN_INT ((int)sub_code));
28753 else if (! comparison_p)
28754 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
28755 else
28756 {
28757 rtx cmp_op = gen_rtx_fmt_ee (sub_code, GET_MODE (target),
28758 args[0].op,
28759 args[1].op);
28760
28761 pat = GEN_FCN (icode) (target, cmp_op, args[0].op, args[1].op);
28762 }
28763 break;
28764
28765 case 3:
28766 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
28767 break;
28768
28769 case 4:
28770 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op, args[3].op);
28771 break;
28772
28773 default:
28774 gcc_unreachable ();
28775 }
28776
28777 if (! pat)
28778 return 0;
28779
28780 emit_insn (pat);
28781 return target;
28782 }
28783
28784 /* Subroutine of ix86_expand_args_builtin to take care of scalar unop
28785 insns with vec_merge. */
28786
28787 static rtx
28788 ix86_expand_unop_vec_merge_builtin (enum insn_code icode, tree exp,
28789 rtx target)
28790 {
28791 rtx pat;
28792 tree arg0 = CALL_EXPR_ARG (exp, 0);
28793 rtx op1, op0 = expand_normal (arg0);
28794 enum machine_mode tmode = insn_data[icode].operand[0].mode;
28795 enum machine_mode mode0 = insn_data[icode].operand[1].mode;
28796
28797 if (optimize || !target
28798 || GET_MODE (target) != tmode
28799 || !insn_data[icode].operand[0].predicate (target, tmode))
28800 target = gen_reg_rtx (tmode);
28801
28802 if (VECTOR_MODE_P (mode0))
28803 op0 = safe_vector_operand (op0, mode0);
28804
28805 if ((optimize && !register_operand (op0, mode0))
28806 || !insn_data[icode].operand[1].predicate (op0, mode0))
28807 op0 = copy_to_mode_reg (mode0, op0);
28808
28809 op1 = op0;
28810 if (!insn_data[icode].operand[2].predicate (op1, mode0))
28811 op1 = copy_to_mode_reg (mode0, op1);
28812
28813 pat = GEN_FCN (icode) (target, op0, op1);
28814 if (! pat)
28815 return 0;
28816 emit_insn (pat);
28817 return target;
28818 }
28819
28820 /* Subroutine of ix86_expand_builtin to take care of comparison insns. */
28821
28822 static rtx
28823 ix86_expand_sse_compare (const struct builtin_description *d,
28824 tree exp, rtx target, bool swap)
28825 {
28826 rtx pat;
28827 tree arg0 = CALL_EXPR_ARG (exp, 0);
28828 tree arg1 = CALL_EXPR_ARG (exp, 1);
28829 rtx op0 = expand_normal (arg0);
28830 rtx op1 = expand_normal (arg1);
28831 rtx op2;
28832 enum machine_mode tmode = insn_data[d->icode].operand[0].mode;
28833 enum machine_mode mode0 = insn_data[d->icode].operand[1].mode;
28834 enum machine_mode mode1 = insn_data[d->icode].operand[2].mode;
28835 enum rtx_code comparison = d->comparison;
28836
28837 if (VECTOR_MODE_P (mode0))
28838 op0 = safe_vector_operand (op0, mode0);
28839 if (VECTOR_MODE_P (mode1))
28840 op1 = safe_vector_operand (op1, mode1);
28841
28842 /* Swap operands if we have a comparison that isn't available in
28843 hardware. */
28844 if (swap)
28845 {
28846 rtx tmp = gen_reg_rtx (mode1);
28847 emit_move_insn (tmp, op1);
28848 op1 = op0;
28849 op0 = tmp;
28850 }
28851
28852 if (optimize || !target
28853 || GET_MODE (target) != tmode
28854 || !insn_data[d->icode].operand[0].predicate (target, tmode))
28855 target = gen_reg_rtx (tmode);
28856
28857 if ((optimize && !register_operand (op0, mode0))
28858 || !insn_data[d->icode].operand[1].predicate (op0, mode0))
28859 op0 = copy_to_mode_reg (mode0, op0);
28860 if ((optimize && !register_operand (op1, mode1))
28861 || !insn_data[d->icode].operand[2].predicate (op1, mode1))
28862 op1 = copy_to_mode_reg (mode1, op1);
28863
28864 op2 = gen_rtx_fmt_ee (comparison, mode0, op0, op1);
28865 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
28866 if (! pat)
28867 return 0;
28868 emit_insn (pat);
28869 return target;
28870 }
28871
28872 /* Subroutine of ix86_expand_builtin to take care of comi insns. */
28873
28874 static rtx
28875 ix86_expand_sse_comi (const struct builtin_description *d, tree exp,
28876 rtx target)
28877 {
28878 rtx pat;
28879 tree arg0 = CALL_EXPR_ARG (exp, 0);
28880 tree arg1 = CALL_EXPR_ARG (exp, 1);
28881 rtx op0 = expand_normal (arg0);
28882 rtx op1 = expand_normal (arg1);
28883 enum machine_mode mode0 = insn_data[d->icode].operand[0].mode;
28884 enum machine_mode mode1 = insn_data[d->icode].operand[1].mode;
28885 enum rtx_code comparison = d->comparison;
28886
28887 if (VECTOR_MODE_P (mode0))
28888 op0 = safe_vector_operand (op0, mode0);
28889 if (VECTOR_MODE_P (mode1))
28890 op1 = safe_vector_operand (op1, mode1);
28891
28892 /* Swap operands if we have a comparison that isn't available in
28893 hardware. */
28894 if (d->flag & BUILTIN_DESC_SWAP_OPERANDS)
28895 {
28896 rtx tmp = op1;
28897 op1 = op0;
28898 op0 = tmp;
28899 }
28900
28901 target = gen_reg_rtx (SImode);
28902 emit_move_insn (target, const0_rtx);
28903 target = gen_rtx_SUBREG (QImode, target, 0);
28904
28905 if ((optimize && !register_operand (op0, mode0))
28906 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
28907 op0 = copy_to_mode_reg (mode0, op0);
28908 if ((optimize && !register_operand (op1, mode1))
28909 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
28910 op1 = copy_to_mode_reg (mode1, op1);
28911
28912 pat = GEN_FCN (d->icode) (op0, op1);
28913 if (! pat)
28914 return 0;
28915 emit_insn (pat);
28916 emit_insn (gen_rtx_SET (VOIDmode,
28917 gen_rtx_STRICT_LOW_PART (VOIDmode, target),
28918 gen_rtx_fmt_ee (comparison, QImode,
28919 SET_DEST (pat),
28920 const0_rtx)));
28921
28922 return SUBREG_REG (target);
28923 }
28924
28925 /* Subroutines of ix86_expand_args_builtin to take care of round insns. */
28926
28927 static rtx
28928 ix86_expand_sse_round (const struct builtin_description *d, tree exp,
28929 rtx target)
28930 {
28931 rtx pat;
28932 tree arg0 = CALL_EXPR_ARG (exp, 0);
28933 rtx op1, op0 = expand_normal (arg0);
28934 enum machine_mode tmode = insn_data[d->icode].operand[0].mode;
28935 enum machine_mode mode0 = insn_data[d->icode].operand[1].mode;
28936
28937 if (optimize || target == 0
28938 || GET_MODE (target) != tmode
28939 || !insn_data[d->icode].operand[0].predicate (target, tmode))
28940 target = gen_reg_rtx (tmode);
28941
28942 if (VECTOR_MODE_P (mode0))
28943 op0 = safe_vector_operand (op0, mode0);
28944
28945 if ((optimize && !register_operand (op0, mode0))
28946 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
28947 op0 = copy_to_mode_reg (mode0, op0);
28948
28949 op1 = GEN_INT (d->comparison);
28950
28951 pat = GEN_FCN (d->icode) (target, op0, op1);
28952 if (! pat)
28953 return 0;
28954 emit_insn (pat);
28955 return target;
28956 }
28957
28958 static rtx
28959 ix86_expand_sse_round_vec_pack_sfix (const struct builtin_description *d,
28960 tree exp, rtx target)
28961 {
28962 rtx pat;
28963 tree arg0 = CALL_EXPR_ARG (exp, 0);
28964 tree arg1 = CALL_EXPR_ARG (exp, 1);
28965 rtx op0 = expand_normal (arg0);
28966 rtx op1 = expand_normal (arg1);
28967 rtx op2;
28968 enum machine_mode tmode = insn_data[d->icode].operand[0].mode;
28969 enum machine_mode mode0 = insn_data[d->icode].operand[1].mode;
28970 enum machine_mode mode1 = insn_data[d->icode].operand[2].mode;
28971
28972 if (optimize || target == 0
28973 || GET_MODE (target) != tmode
28974 || !insn_data[d->icode].operand[0].predicate (target, tmode))
28975 target = gen_reg_rtx (tmode);
28976
28977 op0 = safe_vector_operand (op0, mode0);
28978 op1 = safe_vector_operand (op1, mode1);
28979
28980 if ((optimize && !register_operand (op0, mode0))
28981 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
28982 op0 = copy_to_mode_reg (mode0, op0);
28983 if ((optimize && !register_operand (op1, mode1))
28984 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
28985 op1 = copy_to_mode_reg (mode1, op1);
28986
28987 op2 = GEN_INT (d->comparison);
28988
28989 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
28990 if (! pat)
28991 return 0;
28992 emit_insn (pat);
28993 return target;
28994 }
28995
28996 /* Subroutine of ix86_expand_builtin to take care of ptest insns. */
28997
28998 static rtx
28999 ix86_expand_sse_ptest (const struct builtin_description *d, tree exp,
29000 rtx target)
29001 {
29002 rtx pat;
29003 tree arg0 = CALL_EXPR_ARG (exp, 0);
29004 tree arg1 = CALL_EXPR_ARG (exp, 1);
29005 rtx op0 = expand_normal (arg0);
29006 rtx op1 = expand_normal (arg1);
29007 enum machine_mode mode0 = insn_data[d->icode].operand[0].mode;
29008 enum machine_mode mode1 = insn_data[d->icode].operand[1].mode;
29009 enum rtx_code comparison = d->comparison;
29010
29011 if (VECTOR_MODE_P (mode0))
29012 op0 = safe_vector_operand (op0, mode0);
29013 if (VECTOR_MODE_P (mode1))
29014 op1 = safe_vector_operand (op1, mode1);
29015
29016 target = gen_reg_rtx (SImode);
29017 emit_move_insn (target, const0_rtx);
29018 target = gen_rtx_SUBREG (QImode, target, 0);
29019
29020 if ((optimize && !register_operand (op0, mode0))
29021 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
29022 op0 = copy_to_mode_reg (mode0, op0);
29023 if ((optimize && !register_operand (op1, mode1))
29024 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
29025 op1 = copy_to_mode_reg (mode1, op1);
29026
29027 pat = GEN_FCN (d->icode) (op0, op1);
29028 if (! pat)
29029 return 0;
29030 emit_insn (pat);
29031 emit_insn (gen_rtx_SET (VOIDmode,
29032 gen_rtx_STRICT_LOW_PART (VOIDmode, target),
29033 gen_rtx_fmt_ee (comparison, QImode,
29034 SET_DEST (pat),
29035 const0_rtx)));
29036
29037 return SUBREG_REG (target);
29038 }
29039
29040 /* Subroutine of ix86_expand_builtin to take care of pcmpestr[im] insns. */
29041
29042 static rtx
29043 ix86_expand_sse_pcmpestr (const struct builtin_description *d,
29044 tree exp, rtx target)
29045 {
29046 rtx pat;
29047 tree arg0 = CALL_EXPR_ARG (exp, 0);
29048 tree arg1 = CALL_EXPR_ARG (exp, 1);
29049 tree arg2 = CALL_EXPR_ARG (exp, 2);
29050 tree arg3 = CALL_EXPR_ARG (exp, 3);
29051 tree arg4 = CALL_EXPR_ARG (exp, 4);
29052 rtx scratch0, scratch1;
29053 rtx op0 = expand_normal (arg0);
29054 rtx op1 = expand_normal (arg1);
29055 rtx op2 = expand_normal (arg2);
29056 rtx op3 = expand_normal (arg3);
29057 rtx op4 = expand_normal (arg4);
29058 enum machine_mode tmode0, tmode1, modev2, modei3, modev4, modei5, modeimm;
29059
29060 tmode0 = insn_data[d->icode].operand[0].mode;
29061 tmode1 = insn_data[d->icode].operand[1].mode;
29062 modev2 = insn_data[d->icode].operand[2].mode;
29063 modei3 = insn_data[d->icode].operand[3].mode;
29064 modev4 = insn_data[d->icode].operand[4].mode;
29065 modei5 = insn_data[d->icode].operand[5].mode;
29066 modeimm = insn_data[d->icode].operand[6].mode;
29067
29068 if (VECTOR_MODE_P (modev2))
29069 op0 = safe_vector_operand (op0, modev2);
29070 if (VECTOR_MODE_P (modev4))
29071 op2 = safe_vector_operand (op2, modev4);
29072
29073 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
29074 op0 = copy_to_mode_reg (modev2, op0);
29075 if (!insn_data[d->icode].operand[3].predicate (op1, modei3))
29076 op1 = copy_to_mode_reg (modei3, op1);
29077 if ((optimize && !register_operand (op2, modev4))
29078 || !insn_data[d->icode].operand[4].predicate (op2, modev4))
29079 op2 = copy_to_mode_reg (modev4, op2);
29080 if (!insn_data[d->icode].operand[5].predicate (op3, modei5))
29081 op3 = copy_to_mode_reg (modei5, op3);
29082
29083 if (!insn_data[d->icode].operand[6].predicate (op4, modeimm))
29084 {
29085 error ("the fifth argument must be an 8-bit immediate");
29086 return const0_rtx;
29087 }
29088
29089 if (d->code == IX86_BUILTIN_PCMPESTRI128)
29090 {
29091 if (optimize || !target
29092 || GET_MODE (target) != tmode0
29093 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
29094 target = gen_reg_rtx (tmode0);
29095
29096 scratch1 = gen_reg_rtx (tmode1);
29097
29098 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2, op3, op4);
29099 }
29100 else if (d->code == IX86_BUILTIN_PCMPESTRM128)
29101 {
29102 if (optimize || !target
29103 || GET_MODE (target) != tmode1
29104 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
29105 target = gen_reg_rtx (tmode1);
29106
29107 scratch0 = gen_reg_rtx (tmode0);
29108
29109 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2, op3, op4);
29110 }
29111 else
29112 {
29113 gcc_assert (d->flag);
29114
29115 scratch0 = gen_reg_rtx (tmode0);
29116 scratch1 = gen_reg_rtx (tmode1);
29117
29118 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2, op3, op4);
29119 }
29120
29121 if (! pat)
29122 return 0;
29123
29124 emit_insn (pat);
29125
29126 if (d->flag)
29127 {
29128 target = gen_reg_rtx (SImode);
29129 emit_move_insn (target, const0_rtx);
29130 target = gen_rtx_SUBREG (QImode, target, 0);
29131
29132 emit_insn
29133 (gen_rtx_SET (VOIDmode, gen_rtx_STRICT_LOW_PART (VOIDmode, target),
29134 gen_rtx_fmt_ee (EQ, QImode,
29135 gen_rtx_REG ((enum machine_mode) d->flag,
29136 FLAGS_REG),
29137 const0_rtx)));
29138 return SUBREG_REG (target);
29139 }
29140 else
29141 return target;
29142 }
29143
29144
29145 /* Subroutine of ix86_expand_builtin to take care of pcmpistr[im] insns. */
29146
29147 static rtx
29148 ix86_expand_sse_pcmpistr (const struct builtin_description *d,
29149 tree exp, rtx target)
29150 {
29151 rtx pat;
29152 tree arg0 = CALL_EXPR_ARG (exp, 0);
29153 tree arg1 = CALL_EXPR_ARG (exp, 1);
29154 tree arg2 = CALL_EXPR_ARG (exp, 2);
29155 rtx scratch0, scratch1;
29156 rtx op0 = expand_normal (arg0);
29157 rtx op1 = expand_normal (arg1);
29158 rtx op2 = expand_normal (arg2);
29159 enum machine_mode tmode0, tmode1, modev2, modev3, modeimm;
29160
29161 tmode0 = insn_data[d->icode].operand[0].mode;
29162 tmode1 = insn_data[d->icode].operand[1].mode;
29163 modev2 = insn_data[d->icode].operand[2].mode;
29164 modev3 = insn_data[d->icode].operand[3].mode;
29165 modeimm = insn_data[d->icode].operand[4].mode;
29166
29167 if (VECTOR_MODE_P (modev2))
29168 op0 = safe_vector_operand (op0, modev2);
29169 if (VECTOR_MODE_P (modev3))
29170 op1 = safe_vector_operand (op1, modev3);
29171
29172 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
29173 op0 = copy_to_mode_reg (modev2, op0);
29174 if ((optimize && !register_operand (op1, modev3))
29175 || !insn_data[d->icode].operand[3].predicate (op1, modev3))
29176 op1 = copy_to_mode_reg (modev3, op1);
29177
29178 if (!insn_data[d->icode].operand[4].predicate (op2, modeimm))
29179 {
29180 error ("the third argument must be an 8-bit immediate");
29181 return const0_rtx;
29182 }
29183
29184 if (d->code == IX86_BUILTIN_PCMPISTRI128)
29185 {
29186 if (optimize || !target
29187 || GET_MODE (target) != tmode0
29188 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
29189 target = gen_reg_rtx (tmode0);
29190
29191 scratch1 = gen_reg_rtx (tmode1);
29192
29193 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2);
29194 }
29195 else if (d->code == IX86_BUILTIN_PCMPISTRM128)
29196 {
29197 if (optimize || !target
29198 || GET_MODE (target) != tmode1
29199 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
29200 target = gen_reg_rtx (tmode1);
29201
29202 scratch0 = gen_reg_rtx (tmode0);
29203
29204 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2);
29205 }
29206 else
29207 {
29208 gcc_assert (d->flag);
29209
29210 scratch0 = gen_reg_rtx (tmode0);
29211 scratch1 = gen_reg_rtx (tmode1);
29212
29213 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2);
29214 }
29215
29216 if (! pat)
29217 return 0;
29218
29219 emit_insn (pat);
29220
29221 if (d->flag)
29222 {
29223 target = gen_reg_rtx (SImode);
29224 emit_move_insn (target, const0_rtx);
29225 target = gen_rtx_SUBREG (QImode, target, 0);
29226
29227 emit_insn
29228 (gen_rtx_SET (VOIDmode, gen_rtx_STRICT_LOW_PART (VOIDmode, target),
29229 gen_rtx_fmt_ee (EQ, QImode,
29230 gen_rtx_REG ((enum machine_mode) d->flag,
29231 FLAGS_REG),
29232 const0_rtx)));
29233 return SUBREG_REG (target);
29234 }
29235 else
29236 return target;
29237 }
29238
29239 /* Subroutine of ix86_expand_builtin to take care of insns with
29240 variable number of operands. */
29241
29242 static rtx
29243 ix86_expand_args_builtin (const struct builtin_description *d,
29244 tree exp, rtx target)
29245 {
29246 rtx pat, real_target;
29247 unsigned int i, nargs;
29248 unsigned int nargs_constant = 0;
29249 int num_memory = 0;
29250 struct
29251 {
29252 rtx op;
29253 enum machine_mode mode;
29254 } args[4];
29255 bool last_arg_count = false;
29256 enum insn_code icode = d->icode;
29257 const struct insn_data_d *insn_p = &insn_data[icode];
29258 enum machine_mode tmode = insn_p->operand[0].mode;
29259 enum machine_mode rmode = VOIDmode;
29260 bool swap = false;
29261 enum rtx_code comparison = d->comparison;
29262
29263 switch ((enum ix86_builtin_func_type) d->flag)
29264 {
29265 case V2DF_FTYPE_V2DF_ROUND:
29266 case V4DF_FTYPE_V4DF_ROUND:
29267 case V4SF_FTYPE_V4SF_ROUND:
29268 case V8SF_FTYPE_V8SF_ROUND:
29269 case V4SI_FTYPE_V4SF_ROUND:
29270 case V8SI_FTYPE_V8SF_ROUND:
29271 return ix86_expand_sse_round (d, exp, target);
29272 case V4SI_FTYPE_V2DF_V2DF_ROUND:
29273 case V8SI_FTYPE_V4DF_V4DF_ROUND:
29274 return ix86_expand_sse_round_vec_pack_sfix (d, exp, target);
29275 case INT_FTYPE_V8SF_V8SF_PTEST:
29276 case INT_FTYPE_V4DI_V4DI_PTEST:
29277 case INT_FTYPE_V4DF_V4DF_PTEST:
29278 case INT_FTYPE_V4SF_V4SF_PTEST:
29279 case INT_FTYPE_V2DI_V2DI_PTEST:
29280 case INT_FTYPE_V2DF_V2DF_PTEST:
29281 return ix86_expand_sse_ptest (d, exp, target);
29282 case FLOAT128_FTYPE_FLOAT128:
29283 case FLOAT_FTYPE_FLOAT:
29284 case INT_FTYPE_INT:
29285 case UINT64_FTYPE_INT:
29286 case UINT16_FTYPE_UINT16:
29287 case INT64_FTYPE_INT64:
29288 case INT64_FTYPE_V4SF:
29289 case INT64_FTYPE_V2DF:
29290 case INT_FTYPE_V16QI:
29291 case INT_FTYPE_V8QI:
29292 case INT_FTYPE_V8SF:
29293 case INT_FTYPE_V4DF:
29294 case INT_FTYPE_V4SF:
29295 case INT_FTYPE_V2DF:
29296 case INT_FTYPE_V32QI:
29297 case V16QI_FTYPE_V16QI:
29298 case V8SI_FTYPE_V8SF:
29299 case V8SI_FTYPE_V4SI:
29300 case V8HI_FTYPE_V8HI:
29301 case V8HI_FTYPE_V16QI:
29302 case V8QI_FTYPE_V8QI:
29303 case V8SF_FTYPE_V8SF:
29304 case V8SF_FTYPE_V8SI:
29305 case V8SF_FTYPE_V4SF:
29306 case V8SF_FTYPE_V8HI:
29307 case V4SI_FTYPE_V4SI:
29308 case V4SI_FTYPE_V16QI:
29309 case V4SI_FTYPE_V4SF:
29310 case V4SI_FTYPE_V8SI:
29311 case V4SI_FTYPE_V8HI:
29312 case V4SI_FTYPE_V4DF:
29313 case V4SI_FTYPE_V2DF:
29314 case V4HI_FTYPE_V4HI:
29315 case V4DF_FTYPE_V4DF:
29316 case V4DF_FTYPE_V4SI:
29317 case V4DF_FTYPE_V4SF:
29318 case V4DF_FTYPE_V2DF:
29319 case V4SF_FTYPE_V4SF:
29320 case V4SF_FTYPE_V4SI:
29321 case V4SF_FTYPE_V8SF:
29322 case V4SF_FTYPE_V4DF:
29323 case V4SF_FTYPE_V8HI:
29324 case V4SF_FTYPE_V2DF:
29325 case V2DI_FTYPE_V2DI:
29326 case V2DI_FTYPE_V16QI:
29327 case V2DI_FTYPE_V8HI:
29328 case V2DI_FTYPE_V4SI:
29329 case V2DF_FTYPE_V2DF:
29330 case V2DF_FTYPE_V4SI:
29331 case V2DF_FTYPE_V4DF:
29332 case V2DF_FTYPE_V4SF:
29333 case V2DF_FTYPE_V2SI:
29334 case V2SI_FTYPE_V2SI:
29335 case V2SI_FTYPE_V4SF:
29336 case V2SI_FTYPE_V2SF:
29337 case V2SI_FTYPE_V2DF:
29338 case V2SF_FTYPE_V2SF:
29339 case V2SF_FTYPE_V2SI:
29340 case V32QI_FTYPE_V32QI:
29341 case V32QI_FTYPE_V16QI:
29342 case V16HI_FTYPE_V16HI:
29343 case V16HI_FTYPE_V8HI:
29344 case V8SI_FTYPE_V8SI:
29345 case V16HI_FTYPE_V16QI:
29346 case V8SI_FTYPE_V16QI:
29347 case V4DI_FTYPE_V16QI:
29348 case V8SI_FTYPE_V8HI:
29349 case V4DI_FTYPE_V8HI:
29350 case V4DI_FTYPE_V4SI:
29351 case V4DI_FTYPE_V2DI:
29352 nargs = 1;
29353 break;
29354 case V4SF_FTYPE_V4SF_VEC_MERGE:
29355 case V2DF_FTYPE_V2DF_VEC_MERGE:
29356 return ix86_expand_unop_vec_merge_builtin (icode, exp, target);
29357 case FLOAT128_FTYPE_FLOAT128_FLOAT128:
29358 case V16QI_FTYPE_V16QI_V16QI:
29359 case V16QI_FTYPE_V8HI_V8HI:
29360 case V8QI_FTYPE_V8QI_V8QI:
29361 case V8QI_FTYPE_V4HI_V4HI:
29362 case V8HI_FTYPE_V8HI_V8HI:
29363 case V8HI_FTYPE_V16QI_V16QI:
29364 case V8HI_FTYPE_V4SI_V4SI:
29365 case V8SF_FTYPE_V8SF_V8SF:
29366 case V8SF_FTYPE_V8SF_V8SI:
29367 case V4SI_FTYPE_V4SI_V4SI:
29368 case V4SI_FTYPE_V8HI_V8HI:
29369 case V4SI_FTYPE_V4SF_V4SF:
29370 case V4SI_FTYPE_V2DF_V2DF:
29371 case V4HI_FTYPE_V4HI_V4HI:
29372 case V4HI_FTYPE_V8QI_V8QI:
29373 case V4HI_FTYPE_V2SI_V2SI:
29374 case V4DF_FTYPE_V4DF_V4DF:
29375 case V4DF_FTYPE_V4DF_V4DI:
29376 case V4SF_FTYPE_V4SF_V4SF:
29377 case V4SF_FTYPE_V4SF_V4SI:
29378 case V4SF_FTYPE_V4SF_V2SI:
29379 case V4SF_FTYPE_V4SF_V2DF:
29380 case V4SF_FTYPE_V4SF_DI:
29381 case V4SF_FTYPE_V4SF_SI:
29382 case V2DI_FTYPE_V2DI_V2DI:
29383 case V2DI_FTYPE_V16QI_V16QI:
29384 case V2DI_FTYPE_V4SI_V4SI:
29385 case V2UDI_FTYPE_V4USI_V4USI:
29386 case V2DI_FTYPE_V2DI_V16QI:
29387 case V2DI_FTYPE_V2DF_V2DF:
29388 case V2SI_FTYPE_V2SI_V2SI:
29389 case V2SI_FTYPE_V4HI_V4HI:
29390 case V2SI_FTYPE_V2SF_V2SF:
29391 case V2DF_FTYPE_V2DF_V2DF:
29392 case V2DF_FTYPE_V2DF_V4SF:
29393 case V2DF_FTYPE_V2DF_V2DI:
29394 case V2DF_FTYPE_V2DF_DI:
29395 case V2DF_FTYPE_V2DF_SI:
29396 case V2SF_FTYPE_V2SF_V2SF:
29397 case V1DI_FTYPE_V1DI_V1DI:
29398 case V1DI_FTYPE_V8QI_V8QI:
29399 case V1DI_FTYPE_V2SI_V2SI:
29400 case V32QI_FTYPE_V16HI_V16HI:
29401 case V16HI_FTYPE_V8SI_V8SI:
29402 case V32QI_FTYPE_V32QI_V32QI:
29403 case V16HI_FTYPE_V32QI_V32QI:
29404 case V16HI_FTYPE_V16HI_V16HI:
29405 case V8SI_FTYPE_V4DF_V4DF:
29406 case V8SI_FTYPE_V8SI_V8SI:
29407 case V8SI_FTYPE_V16HI_V16HI:
29408 case V4DI_FTYPE_V4DI_V4DI:
29409 case V4DI_FTYPE_V8SI_V8SI:
29410 case V4UDI_FTYPE_V8USI_V8USI:
29411 if (comparison == UNKNOWN)
29412 return ix86_expand_binop_builtin (icode, exp, target);
29413 nargs = 2;
29414 break;
29415 case V4SF_FTYPE_V4SF_V4SF_SWAP:
29416 case V2DF_FTYPE_V2DF_V2DF_SWAP:
29417 gcc_assert (comparison != UNKNOWN);
29418 nargs = 2;
29419 swap = true;
29420 break;
29421 case V16HI_FTYPE_V16HI_V8HI_COUNT:
29422 case V16HI_FTYPE_V16HI_SI_COUNT:
29423 case V8SI_FTYPE_V8SI_V4SI_COUNT:
29424 case V8SI_FTYPE_V8SI_SI_COUNT:
29425 case V4DI_FTYPE_V4DI_V2DI_COUNT:
29426 case V4DI_FTYPE_V4DI_INT_COUNT:
29427 case V8HI_FTYPE_V8HI_V8HI_COUNT:
29428 case V8HI_FTYPE_V8HI_SI_COUNT:
29429 case V4SI_FTYPE_V4SI_V4SI_COUNT:
29430 case V4SI_FTYPE_V4SI_SI_COUNT:
29431 case V4HI_FTYPE_V4HI_V4HI_COUNT:
29432 case V4HI_FTYPE_V4HI_SI_COUNT:
29433 case V2DI_FTYPE_V2DI_V2DI_COUNT:
29434 case V2DI_FTYPE_V2DI_SI_COUNT:
29435 case V2SI_FTYPE_V2SI_V2SI_COUNT:
29436 case V2SI_FTYPE_V2SI_SI_COUNT:
29437 case V1DI_FTYPE_V1DI_V1DI_COUNT:
29438 case V1DI_FTYPE_V1DI_SI_COUNT:
29439 nargs = 2;
29440 last_arg_count = true;
29441 break;
29442 case UINT64_FTYPE_UINT64_UINT64:
29443 case UINT_FTYPE_UINT_UINT:
29444 case UINT_FTYPE_UINT_USHORT:
29445 case UINT_FTYPE_UINT_UCHAR:
29446 case UINT16_FTYPE_UINT16_INT:
29447 case UINT8_FTYPE_UINT8_INT:
29448 nargs = 2;
29449 break;
29450 case V2DI_FTYPE_V2DI_INT_CONVERT:
29451 nargs = 2;
29452 rmode = V1TImode;
29453 nargs_constant = 1;
29454 break;
29455 case V4DI_FTYPE_V4DI_INT_CONVERT:
29456 nargs = 2;
29457 rmode = V2TImode;
29458 nargs_constant = 1;
29459 break;
29460 case V8HI_FTYPE_V8HI_INT:
29461 case V8HI_FTYPE_V8SF_INT:
29462 case V8HI_FTYPE_V4SF_INT:
29463 case V8SF_FTYPE_V8SF_INT:
29464 case V4SI_FTYPE_V4SI_INT:
29465 case V4SI_FTYPE_V8SI_INT:
29466 case V4HI_FTYPE_V4HI_INT:
29467 case V4DF_FTYPE_V4DF_INT:
29468 case V4SF_FTYPE_V4SF_INT:
29469 case V4SF_FTYPE_V8SF_INT:
29470 case V2DI_FTYPE_V2DI_INT:
29471 case V2DF_FTYPE_V2DF_INT:
29472 case V2DF_FTYPE_V4DF_INT:
29473 case V16HI_FTYPE_V16HI_INT:
29474 case V8SI_FTYPE_V8SI_INT:
29475 case V4DI_FTYPE_V4DI_INT:
29476 case V2DI_FTYPE_V4DI_INT:
29477 nargs = 2;
29478 nargs_constant = 1;
29479 break;
29480 case V16QI_FTYPE_V16QI_V16QI_V16QI:
29481 case V8SF_FTYPE_V8SF_V8SF_V8SF:
29482 case V4DF_FTYPE_V4DF_V4DF_V4DF:
29483 case V4SF_FTYPE_V4SF_V4SF_V4SF:
29484 case V2DF_FTYPE_V2DF_V2DF_V2DF:
29485 case V32QI_FTYPE_V32QI_V32QI_V32QI:
29486 nargs = 3;
29487 break;
29488 case V32QI_FTYPE_V32QI_V32QI_INT:
29489 case V16HI_FTYPE_V16HI_V16HI_INT:
29490 case V16QI_FTYPE_V16QI_V16QI_INT:
29491 case V4DI_FTYPE_V4DI_V4DI_INT:
29492 case V8HI_FTYPE_V8HI_V8HI_INT:
29493 case V8SI_FTYPE_V8SI_V8SI_INT:
29494 case V8SI_FTYPE_V8SI_V4SI_INT:
29495 case V8SF_FTYPE_V8SF_V8SF_INT:
29496 case V8SF_FTYPE_V8SF_V4SF_INT:
29497 case V4SI_FTYPE_V4SI_V4SI_INT:
29498 case V4DF_FTYPE_V4DF_V4DF_INT:
29499 case V4DF_FTYPE_V4DF_V2DF_INT:
29500 case V4SF_FTYPE_V4SF_V4SF_INT:
29501 case V2DI_FTYPE_V2DI_V2DI_INT:
29502 case V4DI_FTYPE_V4DI_V2DI_INT:
29503 case V2DF_FTYPE_V2DF_V2DF_INT:
29504 nargs = 3;
29505 nargs_constant = 1;
29506 break;
29507 case V4DI_FTYPE_V4DI_V4DI_INT_CONVERT:
29508 nargs = 3;
29509 rmode = V4DImode;
29510 nargs_constant = 1;
29511 break;
29512 case V2DI_FTYPE_V2DI_V2DI_INT_CONVERT:
29513 nargs = 3;
29514 rmode = V2DImode;
29515 nargs_constant = 1;
29516 break;
29517 case V1DI_FTYPE_V1DI_V1DI_INT_CONVERT:
29518 nargs = 3;
29519 rmode = DImode;
29520 nargs_constant = 1;
29521 break;
29522 case V2DI_FTYPE_V2DI_UINT_UINT:
29523 nargs = 3;
29524 nargs_constant = 2;
29525 break;
29526 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT:
29527 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT:
29528 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT:
29529 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT:
29530 nargs = 4;
29531 nargs_constant = 1;
29532 break;
29533 case V2DI_FTYPE_V2DI_V2DI_UINT_UINT:
29534 nargs = 4;
29535 nargs_constant = 2;
29536 break;
29537 case UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED:
29538 case UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG:
29539 nargs = 4;
29540 break;
29541 default:
29542 gcc_unreachable ();
29543 }
29544
29545 gcc_assert (nargs <= ARRAY_SIZE (args));
29546
29547 if (comparison != UNKNOWN)
29548 {
29549 gcc_assert (nargs == 2);
29550 return ix86_expand_sse_compare (d, exp, target, swap);
29551 }
29552
29553 if (rmode == VOIDmode || rmode == tmode)
29554 {
29555 if (optimize
29556 || target == 0
29557 || GET_MODE (target) != tmode
29558 || !insn_p->operand[0].predicate (target, tmode))
29559 target = gen_reg_rtx (tmode);
29560 real_target = target;
29561 }
29562 else
29563 {
29564 target = gen_reg_rtx (rmode);
29565 real_target = simplify_gen_subreg (tmode, target, rmode, 0);
29566 }
29567
29568 for (i = 0; i < nargs; i++)
29569 {
29570 tree arg = CALL_EXPR_ARG (exp, i);
29571 rtx op = expand_normal (arg);
29572 enum machine_mode mode = insn_p->operand[i + 1].mode;
29573 bool match = insn_p->operand[i + 1].predicate (op, mode);
29574
29575 if (last_arg_count && (i + 1) == nargs)
29576 {
29577 /* SIMD shift insns take either an 8-bit immediate or
29578 register as count. But builtin functions take int as
29579 count. If count doesn't match, we put it in register. */
29580 if (!match)
29581 {
29582 op = simplify_gen_subreg (SImode, op, GET_MODE (op), 0);
29583 if (!insn_p->operand[i + 1].predicate (op, mode))
29584 op = copy_to_reg (op);
29585 }
29586 }
29587 else if ((nargs - i) <= nargs_constant)
29588 {
29589 if (!match)
29590 switch (icode)
29591 {
29592 case CODE_FOR_avx2_inserti128:
29593 case CODE_FOR_avx2_extracti128:
29594 error ("the last argument must be an 1-bit immediate");
29595 return const0_rtx;
29596
29597 case CODE_FOR_sse4_1_roundsd:
29598 case CODE_FOR_sse4_1_roundss:
29599
29600 case CODE_FOR_sse4_1_roundpd:
29601 case CODE_FOR_sse4_1_roundps:
29602 case CODE_FOR_avx_roundpd256:
29603 case CODE_FOR_avx_roundps256:
29604
29605 case CODE_FOR_sse4_1_roundpd_vec_pack_sfix:
29606 case CODE_FOR_sse4_1_roundps_sfix:
29607 case CODE_FOR_avx_roundpd_vec_pack_sfix256:
29608 case CODE_FOR_avx_roundps_sfix256:
29609
29610 case CODE_FOR_sse4_1_blendps:
29611 case CODE_FOR_avx_blendpd256:
29612 case CODE_FOR_avx_vpermilv4df:
29613 error ("the last argument must be a 4-bit immediate");
29614 return const0_rtx;
29615
29616 case CODE_FOR_sse4_1_blendpd:
29617 case CODE_FOR_avx_vpermilv2df:
29618 case CODE_FOR_xop_vpermil2v2df3:
29619 case CODE_FOR_xop_vpermil2v4sf3:
29620 case CODE_FOR_xop_vpermil2v4df3:
29621 case CODE_FOR_xop_vpermil2v8sf3:
29622 error ("the last argument must be a 2-bit immediate");
29623 return const0_rtx;
29624
29625 case CODE_FOR_avx_vextractf128v4df:
29626 case CODE_FOR_avx_vextractf128v8sf:
29627 case CODE_FOR_avx_vextractf128v8si:
29628 case CODE_FOR_avx_vinsertf128v4df:
29629 case CODE_FOR_avx_vinsertf128v8sf:
29630 case CODE_FOR_avx_vinsertf128v8si:
29631 error ("the last argument must be a 1-bit immediate");
29632 return const0_rtx;
29633
29634 case CODE_FOR_avx_vmcmpv2df3:
29635 case CODE_FOR_avx_vmcmpv4sf3:
29636 case CODE_FOR_avx_cmpv2df3:
29637 case CODE_FOR_avx_cmpv4sf3:
29638 case CODE_FOR_avx_cmpv4df3:
29639 case CODE_FOR_avx_cmpv8sf3:
29640 error ("the last argument must be a 5-bit immediate");
29641 return const0_rtx;
29642
29643 default:
29644 switch (nargs_constant)
29645 {
29646 case 2:
29647 if ((nargs - i) == nargs_constant)
29648 {
29649 error ("the next to last argument must be an 8-bit immediate");
29650 break;
29651 }
29652 case 1:
29653 error ("the last argument must be an 8-bit immediate");
29654 break;
29655 default:
29656 gcc_unreachable ();
29657 }
29658 return const0_rtx;
29659 }
29660 }
29661 else
29662 {
29663 if (VECTOR_MODE_P (mode))
29664 op = safe_vector_operand (op, mode);
29665
29666 /* If we aren't optimizing, only allow one memory operand to
29667 be generated. */
29668 if (memory_operand (op, mode))
29669 num_memory++;
29670
29671 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
29672 {
29673 if (optimize || !match || num_memory > 1)
29674 op = copy_to_mode_reg (mode, op);
29675 }
29676 else
29677 {
29678 op = copy_to_reg (op);
29679 op = simplify_gen_subreg (mode, op, GET_MODE (op), 0);
29680 }
29681 }
29682
29683 args[i].op = op;
29684 args[i].mode = mode;
29685 }
29686
29687 switch (nargs)
29688 {
29689 case 1:
29690 pat = GEN_FCN (icode) (real_target, args[0].op);
29691 break;
29692 case 2:
29693 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op);
29694 break;
29695 case 3:
29696 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
29697 args[2].op);
29698 break;
29699 case 4:
29700 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
29701 args[2].op, args[3].op);
29702 break;
29703 default:
29704 gcc_unreachable ();
29705 }
29706
29707 if (! pat)
29708 return 0;
29709
29710 emit_insn (pat);
29711 return target;
29712 }
29713
29714 /* Subroutine of ix86_expand_builtin to take care of special insns
29715 with variable number of operands. */
29716
29717 static rtx
29718 ix86_expand_special_args_builtin (const struct builtin_description *d,
29719 tree exp, rtx target)
29720 {
29721 tree arg;
29722 rtx pat, op;
29723 unsigned int i, nargs, arg_adjust, memory;
29724 struct
29725 {
29726 rtx op;
29727 enum machine_mode mode;
29728 } args[3];
29729 enum insn_code icode = d->icode;
29730 bool last_arg_constant = false;
29731 const struct insn_data_d *insn_p = &insn_data[icode];
29732 enum machine_mode tmode = insn_p->operand[0].mode;
29733 enum { load, store } klass;
29734
29735 switch ((enum ix86_builtin_func_type) d->flag)
29736 {
29737 case VOID_FTYPE_VOID:
29738 if (icode == CODE_FOR_avx_vzeroupper)
29739 target = GEN_INT (vzeroupper_intrinsic);
29740 emit_insn (GEN_FCN (icode) (target));
29741 return 0;
29742 case VOID_FTYPE_UINT64:
29743 case VOID_FTYPE_UNSIGNED:
29744 nargs = 0;
29745 klass = store;
29746 memory = 0;
29747 break;
29748
29749 case INT_FTYPE_VOID:
29750 case UINT64_FTYPE_VOID:
29751 case UNSIGNED_FTYPE_VOID:
29752 nargs = 0;
29753 klass = load;
29754 memory = 0;
29755 break;
29756 case UINT64_FTYPE_PUNSIGNED:
29757 case V2DI_FTYPE_PV2DI:
29758 case V4DI_FTYPE_PV4DI:
29759 case V32QI_FTYPE_PCCHAR:
29760 case V16QI_FTYPE_PCCHAR:
29761 case V8SF_FTYPE_PCV4SF:
29762 case V8SF_FTYPE_PCFLOAT:
29763 case V4SF_FTYPE_PCFLOAT:
29764 case V4DF_FTYPE_PCV2DF:
29765 case V4DF_FTYPE_PCDOUBLE:
29766 case V2DF_FTYPE_PCDOUBLE:
29767 case VOID_FTYPE_PVOID:
29768 nargs = 1;
29769 klass = load;
29770 memory = 0;
29771 break;
29772 case VOID_FTYPE_PV2SF_V4SF:
29773 case VOID_FTYPE_PV4DI_V4DI:
29774 case VOID_FTYPE_PV2DI_V2DI:
29775 case VOID_FTYPE_PCHAR_V32QI:
29776 case VOID_FTYPE_PCHAR_V16QI:
29777 case VOID_FTYPE_PFLOAT_V8SF:
29778 case VOID_FTYPE_PFLOAT_V4SF:
29779 case VOID_FTYPE_PDOUBLE_V4DF:
29780 case VOID_FTYPE_PDOUBLE_V2DF:
29781 case VOID_FTYPE_PLONGLONG_LONGLONG:
29782 case VOID_FTYPE_PULONGLONG_ULONGLONG:
29783 case VOID_FTYPE_PINT_INT:
29784 nargs = 1;
29785 klass = store;
29786 /* Reserve memory operand for target. */
29787 memory = ARRAY_SIZE (args);
29788 break;
29789 case V4SF_FTYPE_V4SF_PCV2SF:
29790 case V2DF_FTYPE_V2DF_PCDOUBLE:
29791 nargs = 2;
29792 klass = load;
29793 memory = 1;
29794 break;
29795 case V8SF_FTYPE_PCV8SF_V8SI:
29796 case V4DF_FTYPE_PCV4DF_V4DI:
29797 case V4SF_FTYPE_PCV4SF_V4SI:
29798 case V2DF_FTYPE_PCV2DF_V2DI:
29799 case V8SI_FTYPE_PCV8SI_V8SI:
29800 case V4DI_FTYPE_PCV4DI_V4DI:
29801 case V4SI_FTYPE_PCV4SI_V4SI:
29802 case V2DI_FTYPE_PCV2DI_V2DI:
29803 nargs = 2;
29804 klass = load;
29805 memory = 0;
29806 break;
29807 case VOID_FTYPE_PV8SF_V8SI_V8SF:
29808 case VOID_FTYPE_PV4DF_V4DI_V4DF:
29809 case VOID_FTYPE_PV4SF_V4SI_V4SF:
29810 case VOID_FTYPE_PV2DF_V2DI_V2DF:
29811 case VOID_FTYPE_PV8SI_V8SI_V8SI:
29812 case VOID_FTYPE_PV4DI_V4DI_V4DI:
29813 case VOID_FTYPE_PV4SI_V4SI_V4SI:
29814 case VOID_FTYPE_PV2DI_V2DI_V2DI:
29815 nargs = 2;
29816 klass = store;
29817 /* Reserve memory operand for target. */
29818 memory = ARRAY_SIZE (args);
29819 break;
29820 case VOID_FTYPE_UINT_UINT_UINT:
29821 case VOID_FTYPE_UINT64_UINT_UINT:
29822 case UCHAR_FTYPE_UINT_UINT_UINT:
29823 case UCHAR_FTYPE_UINT64_UINT_UINT:
29824 nargs = 3;
29825 klass = load;
29826 memory = ARRAY_SIZE (args);
29827 last_arg_constant = true;
29828 break;
29829 default:
29830 gcc_unreachable ();
29831 }
29832
29833 gcc_assert (nargs <= ARRAY_SIZE (args));
29834
29835 if (klass == store)
29836 {
29837 arg = CALL_EXPR_ARG (exp, 0);
29838 op = expand_normal (arg);
29839 gcc_assert (target == 0);
29840 if (memory)
29841 {
29842 if (GET_MODE (op) != Pmode)
29843 op = convert_to_mode (Pmode, op, 1);
29844 target = gen_rtx_MEM (tmode, force_reg (Pmode, op));
29845 }
29846 else
29847 target = force_reg (tmode, op);
29848 arg_adjust = 1;
29849 }
29850 else
29851 {
29852 arg_adjust = 0;
29853 if (optimize
29854 || target == 0
29855 || !register_operand (target, tmode)
29856 || GET_MODE (target) != tmode)
29857 target = gen_reg_rtx (tmode);
29858 }
29859
29860 for (i = 0; i < nargs; i++)
29861 {
29862 enum machine_mode mode = insn_p->operand[i + 1].mode;
29863 bool match;
29864
29865 arg = CALL_EXPR_ARG (exp, i + arg_adjust);
29866 op = expand_normal (arg);
29867 match = insn_p->operand[i + 1].predicate (op, mode);
29868
29869 if (last_arg_constant && (i + 1) == nargs)
29870 {
29871 if (!match)
29872 {
29873 if (icode == CODE_FOR_lwp_lwpvalsi3
29874 || icode == CODE_FOR_lwp_lwpinssi3
29875 || icode == CODE_FOR_lwp_lwpvaldi3
29876 || icode == CODE_FOR_lwp_lwpinsdi3)
29877 error ("the last argument must be a 32-bit immediate");
29878 else
29879 error ("the last argument must be an 8-bit immediate");
29880 return const0_rtx;
29881 }
29882 }
29883 else
29884 {
29885 if (i == memory)
29886 {
29887 /* This must be the memory operand. */
29888 if (GET_MODE (op) != Pmode)
29889 op = convert_to_mode (Pmode, op, 1);
29890 op = gen_rtx_MEM (mode, force_reg (Pmode, op));
29891 gcc_assert (GET_MODE (op) == mode
29892 || GET_MODE (op) == VOIDmode);
29893 }
29894 else
29895 {
29896 /* This must be register. */
29897 if (VECTOR_MODE_P (mode))
29898 op = safe_vector_operand (op, mode);
29899
29900 gcc_assert (GET_MODE (op) == mode
29901 || GET_MODE (op) == VOIDmode);
29902 op = copy_to_mode_reg (mode, op);
29903 }
29904 }
29905
29906 args[i].op = op;
29907 args[i].mode = mode;
29908 }
29909
29910 switch (nargs)
29911 {
29912 case 0:
29913 pat = GEN_FCN (icode) (target);
29914 break;
29915 case 1:
29916 pat = GEN_FCN (icode) (target, args[0].op);
29917 break;
29918 case 2:
29919 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
29920 break;
29921 case 3:
29922 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
29923 break;
29924 default:
29925 gcc_unreachable ();
29926 }
29927
29928 if (! pat)
29929 return 0;
29930 emit_insn (pat);
29931 return klass == store ? 0 : target;
29932 }
29933
29934 /* Return the integer constant in ARG. Constrain it to be in the range
29935 of the subparts of VEC_TYPE; issue an error if not. */
29936
29937 static int
29938 get_element_number (tree vec_type, tree arg)
29939 {
29940 unsigned HOST_WIDE_INT elt, max = TYPE_VECTOR_SUBPARTS (vec_type) - 1;
29941
29942 if (!host_integerp (arg, 1)
29943 || (elt = tree_low_cst (arg, 1), elt > max))
29944 {
29945 error ("selector must be an integer constant in the range 0..%wi", max);
29946 return 0;
29947 }
29948
29949 return elt;
29950 }
29951
29952 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
29953 ix86_expand_vector_init. We DO have language-level syntax for this, in
29954 the form of (type){ init-list }. Except that since we can't place emms
29955 instructions from inside the compiler, we can't allow the use of MMX
29956 registers unless the user explicitly asks for it. So we do *not* define
29957 vec_set/vec_extract/vec_init patterns for MMX modes in mmx.md. Instead
29958 we have builtins invoked by mmintrin.h that gives us license to emit
29959 these sorts of instructions. */
29960
29961 static rtx
29962 ix86_expand_vec_init_builtin (tree type, tree exp, rtx target)
29963 {
29964 enum machine_mode tmode = TYPE_MODE (type);
29965 enum machine_mode inner_mode = GET_MODE_INNER (tmode);
29966 int i, n_elt = GET_MODE_NUNITS (tmode);
29967 rtvec v = rtvec_alloc (n_elt);
29968
29969 gcc_assert (VECTOR_MODE_P (tmode));
29970 gcc_assert (call_expr_nargs (exp) == n_elt);
29971
29972 for (i = 0; i < n_elt; ++i)
29973 {
29974 rtx x = expand_normal (CALL_EXPR_ARG (exp, i));
29975 RTVEC_ELT (v, i) = gen_lowpart (inner_mode, x);
29976 }
29977
29978 if (!target || !register_operand (target, tmode))
29979 target = gen_reg_rtx (tmode);
29980
29981 ix86_expand_vector_init (true, target, gen_rtx_PARALLEL (tmode, v));
29982 return target;
29983 }
29984
29985 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
29986 ix86_expand_vector_extract. They would be redundant (for non-MMX) if we
29987 had a language-level syntax for referencing vector elements. */
29988
29989 static rtx
29990 ix86_expand_vec_ext_builtin (tree exp, rtx target)
29991 {
29992 enum machine_mode tmode, mode0;
29993 tree arg0, arg1;
29994 int elt;
29995 rtx op0;
29996
29997 arg0 = CALL_EXPR_ARG (exp, 0);
29998 arg1 = CALL_EXPR_ARG (exp, 1);
29999
30000 op0 = expand_normal (arg0);
30001 elt = get_element_number (TREE_TYPE (arg0), arg1);
30002
30003 tmode = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
30004 mode0 = TYPE_MODE (TREE_TYPE (arg0));
30005 gcc_assert (VECTOR_MODE_P (mode0));
30006
30007 op0 = force_reg (mode0, op0);
30008
30009 if (optimize || !target || !register_operand (target, tmode))
30010 target = gen_reg_rtx (tmode);
30011
30012 ix86_expand_vector_extract (true, target, op0, elt);
30013
30014 return target;
30015 }
30016
30017 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
30018 ix86_expand_vector_set. They would be redundant (for non-MMX) if we had
30019 a language-level syntax for referencing vector elements. */
30020
30021 static rtx
30022 ix86_expand_vec_set_builtin (tree exp)
30023 {
30024 enum machine_mode tmode, mode1;
30025 tree arg0, arg1, arg2;
30026 int elt;
30027 rtx op0, op1, target;
30028
30029 arg0 = CALL_EXPR_ARG (exp, 0);
30030 arg1 = CALL_EXPR_ARG (exp, 1);
30031 arg2 = CALL_EXPR_ARG (exp, 2);
30032
30033 tmode = TYPE_MODE (TREE_TYPE (arg0));
30034 mode1 = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
30035 gcc_assert (VECTOR_MODE_P (tmode));
30036
30037 op0 = expand_expr (arg0, NULL_RTX, tmode, EXPAND_NORMAL);
30038 op1 = expand_expr (arg1, NULL_RTX, mode1, EXPAND_NORMAL);
30039 elt = get_element_number (TREE_TYPE (arg0), arg2);
30040
30041 if (GET_MODE (op1) != mode1 && GET_MODE (op1) != VOIDmode)
30042 op1 = convert_modes (mode1, GET_MODE (op1), op1, true);
30043
30044 op0 = force_reg (tmode, op0);
30045 op1 = force_reg (mode1, op1);
30046
30047 /* OP0 is the source of these builtin functions and shouldn't be
30048 modified. Create a copy, use it and return it as target. */
30049 target = gen_reg_rtx (tmode);
30050 emit_move_insn (target, op0);
30051 ix86_expand_vector_set (true, target, op1, elt);
30052
30053 return target;
30054 }
30055
30056 /* Expand an expression EXP that calls a built-in function,
30057 with result going to TARGET if that's convenient
30058 (and in mode MODE if that's convenient).
30059 SUBTARGET may be used as the target for computing one of EXP's operands.
30060 IGNORE is nonzero if the value is to be ignored. */
30061
30062 static rtx
30063 ix86_expand_builtin (tree exp, rtx target, rtx subtarget ATTRIBUTE_UNUSED,
30064 enum machine_mode mode ATTRIBUTE_UNUSED,
30065 int ignore ATTRIBUTE_UNUSED)
30066 {
30067 const struct builtin_description *d;
30068 size_t i;
30069 enum insn_code icode;
30070 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
30071 tree arg0, arg1, arg2, arg3, arg4;
30072 rtx op0, op1, op2, op3, op4, pat;
30073 enum machine_mode mode0, mode1, mode2, mode3, mode4;
30074 unsigned int fcode = DECL_FUNCTION_CODE (fndecl);
30075
30076 /* For CPU builtins that can be folded, fold first and expand the fold. */
30077 switch (fcode)
30078 {
30079 case IX86_BUILTIN_CPU_INIT:
30080 {
30081 /* Make it call __cpu_indicator_init in libgcc. */
30082 tree call_expr, fndecl, type;
30083 type = build_function_type_list (integer_type_node, NULL_TREE);
30084 fndecl = build_fn_decl ("__cpu_indicator_init", type);
30085 call_expr = build_call_expr (fndecl, 0);
30086 return expand_expr (call_expr, target, mode, EXPAND_NORMAL);
30087 }
30088 case IX86_BUILTIN_CPU_IS:
30089 case IX86_BUILTIN_CPU_SUPPORTS:
30090 {
30091 tree arg0 = CALL_EXPR_ARG (exp, 0);
30092 tree fold_expr = fold_builtin_cpu (fndecl, &arg0);
30093 gcc_assert (fold_expr != NULL_TREE);
30094 return expand_expr (fold_expr, target, mode, EXPAND_NORMAL);
30095 }
30096 }
30097
30098 /* Determine whether the builtin function is available under the current ISA.
30099 Originally the builtin was not created if it wasn't applicable to the
30100 current ISA based on the command line switches. With function specific
30101 options, we need to check in the context of the function making the call
30102 whether it is supported. */
30103 if (ix86_builtins_isa[fcode].isa
30104 && !(ix86_builtins_isa[fcode].isa & ix86_isa_flags))
30105 {
30106 char *opts = ix86_target_string (ix86_builtins_isa[fcode].isa, 0, NULL,
30107 NULL, (enum fpmath_unit) 0, false);
30108
30109 if (!opts)
30110 error ("%qE needs unknown isa option", fndecl);
30111 else
30112 {
30113 gcc_assert (opts != NULL);
30114 error ("%qE needs isa option %s", fndecl, opts);
30115 free (opts);
30116 }
30117 return const0_rtx;
30118 }
30119
30120 switch (fcode)
30121 {
30122 case IX86_BUILTIN_MASKMOVQ:
30123 case IX86_BUILTIN_MASKMOVDQU:
30124 icode = (fcode == IX86_BUILTIN_MASKMOVQ
30125 ? CODE_FOR_mmx_maskmovq
30126 : CODE_FOR_sse2_maskmovdqu);
30127 /* Note the arg order is different from the operand order. */
30128 arg1 = CALL_EXPR_ARG (exp, 0);
30129 arg2 = CALL_EXPR_ARG (exp, 1);
30130 arg0 = CALL_EXPR_ARG (exp, 2);
30131 op0 = expand_normal (arg0);
30132 op1 = expand_normal (arg1);
30133 op2 = expand_normal (arg2);
30134 mode0 = insn_data[icode].operand[0].mode;
30135 mode1 = insn_data[icode].operand[1].mode;
30136 mode2 = insn_data[icode].operand[2].mode;
30137
30138 if (GET_MODE (op0) != Pmode)
30139 op0 = convert_to_mode (Pmode, op0, 1);
30140 op0 = gen_rtx_MEM (mode1, force_reg (Pmode, op0));
30141
30142 if (!insn_data[icode].operand[0].predicate (op0, mode0))
30143 op0 = copy_to_mode_reg (mode0, op0);
30144 if (!insn_data[icode].operand[1].predicate (op1, mode1))
30145 op1 = copy_to_mode_reg (mode1, op1);
30146 if (!insn_data[icode].operand[2].predicate (op2, mode2))
30147 op2 = copy_to_mode_reg (mode2, op2);
30148 pat = GEN_FCN (icode) (op0, op1, op2);
30149 if (! pat)
30150 return 0;
30151 emit_insn (pat);
30152 return 0;
30153
30154 case IX86_BUILTIN_LDMXCSR:
30155 op0 = expand_normal (CALL_EXPR_ARG (exp, 0));
30156 target = assign_386_stack_local (SImode, SLOT_VIRTUAL);
30157 emit_move_insn (target, op0);
30158 emit_insn (gen_sse_ldmxcsr (target));
30159 return 0;
30160
30161 case IX86_BUILTIN_STMXCSR:
30162 target = assign_386_stack_local (SImode, SLOT_VIRTUAL);
30163 emit_insn (gen_sse_stmxcsr (target));
30164 return copy_to_mode_reg (SImode, target);
30165
30166 case IX86_BUILTIN_CLFLUSH:
30167 arg0 = CALL_EXPR_ARG (exp, 0);
30168 op0 = expand_normal (arg0);
30169 icode = CODE_FOR_sse2_clflush;
30170 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
30171 {
30172 if (GET_MODE (op0) != Pmode)
30173 op0 = convert_to_mode (Pmode, op0, 1);
30174 op0 = force_reg (Pmode, op0);
30175 }
30176
30177 emit_insn (gen_sse2_clflush (op0));
30178 return 0;
30179
30180 case IX86_BUILTIN_MONITOR:
30181 arg0 = CALL_EXPR_ARG (exp, 0);
30182 arg1 = CALL_EXPR_ARG (exp, 1);
30183 arg2 = CALL_EXPR_ARG (exp, 2);
30184 op0 = expand_normal (arg0);
30185 op1 = expand_normal (arg1);
30186 op2 = expand_normal (arg2);
30187 if (!REG_P (op0))
30188 {
30189 if (GET_MODE (op0) != Pmode)
30190 op0 = convert_to_mode (Pmode, op0, 1);
30191 op0 = force_reg (Pmode, op0);
30192 }
30193 if (!REG_P (op1))
30194 op1 = copy_to_mode_reg (SImode, op1);
30195 if (!REG_P (op2))
30196 op2 = copy_to_mode_reg (SImode, op2);
30197 emit_insn (ix86_gen_monitor (op0, op1, op2));
30198 return 0;
30199
30200 case IX86_BUILTIN_MWAIT:
30201 arg0 = CALL_EXPR_ARG (exp, 0);
30202 arg1 = CALL_EXPR_ARG (exp, 1);
30203 op0 = expand_normal (arg0);
30204 op1 = expand_normal (arg1);
30205 if (!REG_P (op0))
30206 op0 = copy_to_mode_reg (SImode, op0);
30207 if (!REG_P (op1))
30208 op1 = copy_to_mode_reg (SImode, op1);
30209 emit_insn (gen_sse3_mwait (op0, op1));
30210 return 0;
30211
30212 case IX86_BUILTIN_VEC_INIT_V2SI:
30213 case IX86_BUILTIN_VEC_INIT_V4HI:
30214 case IX86_BUILTIN_VEC_INIT_V8QI:
30215 return ix86_expand_vec_init_builtin (TREE_TYPE (exp), exp, target);
30216
30217 case IX86_BUILTIN_VEC_EXT_V2DF:
30218 case IX86_BUILTIN_VEC_EXT_V2DI:
30219 case IX86_BUILTIN_VEC_EXT_V4SF:
30220 case IX86_BUILTIN_VEC_EXT_V4SI:
30221 case IX86_BUILTIN_VEC_EXT_V8HI:
30222 case IX86_BUILTIN_VEC_EXT_V2SI:
30223 case IX86_BUILTIN_VEC_EXT_V4HI:
30224 case IX86_BUILTIN_VEC_EXT_V16QI:
30225 return ix86_expand_vec_ext_builtin (exp, target);
30226
30227 case IX86_BUILTIN_VEC_SET_V2DI:
30228 case IX86_BUILTIN_VEC_SET_V4SF:
30229 case IX86_BUILTIN_VEC_SET_V4SI:
30230 case IX86_BUILTIN_VEC_SET_V8HI:
30231 case IX86_BUILTIN_VEC_SET_V4HI:
30232 case IX86_BUILTIN_VEC_SET_V16QI:
30233 return ix86_expand_vec_set_builtin (exp);
30234
30235 case IX86_BUILTIN_INFQ:
30236 case IX86_BUILTIN_HUGE_VALQ:
30237 {
30238 REAL_VALUE_TYPE inf;
30239 rtx tmp;
30240
30241 real_inf (&inf);
30242 tmp = CONST_DOUBLE_FROM_REAL_VALUE (inf, mode);
30243
30244 tmp = validize_mem (force_const_mem (mode, tmp));
30245
30246 if (target == 0)
30247 target = gen_reg_rtx (mode);
30248
30249 emit_move_insn (target, tmp);
30250 return target;
30251 }
30252
30253 case IX86_BUILTIN_LLWPCB:
30254 arg0 = CALL_EXPR_ARG (exp, 0);
30255 op0 = expand_normal (arg0);
30256 icode = CODE_FOR_lwp_llwpcb;
30257 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
30258 {
30259 if (GET_MODE (op0) != Pmode)
30260 op0 = convert_to_mode (Pmode, op0, 1);
30261 op0 = force_reg (Pmode, op0);
30262 }
30263 emit_insn (gen_lwp_llwpcb (op0));
30264 return 0;
30265
30266 case IX86_BUILTIN_SLWPCB:
30267 icode = CODE_FOR_lwp_slwpcb;
30268 if (!target
30269 || !insn_data[icode].operand[0].predicate (target, Pmode))
30270 target = gen_reg_rtx (Pmode);
30271 emit_insn (gen_lwp_slwpcb (target));
30272 return target;
30273
30274 case IX86_BUILTIN_BEXTRI32:
30275 case IX86_BUILTIN_BEXTRI64:
30276 arg0 = CALL_EXPR_ARG (exp, 0);
30277 arg1 = CALL_EXPR_ARG (exp, 1);
30278 op0 = expand_normal (arg0);
30279 op1 = expand_normal (arg1);
30280 icode = (fcode == IX86_BUILTIN_BEXTRI32
30281 ? CODE_FOR_tbm_bextri_si
30282 : CODE_FOR_tbm_bextri_di);
30283 if (!CONST_INT_P (op1))
30284 {
30285 error ("last argument must be an immediate");
30286 return const0_rtx;
30287 }
30288 else
30289 {
30290 unsigned char length = (INTVAL (op1) >> 8) & 0xFF;
30291 unsigned char lsb_index = INTVAL (op1) & 0xFF;
30292 op1 = GEN_INT (length);
30293 op2 = GEN_INT (lsb_index);
30294 pat = GEN_FCN (icode) (target, op0, op1, op2);
30295 if (pat)
30296 emit_insn (pat);
30297 return target;
30298 }
30299
30300 case IX86_BUILTIN_RDRAND16_STEP:
30301 icode = CODE_FOR_rdrandhi_1;
30302 mode0 = HImode;
30303 goto rdrand_step;
30304
30305 case IX86_BUILTIN_RDRAND32_STEP:
30306 icode = CODE_FOR_rdrandsi_1;
30307 mode0 = SImode;
30308 goto rdrand_step;
30309
30310 case IX86_BUILTIN_RDRAND64_STEP:
30311 icode = CODE_FOR_rdranddi_1;
30312 mode0 = DImode;
30313
30314 rdrand_step:
30315 op0 = gen_reg_rtx (mode0);
30316 emit_insn (GEN_FCN (icode) (op0));
30317
30318 arg0 = CALL_EXPR_ARG (exp, 0);
30319 op1 = expand_normal (arg0);
30320 if (!address_operand (op1, VOIDmode))
30321 {
30322 op1 = convert_memory_address (Pmode, op1);
30323 op1 = copy_addr_to_reg (op1);
30324 }
30325 emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
30326
30327 op1 = gen_reg_rtx (SImode);
30328 emit_move_insn (op1, CONST1_RTX (SImode));
30329
30330 /* Emit SImode conditional move. */
30331 if (mode0 == HImode)
30332 {
30333 op2 = gen_reg_rtx (SImode);
30334 emit_insn (gen_zero_extendhisi2 (op2, op0));
30335 }
30336 else if (mode0 == SImode)
30337 op2 = op0;
30338 else
30339 op2 = gen_rtx_SUBREG (SImode, op0, 0);
30340
30341 if (target == 0)
30342 target = gen_reg_rtx (SImode);
30343
30344 pat = gen_rtx_GEU (VOIDmode, gen_rtx_REG (CCCmode, FLAGS_REG),
30345 const0_rtx);
30346 emit_insn (gen_rtx_SET (VOIDmode, target,
30347 gen_rtx_IF_THEN_ELSE (SImode, pat, op2, op1)));
30348 return target;
30349
30350 case IX86_BUILTIN_RDSEED16_STEP:
30351 icode = CODE_FOR_rdseedhi_1;
30352 mode0 = HImode;
30353 goto rdseed_step;
30354
30355 case IX86_BUILTIN_RDSEED32_STEP:
30356 icode = CODE_FOR_rdseedsi_1;
30357 mode0 = SImode;
30358 goto rdseed_step;
30359
30360 case IX86_BUILTIN_RDSEED64_STEP:
30361 icode = CODE_FOR_rdseeddi_1;
30362 mode0 = DImode;
30363
30364 rdseed_step:
30365 op0 = gen_reg_rtx (mode0);
30366 emit_insn (GEN_FCN (icode) (op0));
30367
30368 arg0 = CALL_EXPR_ARG (exp, 0);
30369 op1 = expand_normal (arg0);
30370 if (!address_operand (op1, VOIDmode))
30371 {
30372 op1 = convert_memory_address (Pmode, op1);
30373 op1 = copy_addr_to_reg (op1);
30374 }
30375 emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
30376
30377 op2 = gen_reg_rtx (QImode);
30378
30379 pat = gen_rtx_LTU (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
30380 const0_rtx);
30381 emit_insn (gen_rtx_SET (VOIDmode, op2, pat));
30382
30383 if (target == 0)
30384 target = gen_reg_rtx (SImode);
30385
30386 emit_insn (gen_zero_extendqisi2 (target, op2));
30387 return target;
30388
30389 case IX86_BUILTIN_ADDCARRYX32:
30390 icode = TARGET_ADX ? CODE_FOR_adcxsi3 : CODE_FOR_addsi3_carry;
30391 mode0 = SImode;
30392 goto addcarryx;
30393
30394 case IX86_BUILTIN_ADDCARRYX64:
30395 icode = TARGET_ADX ? CODE_FOR_adcxdi3 : CODE_FOR_adddi3_carry;
30396 mode0 = DImode;
30397
30398 addcarryx:
30399 arg0 = CALL_EXPR_ARG (exp, 0); /* unsigned char c_in. */
30400 arg1 = CALL_EXPR_ARG (exp, 1); /* unsigned int src1. */
30401 arg2 = CALL_EXPR_ARG (exp, 2); /* unsigned int src2. */
30402 arg3 = CALL_EXPR_ARG (exp, 3); /* unsigned int *sum_out. */
30403
30404 op0 = gen_reg_rtx (QImode);
30405
30406 /* Generate CF from input operand. */
30407 op1 = expand_normal (arg0);
30408 if (GET_MODE (op1) != QImode)
30409 op1 = convert_to_mode (QImode, op1, 1);
30410 op1 = copy_to_mode_reg (QImode, op1);
30411 emit_insn (gen_addqi3_cc (op0, op1, constm1_rtx));
30412
30413 /* Gen ADCX instruction to compute X+Y+CF. */
30414 op2 = expand_normal (arg1);
30415 op3 = expand_normal (arg2);
30416
30417 if (!REG_P (op2))
30418 op2 = copy_to_mode_reg (mode0, op2);
30419 if (!REG_P (op3))
30420 op3 = copy_to_mode_reg (mode0, op3);
30421
30422 op0 = gen_reg_rtx (mode0);
30423
30424 op4 = gen_rtx_REG (CCCmode, FLAGS_REG);
30425 pat = gen_rtx_LTU (VOIDmode, op4, const0_rtx);
30426 emit_insn (GEN_FCN (icode) (op0, op2, op3, op4, pat));
30427
30428 /* Store the result. */
30429 op4 = expand_normal (arg3);
30430 if (!address_operand (op4, VOIDmode))
30431 {
30432 op4 = convert_memory_address (Pmode, op4);
30433 op4 = copy_addr_to_reg (op4);
30434 }
30435 emit_move_insn (gen_rtx_MEM (mode0, op4), op0);
30436
30437 /* Return current CF value. */
30438 if (target == 0)
30439 target = gen_reg_rtx (QImode);
30440
30441 PUT_MODE (pat, QImode);
30442 emit_insn (gen_rtx_SET (VOIDmode, target, pat));
30443 return target;
30444
30445 case IX86_BUILTIN_GATHERSIV2DF:
30446 icode = CODE_FOR_avx2_gathersiv2df;
30447 goto gather_gen;
30448 case IX86_BUILTIN_GATHERSIV4DF:
30449 icode = CODE_FOR_avx2_gathersiv4df;
30450 goto gather_gen;
30451 case IX86_BUILTIN_GATHERDIV2DF:
30452 icode = CODE_FOR_avx2_gatherdiv2df;
30453 goto gather_gen;
30454 case IX86_BUILTIN_GATHERDIV4DF:
30455 icode = CODE_FOR_avx2_gatherdiv4df;
30456 goto gather_gen;
30457 case IX86_BUILTIN_GATHERSIV4SF:
30458 icode = CODE_FOR_avx2_gathersiv4sf;
30459 goto gather_gen;
30460 case IX86_BUILTIN_GATHERSIV8SF:
30461 icode = CODE_FOR_avx2_gathersiv8sf;
30462 goto gather_gen;
30463 case IX86_BUILTIN_GATHERDIV4SF:
30464 icode = CODE_FOR_avx2_gatherdiv4sf;
30465 goto gather_gen;
30466 case IX86_BUILTIN_GATHERDIV8SF:
30467 icode = CODE_FOR_avx2_gatherdiv8sf;
30468 goto gather_gen;
30469 case IX86_BUILTIN_GATHERSIV2DI:
30470 icode = CODE_FOR_avx2_gathersiv2di;
30471 goto gather_gen;
30472 case IX86_BUILTIN_GATHERSIV4DI:
30473 icode = CODE_FOR_avx2_gathersiv4di;
30474 goto gather_gen;
30475 case IX86_BUILTIN_GATHERDIV2DI:
30476 icode = CODE_FOR_avx2_gatherdiv2di;
30477 goto gather_gen;
30478 case IX86_BUILTIN_GATHERDIV4DI:
30479 icode = CODE_FOR_avx2_gatherdiv4di;
30480 goto gather_gen;
30481 case IX86_BUILTIN_GATHERSIV4SI:
30482 icode = CODE_FOR_avx2_gathersiv4si;
30483 goto gather_gen;
30484 case IX86_BUILTIN_GATHERSIV8SI:
30485 icode = CODE_FOR_avx2_gathersiv8si;
30486 goto gather_gen;
30487 case IX86_BUILTIN_GATHERDIV4SI:
30488 icode = CODE_FOR_avx2_gatherdiv4si;
30489 goto gather_gen;
30490 case IX86_BUILTIN_GATHERDIV8SI:
30491 icode = CODE_FOR_avx2_gatherdiv8si;
30492 goto gather_gen;
30493 case IX86_BUILTIN_GATHERALTSIV4DF:
30494 icode = CODE_FOR_avx2_gathersiv4df;
30495 goto gather_gen;
30496 case IX86_BUILTIN_GATHERALTDIV8SF:
30497 icode = CODE_FOR_avx2_gatherdiv8sf;
30498 goto gather_gen;
30499 case IX86_BUILTIN_GATHERALTSIV4DI:
30500 icode = CODE_FOR_avx2_gathersiv4di;
30501 goto gather_gen;
30502 case IX86_BUILTIN_GATHERALTDIV8SI:
30503 icode = CODE_FOR_avx2_gatherdiv8si;
30504 goto gather_gen;
30505
30506 gather_gen:
30507 arg0 = CALL_EXPR_ARG (exp, 0);
30508 arg1 = CALL_EXPR_ARG (exp, 1);
30509 arg2 = CALL_EXPR_ARG (exp, 2);
30510 arg3 = CALL_EXPR_ARG (exp, 3);
30511 arg4 = CALL_EXPR_ARG (exp, 4);
30512 op0 = expand_normal (arg0);
30513 op1 = expand_normal (arg1);
30514 op2 = expand_normal (arg2);
30515 op3 = expand_normal (arg3);
30516 op4 = expand_normal (arg4);
30517 /* Note the arg order is different from the operand order. */
30518 mode0 = insn_data[icode].operand[1].mode;
30519 mode2 = insn_data[icode].operand[3].mode;
30520 mode3 = insn_data[icode].operand[4].mode;
30521 mode4 = insn_data[icode].operand[5].mode;
30522
30523 if (target == NULL_RTX
30524 || GET_MODE (target) != insn_data[icode].operand[0].mode)
30525 subtarget = gen_reg_rtx (insn_data[icode].operand[0].mode);
30526 else
30527 subtarget = target;
30528
30529 if (fcode == IX86_BUILTIN_GATHERALTSIV4DF
30530 || fcode == IX86_BUILTIN_GATHERALTSIV4DI)
30531 {
30532 rtx half = gen_reg_rtx (V4SImode);
30533 if (!nonimmediate_operand (op2, V8SImode))
30534 op2 = copy_to_mode_reg (V8SImode, op2);
30535 emit_insn (gen_vec_extract_lo_v8si (half, op2));
30536 op2 = half;
30537 }
30538 else if (fcode == IX86_BUILTIN_GATHERALTDIV8SF
30539 || fcode == IX86_BUILTIN_GATHERALTDIV8SI)
30540 {
30541 rtx (*gen) (rtx, rtx);
30542 rtx half = gen_reg_rtx (mode0);
30543 if (mode0 == V4SFmode)
30544 gen = gen_vec_extract_lo_v8sf;
30545 else
30546 gen = gen_vec_extract_lo_v8si;
30547 if (!nonimmediate_operand (op0, GET_MODE (op0)))
30548 op0 = copy_to_mode_reg (GET_MODE (op0), op0);
30549 emit_insn (gen (half, op0));
30550 op0 = half;
30551 if (!nonimmediate_operand (op3, GET_MODE (op3)))
30552 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
30553 emit_insn (gen (half, op3));
30554 op3 = half;
30555 }
30556
30557 /* Force memory operand only with base register here. But we
30558 don't want to do it on memory operand for other builtin
30559 functions. */
30560 if (GET_MODE (op1) != Pmode)
30561 op1 = convert_to_mode (Pmode, op1, 1);
30562 op1 = force_reg (Pmode, op1);
30563
30564 if (!insn_data[icode].operand[1].predicate (op0, mode0))
30565 op0 = copy_to_mode_reg (mode0, op0);
30566 if (!insn_data[icode].operand[2].predicate (op1, Pmode))
30567 op1 = copy_to_mode_reg (Pmode, op1);
30568 if (!insn_data[icode].operand[3].predicate (op2, mode2))
30569 op2 = copy_to_mode_reg (mode2, op2);
30570 if (!insn_data[icode].operand[4].predicate (op3, mode3))
30571 op3 = copy_to_mode_reg (mode3, op3);
30572 if (!insn_data[icode].operand[5].predicate (op4, mode4))
30573 {
30574 error ("last argument must be scale 1, 2, 4, 8");
30575 return const0_rtx;
30576 }
30577
30578 /* Optimize. If mask is known to have all high bits set,
30579 replace op0 with pc_rtx to signal that the instruction
30580 overwrites the whole destination and doesn't use its
30581 previous contents. */
30582 if (optimize)
30583 {
30584 if (TREE_CODE (arg3) == VECTOR_CST)
30585 {
30586 unsigned int negative = 0;
30587 for (i = 0; i < VECTOR_CST_NELTS (arg3); ++i)
30588 {
30589 tree cst = VECTOR_CST_ELT (arg3, i);
30590 if (TREE_CODE (cst) == INTEGER_CST
30591 && tree_int_cst_sign_bit (cst))
30592 negative++;
30593 else if (TREE_CODE (cst) == REAL_CST
30594 && REAL_VALUE_NEGATIVE (TREE_REAL_CST (cst)))
30595 negative++;
30596 }
30597 if (negative == TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg3)))
30598 op0 = pc_rtx;
30599 }
30600 else if (TREE_CODE (arg3) == SSA_NAME)
30601 {
30602 /* Recognize also when mask is like:
30603 __v2df src = _mm_setzero_pd ();
30604 __v2df mask = _mm_cmpeq_pd (src, src);
30605 or
30606 __v8sf src = _mm256_setzero_ps ();
30607 __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ);
30608 as that is a cheaper way to load all ones into
30609 a register than having to load a constant from
30610 memory. */
30611 gimple def_stmt = SSA_NAME_DEF_STMT (arg3);
30612 if (is_gimple_call (def_stmt))
30613 {
30614 tree fndecl = gimple_call_fndecl (def_stmt);
30615 if (fndecl
30616 && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD)
30617 switch ((unsigned int) DECL_FUNCTION_CODE (fndecl))
30618 {
30619 case IX86_BUILTIN_CMPPD:
30620 case IX86_BUILTIN_CMPPS:
30621 case IX86_BUILTIN_CMPPD256:
30622 case IX86_BUILTIN_CMPPS256:
30623 if (!integer_zerop (gimple_call_arg (def_stmt, 2)))
30624 break;
30625 /* FALLTHRU */
30626 case IX86_BUILTIN_CMPEQPD:
30627 case IX86_BUILTIN_CMPEQPS:
30628 if (initializer_zerop (gimple_call_arg (def_stmt, 0))
30629 && initializer_zerop (gimple_call_arg (def_stmt,
30630 1)))
30631 op0 = pc_rtx;
30632 break;
30633 default:
30634 break;
30635 }
30636 }
30637 }
30638 }
30639
30640 pat = GEN_FCN (icode) (subtarget, op0, op1, op2, op3, op4);
30641 if (! pat)
30642 return const0_rtx;
30643 emit_insn (pat);
30644
30645 if (fcode == IX86_BUILTIN_GATHERDIV8SF
30646 || fcode == IX86_BUILTIN_GATHERDIV8SI)
30647 {
30648 enum machine_mode tmode = GET_MODE (subtarget) == V8SFmode
30649 ? V4SFmode : V4SImode;
30650 if (target == NULL_RTX)
30651 target = gen_reg_rtx (tmode);
30652 if (tmode == V4SFmode)
30653 emit_insn (gen_vec_extract_lo_v8sf (target, subtarget));
30654 else
30655 emit_insn (gen_vec_extract_lo_v8si (target, subtarget));
30656 }
30657 else
30658 target = subtarget;
30659
30660 return target;
30661
30662 case IX86_BUILTIN_XABORT:
30663 icode = CODE_FOR_xabort;
30664 arg0 = CALL_EXPR_ARG (exp, 0);
30665 op0 = expand_normal (arg0);
30666 mode0 = insn_data[icode].operand[0].mode;
30667 if (!insn_data[icode].operand[0].predicate (op0, mode0))
30668 {
30669 error ("the xabort's argument must be an 8-bit immediate");
30670 return const0_rtx;
30671 }
30672 emit_insn (gen_xabort (op0));
30673 return 0;
30674
30675 default:
30676 break;
30677 }
30678
30679 for (i = 0, d = bdesc_special_args;
30680 i < ARRAY_SIZE (bdesc_special_args);
30681 i++, d++)
30682 if (d->code == fcode)
30683 return ix86_expand_special_args_builtin (d, exp, target);
30684
30685 for (i = 0, d = bdesc_args;
30686 i < ARRAY_SIZE (bdesc_args);
30687 i++, d++)
30688 if (d->code == fcode)
30689 switch (fcode)
30690 {
30691 case IX86_BUILTIN_FABSQ:
30692 case IX86_BUILTIN_COPYSIGNQ:
30693 if (!TARGET_SSE)
30694 /* Emit a normal call if SSE isn't available. */
30695 return expand_call (exp, target, ignore);
30696 default:
30697 return ix86_expand_args_builtin (d, exp, target);
30698 }
30699
30700 for (i = 0, d = bdesc_comi; i < ARRAY_SIZE (bdesc_comi); i++, d++)
30701 if (d->code == fcode)
30702 return ix86_expand_sse_comi (d, exp, target);
30703
30704 for (i = 0, d = bdesc_pcmpestr;
30705 i < ARRAY_SIZE (bdesc_pcmpestr);
30706 i++, d++)
30707 if (d->code == fcode)
30708 return ix86_expand_sse_pcmpestr (d, exp, target);
30709
30710 for (i = 0, d = bdesc_pcmpistr;
30711 i < ARRAY_SIZE (bdesc_pcmpistr);
30712 i++, d++)
30713 if (d->code == fcode)
30714 return ix86_expand_sse_pcmpistr (d, exp, target);
30715
30716 for (i = 0, d = bdesc_multi_arg; i < ARRAY_SIZE (bdesc_multi_arg); i++, d++)
30717 if (d->code == fcode)
30718 return ix86_expand_multi_arg_builtin (d->icode, exp, target,
30719 (enum ix86_builtin_func_type)
30720 d->flag, d->comparison);
30721
30722 gcc_unreachable ();
30723 }
30724
30725 /* Returns a function decl for a vectorized version of the builtin function
30726 with builtin function code FN and the result vector type TYPE, or NULL_TREE
30727 if it is not available. */
30728
30729 static tree
30730 ix86_builtin_vectorized_function (tree fndecl, tree type_out,
30731 tree type_in)
30732 {
30733 enum machine_mode in_mode, out_mode;
30734 int in_n, out_n;
30735 enum built_in_function fn = DECL_FUNCTION_CODE (fndecl);
30736
30737 if (TREE_CODE (type_out) != VECTOR_TYPE
30738 || TREE_CODE (type_in) != VECTOR_TYPE
30739 || DECL_BUILT_IN_CLASS (fndecl) != BUILT_IN_NORMAL)
30740 return NULL_TREE;
30741
30742 out_mode = TYPE_MODE (TREE_TYPE (type_out));
30743 out_n = TYPE_VECTOR_SUBPARTS (type_out);
30744 in_mode = TYPE_MODE (TREE_TYPE (type_in));
30745 in_n = TYPE_VECTOR_SUBPARTS (type_in);
30746
30747 switch (fn)
30748 {
30749 case BUILT_IN_SQRT:
30750 if (out_mode == DFmode && in_mode == DFmode)
30751 {
30752 if (out_n == 2 && in_n == 2)
30753 return ix86_builtins[IX86_BUILTIN_SQRTPD];
30754 else if (out_n == 4 && in_n == 4)
30755 return ix86_builtins[IX86_BUILTIN_SQRTPD256];
30756 }
30757 break;
30758
30759 case BUILT_IN_SQRTF:
30760 if (out_mode == SFmode && in_mode == SFmode)
30761 {
30762 if (out_n == 4 && in_n == 4)
30763 return ix86_builtins[IX86_BUILTIN_SQRTPS_NR];
30764 else if (out_n == 8 && in_n == 8)
30765 return ix86_builtins[IX86_BUILTIN_SQRTPS_NR256];
30766 }
30767 break;
30768
30769 case BUILT_IN_IFLOOR:
30770 case BUILT_IN_LFLOOR:
30771 case BUILT_IN_LLFLOOR:
30772 /* The round insn does not trap on denormals. */
30773 if (flag_trapping_math || !TARGET_ROUND)
30774 break;
30775
30776 if (out_mode == SImode && in_mode == DFmode)
30777 {
30778 if (out_n == 4 && in_n == 2)
30779 return ix86_builtins[IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX];
30780 else if (out_n == 8 && in_n == 4)
30781 return ix86_builtins[IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256];
30782 }
30783 break;
30784
30785 case BUILT_IN_IFLOORF:
30786 case BUILT_IN_LFLOORF:
30787 case BUILT_IN_LLFLOORF:
30788 /* The round insn does not trap on denormals. */
30789 if (flag_trapping_math || !TARGET_ROUND)
30790 break;
30791
30792 if (out_mode == SImode && in_mode == SFmode)
30793 {
30794 if (out_n == 4 && in_n == 4)
30795 return ix86_builtins[IX86_BUILTIN_FLOORPS_SFIX];
30796 else if (out_n == 8 && in_n == 8)
30797 return ix86_builtins[IX86_BUILTIN_FLOORPS_SFIX256];
30798 }
30799 break;
30800
30801 case BUILT_IN_ICEIL:
30802 case BUILT_IN_LCEIL:
30803 case BUILT_IN_LLCEIL:
30804 /* The round insn does not trap on denormals. */
30805 if (flag_trapping_math || !TARGET_ROUND)
30806 break;
30807
30808 if (out_mode == SImode && in_mode == DFmode)
30809 {
30810 if (out_n == 4 && in_n == 2)
30811 return ix86_builtins[IX86_BUILTIN_CEILPD_VEC_PACK_SFIX];
30812 else if (out_n == 8 && in_n == 4)
30813 return ix86_builtins[IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256];
30814 }
30815 break;
30816
30817 case BUILT_IN_ICEILF:
30818 case BUILT_IN_LCEILF:
30819 case BUILT_IN_LLCEILF:
30820 /* The round insn does not trap on denormals. */
30821 if (flag_trapping_math || !TARGET_ROUND)
30822 break;
30823
30824 if (out_mode == SImode && in_mode == SFmode)
30825 {
30826 if (out_n == 4 && in_n == 4)
30827 return ix86_builtins[IX86_BUILTIN_CEILPS_SFIX];
30828 else if (out_n == 8 && in_n == 8)
30829 return ix86_builtins[IX86_BUILTIN_CEILPS_SFIX256];
30830 }
30831 break;
30832
30833 case BUILT_IN_IRINT:
30834 case BUILT_IN_LRINT:
30835 case BUILT_IN_LLRINT:
30836 if (out_mode == SImode && in_mode == DFmode)
30837 {
30838 if (out_n == 4 && in_n == 2)
30839 return ix86_builtins[IX86_BUILTIN_VEC_PACK_SFIX];
30840 else if (out_n == 8 && in_n == 4)
30841 return ix86_builtins[IX86_BUILTIN_VEC_PACK_SFIX256];
30842 }
30843 break;
30844
30845 case BUILT_IN_IRINTF:
30846 case BUILT_IN_LRINTF:
30847 case BUILT_IN_LLRINTF:
30848 if (out_mode == SImode && in_mode == SFmode)
30849 {
30850 if (out_n == 4 && in_n == 4)
30851 return ix86_builtins[IX86_BUILTIN_CVTPS2DQ];
30852 else if (out_n == 8 && in_n == 8)
30853 return ix86_builtins[IX86_BUILTIN_CVTPS2DQ256];
30854 }
30855 break;
30856
30857 case BUILT_IN_IROUND:
30858 case BUILT_IN_LROUND:
30859 case BUILT_IN_LLROUND:
30860 /* The round insn does not trap on denormals. */
30861 if (flag_trapping_math || !TARGET_ROUND)
30862 break;
30863
30864 if (out_mode == SImode && in_mode == DFmode)
30865 {
30866 if (out_n == 4 && in_n == 2)
30867 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX];
30868 else if (out_n == 8 && in_n == 4)
30869 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256];
30870 }
30871 break;
30872
30873 case BUILT_IN_IROUNDF:
30874 case BUILT_IN_LROUNDF:
30875 case BUILT_IN_LLROUNDF:
30876 /* The round insn does not trap on denormals. */
30877 if (flag_trapping_math || !TARGET_ROUND)
30878 break;
30879
30880 if (out_mode == SImode && in_mode == SFmode)
30881 {
30882 if (out_n == 4 && in_n == 4)
30883 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ_SFIX];
30884 else if (out_n == 8 && in_n == 8)
30885 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ_SFIX256];
30886 }
30887 break;
30888
30889 case BUILT_IN_COPYSIGN:
30890 if (out_mode == DFmode && in_mode == DFmode)
30891 {
30892 if (out_n == 2 && in_n == 2)
30893 return ix86_builtins[IX86_BUILTIN_CPYSGNPD];
30894 else if (out_n == 4 && in_n == 4)
30895 return ix86_builtins[IX86_BUILTIN_CPYSGNPD256];
30896 }
30897 break;
30898
30899 case BUILT_IN_COPYSIGNF:
30900 if (out_mode == SFmode && in_mode == SFmode)
30901 {
30902 if (out_n == 4 && in_n == 4)
30903 return ix86_builtins[IX86_BUILTIN_CPYSGNPS];
30904 else if (out_n == 8 && in_n == 8)
30905 return ix86_builtins[IX86_BUILTIN_CPYSGNPS256];
30906 }
30907 break;
30908
30909 case BUILT_IN_FLOOR:
30910 /* The round insn does not trap on denormals. */
30911 if (flag_trapping_math || !TARGET_ROUND)
30912 break;
30913
30914 if (out_mode == DFmode && in_mode == DFmode)
30915 {
30916 if (out_n == 2 && in_n == 2)
30917 return ix86_builtins[IX86_BUILTIN_FLOORPD];
30918 else if (out_n == 4 && in_n == 4)
30919 return ix86_builtins[IX86_BUILTIN_FLOORPD256];
30920 }
30921 break;
30922
30923 case BUILT_IN_FLOORF:
30924 /* The round insn does not trap on denormals. */
30925 if (flag_trapping_math || !TARGET_ROUND)
30926 break;
30927
30928 if (out_mode == SFmode && in_mode == SFmode)
30929 {
30930 if (out_n == 4 && in_n == 4)
30931 return ix86_builtins[IX86_BUILTIN_FLOORPS];
30932 else if (out_n == 8 && in_n == 8)
30933 return ix86_builtins[IX86_BUILTIN_FLOORPS256];
30934 }
30935 break;
30936
30937 case BUILT_IN_CEIL:
30938 /* The round insn does not trap on denormals. */
30939 if (flag_trapping_math || !TARGET_ROUND)
30940 break;
30941
30942 if (out_mode == DFmode && in_mode == DFmode)
30943 {
30944 if (out_n == 2 && in_n == 2)
30945 return ix86_builtins[IX86_BUILTIN_CEILPD];
30946 else if (out_n == 4 && in_n == 4)
30947 return ix86_builtins[IX86_BUILTIN_CEILPD256];
30948 }
30949 break;
30950
30951 case BUILT_IN_CEILF:
30952 /* The round insn does not trap on denormals. */
30953 if (flag_trapping_math || !TARGET_ROUND)
30954 break;
30955
30956 if (out_mode == SFmode && in_mode == SFmode)
30957 {
30958 if (out_n == 4 && in_n == 4)
30959 return ix86_builtins[IX86_BUILTIN_CEILPS];
30960 else if (out_n == 8 && in_n == 8)
30961 return ix86_builtins[IX86_BUILTIN_CEILPS256];
30962 }
30963 break;
30964
30965 case BUILT_IN_TRUNC:
30966 /* The round insn does not trap on denormals. */
30967 if (flag_trapping_math || !TARGET_ROUND)
30968 break;
30969
30970 if (out_mode == DFmode && in_mode == DFmode)
30971 {
30972 if (out_n == 2 && in_n == 2)
30973 return ix86_builtins[IX86_BUILTIN_TRUNCPD];
30974 else if (out_n == 4 && in_n == 4)
30975 return ix86_builtins[IX86_BUILTIN_TRUNCPD256];
30976 }
30977 break;
30978
30979 case BUILT_IN_TRUNCF:
30980 /* The round insn does not trap on denormals. */
30981 if (flag_trapping_math || !TARGET_ROUND)
30982 break;
30983
30984 if (out_mode == SFmode && in_mode == SFmode)
30985 {
30986 if (out_n == 4 && in_n == 4)
30987 return ix86_builtins[IX86_BUILTIN_TRUNCPS];
30988 else if (out_n == 8 && in_n == 8)
30989 return ix86_builtins[IX86_BUILTIN_TRUNCPS256];
30990 }
30991 break;
30992
30993 case BUILT_IN_RINT:
30994 /* The round insn does not trap on denormals. */
30995 if (flag_trapping_math || !TARGET_ROUND)
30996 break;
30997
30998 if (out_mode == DFmode && in_mode == DFmode)
30999 {
31000 if (out_n == 2 && in_n == 2)
31001 return ix86_builtins[IX86_BUILTIN_RINTPD];
31002 else if (out_n == 4 && in_n == 4)
31003 return ix86_builtins[IX86_BUILTIN_RINTPD256];
31004 }
31005 break;
31006
31007 case BUILT_IN_RINTF:
31008 /* The round insn does not trap on denormals. */
31009 if (flag_trapping_math || !TARGET_ROUND)
31010 break;
31011
31012 if (out_mode == SFmode && in_mode == SFmode)
31013 {
31014 if (out_n == 4 && in_n == 4)
31015 return ix86_builtins[IX86_BUILTIN_RINTPS];
31016 else if (out_n == 8 && in_n == 8)
31017 return ix86_builtins[IX86_BUILTIN_RINTPS256];
31018 }
31019 break;
31020
31021 case BUILT_IN_ROUND:
31022 /* The round insn does not trap on denormals. */
31023 if (flag_trapping_math || !TARGET_ROUND)
31024 break;
31025
31026 if (out_mode == DFmode && in_mode == DFmode)
31027 {
31028 if (out_n == 2 && in_n == 2)
31029 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ];
31030 else if (out_n == 4 && in_n == 4)
31031 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ256];
31032 }
31033 break;
31034
31035 case BUILT_IN_ROUNDF:
31036 /* The round insn does not trap on denormals. */
31037 if (flag_trapping_math || !TARGET_ROUND)
31038 break;
31039
31040 if (out_mode == SFmode && in_mode == SFmode)
31041 {
31042 if (out_n == 4 && in_n == 4)
31043 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ];
31044 else if (out_n == 8 && in_n == 8)
31045 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ256];
31046 }
31047 break;
31048
31049 case BUILT_IN_FMA:
31050 if (out_mode == DFmode && in_mode == DFmode)
31051 {
31052 if (out_n == 2 && in_n == 2)
31053 return ix86_builtins[IX86_BUILTIN_VFMADDPD];
31054 if (out_n == 4 && in_n == 4)
31055 return ix86_builtins[IX86_BUILTIN_VFMADDPD256];
31056 }
31057 break;
31058
31059 case BUILT_IN_FMAF:
31060 if (out_mode == SFmode && in_mode == SFmode)
31061 {
31062 if (out_n == 4 && in_n == 4)
31063 return ix86_builtins[IX86_BUILTIN_VFMADDPS];
31064 if (out_n == 8 && in_n == 8)
31065 return ix86_builtins[IX86_BUILTIN_VFMADDPS256];
31066 }
31067 break;
31068
31069 default:
31070 break;
31071 }
31072
31073 /* Dispatch to a handler for a vectorization library. */
31074 if (ix86_veclib_handler)
31075 return ix86_veclib_handler ((enum built_in_function) fn, type_out,
31076 type_in);
31077
31078 return NULL_TREE;
31079 }
31080
31081 /* Handler for an SVML-style interface to
31082 a library with vectorized intrinsics. */
31083
31084 static tree
31085 ix86_veclibabi_svml (enum built_in_function fn, tree type_out, tree type_in)
31086 {
31087 char name[20];
31088 tree fntype, new_fndecl, args;
31089 unsigned arity;
31090 const char *bname;
31091 enum machine_mode el_mode, in_mode;
31092 int n, in_n;
31093
31094 /* The SVML is suitable for unsafe math only. */
31095 if (!flag_unsafe_math_optimizations)
31096 return NULL_TREE;
31097
31098 el_mode = TYPE_MODE (TREE_TYPE (type_out));
31099 n = TYPE_VECTOR_SUBPARTS (type_out);
31100 in_mode = TYPE_MODE (TREE_TYPE (type_in));
31101 in_n = TYPE_VECTOR_SUBPARTS (type_in);
31102 if (el_mode != in_mode
31103 || n != in_n)
31104 return NULL_TREE;
31105
31106 switch (fn)
31107 {
31108 case BUILT_IN_EXP:
31109 case BUILT_IN_LOG:
31110 case BUILT_IN_LOG10:
31111 case BUILT_IN_POW:
31112 case BUILT_IN_TANH:
31113 case BUILT_IN_TAN:
31114 case BUILT_IN_ATAN:
31115 case BUILT_IN_ATAN2:
31116 case BUILT_IN_ATANH:
31117 case BUILT_IN_CBRT:
31118 case BUILT_IN_SINH:
31119 case BUILT_IN_SIN:
31120 case BUILT_IN_ASINH:
31121 case BUILT_IN_ASIN:
31122 case BUILT_IN_COSH:
31123 case BUILT_IN_COS:
31124 case BUILT_IN_ACOSH:
31125 case BUILT_IN_ACOS:
31126 if (el_mode != DFmode || n != 2)
31127 return NULL_TREE;
31128 break;
31129
31130 case BUILT_IN_EXPF:
31131 case BUILT_IN_LOGF:
31132 case BUILT_IN_LOG10F:
31133 case BUILT_IN_POWF:
31134 case BUILT_IN_TANHF:
31135 case BUILT_IN_TANF:
31136 case BUILT_IN_ATANF:
31137 case BUILT_IN_ATAN2F:
31138 case BUILT_IN_ATANHF:
31139 case BUILT_IN_CBRTF:
31140 case BUILT_IN_SINHF:
31141 case BUILT_IN_SINF:
31142 case BUILT_IN_ASINHF:
31143 case BUILT_IN_ASINF:
31144 case BUILT_IN_COSHF:
31145 case BUILT_IN_COSF:
31146 case BUILT_IN_ACOSHF:
31147 case BUILT_IN_ACOSF:
31148 if (el_mode != SFmode || n != 4)
31149 return NULL_TREE;
31150 break;
31151
31152 default:
31153 return NULL_TREE;
31154 }
31155
31156 bname = IDENTIFIER_POINTER (DECL_NAME (builtin_decl_implicit (fn)));
31157
31158 if (fn == BUILT_IN_LOGF)
31159 strcpy (name, "vmlsLn4");
31160 else if (fn == BUILT_IN_LOG)
31161 strcpy (name, "vmldLn2");
31162 else if (n == 4)
31163 {
31164 sprintf (name, "vmls%s", bname+10);
31165 name[strlen (name)-1] = '4';
31166 }
31167 else
31168 sprintf (name, "vmld%s2", bname+10);
31169
31170 /* Convert to uppercase. */
31171 name[4] &= ~0x20;
31172
31173 arity = 0;
31174 for (args = DECL_ARGUMENTS (builtin_decl_implicit (fn));
31175 args;
31176 args = TREE_CHAIN (args))
31177 arity++;
31178
31179 if (arity == 1)
31180 fntype = build_function_type_list (type_out, type_in, NULL);
31181 else
31182 fntype = build_function_type_list (type_out, type_in, type_in, NULL);
31183
31184 /* Build a function declaration for the vectorized function. */
31185 new_fndecl = build_decl (BUILTINS_LOCATION,
31186 FUNCTION_DECL, get_identifier (name), fntype);
31187 TREE_PUBLIC (new_fndecl) = 1;
31188 DECL_EXTERNAL (new_fndecl) = 1;
31189 DECL_IS_NOVOPS (new_fndecl) = 1;
31190 TREE_READONLY (new_fndecl) = 1;
31191
31192 return new_fndecl;
31193 }
31194
31195 /* Handler for an ACML-style interface to
31196 a library with vectorized intrinsics. */
31197
31198 static tree
31199 ix86_veclibabi_acml (enum built_in_function fn, tree type_out, tree type_in)
31200 {
31201 char name[20] = "__vr.._";
31202 tree fntype, new_fndecl, args;
31203 unsigned arity;
31204 const char *bname;
31205 enum machine_mode el_mode, in_mode;
31206 int n, in_n;
31207
31208 /* The ACML is 64bits only and suitable for unsafe math only as
31209 it does not correctly support parts of IEEE with the required
31210 precision such as denormals. */
31211 if (!TARGET_64BIT
31212 || !flag_unsafe_math_optimizations)
31213 return NULL_TREE;
31214
31215 el_mode = TYPE_MODE (TREE_TYPE (type_out));
31216 n = TYPE_VECTOR_SUBPARTS (type_out);
31217 in_mode = TYPE_MODE (TREE_TYPE (type_in));
31218 in_n = TYPE_VECTOR_SUBPARTS (type_in);
31219 if (el_mode != in_mode
31220 || n != in_n)
31221 return NULL_TREE;
31222
31223 switch (fn)
31224 {
31225 case BUILT_IN_SIN:
31226 case BUILT_IN_COS:
31227 case BUILT_IN_EXP:
31228 case BUILT_IN_LOG:
31229 case BUILT_IN_LOG2:
31230 case BUILT_IN_LOG10:
31231 name[4] = 'd';
31232 name[5] = '2';
31233 if (el_mode != DFmode
31234 || n != 2)
31235 return NULL_TREE;
31236 break;
31237
31238 case BUILT_IN_SINF:
31239 case BUILT_IN_COSF:
31240 case BUILT_IN_EXPF:
31241 case BUILT_IN_POWF:
31242 case BUILT_IN_LOGF:
31243 case BUILT_IN_LOG2F:
31244 case BUILT_IN_LOG10F:
31245 name[4] = 's';
31246 name[5] = '4';
31247 if (el_mode != SFmode
31248 || n != 4)
31249 return NULL_TREE;
31250 break;
31251
31252 default:
31253 return NULL_TREE;
31254 }
31255
31256 bname = IDENTIFIER_POINTER (DECL_NAME (builtin_decl_implicit (fn)));
31257 sprintf (name + 7, "%s", bname+10);
31258
31259 arity = 0;
31260 for (args = DECL_ARGUMENTS (builtin_decl_implicit (fn));
31261 args;
31262 args = TREE_CHAIN (args))
31263 arity++;
31264
31265 if (arity == 1)
31266 fntype = build_function_type_list (type_out, type_in, NULL);
31267 else
31268 fntype = build_function_type_list (type_out, type_in, type_in, NULL);
31269
31270 /* Build a function declaration for the vectorized function. */
31271 new_fndecl = build_decl (BUILTINS_LOCATION,
31272 FUNCTION_DECL, get_identifier (name), fntype);
31273 TREE_PUBLIC (new_fndecl) = 1;
31274 DECL_EXTERNAL (new_fndecl) = 1;
31275 DECL_IS_NOVOPS (new_fndecl) = 1;
31276 TREE_READONLY (new_fndecl) = 1;
31277
31278 return new_fndecl;
31279 }
31280
31281 /* Returns a decl of a function that implements gather load with
31282 memory type MEM_VECTYPE and index type INDEX_VECTYPE and SCALE.
31283 Return NULL_TREE if it is not available. */
31284
31285 static tree
31286 ix86_vectorize_builtin_gather (const_tree mem_vectype,
31287 const_tree index_type, int scale)
31288 {
31289 bool si;
31290 enum ix86_builtins code;
31291
31292 if (! TARGET_AVX2)
31293 return NULL_TREE;
31294
31295 if ((TREE_CODE (index_type) != INTEGER_TYPE
31296 && !POINTER_TYPE_P (index_type))
31297 || (TYPE_MODE (index_type) != SImode
31298 && TYPE_MODE (index_type) != DImode))
31299 return NULL_TREE;
31300
31301 if (TYPE_PRECISION (index_type) > POINTER_SIZE)
31302 return NULL_TREE;
31303
31304 /* v*gather* insn sign extends index to pointer mode. */
31305 if (TYPE_PRECISION (index_type) < POINTER_SIZE
31306 && TYPE_UNSIGNED (index_type))
31307 return NULL_TREE;
31308
31309 if (scale <= 0
31310 || scale > 8
31311 || (scale & (scale - 1)) != 0)
31312 return NULL_TREE;
31313
31314 si = TYPE_MODE (index_type) == SImode;
31315 switch (TYPE_MODE (mem_vectype))
31316 {
31317 case V2DFmode:
31318 code = si ? IX86_BUILTIN_GATHERSIV2DF : IX86_BUILTIN_GATHERDIV2DF;
31319 break;
31320 case V4DFmode:
31321 code = si ? IX86_BUILTIN_GATHERALTSIV4DF : IX86_BUILTIN_GATHERDIV4DF;
31322 break;
31323 case V2DImode:
31324 code = si ? IX86_BUILTIN_GATHERSIV2DI : IX86_BUILTIN_GATHERDIV2DI;
31325 break;
31326 case V4DImode:
31327 code = si ? IX86_BUILTIN_GATHERALTSIV4DI : IX86_BUILTIN_GATHERDIV4DI;
31328 break;
31329 case V4SFmode:
31330 code = si ? IX86_BUILTIN_GATHERSIV4SF : IX86_BUILTIN_GATHERDIV4SF;
31331 break;
31332 case V8SFmode:
31333 code = si ? IX86_BUILTIN_GATHERSIV8SF : IX86_BUILTIN_GATHERALTDIV8SF;
31334 break;
31335 case V4SImode:
31336 code = si ? IX86_BUILTIN_GATHERSIV4SI : IX86_BUILTIN_GATHERDIV4SI;
31337 break;
31338 case V8SImode:
31339 code = si ? IX86_BUILTIN_GATHERSIV8SI : IX86_BUILTIN_GATHERALTDIV8SI;
31340 break;
31341 default:
31342 return NULL_TREE;
31343 }
31344
31345 return ix86_builtins[code];
31346 }
31347
31348 /* Returns a code for a target-specific builtin that implements
31349 reciprocal of the function, or NULL_TREE if not available. */
31350
31351 static tree
31352 ix86_builtin_reciprocal (unsigned int fn, bool md_fn,
31353 bool sqrt ATTRIBUTE_UNUSED)
31354 {
31355 if (! (TARGET_SSE_MATH && !optimize_insn_for_size_p ()
31356 && flag_finite_math_only && !flag_trapping_math
31357 && flag_unsafe_math_optimizations))
31358 return NULL_TREE;
31359
31360 if (md_fn)
31361 /* Machine dependent builtins. */
31362 switch (fn)
31363 {
31364 /* Vectorized version of sqrt to rsqrt conversion. */
31365 case IX86_BUILTIN_SQRTPS_NR:
31366 return ix86_builtins[IX86_BUILTIN_RSQRTPS_NR];
31367
31368 case IX86_BUILTIN_SQRTPS_NR256:
31369 return ix86_builtins[IX86_BUILTIN_RSQRTPS_NR256];
31370
31371 default:
31372 return NULL_TREE;
31373 }
31374 else
31375 /* Normal builtins. */
31376 switch (fn)
31377 {
31378 /* Sqrt to rsqrt conversion. */
31379 case BUILT_IN_SQRTF:
31380 return ix86_builtins[IX86_BUILTIN_RSQRTF];
31381
31382 default:
31383 return NULL_TREE;
31384 }
31385 }
31386 \f
31387 /* Helper for avx_vpermilps256_operand et al. This is also used by
31388 the expansion functions to turn the parallel back into a mask.
31389 The return value is 0 for no match and the imm8+1 for a match. */
31390
31391 int
31392 avx_vpermilp_parallel (rtx par, enum machine_mode mode)
31393 {
31394 unsigned i, nelt = GET_MODE_NUNITS (mode);
31395 unsigned mask = 0;
31396 unsigned char ipar[8];
31397
31398 if (XVECLEN (par, 0) != (int) nelt)
31399 return 0;
31400
31401 /* Validate that all of the elements are constants, and not totally
31402 out of range. Copy the data into an integral array to make the
31403 subsequent checks easier. */
31404 for (i = 0; i < nelt; ++i)
31405 {
31406 rtx er = XVECEXP (par, 0, i);
31407 unsigned HOST_WIDE_INT ei;
31408
31409 if (!CONST_INT_P (er))
31410 return 0;
31411 ei = INTVAL (er);
31412 if (ei >= nelt)
31413 return 0;
31414 ipar[i] = ei;
31415 }
31416
31417 switch (mode)
31418 {
31419 case V4DFmode:
31420 /* In the 256-bit DFmode case, we can only move elements within
31421 a 128-bit lane. */
31422 for (i = 0; i < 2; ++i)
31423 {
31424 if (ipar[i] >= 2)
31425 return 0;
31426 mask |= ipar[i] << i;
31427 }
31428 for (i = 2; i < 4; ++i)
31429 {
31430 if (ipar[i] < 2)
31431 return 0;
31432 mask |= (ipar[i] - 2) << i;
31433 }
31434 break;
31435
31436 case V8SFmode:
31437 /* In the 256-bit SFmode case, we have full freedom of movement
31438 within the low 128-bit lane, but the high 128-bit lane must
31439 mirror the exact same pattern. */
31440 for (i = 0; i < 4; ++i)
31441 if (ipar[i] + 4 != ipar[i + 4])
31442 return 0;
31443 nelt = 4;
31444 /* FALLTHRU */
31445
31446 case V2DFmode:
31447 case V4SFmode:
31448 /* In the 128-bit case, we've full freedom in the placement of
31449 the elements from the source operand. */
31450 for (i = 0; i < nelt; ++i)
31451 mask |= ipar[i] << (i * (nelt / 2));
31452 break;
31453
31454 default:
31455 gcc_unreachable ();
31456 }
31457
31458 /* Make sure success has a non-zero value by adding one. */
31459 return mask + 1;
31460 }
31461
31462 /* Helper for avx_vperm2f128_v4df_operand et al. This is also used by
31463 the expansion functions to turn the parallel back into a mask.
31464 The return value is 0 for no match and the imm8+1 for a match. */
31465
31466 int
31467 avx_vperm2f128_parallel (rtx par, enum machine_mode mode)
31468 {
31469 unsigned i, nelt = GET_MODE_NUNITS (mode), nelt2 = nelt / 2;
31470 unsigned mask = 0;
31471 unsigned char ipar[8];
31472
31473 if (XVECLEN (par, 0) != (int) nelt)
31474 return 0;
31475
31476 /* Validate that all of the elements are constants, and not totally
31477 out of range. Copy the data into an integral array to make the
31478 subsequent checks easier. */
31479 for (i = 0; i < nelt; ++i)
31480 {
31481 rtx er = XVECEXP (par, 0, i);
31482 unsigned HOST_WIDE_INT ei;
31483
31484 if (!CONST_INT_P (er))
31485 return 0;
31486 ei = INTVAL (er);
31487 if (ei >= 2 * nelt)
31488 return 0;
31489 ipar[i] = ei;
31490 }
31491
31492 /* Validate that the halves of the permute are halves. */
31493 for (i = 0; i < nelt2 - 1; ++i)
31494 if (ipar[i] + 1 != ipar[i + 1])
31495 return 0;
31496 for (i = nelt2; i < nelt - 1; ++i)
31497 if (ipar[i] + 1 != ipar[i + 1])
31498 return 0;
31499
31500 /* Reconstruct the mask. */
31501 for (i = 0; i < 2; ++i)
31502 {
31503 unsigned e = ipar[i * nelt2];
31504 if (e % nelt2)
31505 return 0;
31506 e /= nelt2;
31507 mask |= e << (i * 4);
31508 }
31509
31510 /* Make sure success has a non-zero value by adding one. */
31511 return mask + 1;
31512 }
31513 \f
31514 /* Store OPERAND to the memory after reload is completed. This means
31515 that we can't easily use assign_stack_local. */
31516 rtx
31517 ix86_force_to_memory (enum machine_mode mode, rtx operand)
31518 {
31519 rtx result;
31520
31521 gcc_assert (reload_completed);
31522 if (ix86_using_red_zone ())
31523 {
31524 result = gen_rtx_MEM (mode,
31525 gen_rtx_PLUS (Pmode,
31526 stack_pointer_rtx,
31527 GEN_INT (-RED_ZONE_SIZE)));
31528 emit_move_insn (result, operand);
31529 }
31530 else if (TARGET_64BIT)
31531 {
31532 switch (mode)
31533 {
31534 case HImode:
31535 case SImode:
31536 operand = gen_lowpart (DImode, operand);
31537 /* FALLTHRU */
31538 case DImode:
31539 emit_insn (
31540 gen_rtx_SET (VOIDmode,
31541 gen_rtx_MEM (DImode,
31542 gen_rtx_PRE_DEC (DImode,
31543 stack_pointer_rtx)),
31544 operand));
31545 break;
31546 default:
31547 gcc_unreachable ();
31548 }
31549 result = gen_rtx_MEM (mode, stack_pointer_rtx);
31550 }
31551 else
31552 {
31553 switch (mode)
31554 {
31555 case DImode:
31556 {
31557 rtx operands[2];
31558 split_double_mode (mode, &operand, 1, operands, operands + 1);
31559 emit_insn (
31560 gen_rtx_SET (VOIDmode,
31561 gen_rtx_MEM (SImode,
31562 gen_rtx_PRE_DEC (Pmode,
31563 stack_pointer_rtx)),
31564 operands[1]));
31565 emit_insn (
31566 gen_rtx_SET (VOIDmode,
31567 gen_rtx_MEM (SImode,
31568 gen_rtx_PRE_DEC (Pmode,
31569 stack_pointer_rtx)),
31570 operands[0]));
31571 }
31572 break;
31573 case HImode:
31574 /* Store HImodes as SImodes. */
31575 operand = gen_lowpart (SImode, operand);
31576 /* FALLTHRU */
31577 case SImode:
31578 emit_insn (
31579 gen_rtx_SET (VOIDmode,
31580 gen_rtx_MEM (GET_MODE (operand),
31581 gen_rtx_PRE_DEC (SImode,
31582 stack_pointer_rtx)),
31583 operand));
31584 break;
31585 default:
31586 gcc_unreachable ();
31587 }
31588 result = gen_rtx_MEM (mode, stack_pointer_rtx);
31589 }
31590 return result;
31591 }
31592
31593 /* Free operand from the memory. */
31594 void
31595 ix86_free_from_memory (enum machine_mode mode)
31596 {
31597 if (!ix86_using_red_zone ())
31598 {
31599 int size;
31600
31601 if (mode == DImode || TARGET_64BIT)
31602 size = 8;
31603 else
31604 size = 4;
31605 /* Use LEA to deallocate stack space. In peephole2 it will be converted
31606 to pop or add instruction if registers are available. */
31607 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
31608 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
31609 GEN_INT (size))));
31610 }
31611 }
31612
31613 /* Implement TARGET_PREFERRED_RELOAD_CLASS.
31614
31615 Put float CONST_DOUBLE in the constant pool instead of fp regs.
31616 QImode must go into class Q_REGS.
31617 Narrow ALL_REGS to GENERAL_REGS. This supports allowing movsf and
31618 movdf to do mem-to-mem moves through integer regs. */
31619
31620 static reg_class_t
31621 ix86_preferred_reload_class (rtx x, reg_class_t regclass)
31622 {
31623 enum machine_mode mode = GET_MODE (x);
31624
31625 /* We're only allowed to return a subclass of CLASS. Many of the
31626 following checks fail for NO_REGS, so eliminate that early. */
31627 if (regclass == NO_REGS)
31628 return NO_REGS;
31629
31630 /* All classes can load zeros. */
31631 if (x == CONST0_RTX (mode))
31632 return regclass;
31633
31634 /* Force constants into memory if we are loading a (nonzero) constant into
31635 an MMX or SSE register. This is because there are no MMX/SSE instructions
31636 to load from a constant. */
31637 if (CONSTANT_P (x)
31638 && (MAYBE_MMX_CLASS_P (regclass) || MAYBE_SSE_CLASS_P (regclass)))
31639 return NO_REGS;
31640
31641 /* Prefer SSE regs only, if we can use them for math. */
31642 if (TARGET_SSE_MATH && !TARGET_MIX_SSE_I387 && SSE_FLOAT_MODE_P (mode))
31643 return SSE_CLASS_P (regclass) ? regclass : NO_REGS;
31644
31645 /* Floating-point constants need more complex checks. */
31646 if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) != VOIDmode)
31647 {
31648 /* General regs can load everything. */
31649 if (reg_class_subset_p (regclass, GENERAL_REGS))
31650 return regclass;
31651
31652 /* Floats can load 0 and 1 plus some others. Note that we eliminated
31653 zero above. We only want to wind up preferring 80387 registers if
31654 we plan on doing computation with them. */
31655 if (TARGET_80387
31656 && standard_80387_constant_p (x) > 0)
31657 {
31658 /* Limit class to non-sse. */
31659 if (regclass == FLOAT_SSE_REGS)
31660 return FLOAT_REGS;
31661 if (regclass == FP_TOP_SSE_REGS)
31662 return FP_TOP_REG;
31663 if (regclass == FP_SECOND_SSE_REGS)
31664 return FP_SECOND_REG;
31665 if (regclass == FLOAT_INT_REGS || regclass == FLOAT_REGS)
31666 return regclass;
31667 }
31668
31669 return NO_REGS;
31670 }
31671
31672 /* Generally when we see PLUS here, it's the function invariant
31673 (plus soft-fp const_int). Which can only be computed into general
31674 regs. */
31675 if (GET_CODE (x) == PLUS)
31676 return reg_class_subset_p (regclass, GENERAL_REGS) ? regclass : NO_REGS;
31677
31678 /* QImode constants are easy to load, but non-constant QImode data
31679 must go into Q_REGS. */
31680 if (GET_MODE (x) == QImode && !CONSTANT_P (x))
31681 {
31682 if (reg_class_subset_p (regclass, Q_REGS))
31683 return regclass;
31684 if (reg_class_subset_p (Q_REGS, regclass))
31685 return Q_REGS;
31686 return NO_REGS;
31687 }
31688
31689 return regclass;
31690 }
31691
31692 /* Discourage putting floating-point values in SSE registers unless
31693 SSE math is being used, and likewise for the 387 registers. */
31694 static reg_class_t
31695 ix86_preferred_output_reload_class (rtx x, reg_class_t regclass)
31696 {
31697 enum machine_mode mode = GET_MODE (x);
31698
31699 /* Restrict the output reload class to the register bank that we are doing
31700 math on. If we would like not to return a subset of CLASS, reject this
31701 alternative: if reload cannot do this, it will still use its choice. */
31702 mode = GET_MODE (x);
31703 if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode))
31704 return MAYBE_SSE_CLASS_P (regclass) ? SSE_REGS : NO_REGS;
31705
31706 if (X87_FLOAT_MODE_P (mode))
31707 {
31708 if (regclass == FP_TOP_SSE_REGS)
31709 return FP_TOP_REG;
31710 else if (regclass == FP_SECOND_SSE_REGS)
31711 return FP_SECOND_REG;
31712 else
31713 return FLOAT_CLASS_P (regclass) ? regclass : NO_REGS;
31714 }
31715
31716 return regclass;
31717 }
31718
31719 static reg_class_t
31720 ix86_secondary_reload (bool in_p, rtx x, reg_class_t rclass,
31721 enum machine_mode mode, secondary_reload_info *sri)
31722 {
31723 /* Double-word spills from general registers to non-offsettable memory
31724 references (zero-extended addresses) require special handling. */
31725 if (TARGET_64BIT
31726 && MEM_P (x)
31727 && GET_MODE_SIZE (mode) > UNITS_PER_WORD
31728 && rclass == GENERAL_REGS
31729 && !offsettable_memref_p (x))
31730 {
31731 sri->icode = (in_p
31732 ? CODE_FOR_reload_noff_load
31733 : CODE_FOR_reload_noff_store);
31734 /* Add the cost of moving address to a temporary. */
31735 sri->extra_cost = 1;
31736
31737 return NO_REGS;
31738 }
31739
31740 /* QImode spills from non-QI registers require
31741 intermediate register on 32bit targets. */
31742 if (!TARGET_64BIT
31743 && !in_p && mode == QImode
31744 && (rclass == GENERAL_REGS
31745 || rclass == LEGACY_REGS
31746 || rclass == INDEX_REGS))
31747 {
31748 int regno;
31749
31750 if (REG_P (x))
31751 regno = REGNO (x);
31752 else
31753 regno = -1;
31754
31755 if (regno >= FIRST_PSEUDO_REGISTER || GET_CODE (x) == SUBREG)
31756 regno = true_regnum (x);
31757
31758 /* Return Q_REGS if the operand is in memory. */
31759 if (regno == -1)
31760 return Q_REGS;
31761 }
31762
31763 /* This condition handles corner case where an expression involving
31764 pointers gets vectorized. We're trying to use the address of a
31765 stack slot as a vector initializer.
31766
31767 (set (reg:V2DI 74 [ vect_cst_.2 ])
31768 (vec_duplicate:V2DI (reg/f:DI 20 frame)))
31769
31770 Eventually frame gets turned into sp+offset like this:
31771
31772 (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
31773 (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
31774 (const_int 392 [0x188]))))
31775
31776 That later gets turned into:
31777
31778 (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
31779 (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
31780 (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))))
31781
31782 We'll have the following reload recorded:
31783
31784 Reload 0: reload_in (DI) =
31785 (plus:DI (reg/f:DI 7 sp)
31786 (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))
31787 reload_out (V2DI) = (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
31788 SSE_REGS, RELOAD_OTHER (opnum = 0), can't combine
31789 reload_in_reg: (plus:DI (reg/f:DI 7 sp) (const_int 392 [0x188]))
31790 reload_out_reg: (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
31791 reload_reg_rtx: (reg:V2DI 22 xmm1)
31792
31793 Which isn't going to work since SSE instructions can't handle scalar
31794 additions. Returning GENERAL_REGS forces the addition into integer
31795 register and reload can handle subsequent reloads without problems. */
31796
31797 if (in_p && GET_CODE (x) == PLUS
31798 && SSE_CLASS_P (rclass)
31799 && SCALAR_INT_MODE_P (mode))
31800 return GENERAL_REGS;
31801
31802 return NO_REGS;
31803 }
31804
31805 /* Implement TARGET_CLASS_LIKELY_SPILLED_P. */
31806
31807 static bool
31808 ix86_class_likely_spilled_p (reg_class_t rclass)
31809 {
31810 switch (rclass)
31811 {
31812 case AREG:
31813 case DREG:
31814 case CREG:
31815 case BREG:
31816 case AD_REGS:
31817 case SIREG:
31818 case DIREG:
31819 case SSE_FIRST_REG:
31820 case FP_TOP_REG:
31821 case FP_SECOND_REG:
31822 return true;
31823
31824 default:
31825 break;
31826 }
31827
31828 return false;
31829 }
31830
31831 /* If we are copying between general and FP registers, we need a memory
31832 location. The same is true for SSE and MMX registers.
31833
31834 To optimize register_move_cost performance, allow inline variant.
31835
31836 The macro can't work reliably when one of the CLASSES is class containing
31837 registers from multiple units (SSE, MMX, integer). We avoid this by never
31838 combining those units in single alternative in the machine description.
31839 Ensure that this constraint holds to avoid unexpected surprises.
31840
31841 When STRICT is false, we are being called from REGISTER_MOVE_COST, so do not
31842 enforce these sanity checks. */
31843
31844 static inline bool
31845 inline_secondary_memory_needed (enum reg_class class1, enum reg_class class2,
31846 enum machine_mode mode, int strict)
31847 {
31848 if (MAYBE_FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class1)
31849 || MAYBE_FLOAT_CLASS_P (class2) != FLOAT_CLASS_P (class2)
31850 || MAYBE_SSE_CLASS_P (class1) != SSE_CLASS_P (class1)
31851 || MAYBE_SSE_CLASS_P (class2) != SSE_CLASS_P (class2)
31852 || MAYBE_MMX_CLASS_P (class1) != MMX_CLASS_P (class1)
31853 || MAYBE_MMX_CLASS_P (class2) != MMX_CLASS_P (class2))
31854 {
31855 gcc_assert (!strict);
31856 return true;
31857 }
31858
31859 if (FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class2))
31860 return true;
31861
31862 /* ??? This is a lie. We do have moves between mmx/general, and for
31863 mmx/sse2. But by saying we need secondary memory we discourage the
31864 register allocator from using the mmx registers unless needed. */
31865 if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2))
31866 return true;
31867
31868 if (SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
31869 {
31870 /* SSE1 doesn't have any direct moves from other classes. */
31871 if (!TARGET_SSE2)
31872 return true;
31873
31874 /* If the target says that inter-unit moves are more expensive
31875 than moving through memory, then don't generate them. */
31876 if (!TARGET_INTER_UNIT_MOVES)
31877 return true;
31878
31879 /* Between SSE and general, we have moves no larger than word size. */
31880 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
31881 return true;
31882 }
31883
31884 return false;
31885 }
31886
31887 bool
31888 ix86_secondary_memory_needed (enum reg_class class1, enum reg_class class2,
31889 enum machine_mode mode, int strict)
31890 {
31891 return inline_secondary_memory_needed (class1, class2, mode, strict);
31892 }
31893
31894 /* Implement the TARGET_CLASS_MAX_NREGS hook.
31895
31896 On the 80386, this is the size of MODE in words,
31897 except in the FP regs, where a single reg is always enough. */
31898
31899 static unsigned char
31900 ix86_class_max_nregs (reg_class_t rclass, enum machine_mode mode)
31901 {
31902 if (MAYBE_INTEGER_CLASS_P (rclass))
31903 {
31904 if (mode == XFmode)
31905 return (TARGET_64BIT ? 2 : 3);
31906 else if (mode == XCmode)
31907 return (TARGET_64BIT ? 4 : 6);
31908 else
31909 return ((GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD);
31910 }
31911 else
31912 {
31913 if (COMPLEX_MODE_P (mode))
31914 return 2;
31915 else
31916 return 1;
31917 }
31918 }
31919
31920 /* Return true if the registers in CLASS cannot represent the change from
31921 modes FROM to TO. */
31922
31923 bool
31924 ix86_cannot_change_mode_class (enum machine_mode from, enum machine_mode to,
31925 enum reg_class regclass)
31926 {
31927 if (from == to)
31928 return false;
31929
31930 /* x87 registers can't do subreg at all, as all values are reformatted
31931 to extended precision. */
31932 if (MAYBE_FLOAT_CLASS_P (regclass))
31933 return true;
31934
31935 if (MAYBE_SSE_CLASS_P (regclass) || MAYBE_MMX_CLASS_P (regclass))
31936 {
31937 /* Vector registers do not support QI or HImode loads. If we don't
31938 disallow a change to these modes, reload will assume it's ok to
31939 drop the subreg from (subreg:SI (reg:HI 100) 0). This affects
31940 the vec_dupv4hi pattern. */
31941 if (GET_MODE_SIZE (from) < 4)
31942 return true;
31943
31944 /* Vector registers do not support subreg with nonzero offsets, which
31945 are otherwise valid for integer registers. Since we can't see
31946 whether we have a nonzero offset from here, prohibit all
31947 nonparadoxical subregs changing size. */
31948 if (GET_MODE_SIZE (to) < GET_MODE_SIZE (from))
31949 return true;
31950 }
31951
31952 return false;
31953 }
31954
31955 /* Return the cost of moving data of mode M between a
31956 register and memory. A value of 2 is the default; this cost is
31957 relative to those in `REGISTER_MOVE_COST'.
31958
31959 This function is used extensively by register_move_cost that is used to
31960 build tables at startup. Make it inline in this case.
31961 When IN is 2, return maximum of in and out move cost.
31962
31963 If moving between registers and memory is more expensive than
31964 between two registers, you should define this macro to express the
31965 relative cost.
31966
31967 Model also increased moving costs of QImode registers in non
31968 Q_REGS classes.
31969 */
31970 static inline int
31971 inline_memory_move_cost (enum machine_mode mode, enum reg_class regclass,
31972 int in)
31973 {
31974 int cost;
31975 if (FLOAT_CLASS_P (regclass))
31976 {
31977 int index;
31978 switch (mode)
31979 {
31980 case SFmode:
31981 index = 0;
31982 break;
31983 case DFmode:
31984 index = 1;
31985 break;
31986 case XFmode:
31987 index = 2;
31988 break;
31989 default:
31990 return 100;
31991 }
31992 if (in == 2)
31993 return MAX (ix86_cost->fp_load [index], ix86_cost->fp_store [index]);
31994 return in ? ix86_cost->fp_load [index] : ix86_cost->fp_store [index];
31995 }
31996 if (SSE_CLASS_P (regclass))
31997 {
31998 int index;
31999 switch (GET_MODE_SIZE (mode))
32000 {
32001 case 4:
32002 index = 0;
32003 break;
32004 case 8:
32005 index = 1;
32006 break;
32007 case 16:
32008 index = 2;
32009 break;
32010 default:
32011 return 100;
32012 }
32013 if (in == 2)
32014 return MAX (ix86_cost->sse_load [index], ix86_cost->sse_store [index]);
32015 return in ? ix86_cost->sse_load [index] : ix86_cost->sse_store [index];
32016 }
32017 if (MMX_CLASS_P (regclass))
32018 {
32019 int index;
32020 switch (GET_MODE_SIZE (mode))
32021 {
32022 case 4:
32023 index = 0;
32024 break;
32025 case 8:
32026 index = 1;
32027 break;
32028 default:
32029 return 100;
32030 }
32031 if (in)
32032 return MAX (ix86_cost->mmx_load [index], ix86_cost->mmx_store [index]);
32033 return in ? ix86_cost->mmx_load [index] : ix86_cost->mmx_store [index];
32034 }
32035 switch (GET_MODE_SIZE (mode))
32036 {
32037 case 1:
32038 if (Q_CLASS_P (regclass) || TARGET_64BIT)
32039 {
32040 if (!in)
32041 return ix86_cost->int_store[0];
32042 if (TARGET_PARTIAL_REG_DEPENDENCY
32043 && optimize_function_for_speed_p (cfun))
32044 cost = ix86_cost->movzbl_load;
32045 else
32046 cost = ix86_cost->int_load[0];
32047 if (in == 2)
32048 return MAX (cost, ix86_cost->int_store[0]);
32049 return cost;
32050 }
32051 else
32052 {
32053 if (in == 2)
32054 return MAX (ix86_cost->movzbl_load, ix86_cost->int_store[0] + 4);
32055 if (in)
32056 return ix86_cost->movzbl_load;
32057 else
32058 return ix86_cost->int_store[0] + 4;
32059 }
32060 break;
32061 case 2:
32062 if (in == 2)
32063 return MAX (ix86_cost->int_load[1], ix86_cost->int_store[1]);
32064 return in ? ix86_cost->int_load[1] : ix86_cost->int_store[1];
32065 default:
32066 /* Compute number of 32bit moves needed. TFmode is moved as XFmode. */
32067 if (mode == TFmode)
32068 mode = XFmode;
32069 if (in == 2)
32070 cost = MAX (ix86_cost->int_load[2] , ix86_cost->int_store[2]);
32071 else if (in)
32072 cost = ix86_cost->int_load[2];
32073 else
32074 cost = ix86_cost->int_store[2];
32075 return (cost * (((int) GET_MODE_SIZE (mode)
32076 + UNITS_PER_WORD - 1) / UNITS_PER_WORD));
32077 }
32078 }
32079
32080 static int
32081 ix86_memory_move_cost (enum machine_mode mode, reg_class_t regclass,
32082 bool in)
32083 {
32084 return inline_memory_move_cost (mode, (enum reg_class) regclass, in ? 1 : 0);
32085 }
32086
32087
32088 /* Return the cost of moving data from a register in class CLASS1 to
32089 one in class CLASS2.
32090
32091 It is not required that the cost always equal 2 when FROM is the same as TO;
32092 on some machines it is expensive to move between registers if they are not
32093 general registers. */
32094
32095 static int
32096 ix86_register_move_cost (enum machine_mode mode, reg_class_t class1_i,
32097 reg_class_t class2_i)
32098 {
32099 enum reg_class class1 = (enum reg_class) class1_i;
32100 enum reg_class class2 = (enum reg_class) class2_i;
32101
32102 /* In case we require secondary memory, compute cost of the store followed
32103 by load. In order to avoid bad register allocation choices, we need
32104 for this to be *at least* as high as the symmetric MEMORY_MOVE_COST. */
32105
32106 if (inline_secondary_memory_needed (class1, class2, mode, 0))
32107 {
32108 int cost = 1;
32109
32110 cost += inline_memory_move_cost (mode, class1, 2);
32111 cost += inline_memory_move_cost (mode, class2, 2);
32112
32113 /* In case of copying from general_purpose_register we may emit multiple
32114 stores followed by single load causing memory size mismatch stall.
32115 Count this as arbitrarily high cost of 20. */
32116 if (targetm.class_max_nregs (class1, mode)
32117 > targetm.class_max_nregs (class2, mode))
32118 cost += 20;
32119
32120 /* In the case of FP/MMX moves, the registers actually overlap, and we
32121 have to switch modes in order to treat them differently. */
32122 if ((MMX_CLASS_P (class1) && MAYBE_FLOAT_CLASS_P (class2))
32123 || (MMX_CLASS_P (class2) && MAYBE_FLOAT_CLASS_P (class1)))
32124 cost += 20;
32125
32126 return cost;
32127 }
32128
32129 /* Moves between SSE/MMX and integer unit are expensive. */
32130 if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2)
32131 || SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
32132
32133 /* ??? By keeping returned value relatively high, we limit the number
32134 of moves between integer and MMX/SSE registers for all targets.
32135 Additionally, high value prevents problem with x86_modes_tieable_p(),
32136 where integer modes in MMX/SSE registers are not tieable
32137 because of missing QImode and HImode moves to, from or between
32138 MMX/SSE registers. */
32139 return MAX (8, ix86_cost->mmxsse_to_integer);
32140
32141 if (MAYBE_FLOAT_CLASS_P (class1))
32142 return ix86_cost->fp_move;
32143 if (MAYBE_SSE_CLASS_P (class1))
32144 return ix86_cost->sse_move;
32145 if (MAYBE_MMX_CLASS_P (class1))
32146 return ix86_cost->mmx_move;
32147 return 2;
32148 }
32149
32150 /* Return TRUE if hard register REGNO can hold a value of machine-mode
32151 MODE. */
32152
32153 bool
32154 ix86_hard_regno_mode_ok (int regno, enum machine_mode mode)
32155 {
32156 /* Flags and only flags can only hold CCmode values. */
32157 if (CC_REGNO_P (regno))
32158 return GET_MODE_CLASS (mode) == MODE_CC;
32159 if (GET_MODE_CLASS (mode) == MODE_CC
32160 || GET_MODE_CLASS (mode) == MODE_RANDOM
32161 || GET_MODE_CLASS (mode) == MODE_PARTIAL_INT)
32162 return false;
32163 if (FP_REGNO_P (regno))
32164 return VALID_FP_MODE_P (mode);
32165 if (SSE_REGNO_P (regno))
32166 {
32167 /* We implement the move patterns for all vector modes into and
32168 out of SSE registers, even when no operation instructions
32169 are available. OImode move is available only when AVX is
32170 enabled. */
32171 return ((TARGET_AVX && mode == OImode)
32172 || VALID_AVX256_REG_MODE (mode)
32173 || VALID_SSE_REG_MODE (mode)
32174 || VALID_SSE2_REG_MODE (mode)
32175 || VALID_MMX_REG_MODE (mode)
32176 || VALID_MMX_REG_MODE_3DNOW (mode));
32177 }
32178 if (MMX_REGNO_P (regno))
32179 {
32180 /* We implement the move patterns for 3DNOW modes even in MMX mode,
32181 so if the register is available at all, then we can move data of
32182 the given mode into or out of it. */
32183 return (VALID_MMX_REG_MODE (mode)
32184 || VALID_MMX_REG_MODE_3DNOW (mode));
32185 }
32186
32187 if (mode == QImode)
32188 {
32189 /* Take care for QImode values - they can be in non-QI regs,
32190 but then they do cause partial register stalls. */
32191 if (TARGET_64BIT || QI_REGNO_P (regno))
32192 return true;
32193 if (!TARGET_PARTIAL_REG_STALL)
32194 return true;
32195 return !can_create_pseudo_p ();
32196 }
32197 /* We handle both integer and floats in the general purpose registers. */
32198 else if (VALID_INT_MODE_P (mode))
32199 return true;
32200 else if (VALID_FP_MODE_P (mode))
32201 return true;
32202 else if (VALID_DFP_MODE_P (mode))
32203 return true;
32204 /* Lots of MMX code casts 8 byte vector modes to DImode. If we then go
32205 on to use that value in smaller contexts, this can easily force a
32206 pseudo to be allocated to GENERAL_REGS. Since this is no worse than
32207 supporting DImode, allow it. */
32208 else if (VALID_MMX_REG_MODE_3DNOW (mode) || VALID_MMX_REG_MODE (mode))
32209 return true;
32210
32211 return false;
32212 }
32213
32214 /* A subroutine of ix86_modes_tieable_p. Return true if MODE is a
32215 tieable integer mode. */
32216
32217 static bool
32218 ix86_tieable_integer_mode_p (enum machine_mode mode)
32219 {
32220 switch (mode)
32221 {
32222 case HImode:
32223 case SImode:
32224 return true;
32225
32226 case QImode:
32227 return TARGET_64BIT || !TARGET_PARTIAL_REG_STALL;
32228
32229 case DImode:
32230 return TARGET_64BIT;
32231
32232 default:
32233 return false;
32234 }
32235 }
32236
32237 /* Return true if MODE1 is accessible in a register that can hold MODE2
32238 without copying. That is, all register classes that can hold MODE2
32239 can also hold MODE1. */
32240
32241 bool
32242 ix86_modes_tieable_p (enum machine_mode mode1, enum machine_mode mode2)
32243 {
32244 if (mode1 == mode2)
32245 return true;
32246
32247 if (ix86_tieable_integer_mode_p (mode1)
32248 && ix86_tieable_integer_mode_p (mode2))
32249 return true;
32250
32251 /* MODE2 being XFmode implies fp stack or general regs, which means we
32252 can tie any smaller floating point modes to it. Note that we do not
32253 tie this with TFmode. */
32254 if (mode2 == XFmode)
32255 return mode1 == SFmode || mode1 == DFmode;
32256
32257 /* MODE2 being DFmode implies fp stack, general or sse regs, which means
32258 that we can tie it with SFmode. */
32259 if (mode2 == DFmode)
32260 return mode1 == SFmode;
32261
32262 /* If MODE2 is only appropriate for an SSE register, then tie with
32263 any other mode acceptable to SSE registers. */
32264 if (GET_MODE_SIZE (mode2) == 32
32265 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
32266 return (GET_MODE_SIZE (mode1) == 32
32267 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
32268 if (GET_MODE_SIZE (mode2) == 16
32269 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
32270 return (GET_MODE_SIZE (mode1) == 16
32271 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
32272
32273 /* If MODE2 is appropriate for an MMX register, then tie
32274 with any other mode acceptable to MMX registers. */
32275 if (GET_MODE_SIZE (mode2) == 8
32276 && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode2))
32277 return (GET_MODE_SIZE (mode1) == 8
32278 && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode1));
32279
32280 return false;
32281 }
32282
32283 /* Return the cost of moving between two registers of mode MODE. */
32284
32285 static int
32286 ix86_set_reg_reg_cost (enum machine_mode mode)
32287 {
32288 unsigned int units = UNITS_PER_WORD;
32289
32290 switch (GET_MODE_CLASS (mode))
32291 {
32292 default:
32293 break;
32294
32295 case MODE_CC:
32296 units = GET_MODE_SIZE (CCmode);
32297 break;
32298
32299 case MODE_FLOAT:
32300 if ((TARGET_SSE && mode == TFmode)
32301 || (TARGET_80387 && mode == XFmode)
32302 || ((TARGET_80387 || TARGET_SSE2) && mode == DFmode)
32303 || ((TARGET_80387 || TARGET_SSE) && mode == SFmode))
32304 units = GET_MODE_SIZE (mode);
32305 break;
32306
32307 case MODE_COMPLEX_FLOAT:
32308 if ((TARGET_SSE && mode == TCmode)
32309 || (TARGET_80387 && mode == XCmode)
32310 || ((TARGET_80387 || TARGET_SSE2) && mode == DCmode)
32311 || ((TARGET_80387 || TARGET_SSE) && mode == SCmode))
32312 units = GET_MODE_SIZE (mode);
32313 break;
32314
32315 case MODE_VECTOR_INT:
32316 case MODE_VECTOR_FLOAT:
32317 if ((TARGET_AVX && VALID_AVX256_REG_MODE (mode))
32318 || (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode))
32319 || (TARGET_SSE && VALID_SSE_REG_MODE (mode))
32320 || (TARGET_MMX && VALID_MMX_REG_MODE (mode)))
32321 units = GET_MODE_SIZE (mode);
32322 }
32323
32324 /* Return the cost of moving between two registers of mode MODE,
32325 assuming that the move will be in pieces of at most UNITS bytes. */
32326 return COSTS_N_INSNS ((GET_MODE_SIZE (mode) + units - 1) / units);
32327 }
32328
32329 /* Compute a (partial) cost for rtx X. Return true if the complete
32330 cost has been computed, and false if subexpressions should be
32331 scanned. In either case, *TOTAL contains the cost result. */
32332
32333 static bool
32334 ix86_rtx_costs (rtx x, int code_i, int outer_code_i, int opno, int *total,
32335 bool speed)
32336 {
32337 enum rtx_code code = (enum rtx_code) code_i;
32338 enum rtx_code outer_code = (enum rtx_code) outer_code_i;
32339 enum machine_mode mode = GET_MODE (x);
32340 const struct processor_costs *cost = speed ? ix86_cost : &ix86_size_cost;
32341
32342 switch (code)
32343 {
32344 case SET:
32345 if (register_operand (SET_DEST (x), VOIDmode)
32346 && reg_or_0_operand (SET_SRC (x), VOIDmode))
32347 {
32348 *total = ix86_set_reg_reg_cost (GET_MODE (SET_DEST (x)));
32349 return true;
32350 }
32351 return false;
32352
32353 case CONST_INT:
32354 case CONST:
32355 case LABEL_REF:
32356 case SYMBOL_REF:
32357 if (TARGET_64BIT && !x86_64_immediate_operand (x, VOIDmode))
32358 *total = 3;
32359 else if (TARGET_64BIT && !x86_64_zext_immediate_operand (x, VOIDmode))
32360 *total = 2;
32361 else if (flag_pic && SYMBOLIC_CONST (x)
32362 && (!TARGET_64BIT
32363 || (!GET_CODE (x) != LABEL_REF
32364 && (GET_CODE (x) != SYMBOL_REF
32365 || !SYMBOL_REF_LOCAL_P (x)))))
32366 *total = 1;
32367 else
32368 *total = 0;
32369 return true;
32370
32371 case CONST_DOUBLE:
32372 if (mode == VOIDmode)
32373 {
32374 *total = 0;
32375 return true;
32376 }
32377 switch (standard_80387_constant_p (x))
32378 {
32379 case 1: /* 0.0 */
32380 *total = 1;
32381 return true;
32382 default: /* Other constants */
32383 *total = 2;
32384 return true;
32385 case 0:
32386 case -1:
32387 break;
32388 }
32389 if (SSE_FLOAT_MODE_P (mode))
32390 {
32391 case CONST_VECTOR:
32392 switch (standard_sse_constant_p (x))
32393 {
32394 case 0:
32395 break;
32396 case 1: /* 0: xor eliminates false dependency */
32397 *total = 0;
32398 return true;
32399 default: /* -1: cmp contains false dependency */
32400 *total = 1;
32401 return true;
32402 }
32403 }
32404 /* Fall back to (MEM (SYMBOL_REF)), since that's where
32405 it'll probably end up. Add a penalty for size. */
32406 *total = (COSTS_N_INSNS (1)
32407 + (flag_pic != 0 && !TARGET_64BIT)
32408 + (mode == SFmode ? 0 : mode == DFmode ? 1 : 2));
32409 return true;
32410
32411 case ZERO_EXTEND:
32412 /* The zero extensions is often completely free on x86_64, so make
32413 it as cheap as possible. */
32414 if (TARGET_64BIT && mode == DImode
32415 && GET_MODE (XEXP (x, 0)) == SImode)
32416 *total = 1;
32417 else if (TARGET_ZERO_EXTEND_WITH_AND)
32418 *total = cost->add;
32419 else
32420 *total = cost->movzx;
32421 return false;
32422
32423 case SIGN_EXTEND:
32424 *total = cost->movsx;
32425 return false;
32426
32427 case ASHIFT:
32428 if (SCALAR_INT_MODE_P (mode)
32429 && GET_MODE_SIZE (mode) < UNITS_PER_WORD
32430 && CONST_INT_P (XEXP (x, 1)))
32431 {
32432 HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
32433 if (value == 1)
32434 {
32435 *total = cost->add;
32436 return false;
32437 }
32438 if ((value == 2 || value == 3)
32439 && cost->lea <= cost->shift_const)
32440 {
32441 *total = cost->lea;
32442 return false;
32443 }
32444 }
32445 /* FALLTHRU */
32446
32447 case ROTATE:
32448 case ASHIFTRT:
32449 case LSHIFTRT:
32450 case ROTATERT:
32451 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
32452 {
32453 /* ??? Should be SSE vector operation cost. */
32454 /* At least for published AMD latencies, this really is the same
32455 as the latency for a simple fpu operation like fabs. */
32456 /* V*QImode is emulated with 1-11 insns. */
32457 if (mode == V16QImode || mode == V32QImode)
32458 {
32459 int count = 11;
32460 if (TARGET_XOP && mode == V16QImode)
32461 {
32462 /* For XOP we use vpshab, which requires a broadcast of the
32463 value to the variable shift insn. For constants this
32464 means a V16Q const in mem; even when we can perform the
32465 shift with one insn set the cost to prefer paddb. */
32466 if (CONSTANT_P (XEXP (x, 1)))
32467 {
32468 *total = (cost->fabs
32469 + rtx_cost (XEXP (x, 0), code, 0, speed)
32470 + (speed ? 2 : COSTS_N_BYTES (16)));
32471 return true;
32472 }
32473 count = 3;
32474 }
32475 else if (TARGET_SSSE3)
32476 count = 7;
32477 *total = cost->fabs * count;
32478 }
32479 else
32480 *total = cost->fabs;
32481 }
32482 else if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
32483 {
32484 if (CONST_INT_P (XEXP (x, 1)))
32485 {
32486 if (INTVAL (XEXP (x, 1)) > 32)
32487 *total = cost->shift_const + COSTS_N_INSNS (2);
32488 else
32489 *total = cost->shift_const * 2;
32490 }
32491 else
32492 {
32493 if (GET_CODE (XEXP (x, 1)) == AND)
32494 *total = cost->shift_var * 2;
32495 else
32496 *total = cost->shift_var * 6 + COSTS_N_INSNS (2);
32497 }
32498 }
32499 else
32500 {
32501 if (CONST_INT_P (XEXP (x, 1)))
32502 *total = cost->shift_const;
32503 else
32504 *total = cost->shift_var;
32505 }
32506 return false;
32507
32508 case FMA:
32509 {
32510 rtx sub;
32511
32512 gcc_assert (FLOAT_MODE_P (mode));
32513 gcc_assert (TARGET_FMA || TARGET_FMA4);
32514
32515 /* ??? SSE scalar/vector cost should be used here. */
32516 /* ??? Bald assumption that fma has the same cost as fmul. */
32517 *total = cost->fmul;
32518 *total += rtx_cost (XEXP (x, 1), FMA, 1, speed);
32519
32520 /* Negate in op0 or op2 is free: FMS, FNMA, FNMS. */
32521 sub = XEXP (x, 0);
32522 if (GET_CODE (sub) == NEG)
32523 sub = XEXP (sub, 0);
32524 *total += rtx_cost (sub, FMA, 0, speed);
32525
32526 sub = XEXP (x, 2);
32527 if (GET_CODE (sub) == NEG)
32528 sub = XEXP (sub, 0);
32529 *total += rtx_cost (sub, FMA, 2, speed);
32530 return true;
32531 }
32532
32533 case MULT:
32534 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
32535 {
32536 /* ??? SSE scalar cost should be used here. */
32537 *total = cost->fmul;
32538 return false;
32539 }
32540 else if (X87_FLOAT_MODE_P (mode))
32541 {
32542 *total = cost->fmul;
32543 return false;
32544 }
32545 else if (FLOAT_MODE_P (mode))
32546 {
32547 /* ??? SSE vector cost should be used here. */
32548 *total = cost->fmul;
32549 return false;
32550 }
32551 else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
32552 {
32553 /* V*QImode is emulated with 7-13 insns. */
32554 if (mode == V16QImode || mode == V32QImode)
32555 {
32556 int extra = 11;
32557 if (TARGET_XOP && mode == V16QImode)
32558 extra = 5;
32559 else if (TARGET_SSSE3)
32560 extra = 6;
32561 *total = cost->fmul * 2 + cost->fabs * extra;
32562 }
32563 /* V*DImode is emulated with 5-8 insns. */
32564 else if (mode == V2DImode || mode == V4DImode)
32565 {
32566 if (TARGET_XOP && mode == V2DImode)
32567 *total = cost->fmul * 2 + cost->fabs * 3;
32568 else
32569 *total = cost->fmul * 3 + cost->fabs * 5;
32570 }
32571 /* Without sse4.1, we don't have PMULLD; it's emulated with 7
32572 insns, including two PMULUDQ. */
32573 else if (mode == V4SImode && !(TARGET_SSE4_1 || TARGET_AVX))
32574 *total = cost->fmul * 2 + cost->fabs * 5;
32575 else
32576 *total = cost->fmul;
32577 return false;
32578 }
32579 else
32580 {
32581 rtx op0 = XEXP (x, 0);
32582 rtx op1 = XEXP (x, 1);
32583 int nbits;
32584 if (CONST_INT_P (XEXP (x, 1)))
32585 {
32586 unsigned HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
32587 for (nbits = 0; value != 0; value &= value - 1)
32588 nbits++;
32589 }
32590 else
32591 /* This is arbitrary. */
32592 nbits = 7;
32593
32594 /* Compute costs correctly for widening multiplication. */
32595 if ((GET_CODE (op0) == SIGN_EXTEND || GET_CODE (op0) == ZERO_EXTEND)
32596 && GET_MODE_SIZE (GET_MODE (XEXP (op0, 0))) * 2
32597 == GET_MODE_SIZE (mode))
32598 {
32599 int is_mulwiden = 0;
32600 enum machine_mode inner_mode = GET_MODE (op0);
32601
32602 if (GET_CODE (op0) == GET_CODE (op1))
32603 is_mulwiden = 1, op1 = XEXP (op1, 0);
32604 else if (CONST_INT_P (op1))
32605 {
32606 if (GET_CODE (op0) == SIGN_EXTEND)
32607 is_mulwiden = trunc_int_for_mode (INTVAL (op1), inner_mode)
32608 == INTVAL (op1);
32609 else
32610 is_mulwiden = !(INTVAL (op1) & ~GET_MODE_MASK (inner_mode));
32611 }
32612
32613 if (is_mulwiden)
32614 op0 = XEXP (op0, 0), mode = GET_MODE (op0);
32615 }
32616
32617 *total = (cost->mult_init[MODE_INDEX (mode)]
32618 + nbits * cost->mult_bit
32619 + rtx_cost (op0, outer_code, opno, speed)
32620 + rtx_cost (op1, outer_code, opno, speed));
32621
32622 return true;
32623 }
32624
32625 case DIV:
32626 case UDIV:
32627 case MOD:
32628 case UMOD:
32629 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
32630 /* ??? SSE cost should be used here. */
32631 *total = cost->fdiv;
32632 else if (X87_FLOAT_MODE_P (mode))
32633 *total = cost->fdiv;
32634 else if (FLOAT_MODE_P (mode))
32635 /* ??? SSE vector cost should be used here. */
32636 *total = cost->fdiv;
32637 else
32638 *total = cost->divide[MODE_INDEX (mode)];
32639 return false;
32640
32641 case PLUS:
32642 if (GET_MODE_CLASS (mode) == MODE_INT
32643 && GET_MODE_SIZE (mode) <= UNITS_PER_WORD)
32644 {
32645 if (GET_CODE (XEXP (x, 0)) == PLUS
32646 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
32647 && CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 0), 1))
32648 && CONSTANT_P (XEXP (x, 1)))
32649 {
32650 HOST_WIDE_INT val = INTVAL (XEXP (XEXP (XEXP (x, 0), 0), 1));
32651 if (val == 2 || val == 4 || val == 8)
32652 {
32653 *total = cost->lea;
32654 *total += rtx_cost (XEXP (XEXP (x, 0), 1),
32655 outer_code, opno, speed);
32656 *total += rtx_cost (XEXP (XEXP (XEXP (x, 0), 0), 0),
32657 outer_code, opno, speed);
32658 *total += rtx_cost (XEXP (x, 1), outer_code, opno, speed);
32659 return true;
32660 }
32661 }
32662 else if (GET_CODE (XEXP (x, 0)) == MULT
32663 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
32664 {
32665 HOST_WIDE_INT val = INTVAL (XEXP (XEXP (x, 0), 1));
32666 if (val == 2 || val == 4 || val == 8)
32667 {
32668 *total = cost->lea;
32669 *total += rtx_cost (XEXP (XEXP (x, 0), 0),
32670 outer_code, opno, speed);
32671 *total += rtx_cost (XEXP (x, 1), outer_code, opno, speed);
32672 return true;
32673 }
32674 }
32675 else if (GET_CODE (XEXP (x, 0)) == PLUS)
32676 {
32677 *total = cost->lea;
32678 *total += rtx_cost (XEXP (XEXP (x, 0), 0),
32679 outer_code, opno, speed);
32680 *total += rtx_cost (XEXP (XEXP (x, 0), 1),
32681 outer_code, opno, speed);
32682 *total += rtx_cost (XEXP (x, 1), outer_code, opno, speed);
32683 return true;
32684 }
32685 }
32686 /* FALLTHRU */
32687
32688 case MINUS:
32689 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
32690 {
32691 /* ??? SSE cost should be used here. */
32692 *total = cost->fadd;
32693 return false;
32694 }
32695 else if (X87_FLOAT_MODE_P (mode))
32696 {
32697 *total = cost->fadd;
32698 return false;
32699 }
32700 else if (FLOAT_MODE_P (mode))
32701 {
32702 /* ??? SSE vector cost should be used here. */
32703 *total = cost->fadd;
32704 return false;
32705 }
32706 /* FALLTHRU */
32707
32708 case AND:
32709 case IOR:
32710 case XOR:
32711 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
32712 {
32713 *total = (cost->add * 2
32714 + (rtx_cost (XEXP (x, 0), outer_code, opno, speed)
32715 << (GET_MODE (XEXP (x, 0)) != DImode))
32716 + (rtx_cost (XEXP (x, 1), outer_code, opno, speed)
32717 << (GET_MODE (XEXP (x, 1)) != DImode)));
32718 return true;
32719 }
32720 /* FALLTHRU */
32721
32722 case NEG:
32723 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
32724 {
32725 /* ??? SSE cost should be used here. */
32726 *total = cost->fchs;
32727 return false;
32728 }
32729 else if (X87_FLOAT_MODE_P (mode))
32730 {
32731 *total = cost->fchs;
32732 return false;
32733 }
32734 else if (FLOAT_MODE_P (mode))
32735 {
32736 /* ??? SSE vector cost should be used here. */
32737 *total = cost->fchs;
32738 return false;
32739 }
32740 /* FALLTHRU */
32741
32742 case NOT:
32743 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
32744 {
32745 /* ??? Should be SSE vector operation cost. */
32746 /* At least for published AMD latencies, this really is the same
32747 as the latency for a simple fpu operation like fabs. */
32748 *total = cost->fabs;
32749 }
32750 else if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
32751 *total = cost->add * 2;
32752 else
32753 *total = cost->add;
32754 return false;
32755
32756 case COMPARE:
32757 if (GET_CODE (XEXP (x, 0)) == ZERO_EXTRACT
32758 && XEXP (XEXP (x, 0), 1) == const1_rtx
32759 && CONST_INT_P (XEXP (XEXP (x, 0), 2))
32760 && XEXP (x, 1) == const0_rtx)
32761 {
32762 /* This kind of construct is implemented using test[bwl].
32763 Treat it as if we had an AND. */
32764 *total = (cost->add
32765 + rtx_cost (XEXP (XEXP (x, 0), 0), outer_code, opno, speed)
32766 + rtx_cost (const1_rtx, outer_code, opno, speed));
32767 return true;
32768 }
32769 return false;
32770
32771 case FLOAT_EXTEND:
32772 if (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH))
32773 *total = 0;
32774 return false;
32775
32776 case ABS:
32777 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
32778 /* ??? SSE cost should be used here. */
32779 *total = cost->fabs;
32780 else if (X87_FLOAT_MODE_P (mode))
32781 *total = cost->fabs;
32782 else if (FLOAT_MODE_P (mode))
32783 /* ??? SSE vector cost should be used here. */
32784 *total = cost->fabs;
32785 return false;
32786
32787 case SQRT:
32788 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
32789 /* ??? SSE cost should be used here. */
32790 *total = cost->fsqrt;
32791 else if (X87_FLOAT_MODE_P (mode))
32792 *total = cost->fsqrt;
32793 else if (FLOAT_MODE_P (mode))
32794 /* ??? SSE vector cost should be used here. */
32795 *total = cost->fsqrt;
32796 return false;
32797
32798 case UNSPEC:
32799 if (XINT (x, 1) == UNSPEC_TP)
32800 *total = 0;
32801 return false;
32802
32803 case VEC_SELECT:
32804 case VEC_CONCAT:
32805 case VEC_MERGE:
32806 case VEC_DUPLICATE:
32807 /* ??? Assume all of these vector manipulation patterns are
32808 recognizable. In which case they all pretty much have the
32809 same cost. */
32810 *total = cost->fabs;
32811 return true;
32812
32813 default:
32814 return false;
32815 }
32816 }
32817
32818 #if TARGET_MACHO
32819
32820 static int current_machopic_label_num;
32821
32822 /* Given a symbol name and its associated stub, write out the
32823 definition of the stub. */
32824
32825 void
32826 machopic_output_stub (FILE *file, const char *symb, const char *stub)
32827 {
32828 unsigned int length;
32829 char *binder_name, *symbol_name, lazy_ptr_name[32];
32830 int label = ++current_machopic_label_num;
32831
32832 /* For 64-bit we shouldn't get here. */
32833 gcc_assert (!TARGET_64BIT);
32834
32835 /* Lose our funky encoding stuff so it doesn't contaminate the stub. */
32836 symb = targetm.strip_name_encoding (symb);
32837
32838 length = strlen (stub);
32839 binder_name = XALLOCAVEC (char, length + 32);
32840 GEN_BINDER_NAME_FOR_STUB (binder_name, stub, length);
32841
32842 length = strlen (symb);
32843 symbol_name = XALLOCAVEC (char, length + 32);
32844 GEN_SYMBOL_NAME_FOR_SYMBOL (symbol_name, symb, length);
32845
32846 sprintf (lazy_ptr_name, "L%d$lz", label);
32847
32848 if (MACHOPIC_ATT_STUB)
32849 switch_to_section (darwin_sections[machopic_picsymbol_stub3_section]);
32850 else if (MACHOPIC_PURE)
32851 switch_to_section (darwin_sections[machopic_picsymbol_stub2_section]);
32852 else
32853 switch_to_section (darwin_sections[machopic_symbol_stub_section]);
32854
32855 fprintf (file, "%s:\n", stub);
32856 fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
32857
32858 if (MACHOPIC_ATT_STUB)
32859 {
32860 fprintf (file, "\thlt ; hlt ; hlt ; hlt ; hlt\n");
32861 }
32862 else if (MACHOPIC_PURE)
32863 {
32864 /* PIC stub. */
32865 /* 25-byte PIC stub using "CALL get_pc_thunk". */
32866 rtx tmp = gen_rtx_REG (SImode, 2 /* ECX */);
32867 output_set_got (tmp, NULL_RTX); /* "CALL ___<cpu>.get_pc_thunk.cx". */
32868 fprintf (file, "LPC$%d:\tmovl\t%s-LPC$%d(%%ecx),%%ecx\n",
32869 label, lazy_ptr_name, label);
32870 fprintf (file, "\tjmp\t*%%ecx\n");
32871 }
32872 else
32873 fprintf (file, "\tjmp\t*%s\n", lazy_ptr_name);
32874
32875 /* The AT&T-style ("self-modifying") stub is not lazily bound, thus
32876 it needs no stub-binding-helper. */
32877 if (MACHOPIC_ATT_STUB)
32878 return;
32879
32880 fprintf (file, "%s:\n", binder_name);
32881
32882 if (MACHOPIC_PURE)
32883 {
32884 fprintf (file, "\tlea\t%s-%s(%%ecx),%%ecx\n", lazy_ptr_name, binder_name);
32885 fprintf (file, "\tpushl\t%%ecx\n");
32886 }
32887 else
32888 fprintf (file, "\tpushl\t$%s\n", lazy_ptr_name);
32889
32890 fputs ("\tjmp\tdyld_stub_binding_helper\n", file);
32891
32892 /* N.B. Keep the correspondence of these
32893 'symbol_ptr/symbol_ptr2/symbol_ptr3' sections consistent with the
32894 old-pic/new-pic/non-pic stubs; altering this will break
32895 compatibility with existing dylibs. */
32896 if (MACHOPIC_PURE)
32897 {
32898 /* 25-byte PIC stub using "CALL get_pc_thunk". */
32899 switch_to_section (darwin_sections[machopic_lazy_symbol_ptr2_section]);
32900 }
32901 else
32902 /* 16-byte -mdynamic-no-pic stub. */
32903 switch_to_section(darwin_sections[machopic_lazy_symbol_ptr3_section]);
32904
32905 fprintf (file, "%s:\n", lazy_ptr_name);
32906 fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
32907 fprintf (file, ASM_LONG "%s\n", binder_name);
32908 }
32909 #endif /* TARGET_MACHO */
32910
32911 /* Order the registers for register allocator. */
32912
32913 void
32914 x86_order_regs_for_local_alloc (void)
32915 {
32916 int pos = 0;
32917 int i;
32918
32919 /* First allocate the local general purpose registers. */
32920 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
32921 if (GENERAL_REGNO_P (i) && call_used_regs[i])
32922 reg_alloc_order [pos++] = i;
32923
32924 /* Global general purpose registers. */
32925 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
32926 if (GENERAL_REGNO_P (i) && !call_used_regs[i])
32927 reg_alloc_order [pos++] = i;
32928
32929 /* x87 registers come first in case we are doing FP math
32930 using them. */
32931 if (!TARGET_SSE_MATH)
32932 for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
32933 reg_alloc_order [pos++] = i;
32934
32935 /* SSE registers. */
32936 for (i = FIRST_SSE_REG; i <= LAST_SSE_REG; i++)
32937 reg_alloc_order [pos++] = i;
32938 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
32939 reg_alloc_order [pos++] = i;
32940
32941 /* x87 registers. */
32942 if (TARGET_SSE_MATH)
32943 for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
32944 reg_alloc_order [pos++] = i;
32945
32946 for (i = FIRST_MMX_REG; i <= LAST_MMX_REG; i++)
32947 reg_alloc_order [pos++] = i;
32948
32949 /* Initialize the rest of array as we do not allocate some registers
32950 at all. */
32951 while (pos < FIRST_PSEUDO_REGISTER)
32952 reg_alloc_order [pos++] = 0;
32953 }
32954
32955 /* Handle a "callee_pop_aggregate_return" attribute; arguments as
32956 in struct attribute_spec handler. */
32957 static tree
32958 ix86_handle_callee_pop_aggregate_return (tree *node, tree name,
32959 tree args,
32960 int flags ATTRIBUTE_UNUSED,
32961 bool *no_add_attrs)
32962 {
32963 if (TREE_CODE (*node) != FUNCTION_TYPE
32964 && TREE_CODE (*node) != METHOD_TYPE
32965 && TREE_CODE (*node) != FIELD_DECL
32966 && TREE_CODE (*node) != TYPE_DECL)
32967 {
32968 warning (OPT_Wattributes, "%qE attribute only applies to functions",
32969 name);
32970 *no_add_attrs = true;
32971 return NULL_TREE;
32972 }
32973 if (TARGET_64BIT)
32974 {
32975 warning (OPT_Wattributes, "%qE attribute only available for 32-bit",
32976 name);
32977 *no_add_attrs = true;
32978 return NULL_TREE;
32979 }
32980 if (is_attribute_p ("callee_pop_aggregate_return", name))
32981 {
32982 tree cst;
32983
32984 cst = TREE_VALUE (args);
32985 if (TREE_CODE (cst) != INTEGER_CST)
32986 {
32987 warning (OPT_Wattributes,
32988 "%qE attribute requires an integer constant argument",
32989 name);
32990 *no_add_attrs = true;
32991 }
32992 else if (compare_tree_int (cst, 0) != 0
32993 && compare_tree_int (cst, 1) != 0)
32994 {
32995 warning (OPT_Wattributes,
32996 "argument to %qE attribute is neither zero, nor one",
32997 name);
32998 *no_add_attrs = true;
32999 }
33000
33001 return NULL_TREE;
33002 }
33003
33004 return NULL_TREE;
33005 }
33006
33007 /* Handle a "ms_abi" or "sysv" attribute; arguments as in
33008 struct attribute_spec.handler. */
33009 static tree
33010 ix86_handle_abi_attribute (tree *node, tree name,
33011 tree args ATTRIBUTE_UNUSED,
33012 int flags ATTRIBUTE_UNUSED, bool *no_add_attrs)
33013 {
33014 if (TREE_CODE (*node) != FUNCTION_TYPE
33015 && TREE_CODE (*node) != METHOD_TYPE
33016 && TREE_CODE (*node) != FIELD_DECL
33017 && TREE_CODE (*node) != TYPE_DECL)
33018 {
33019 warning (OPT_Wattributes, "%qE attribute only applies to functions",
33020 name);
33021 *no_add_attrs = true;
33022 return NULL_TREE;
33023 }
33024
33025 /* Can combine regparm with all attributes but fastcall. */
33026 if (is_attribute_p ("ms_abi", name))
33027 {
33028 if (lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (*node)))
33029 {
33030 error ("ms_abi and sysv_abi attributes are not compatible");
33031 }
33032
33033 return NULL_TREE;
33034 }
33035 else if (is_attribute_p ("sysv_abi", name))
33036 {
33037 if (lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (*node)))
33038 {
33039 error ("ms_abi and sysv_abi attributes are not compatible");
33040 }
33041
33042 return NULL_TREE;
33043 }
33044
33045 return NULL_TREE;
33046 }
33047
33048 /* Handle a "ms_struct" or "gcc_struct" attribute; arguments as in
33049 struct attribute_spec.handler. */
33050 static tree
33051 ix86_handle_struct_attribute (tree *node, tree name,
33052 tree args ATTRIBUTE_UNUSED,
33053 int flags ATTRIBUTE_UNUSED, bool *no_add_attrs)
33054 {
33055 tree *type = NULL;
33056 if (DECL_P (*node))
33057 {
33058 if (TREE_CODE (*node) == TYPE_DECL)
33059 type = &TREE_TYPE (*node);
33060 }
33061 else
33062 type = node;
33063
33064 if (!(type && RECORD_OR_UNION_TYPE_P (*type)))
33065 {
33066 warning (OPT_Wattributes, "%qE attribute ignored",
33067 name);
33068 *no_add_attrs = true;
33069 }
33070
33071 else if ((is_attribute_p ("ms_struct", name)
33072 && lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (*type)))
33073 || ((is_attribute_p ("gcc_struct", name)
33074 && lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (*type)))))
33075 {
33076 warning (OPT_Wattributes, "%qE incompatible attribute ignored",
33077 name);
33078 *no_add_attrs = true;
33079 }
33080
33081 return NULL_TREE;
33082 }
33083
33084 static tree
33085 ix86_handle_fndecl_attribute (tree *node, tree name,
33086 tree args ATTRIBUTE_UNUSED,
33087 int flags ATTRIBUTE_UNUSED, bool *no_add_attrs)
33088 {
33089 if (TREE_CODE (*node) != FUNCTION_DECL)
33090 {
33091 warning (OPT_Wattributes, "%qE attribute only applies to functions",
33092 name);
33093 *no_add_attrs = true;
33094 }
33095 return NULL_TREE;
33096 }
33097
33098 static bool
33099 ix86_ms_bitfield_layout_p (const_tree record_type)
33100 {
33101 return ((TARGET_MS_BITFIELD_LAYOUT
33102 && !lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (record_type)))
33103 || lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (record_type)));
33104 }
33105
33106 /* Returns an expression indicating where the this parameter is
33107 located on entry to the FUNCTION. */
33108
33109 static rtx
33110 x86_this_parameter (tree function)
33111 {
33112 tree type = TREE_TYPE (function);
33113 bool aggr = aggregate_value_p (TREE_TYPE (type), type) != 0;
33114 int nregs;
33115
33116 if (TARGET_64BIT)
33117 {
33118 const int *parm_regs;
33119
33120 if (ix86_function_type_abi (type) == MS_ABI)
33121 parm_regs = x86_64_ms_abi_int_parameter_registers;
33122 else
33123 parm_regs = x86_64_int_parameter_registers;
33124 return gen_rtx_REG (Pmode, parm_regs[aggr]);
33125 }
33126
33127 nregs = ix86_function_regparm (type, function);
33128
33129 if (nregs > 0 && !stdarg_p (type))
33130 {
33131 int regno;
33132 unsigned int ccvt = ix86_get_callcvt (type);
33133
33134 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
33135 regno = aggr ? DX_REG : CX_REG;
33136 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
33137 {
33138 regno = CX_REG;
33139 if (aggr)
33140 return gen_rtx_MEM (SImode,
33141 plus_constant (Pmode, stack_pointer_rtx, 4));
33142 }
33143 else
33144 {
33145 regno = AX_REG;
33146 if (aggr)
33147 {
33148 regno = DX_REG;
33149 if (nregs == 1)
33150 return gen_rtx_MEM (SImode,
33151 plus_constant (Pmode,
33152 stack_pointer_rtx, 4));
33153 }
33154 }
33155 return gen_rtx_REG (SImode, regno);
33156 }
33157
33158 return gen_rtx_MEM (SImode, plus_constant (Pmode, stack_pointer_rtx,
33159 aggr ? 8 : 4));
33160 }
33161
33162 /* Determine whether x86_output_mi_thunk can succeed. */
33163
33164 static bool
33165 x86_can_output_mi_thunk (const_tree thunk ATTRIBUTE_UNUSED,
33166 HOST_WIDE_INT delta ATTRIBUTE_UNUSED,
33167 HOST_WIDE_INT vcall_offset, const_tree function)
33168 {
33169 /* 64-bit can handle anything. */
33170 if (TARGET_64BIT)
33171 return true;
33172
33173 /* For 32-bit, everything's fine if we have one free register. */
33174 if (ix86_function_regparm (TREE_TYPE (function), function) < 3)
33175 return true;
33176
33177 /* Need a free register for vcall_offset. */
33178 if (vcall_offset)
33179 return false;
33180
33181 /* Need a free register for GOT references. */
33182 if (flag_pic && !targetm.binds_local_p (function))
33183 return false;
33184
33185 /* Otherwise ok. */
33186 return true;
33187 }
33188
33189 /* Output the assembler code for a thunk function. THUNK_DECL is the
33190 declaration for the thunk function itself, FUNCTION is the decl for
33191 the target function. DELTA is an immediate constant offset to be
33192 added to THIS. If VCALL_OFFSET is nonzero, the word at
33193 *(*this + vcall_offset) should be added to THIS. */
33194
33195 static void
33196 x86_output_mi_thunk (FILE *file,
33197 tree thunk ATTRIBUTE_UNUSED, HOST_WIDE_INT delta,
33198 HOST_WIDE_INT vcall_offset, tree function)
33199 {
33200 rtx this_param = x86_this_parameter (function);
33201 rtx this_reg, tmp, fnaddr;
33202 unsigned int tmp_regno;
33203
33204 if (TARGET_64BIT)
33205 tmp_regno = R10_REG;
33206 else
33207 {
33208 unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (function));
33209 if ((ccvt & (IX86_CALLCVT_FASTCALL | IX86_CALLCVT_THISCALL)) != 0)
33210 tmp_regno = AX_REG;
33211 else
33212 tmp_regno = CX_REG;
33213 }
33214
33215 emit_note (NOTE_INSN_PROLOGUE_END);
33216
33217 /* If VCALL_OFFSET, we'll need THIS in a register. Might as well
33218 pull it in now and let DELTA benefit. */
33219 if (REG_P (this_param))
33220 this_reg = this_param;
33221 else if (vcall_offset)
33222 {
33223 /* Put the this parameter into %eax. */
33224 this_reg = gen_rtx_REG (Pmode, AX_REG);
33225 emit_move_insn (this_reg, this_param);
33226 }
33227 else
33228 this_reg = NULL_RTX;
33229
33230 /* Adjust the this parameter by a fixed constant. */
33231 if (delta)
33232 {
33233 rtx delta_rtx = GEN_INT (delta);
33234 rtx delta_dst = this_reg ? this_reg : this_param;
33235
33236 if (TARGET_64BIT)
33237 {
33238 if (!x86_64_general_operand (delta_rtx, Pmode))
33239 {
33240 tmp = gen_rtx_REG (Pmode, tmp_regno);
33241 emit_move_insn (tmp, delta_rtx);
33242 delta_rtx = tmp;
33243 }
33244 }
33245
33246 ix86_emit_binop (PLUS, Pmode, delta_dst, delta_rtx);
33247 }
33248
33249 /* Adjust the this parameter by a value stored in the vtable. */
33250 if (vcall_offset)
33251 {
33252 rtx vcall_addr, vcall_mem, this_mem;
33253
33254 tmp = gen_rtx_REG (Pmode, tmp_regno);
33255
33256 this_mem = gen_rtx_MEM (ptr_mode, this_reg);
33257 if (Pmode != ptr_mode)
33258 this_mem = gen_rtx_ZERO_EXTEND (Pmode, this_mem);
33259 emit_move_insn (tmp, this_mem);
33260
33261 /* Adjust the this parameter. */
33262 vcall_addr = plus_constant (Pmode, tmp, vcall_offset);
33263 if (TARGET_64BIT
33264 && !ix86_legitimate_address_p (ptr_mode, vcall_addr, true))
33265 {
33266 rtx tmp2 = gen_rtx_REG (Pmode, R11_REG);
33267 emit_move_insn (tmp2, GEN_INT (vcall_offset));
33268 vcall_addr = gen_rtx_PLUS (Pmode, tmp, tmp2);
33269 }
33270
33271 vcall_mem = gen_rtx_MEM (ptr_mode, vcall_addr);
33272 if (Pmode != ptr_mode)
33273 emit_insn (gen_addsi_1_zext (this_reg,
33274 gen_rtx_REG (ptr_mode,
33275 REGNO (this_reg)),
33276 vcall_mem));
33277 else
33278 ix86_emit_binop (PLUS, Pmode, this_reg, vcall_mem);
33279 }
33280
33281 /* If necessary, drop THIS back to its stack slot. */
33282 if (this_reg && this_reg != this_param)
33283 emit_move_insn (this_param, this_reg);
33284
33285 fnaddr = XEXP (DECL_RTL (function), 0);
33286 if (TARGET_64BIT)
33287 {
33288 if (!flag_pic || targetm.binds_local_p (function)
33289 || cfun->machine->call_abi == MS_ABI)
33290 ;
33291 else
33292 {
33293 tmp = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOTPCREL);
33294 tmp = gen_rtx_CONST (Pmode, tmp);
33295 fnaddr = gen_rtx_MEM (Pmode, tmp);
33296 }
33297 }
33298 else
33299 {
33300 if (!flag_pic || targetm.binds_local_p (function))
33301 ;
33302 #if TARGET_MACHO
33303 else if (TARGET_MACHO)
33304 {
33305 fnaddr = machopic_indirect_call_target (DECL_RTL (function));
33306 fnaddr = XEXP (fnaddr, 0);
33307 }
33308 #endif /* TARGET_MACHO */
33309 else
33310 {
33311 tmp = gen_rtx_REG (Pmode, CX_REG);
33312 output_set_got (tmp, NULL_RTX);
33313
33314 fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOT);
33315 fnaddr = gen_rtx_PLUS (Pmode, fnaddr, tmp);
33316 fnaddr = gen_rtx_MEM (Pmode, fnaddr);
33317 }
33318 }
33319
33320 /* Our sibling call patterns do not allow memories, because we have no
33321 predicate that can distinguish between frame and non-frame memory.
33322 For our purposes here, we can get away with (ab)using a jump pattern,
33323 because we're going to do no optimization. */
33324 if (MEM_P (fnaddr))
33325 emit_jump_insn (gen_indirect_jump (fnaddr));
33326 else
33327 {
33328 if (ix86_cmodel == CM_LARGE_PIC && SYMBOLIC_CONST (fnaddr))
33329 fnaddr = legitimize_pic_address (fnaddr,
33330 gen_rtx_REG (Pmode, tmp_regno));
33331
33332 if (!sibcall_insn_operand (fnaddr, word_mode))
33333 {
33334 tmp = gen_rtx_REG (word_mode, tmp_regno);
33335 if (GET_MODE (fnaddr) != word_mode)
33336 fnaddr = gen_rtx_ZERO_EXTEND (word_mode, fnaddr);
33337 emit_move_insn (tmp, fnaddr);
33338 fnaddr = tmp;
33339 }
33340
33341 tmp = gen_rtx_MEM (QImode, fnaddr);
33342 tmp = gen_rtx_CALL (VOIDmode, tmp, const0_rtx);
33343 tmp = emit_call_insn (tmp);
33344 SIBLING_CALL_P (tmp) = 1;
33345 }
33346 emit_barrier ();
33347
33348 /* Emit just enough of rest_of_compilation to get the insns emitted.
33349 Note that use_thunk calls assemble_start_function et al. */
33350 tmp = get_insns ();
33351 insn_locators_alloc ();
33352 shorten_branches (tmp);
33353 final_start_function (tmp, file, 1);
33354 final (tmp, file, 1);
33355 final_end_function ();
33356 }
33357
33358 static void
33359 x86_file_start (void)
33360 {
33361 default_file_start ();
33362 #if TARGET_MACHO
33363 darwin_file_start ();
33364 #endif
33365 if (X86_FILE_START_VERSION_DIRECTIVE)
33366 fputs ("\t.version\t\"01.01\"\n", asm_out_file);
33367 if (X86_FILE_START_FLTUSED)
33368 fputs ("\t.global\t__fltused\n", asm_out_file);
33369 if (ix86_asm_dialect == ASM_INTEL)
33370 fputs ("\t.intel_syntax noprefix\n", asm_out_file);
33371 }
33372
33373 int
33374 x86_field_alignment (tree field, int computed)
33375 {
33376 enum machine_mode mode;
33377 tree type = TREE_TYPE (field);
33378
33379 if (TARGET_64BIT || TARGET_ALIGN_DOUBLE)
33380 return computed;
33381 mode = TYPE_MODE (strip_array_types (type));
33382 if (mode == DFmode || mode == DCmode
33383 || GET_MODE_CLASS (mode) == MODE_INT
33384 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
33385 return MIN (32, computed);
33386 return computed;
33387 }
33388
33389 /* Output assembler code to FILE to increment profiler label # LABELNO
33390 for profiling a function entry. */
33391 void
33392 x86_function_profiler (FILE *file, int labelno ATTRIBUTE_UNUSED)
33393 {
33394 const char *mcount_name = (flag_fentry ? MCOUNT_NAME_BEFORE_PROLOGUE
33395 : MCOUNT_NAME);
33396
33397 if (TARGET_64BIT)
33398 {
33399 #ifndef NO_PROFILE_COUNTERS
33400 fprintf (file, "\tleaq\t%sP%d(%%rip),%%r11\n", LPREFIX, labelno);
33401 #endif
33402
33403 if (DEFAULT_ABI == SYSV_ABI && flag_pic)
33404 fprintf (file, "\tcall\t*%s@GOTPCREL(%%rip)\n", mcount_name);
33405 else
33406 fprintf (file, "\tcall\t%s\n", mcount_name);
33407 }
33408 else if (flag_pic)
33409 {
33410 #ifndef NO_PROFILE_COUNTERS
33411 fprintf (file, "\tleal\t%sP%d@GOTOFF(%%ebx),%%" PROFILE_COUNT_REGISTER "\n",
33412 LPREFIX, labelno);
33413 #endif
33414 fprintf (file, "\tcall\t*%s@GOT(%%ebx)\n", mcount_name);
33415 }
33416 else
33417 {
33418 #ifndef NO_PROFILE_COUNTERS
33419 fprintf (file, "\tmovl\t$%sP%d,%%" PROFILE_COUNT_REGISTER "\n",
33420 LPREFIX, labelno);
33421 #endif
33422 fprintf (file, "\tcall\t%s\n", mcount_name);
33423 }
33424 }
33425
33426 /* We don't have exact information about the insn sizes, but we may assume
33427 quite safely that we are informed about all 1 byte insns and memory
33428 address sizes. This is enough to eliminate unnecessary padding in
33429 99% of cases. */
33430
33431 static int
33432 min_insn_size (rtx insn)
33433 {
33434 int l = 0, len;
33435
33436 if (!INSN_P (insn) || !active_insn_p (insn))
33437 return 0;
33438
33439 /* Discard alignments we've emit and jump instructions. */
33440 if (GET_CODE (PATTERN (insn)) == UNSPEC_VOLATILE
33441 && XINT (PATTERN (insn), 1) == UNSPECV_ALIGN)
33442 return 0;
33443 if (JUMP_TABLE_DATA_P (insn))
33444 return 0;
33445
33446 /* Important case - calls are always 5 bytes.
33447 It is common to have many calls in the row. */
33448 if (CALL_P (insn)
33449 && symbolic_reference_mentioned_p (PATTERN (insn))
33450 && !SIBLING_CALL_P (insn))
33451 return 5;
33452 len = get_attr_length (insn);
33453 if (len <= 1)
33454 return 1;
33455
33456 /* For normal instructions we rely on get_attr_length being exact,
33457 with a few exceptions. */
33458 if (!JUMP_P (insn))
33459 {
33460 enum attr_type type = get_attr_type (insn);
33461
33462 switch (type)
33463 {
33464 case TYPE_MULTI:
33465 if (GET_CODE (PATTERN (insn)) == ASM_INPUT
33466 || asm_noperands (PATTERN (insn)) >= 0)
33467 return 0;
33468 break;
33469 case TYPE_OTHER:
33470 case TYPE_FCMP:
33471 break;
33472 default:
33473 /* Otherwise trust get_attr_length. */
33474 return len;
33475 }
33476
33477 l = get_attr_length_address (insn);
33478 if (l < 4 && symbolic_reference_mentioned_p (PATTERN (insn)))
33479 l = 4;
33480 }
33481 if (l)
33482 return 1+l;
33483 else
33484 return 2;
33485 }
33486
33487 #ifdef ASM_OUTPUT_MAX_SKIP_PAD
33488
33489 /* AMD K8 core mispredicts jumps when there are more than 3 jumps in 16 byte
33490 window. */
33491
33492 static void
33493 ix86_avoid_jump_mispredicts (void)
33494 {
33495 rtx insn, start = get_insns ();
33496 int nbytes = 0, njumps = 0;
33497 int isjump = 0;
33498
33499 /* Look for all minimal intervals of instructions containing 4 jumps.
33500 The intervals are bounded by START and INSN. NBYTES is the total
33501 size of instructions in the interval including INSN and not including
33502 START. When the NBYTES is smaller than 16 bytes, it is possible
33503 that the end of START and INSN ends up in the same 16byte page.
33504
33505 The smallest offset in the page INSN can start is the case where START
33506 ends on the offset 0. Offset of INSN is then NBYTES - sizeof (INSN).
33507 We add p2align to 16byte window with maxskip 15 - NBYTES + sizeof (INSN).
33508 */
33509 for (insn = start; insn; insn = NEXT_INSN (insn))
33510 {
33511 int min_size;
33512
33513 if (LABEL_P (insn))
33514 {
33515 int align = label_to_alignment (insn);
33516 int max_skip = label_to_max_skip (insn);
33517
33518 if (max_skip > 15)
33519 max_skip = 15;
33520 /* If align > 3, only up to 16 - max_skip - 1 bytes can be
33521 already in the current 16 byte page, because otherwise
33522 ASM_OUTPUT_MAX_SKIP_ALIGN could skip max_skip or fewer
33523 bytes to reach 16 byte boundary. */
33524 if (align <= 0
33525 || (align <= 3 && max_skip != (1 << align) - 1))
33526 max_skip = 0;
33527 if (dump_file)
33528 fprintf (dump_file, "Label %i with max_skip %i\n",
33529 INSN_UID (insn), max_skip);
33530 if (max_skip)
33531 {
33532 while (nbytes + max_skip >= 16)
33533 {
33534 start = NEXT_INSN (start);
33535 if ((JUMP_P (start)
33536 && GET_CODE (PATTERN (start)) != ADDR_VEC
33537 && GET_CODE (PATTERN (start)) != ADDR_DIFF_VEC)
33538 || CALL_P (start))
33539 njumps--, isjump = 1;
33540 else
33541 isjump = 0;
33542 nbytes -= min_insn_size (start);
33543 }
33544 }
33545 continue;
33546 }
33547
33548 min_size = min_insn_size (insn);
33549 nbytes += min_size;
33550 if (dump_file)
33551 fprintf (dump_file, "Insn %i estimated to %i bytes\n",
33552 INSN_UID (insn), min_size);
33553 if ((JUMP_P (insn)
33554 && GET_CODE (PATTERN (insn)) != ADDR_VEC
33555 && GET_CODE (PATTERN (insn)) != ADDR_DIFF_VEC)
33556 || CALL_P (insn))
33557 njumps++;
33558 else
33559 continue;
33560
33561 while (njumps > 3)
33562 {
33563 start = NEXT_INSN (start);
33564 if ((JUMP_P (start)
33565 && GET_CODE (PATTERN (start)) != ADDR_VEC
33566 && GET_CODE (PATTERN (start)) != ADDR_DIFF_VEC)
33567 || CALL_P (start))
33568 njumps--, isjump = 1;
33569 else
33570 isjump = 0;
33571 nbytes -= min_insn_size (start);
33572 }
33573 gcc_assert (njumps >= 0);
33574 if (dump_file)
33575 fprintf (dump_file, "Interval %i to %i has %i bytes\n",
33576 INSN_UID (start), INSN_UID (insn), nbytes);
33577
33578 if (njumps == 3 && isjump && nbytes < 16)
33579 {
33580 int padsize = 15 - nbytes + min_insn_size (insn);
33581
33582 if (dump_file)
33583 fprintf (dump_file, "Padding insn %i by %i bytes!\n",
33584 INSN_UID (insn), padsize);
33585 emit_insn_before (gen_pad (GEN_INT (padsize)), insn);
33586 }
33587 }
33588 }
33589 #endif
33590
33591 /* AMD Athlon works faster
33592 when RET is not destination of conditional jump or directly preceded
33593 by other jump instruction. We avoid the penalty by inserting NOP just
33594 before the RET instructions in such cases. */
33595 static void
33596 ix86_pad_returns (void)
33597 {
33598 edge e;
33599 edge_iterator ei;
33600
33601 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR->preds)
33602 {
33603 basic_block bb = e->src;
33604 rtx ret = BB_END (bb);
33605 rtx prev;
33606 bool replace = false;
33607
33608 if (!JUMP_P (ret) || !ANY_RETURN_P (PATTERN (ret))
33609 || optimize_bb_for_size_p (bb))
33610 continue;
33611 for (prev = PREV_INSN (ret); prev; prev = PREV_INSN (prev))
33612 if (active_insn_p (prev) || LABEL_P (prev))
33613 break;
33614 if (prev && LABEL_P (prev))
33615 {
33616 edge e;
33617 edge_iterator ei;
33618
33619 FOR_EACH_EDGE (e, ei, bb->preds)
33620 if (EDGE_FREQUENCY (e) && e->src->index >= 0
33621 && !(e->flags & EDGE_FALLTHRU))
33622 replace = true;
33623 }
33624 if (!replace)
33625 {
33626 prev = prev_active_insn (ret);
33627 if (prev
33628 && ((JUMP_P (prev) && any_condjump_p (prev))
33629 || CALL_P (prev)))
33630 replace = true;
33631 /* Empty functions get branch mispredict even when
33632 the jump destination is not visible to us. */
33633 if (!prev && !optimize_function_for_size_p (cfun))
33634 replace = true;
33635 }
33636 if (replace)
33637 {
33638 emit_jump_insn_before (gen_simple_return_internal_long (), ret);
33639 delete_insn (ret);
33640 }
33641 }
33642 }
33643
33644 /* Count the minimum number of instructions in BB. Return 4 if the
33645 number of instructions >= 4. */
33646
33647 static int
33648 ix86_count_insn_bb (basic_block bb)
33649 {
33650 rtx insn;
33651 int insn_count = 0;
33652
33653 /* Count number of instructions in this block. Return 4 if the number
33654 of instructions >= 4. */
33655 FOR_BB_INSNS (bb, insn)
33656 {
33657 /* Only happen in exit blocks. */
33658 if (JUMP_P (insn)
33659 && ANY_RETURN_P (PATTERN (insn)))
33660 break;
33661
33662 if (NONDEBUG_INSN_P (insn)
33663 && GET_CODE (PATTERN (insn)) != USE
33664 && GET_CODE (PATTERN (insn)) != CLOBBER)
33665 {
33666 insn_count++;
33667 if (insn_count >= 4)
33668 return insn_count;
33669 }
33670 }
33671
33672 return insn_count;
33673 }
33674
33675
33676 /* Count the minimum number of instructions in code path in BB.
33677 Return 4 if the number of instructions >= 4. */
33678
33679 static int
33680 ix86_count_insn (basic_block bb)
33681 {
33682 edge e;
33683 edge_iterator ei;
33684 int min_prev_count;
33685
33686 /* Only bother counting instructions along paths with no
33687 more than 2 basic blocks between entry and exit. Given
33688 that BB has an edge to exit, determine if a predecessor
33689 of BB has an edge from entry. If so, compute the number
33690 of instructions in the predecessor block. If there
33691 happen to be multiple such blocks, compute the minimum. */
33692 min_prev_count = 4;
33693 FOR_EACH_EDGE (e, ei, bb->preds)
33694 {
33695 edge prev_e;
33696 edge_iterator prev_ei;
33697
33698 if (e->src == ENTRY_BLOCK_PTR)
33699 {
33700 min_prev_count = 0;
33701 break;
33702 }
33703 FOR_EACH_EDGE (prev_e, prev_ei, e->src->preds)
33704 {
33705 if (prev_e->src == ENTRY_BLOCK_PTR)
33706 {
33707 int count = ix86_count_insn_bb (e->src);
33708 if (count < min_prev_count)
33709 min_prev_count = count;
33710 break;
33711 }
33712 }
33713 }
33714
33715 if (min_prev_count < 4)
33716 min_prev_count += ix86_count_insn_bb (bb);
33717
33718 return min_prev_count;
33719 }
33720
33721 /* Pad short function to 4 instructions. */
33722
33723 static void
33724 ix86_pad_short_function (void)
33725 {
33726 edge e;
33727 edge_iterator ei;
33728
33729 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR->preds)
33730 {
33731 rtx ret = BB_END (e->src);
33732 if (JUMP_P (ret) && ANY_RETURN_P (PATTERN (ret)))
33733 {
33734 int insn_count = ix86_count_insn (e->src);
33735
33736 /* Pad short function. */
33737 if (insn_count < 4)
33738 {
33739 rtx insn = ret;
33740
33741 /* Find epilogue. */
33742 while (insn
33743 && (!NOTE_P (insn)
33744 || NOTE_KIND (insn) != NOTE_INSN_EPILOGUE_BEG))
33745 insn = PREV_INSN (insn);
33746
33747 if (!insn)
33748 insn = ret;
33749
33750 /* Two NOPs count as one instruction. */
33751 insn_count = 2 * (4 - insn_count);
33752 emit_insn_before (gen_nops (GEN_INT (insn_count)), insn);
33753 }
33754 }
33755 }
33756 }
33757
33758 /* Implement machine specific optimizations. We implement padding of returns
33759 for K8 CPUs and pass to avoid 4 jumps in the single 16 byte window. */
33760 static void
33761 ix86_reorg (void)
33762 {
33763 /* We are freeing block_for_insn in the toplev to keep compatibility
33764 with old MDEP_REORGS that are not CFG based. Recompute it now. */
33765 compute_bb_for_insn ();
33766
33767 /* Run the vzeroupper optimization if needed. */
33768 if (TARGET_VZEROUPPER)
33769 move_or_delete_vzeroupper ();
33770
33771 if (optimize && optimize_function_for_speed_p (cfun))
33772 {
33773 if (TARGET_PAD_SHORT_FUNCTION)
33774 ix86_pad_short_function ();
33775 else if (TARGET_PAD_RETURNS)
33776 ix86_pad_returns ();
33777 #ifdef ASM_OUTPUT_MAX_SKIP_PAD
33778 if (TARGET_FOUR_JUMP_LIMIT)
33779 ix86_avoid_jump_mispredicts ();
33780 #endif
33781 }
33782 }
33783
33784 /* Return nonzero when QImode register that must be represented via REX prefix
33785 is used. */
33786 bool
33787 x86_extended_QIreg_mentioned_p (rtx insn)
33788 {
33789 int i;
33790 extract_insn_cached (insn);
33791 for (i = 0; i < recog_data.n_operands; i++)
33792 if (GENERAL_REG_P (recog_data.operand[i])
33793 && !QI_REGNO_P (REGNO (recog_data.operand[i])))
33794 return true;
33795 return false;
33796 }
33797
33798 /* Return nonzero when P points to register encoded via REX prefix.
33799 Called via for_each_rtx. */
33800 static int
33801 extended_reg_mentioned_1 (rtx *p, void *data ATTRIBUTE_UNUSED)
33802 {
33803 unsigned int regno;
33804 if (!REG_P (*p))
33805 return 0;
33806 regno = REGNO (*p);
33807 return REX_INT_REGNO_P (regno) || REX_SSE_REGNO_P (regno);
33808 }
33809
33810 /* Return true when INSN mentions register that must be encoded using REX
33811 prefix. */
33812 bool
33813 x86_extended_reg_mentioned_p (rtx insn)
33814 {
33815 return for_each_rtx (INSN_P (insn) ? &PATTERN (insn) : &insn,
33816 extended_reg_mentioned_1, NULL);
33817 }
33818
33819 /* If profitable, negate (without causing overflow) integer constant
33820 of mode MODE at location LOC. Return true in this case. */
33821 bool
33822 x86_maybe_negate_const_int (rtx *loc, enum machine_mode mode)
33823 {
33824 HOST_WIDE_INT val;
33825
33826 if (!CONST_INT_P (*loc))
33827 return false;
33828
33829 switch (mode)
33830 {
33831 case DImode:
33832 /* DImode x86_64 constants must fit in 32 bits. */
33833 gcc_assert (x86_64_immediate_operand (*loc, mode));
33834
33835 mode = SImode;
33836 break;
33837
33838 case SImode:
33839 case HImode:
33840 case QImode:
33841 break;
33842
33843 default:
33844 gcc_unreachable ();
33845 }
33846
33847 /* Avoid overflows. */
33848 if (mode_signbit_p (mode, *loc))
33849 return false;
33850
33851 val = INTVAL (*loc);
33852
33853 /* Make things pretty and `subl $4,%eax' rather than `addl $-4,%eax'.
33854 Exceptions: -128 encodes smaller than 128, so swap sign and op. */
33855 if ((val < 0 && val != -128)
33856 || val == 128)
33857 {
33858 *loc = GEN_INT (-val);
33859 return true;
33860 }
33861
33862 return false;
33863 }
33864
33865 /* Generate an unsigned DImode/SImode to FP conversion. This is the same code
33866 optabs would emit if we didn't have TFmode patterns. */
33867
33868 void
33869 x86_emit_floatuns (rtx operands[2])
33870 {
33871 rtx neglab, donelab, i0, i1, f0, in, out;
33872 enum machine_mode mode, inmode;
33873
33874 inmode = GET_MODE (operands[1]);
33875 gcc_assert (inmode == SImode || inmode == DImode);
33876
33877 out = operands[0];
33878 in = force_reg (inmode, operands[1]);
33879 mode = GET_MODE (out);
33880 neglab = gen_label_rtx ();
33881 donelab = gen_label_rtx ();
33882 f0 = gen_reg_rtx (mode);
33883
33884 emit_cmp_and_jump_insns (in, const0_rtx, LT, const0_rtx, inmode, 0, neglab);
33885
33886 expand_float (out, in, 0);
33887
33888 emit_jump_insn (gen_jump (donelab));
33889 emit_barrier ();
33890
33891 emit_label (neglab);
33892
33893 i0 = expand_simple_binop (inmode, LSHIFTRT, in, const1_rtx, NULL,
33894 1, OPTAB_DIRECT);
33895 i1 = expand_simple_binop (inmode, AND, in, const1_rtx, NULL,
33896 1, OPTAB_DIRECT);
33897 i0 = expand_simple_binop (inmode, IOR, i0, i1, i0, 1, OPTAB_DIRECT);
33898
33899 expand_float (f0, i0, 0);
33900
33901 emit_insn (gen_rtx_SET (VOIDmode, out, gen_rtx_PLUS (mode, f0, f0)));
33902
33903 emit_label (donelab);
33904 }
33905 \f
33906 /* AVX2 does support 32-byte integer vector operations,
33907 thus the longest vector we are faced with is V32QImode. */
33908 #define MAX_VECT_LEN 32
33909
33910 struct expand_vec_perm_d
33911 {
33912 rtx target, op0, op1;
33913 unsigned char perm[MAX_VECT_LEN];
33914 enum machine_mode vmode;
33915 unsigned char nelt;
33916 bool one_operand_p;
33917 bool testing_p;
33918 };
33919
33920 static bool canonicalize_perm (struct expand_vec_perm_d *d);
33921 static bool expand_vec_perm_1 (struct expand_vec_perm_d *d);
33922 static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d);
33923
33924 /* Get a vector mode of the same size as the original but with elements
33925 twice as wide. This is only guaranteed to apply to integral vectors. */
33926
33927 static inline enum machine_mode
33928 get_mode_wider_vector (enum machine_mode o)
33929 {
33930 /* ??? Rely on the ordering that genmodes.c gives to vectors. */
33931 enum machine_mode n = GET_MODE_WIDER_MODE (o);
33932 gcc_assert (GET_MODE_NUNITS (o) == GET_MODE_NUNITS (n) * 2);
33933 gcc_assert (GET_MODE_SIZE (o) == GET_MODE_SIZE (n));
33934 return n;
33935 }
33936
33937 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
33938 with all elements equal to VAR. Return true if successful. */
33939
33940 static bool
33941 ix86_expand_vector_init_duplicate (bool mmx_ok, enum machine_mode mode,
33942 rtx target, rtx val)
33943 {
33944 bool ok;
33945
33946 switch (mode)
33947 {
33948 case V2SImode:
33949 case V2SFmode:
33950 if (!mmx_ok)
33951 return false;
33952 /* FALLTHRU */
33953
33954 case V4DFmode:
33955 case V4DImode:
33956 case V8SFmode:
33957 case V8SImode:
33958 case V2DFmode:
33959 case V2DImode:
33960 case V4SFmode:
33961 case V4SImode:
33962 {
33963 rtx insn, dup;
33964
33965 /* First attempt to recognize VAL as-is. */
33966 dup = gen_rtx_VEC_DUPLICATE (mode, val);
33967 insn = emit_insn (gen_rtx_SET (VOIDmode, target, dup));
33968 if (recog_memoized (insn) < 0)
33969 {
33970 rtx seq;
33971 /* If that fails, force VAL into a register. */
33972
33973 start_sequence ();
33974 XEXP (dup, 0) = force_reg (GET_MODE_INNER (mode), val);
33975 seq = get_insns ();
33976 end_sequence ();
33977 if (seq)
33978 emit_insn_before (seq, insn);
33979
33980 ok = recog_memoized (insn) >= 0;
33981 gcc_assert (ok);
33982 }
33983 }
33984 return true;
33985
33986 case V4HImode:
33987 if (!mmx_ok)
33988 return false;
33989 if (TARGET_SSE || TARGET_3DNOW_A)
33990 {
33991 rtx x;
33992
33993 val = gen_lowpart (SImode, val);
33994 x = gen_rtx_TRUNCATE (HImode, val);
33995 x = gen_rtx_VEC_DUPLICATE (mode, x);
33996 emit_insn (gen_rtx_SET (VOIDmode, target, x));
33997 return true;
33998 }
33999 goto widen;
34000
34001 case V8QImode:
34002 if (!mmx_ok)
34003 return false;
34004 goto widen;
34005
34006 case V8HImode:
34007 if (TARGET_SSE2)
34008 {
34009 struct expand_vec_perm_d dperm;
34010 rtx tmp1, tmp2;
34011
34012 permute:
34013 memset (&dperm, 0, sizeof (dperm));
34014 dperm.target = target;
34015 dperm.vmode = mode;
34016 dperm.nelt = GET_MODE_NUNITS (mode);
34017 dperm.op0 = dperm.op1 = gen_reg_rtx (mode);
34018 dperm.one_operand_p = true;
34019
34020 /* Extend to SImode using a paradoxical SUBREG. */
34021 tmp1 = gen_reg_rtx (SImode);
34022 emit_move_insn (tmp1, gen_lowpart (SImode, val));
34023
34024 /* Insert the SImode value as low element of a V4SImode vector. */
34025 tmp2 = gen_lowpart (V4SImode, dperm.op0);
34026 emit_insn (gen_vec_setv4si_0 (tmp2, CONST0_RTX (V4SImode), tmp1));
34027
34028 ok = (expand_vec_perm_1 (&dperm)
34029 || expand_vec_perm_broadcast_1 (&dperm));
34030 gcc_assert (ok);
34031 return ok;
34032 }
34033 goto widen;
34034
34035 case V16QImode:
34036 if (TARGET_SSE2)
34037 goto permute;
34038 goto widen;
34039
34040 widen:
34041 /* Replicate the value once into the next wider mode and recurse. */
34042 {
34043 enum machine_mode smode, wsmode, wvmode;
34044 rtx x;
34045
34046 smode = GET_MODE_INNER (mode);
34047 wvmode = get_mode_wider_vector (mode);
34048 wsmode = GET_MODE_INNER (wvmode);
34049
34050 val = convert_modes (wsmode, smode, val, true);
34051 x = expand_simple_binop (wsmode, ASHIFT, val,
34052 GEN_INT (GET_MODE_BITSIZE (smode)),
34053 NULL_RTX, 1, OPTAB_LIB_WIDEN);
34054 val = expand_simple_binop (wsmode, IOR, val, x, x, 1, OPTAB_LIB_WIDEN);
34055
34056 x = gen_lowpart (wvmode, target);
34057 ok = ix86_expand_vector_init_duplicate (mmx_ok, wvmode, x, val);
34058 gcc_assert (ok);
34059 return ok;
34060 }
34061
34062 case V16HImode:
34063 case V32QImode:
34064 {
34065 enum machine_mode hvmode = (mode == V16HImode ? V8HImode : V16QImode);
34066 rtx x = gen_reg_rtx (hvmode);
34067
34068 ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
34069 gcc_assert (ok);
34070
34071 x = gen_rtx_VEC_CONCAT (mode, x, x);
34072 emit_insn (gen_rtx_SET (VOIDmode, target, x));
34073 }
34074 return true;
34075
34076 default:
34077 return false;
34078 }
34079 }
34080
34081 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
34082 whose ONE_VAR element is VAR, and other elements are zero. Return true
34083 if successful. */
34084
34085 static bool
34086 ix86_expand_vector_init_one_nonzero (bool mmx_ok, enum machine_mode mode,
34087 rtx target, rtx var, int one_var)
34088 {
34089 enum machine_mode vsimode;
34090 rtx new_target;
34091 rtx x, tmp;
34092 bool use_vector_set = false;
34093
34094 switch (mode)
34095 {
34096 case V2DImode:
34097 /* For SSE4.1, we normally use vector set. But if the second
34098 element is zero and inter-unit moves are OK, we use movq
34099 instead. */
34100 use_vector_set = (TARGET_64BIT
34101 && TARGET_SSE4_1
34102 && !(TARGET_INTER_UNIT_MOVES
34103 && one_var == 0));
34104 break;
34105 case V16QImode:
34106 case V4SImode:
34107 case V4SFmode:
34108 use_vector_set = TARGET_SSE4_1;
34109 break;
34110 case V8HImode:
34111 use_vector_set = TARGET_SSE2;
34112 break;
34113 case V4HImode:
34114 use_vector_set = TARGET_SSE || TARGET_3DNOW_A;
34115 break;
34116 case V32QImode:
34117 case V16HImode:
34118 case V8SImode:
34119 case V8SFmode:
34120 case V4DFmode:
34121 use_vector_set = TARGET_AVX;
34122 break;
34123 case V4DImode:
34124 /* Use ix86_expand_vector_set in 64bit mode only. */
34125 use_vector_set = TARGET_AVX && TARGET_64BIT;
34126 break;
34127 default:
34128 break;
34129 }
34130
34131 if (use_vector_set)
34132 {
34133 emit_insn (gen_rtx_SET (VOIDmode, target, CONST0_RTX (mode)));
34134 var = force_reg (GET_MODE_INNER (mode), var);
34135 ix86_expand_vector_set (mmx_ok, target, var, one_var);
34136 return true;
34137 }
34138
34139 switch (mode)
34140 {
34141 case V2SFmode:
34142 case V2SImode:
34143 if (!mmx_ok)
34144 return false;
34145 /* FALLTHRU */
34146
34147 case V2DFmode:
34148 case V2DImode:
34149 if (one_var != 0)
34150 return false;
34151 var = force_reg (GET_MODE_INNER (mode), var);
34152 x = gen_rtx_VEC_CONCAT (mode, var, CONST0_RTX (GET_MODE_INNER (mode)));
34153 emit_insn (gen_rtx_SET (VOIDmode, target, x));
34154 return true;
34155
34156 case V4SFmode:
34157 case V4SImode:
34158 if (!REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
34159 new_target = gen_reg_rtx (mode);
34160 else
34161 new_target = target;
34162 var = force_reg (GET_MODE_INNER (mode), var);
34163 x = gen_rtx_VEC_DUPLICATE (mode, var);
34164 x = gen_rtx_VEC_MERGE (mode, x, CONST0_RTX (mode), const1_rtx);
34165 emit_insn (gen_rtx_SET (VOIDmode, new_target, x));
34166 if (one_var != 0)
34167 {
34168 /* We need to shuffle the value to the correct position, so
34169 create a new pseudo to store the intermediate result. */
34170
34171 /* With SSE2, we can use the integer shuffle insns. */
34172 if (mode != V4SFmode && TARGET_SSE2)
34173 {
34174 emit_insn (gen_sse2_pshufd_1 (new_target, new_target,
34175 const1_rtx,
34176 GEN_INT (one_var == 1 ? 0 : 1),
34177 GEN_INT (one_var == 2 ? 0 : 1),
34178 GEN_INT (one_var == 3 ? 0 : 1)));
34179 if (target != new_target)
34180 emit_move_insn (target, new_target);
34181 return true;
34182 }
34183
34184 /* Otherwise convert the intermediate result to V4SFmode and
34185 use the SSE1 shuffle instructions. */
34186 if (mode != V4SFmode)
34187 {
34188 tmp = gen_reg_rtx (V4SFmode);
34189 emit_move_insn (tmp, gen_lowpart (V4SFmode, new_target));
34190 }
34191 else
34192 tmp = new_target;
34193
34194 emit_insn (gen_sse_shufps_v4sf (tmp, tmp, tmp,
34195 const1_rtx,
34196 GEN_INT (one_var == 1 ? 0 : 1),
34197 GEN_INT (one_var == 2 ? 0+4 : 1+4),
34198 GEN_INT (one_var == 3 ? 0+4 : 1+4)));
34199
34200 if (mode != V4SFmode)
34201 emit_move_insn (target, gen_lowpart (V4SImode, tmp));
34202 else if (tmp != target)
34203 emit_move_insn (target, tmp);
34204 }
34205 else if (target != new_target)
34206 emit_move_insn (target, new_target);
34207 return true;
34208
34209 case V8HImode:
34210 case V16QImode:
34211 vsimode = V4SImode;
34212 goto widen;
34213 case V4HImode:
34214 case V8QImode:
34215 if (!mmx_ok)
34216 return false;
34217 vsimode = V2SImode;
34218 goto widen;
34219 widen:
34220 if (one_var != 0)
34221 return false;
34222
34223 /* Zero extend the variable element to SImode and recurse. */
34224 var = convert_modes (SImode, GET_MODE_INNER (mode), var, true);
34225
34226 x = gen_reg_rtx (vsimode);
34227 if (!ix86_expand_vector_init_one_nonzero (mmx_ok, vsimode, x,
34228 var, one_var))
34229 gcc_unreachable ();
34230
34231 emit_move_insn (target, gen_lowpart (mode, x));
34232 return true;
34233
34234 default:
34235 return false;
34236 }
34237 }
34238
34239 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
34240 consisting of the values in VALS. It is known that all elements
34241 except ONE_VAR are constants. Return true if successful. */
34242
34243 static bool
34244 ix86_expand_vector_init_one_var (bool mmx_ok, enum machine_mode mode,
34245 rtx target, rtx vals, int one_var)
34246 {
34247 rtx var = XVECEXP (vals, 0, one_var);
34248 enum machine_mode wmode;
34249 rtx const_vec, x;
34250
34251 const_vec = copy_rtx (vals);
34252 XVECEXP (const_vec, 0, one_var) = CONST0_RTX (GET_MODE_INNER (mode));
34253 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (const_vec, 0));
34254
34255 switch (mode)
34256 {
34257 case V2DFmode:
34258 case V2DImode:
34259 case V2SFmode:
34260 case V2SImode:
34261 /* For the two element vectors, it's just as easy to use
34262 the general case. */
34263 return false;
34264
34265 case V4DImode:
34266 /* Use ix86_expand_vector_set in 64bit mode only. */
34267 if (!TARGET_64BIT)
34268 return false;
34269 case V4DFmode:
34270 case V8SFmode:
34271 case V8SImode:
34272 case V16HImode:
34273 case V32QImode:
34274 case V4SFmode:
34275 case V4SImode:
34276 case V8HImode:
34277 case V4HImode:
34278 break;
34279
34280 case V16QImode:
34281 if (TARGET_SSE4_1)
34282 break;
34283 wmode = V8HImode;
34284 goto widen;
34285 case V8QImode:
34286 wmode = V4HImode;
34287 goto widen;
34288 widen:
34289 /* There's no way to set one QImode entry easily. Combine
34290 the variable value with its adjacent constant value, and
34291 promote to an HImode set. */
34292 x = XVECEXP (vals, 0, one_var ^ 1);
34293 if (one_var & 1)
34294 {
34295 var = convert_modes (HImode, QImode, var, true);
34296 var = expand_simple_binop (HImode, ASHIFT, var, GEN_INT (8),
34297 NULL_RTX, 1, OPTAB_LIB_WIDEN);
34298 x = GEN_INT (INTVAL (x) & 0xff);
34299 }
34300 else
34301 {
34302 var = convert_modes (HImode, QImode, var, true);
34303 x = gen_int_mode (INTVAL (x) << 8, HImode);
34304 }
34305 if (x != const0_rtx)
34306 var = expand_simple_binop (HImode, IOR, var, x, var,
34307 1, OPTAB_LIB_WIDEN);
34308
34309 x = gen_reg_rtx (wmode);
34310 emit_move_insn (x, gen_lowpart (wmode, const_vec));
34311 ix86_expand_vector_set (mmx_ok, x, var, one_var >> 1);
34312
34313 emit_move_insn (target, gen_lowpart (mode, x));
34314 return true;
34315
34316 default:
34317 return false;
34318 }
34319
34320 emit_move_insn (target, const_vec);
34321 ix86_expand_vector_set (mmx_ok, target, var, one_var);
34322 return true;
34323 }
34324
34325 /* A subroutine of ix86_expand_vector_init_general. Use vector
34326 concatenate to handle the most general case: all values variable,
34327 and none identical. */
34328
34329 static void
34330 ix86_expand_vector_init_concat (enum machine_mode mode,
34331 rtx target, rtx *ops, int n)
34332 {
34333 enum machine_mode cmode, hmode = VOIDmode;
34334 rtx first[8], second[4];
34335 rtvec v;
34336 int i, j;
34337
34338 switch (n)
34339 {
34340 case 2:
34341 switch (mode)
34342 {
34343 case V8SImode:
34344 cmode = V4SImode;
34345 break;
34346 case V8SFmode:
34347 cmode = V4SFmode;
34348 break;
34349 case V4DImode:
34350 cmode = V2DImode;
34351 break;
34352 case V4DFmode:
34353 cmode = V2DFmode;
34354 break;
34355 case V4SImode:
34356 cmode = V2SImode;
34357 break;
34358 case V4SFmode:
34359 cmode = V2SFmode;
34360 break;
34361 case V2DImode:
34362 cmode = DImode;
34363 break;
34364 case V2SImode:
34365 cmode = SImode;
34366 break;
34367 case V2DFmode:
34368 cmode = DFmode;
34369 break;
34370 case V2SFmode:
34371 cmode = SFmode;
34372 break;
34373 default:
34374 gcc_unreachable ();
34375 }
34376
34377 if (!register_operand (ops[1], cmode))
34378 ops[1] = force_reg (cmode, ops[1]);
34379 if (!register_operand (ops[0], cmode))
34380 ops[0] = force_reg (cmode, ops[0]);
34381 emit_insn (gen_rtx_SET (VOIDmode, target,
34382 gen_rtx_VEC_CONCAT (mode, ops[0],
34383 ops[1])));
34384 break;
34385
34386 case 4:
34387 switch (mode)
34388 {
34389 case V4DImode:
34390 cmode = V2DImode;
34391 break;
34392 case V4DFmode:
34393 cmode = V2DFmode;
34394 break;
34395 case V4SImode:
34396 cmode = V2SImode;
34397 break;
34398 case V4SFmode:
34399 cmode = V2SFmode;
34400 break;
34401 default:
34402 gcc_unreachable ();
34403 }
34404 goto half;
34405
34406 case 8:
34407 switch (mode)
34408 {
34409 case V8SImode:
34410 cmode = V2SImode;
34411 hmode = V4SImode;
34412 break;
34413 case V8SFmode:
34414 cmode = V2SFmode;
34415 hmode = V4SFmode;
34416 break;
34417 default:
34418 gcc_unreachable ();
34419 }
34420 goto half;
34421
34422 half:
34423 /* FIXME: We process inputs backward to help RA. PR 36222. */
34424 i = n - 1;
34425 j = (n >> 1) - 1;
34426 for (; i > 0; i -= 2, j--)
34427 {
34428 first[j] = gen_reg_rtx (cmode);
34429 v = gen_rtvec (2, ops[i - 1], ops[i]);
34430 ix86_expand_vector_init (false, first[j],
34431 gen_rtx_PARALLEL (cmode, v));
34432 }
34433
34434 n >>= 1;
34435 if (n > 2)
34436 {
34437 gcc_assert (hmode != VOIDmode);
34438 for (i = j = 0; i < n; i += 2, j++)
34439 {
34440 second[j] = gen_reg_rtx (hmode);
34441 ix86_expand_vector_init_concat (hmode, second [j],
34442 &first [i], 2);
34443 }
34444 n >>= 1;
34445 ix86_expand_vector_init_concat (mode, target, second, n);
34446 }
34447 else
34448 ix86_expand_vector_init_concat (mode, target, first, n);
34449 break;
34450
34451 default:
34452 gcc_unreachable ();
34453 }
34454 }
34455
34456 /* A subroutine of ix86_expand_vector_init_general. Use vector
34457 interleave to handle the most general case: all values variable,
34458 and none identical. */
34459
34460 static void
34461 ix86_expand_vector_init_interleave (enum machine_mode mode,
34462 rtx target, rtx *ops, int n)
34463 {
34464 enum machine_mode first_imode, second_imode, third_imode, inner_mode;
34465 int i, j;
34466 rtx op0, op1;
34467 rtx (*gen_load_even) (rtx, rtx, rtx);
34468 rtx (*gen_interleave_first_low) (rtx, rtx, rtx);
34469 rtx (*gen_interleave_second_low) (rtx, rtx, rtx);
34470
34471 switch (mode)
34472 {
34473 case V8HImode:
34474 gen_load_even = gen_vec_setv8hi;
34475 gen_interleave_first_low = gen_vec_interleave_lowv4si;
34476 gen_interleave_second_low = gen_vec_interleave_lowv2di;
34477 inner_mode = HImode;
34478 first_imode = V4SImode;
34479 second_imode = V2DImode;
34480 third_imode = VOIDmode;
34481 break;
34482 case V16QImode:
34483 gen_load_even = gen_vec_setv16qi;
34484 gen_interleave_first_low = gen_vec_interleave_lowv8hi;
34485 gen_interleave_second_low = gen_vec_interleave_lowv4si;
34486 inner_mode = QImode;
34487 first_imode = V8HImode;
34488 second_imode = V4SImode;
34489 third_imode = V2DImode;
34490 break;
34491 default:
34492 gcc_unreachable ();
34493 }
34494
34495 for (i = 0; i < n; i++)
34496 {
34497 /* Extend the odd elment to SImode using a paradoxical SUBREG. */
34498 op0 = gen_reg_rtx (SImode);
34499 emit_move_insn (op0, gen_lowpart (SImode, ops [i + i]));
34500
34501 /* Insert the SImode value as low element of V4SImode vector. */
34502 op1 = gen_reg_rtx (V4SImode);
34503 op0 = gen_rtx_VEC_MERGE (V4SImode,
34504 gen_rtx_VEC_DUPLICATE (V4SImode,
34505 op0),
34506 CONST0_RTX (V4SImode),
34507 const1_rtx);
34508 emit_insn (gen_rtx_SET (VOIDmode, op1, op0));
34509
34510 /* Cast the V4SImode vector back to a vector in orignal mode. */
34511 op0 = gen_reg_rtx (mode);
34512 emit_move_insn (op0, gen_lowpart (mode, op1));
34513
34514 /* Load even elements into the second positon. */
34515 emit_insn (gen_load_even (op0,
34516 force_reg (inner_mode,
34517 ops [i + i + 1]),
34518 const1_rtx));
34519
34520 /* Cast vector to FIRST_IMODE vector. */
34521 ops[i] = gen_reg_rtx (first_imode);
34522 emit_move_insn (ops[i], gen_lowpart (first_imode, op0));
34523 }
34524
34525 /* Interleave low FIRST_IMODE vectors. */
34526 for (i = j = 0; i < n; i += 2, j++)
34527 {
34528 op0 = gen_reg_rtx (first_imode);
34529 emit_insn (gen_interleave_first_low (op0, ops[i], ops[i + 1]));
34530
34531 /* Cast FIRST_IMODE vector to SECOND_IMODE vector. */
34532 ops[j] = gen_reg_rtx (second_imode);
34533 emit_move_insn (ops[j], gen_lowpart (second_imode, op0));
34534 }
34535
34536 /* Interleave low SECOND_IMODE vectors. */
34537 switch (second_imode)
34538 {
34539 case V4SImode:
34540 for (i = j = 0; i < n / 2; i += 2, j++)
34541 {
34542 op0 = gen_reg_rtx (second_imode);
34543 emit_insn (gen_interleave_second_low (op0, ops[i],
34544 ops[i + 1]));
34545
34546 /* Cast the SECOND_IMODE vector to the THIRD_IMODE
34547 vector. */
34548 ops[j] = gen_reg_rtx (third_imode);
34549 emit_move_insn (ops[j], gen_lowpart (third_imode, op0));
34550 }
34551 second_imode = V2DImode;
34552 gen_interleave_second_low = gen_vec_interleave_lowv2di;
34553 /* FALLTHRU */
34554
34555 case V2DImode:
34556 op0 = gen_reg_rtx (second_imode);
34557 emit_insn (gen_interleave_second_low (op0, ops[0],
34558 ops[1]));
34559
34560 /* Cast the SECOND_IMODE vector back to a vector on original
34561 mode. */
34562 emit_insn (gen_rtx_SET (VOIDmode, target,
34563 gen_lowpart (mode, op0)));
34564 break;
34565
34566 default:
34567 gcc_unreachable ();
34568 }
34569 }
34570
34571 /* A subroutine of ix86_expand_vector_init. Handle the most general case:
34572 all values variable, and none identical. */
34573
34574 static void
34575 ix86_expand_vector_init_general (bool mmx_ok, enum machine_mode mode,
34576 rtx target, rtx vals)
34577 {
34578 rtx ops[32], op0, op1;
34579 enum machine_mode half_mode = VOIDmode;
34580 int n, i;
34581
34582 switch (mode)
34583 {
34584 case V2SFmode:
34585 case V2SImode:
34586 if (!mmx_ok && !TARGET_SSE)
34587 break;
34588 /* FALLTHRU */
34589
34590 case V8SFmode:
34591 case V8SImode:
34592 case V4DFmode:
34593 case V4DImode:
34594 case V4SFmode:
34595 case V4SImode:
34596 case V2DFmode:
34597 case V2DImode:
34598 n = GET_MODE_NUNITS (mode);
34599 for (i = 0; i < n; i++)
34600 ops[i] = XVECEXP (vals, 0, i);
34601 ix86_expand_vector_init_concat (mode, target, ops, n);
34602 return;
34603
34604 case V32QImode:
34605 half_mode = V16QImode;
34606 goto half;
34607
34608 case V16HImode:
34609 half_mode = V8HImode;
34610 goto half;
34611
34612 half:
34613 n = GET_MODE_NUNITS (mode);
34614 for (i = 0; i < n; i++)
34615 ops[i] = XVECEXP (vals, 0, i);
34616 op0 = gen_reg_rtx (half_mode);
34617 op1 = gen_reg_rtx (half_mode);
34618 ix86_expand_vector_init_interleave (half_mode, op0, ops,
34619 n >> 2);
34620 ix86_expand_vector_init_interleave (half_mode, op1,
34621 &ops [n >> 1], n >> 2);
34622 emit_insn (gen_rtx_SET (VOIDmode, target,
34623 gen_rtx_VEC_CONCAT (mode, op0, op1)));
34624 return;
34625
34626 case V16QImode:
34627 if (!TARGET_SSE4_1)
34628 break;
34629 /* FALLTHRU */
34630
34631 case V8HImode:
34632 if (!TARGET_SSE2)
34633 break;
34634
34635 /* Don't use ix86_expand_vector_init_interleave if we can't
34636 move from GPR to SSE register directly. */
34637 if (!TARGET_INTER_UNIT_MOVES)
34638 break;
34639
34640 n = GET_MODE_NUNITS (mode);
34641 for (i = 0; i < n; i++)
34642 ops[i] = XVECEXP (vals, 0, i);
34643 ix86_expand_vector_init_interleave (mode, target, ops, n >> 1);
34644 return;
34645
34646 case V4HImode:
34647 case V8QImode:
34648 break;
34649
34650 default:
34651 gcc_unreachable ();
34652 }
34653
34654 {
34655 int i, j, n_elts, n_words, n_elt_per_word;
34656 enum machine_mode inner_mode;
34657 rtx words[4], shift;
34658
34659 inner_mode = GET_MODE_INNER (mode);
34660 n_elts = GET_MODE_NUNITS (mode);
34661 n_words = GET_MODE_SIZE (mode) / UNITS_PER_WORD;
34662 n_elt_per_word = n_elts / n_words;
34663 shift = GEN_INT (GET_MODE_BITSIZE (inner_mode));
34664
34665 for (i = 0; i < n_words; ++i)
34666 {
34667 rtx word = NULL_RTX;
34668
34669 for (j = 0; j < n_elt_per_word; ++j)
34670 {
34671 rtx elt = XVECEXP (vals, 0, (i+1)*n_elt_per_word - j - 1);
34672 elt = convert_modes (word_mode, inner_mode, elt, true);
34673
34674 if (j == 0)
34675 word = elt;
34676 else
34677 {
34678 word = expand_simple_binop (word_mode, ASHIFT, word, shift,
34679 word, 1, OPTAB_LIB_WIDEN);
34680 word = expand_simple_binop (word_mode, IOR, word, elt,
34681 word, 1, OPTAB_LIB_WIDEN);
34682 }
34683 }
34684
34685 words[i] = word;
34686 }
34687
34688 if (n_words == 1)
34689 emit_move_insn (target, gen_lowpart (mode, words[0]));
34690 else if (n_words == 2)
34691 {
34692 rtx tmp = gen_reg_rtx (mode);
34693 emit_clobber (tmp);
34694 emit_move_insn (gen_lowpart (word_mode, tmp), words[0]);
34695 emit_move_insn (gen_highpart (word_mode, tmp), words[1]);
34696 emit_move_insn (target, tmp);
34697 }
34698 else if (n_words == 4)
34699 {
34700 rtx tmp = gen_reg_rtx (V4SImode);
34701 gcc_assert (word_mode == SImode);
34702 vals = gen_rtx_PARALLEL (V4SImode, gen_rtvec_v (4, words));
34703 ix86_expand_vector_init_general (false, V4SImode, tmp, vals);
34704 emit_move_insn (target, gen_lowpart (mode, tmp));
34705 }
34706 else
34707 gcc_unreachable ();
34708 }
34709 }
34710
34711 /* Initialize vector TARGET via VALS. Suppress the use of MMX
34712 instructions unless MMX_OK is true. */
34713
34714 void
34715 ix86_expand_vector_init (bool mmx_ok, rtx target, rtx vals)
34716 {
34717 enum machine_mode mode = GET_MODE (target);
34718 enum machine_mode inner_mode = GET_MODE_INNER (mode);
34719 int n_elts = GET_MODE_NUNITS (mode);
34720 int n_var = 0, one_var = -1;
34721 bool all_same = true, all_const_zero = true;
34722 int i;
34723 rtx x;
34724
34725 for (i = 0; i < n_elts; ++i)
34726 {
34727 x = XVECEXP (vals, 0, i);
34728 if (!(CONST_INT_P (x)
34729 || GET_CODE (x) == CONST_DOUBLE
34730 || GET_CODE (x) == CONST_FIXED))
34731 n_var++, one_var = i;
34732 else if (x != CONST0_RTX (inner_mode))
34733 all_const_zero = false;
34734 if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
34735 all_same = false;
34736 }
34737
34738 /* Constants are best loaded from the constant pool. */
34739 if (n_var == 0)
34740 {
34741 emit_move_insn (target, gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0)));
34742 return;
34743 }
34744
34745 /* If all values are identical, broadcast the value. */
34746 if (all_same
34747 && ix86_expand_vector_init_duplicate (mmx_ok, mode, target,
34748 XVECEXP (vals, 0, 0)))
34749 return;
34750
34751 /* Values where only one field is non-constant are best loaded from
34752 the pool and overwritten via move later. */
34753 if (n_var == 1)
34754 {
34755 if (all_const_zero
34756 && ix86_expand_vector_init_one_nonzero (mmx_ok, mode, target,
34757 XVECEXP (vals, 0, one_var),
34758 one_var))
34759 return;
34760
34761 if (ix86_expand_vector_init_one_var (mmx_ok, mode, target, vals, one_var))
34762 return;
34763 }
34764
34765 ix86_expand_vector_init_general (mmx_ok, mode, target, vals);
34766 }
34767
34768 void
34769 ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt)
34770 {
34771 enum machine_mode mode = GET_MODE (target);
34772 enum machine_mode inner_mode = GET_MODE_INNER (mode);
34773 enum machine_mode half_mode;
34774 bool use_vec_merge = false;
34775 rtx tmp;
34776 static rtx (*gen_extract[6][2]) (rtx, rtx)
34777 = {
34778 { gen_vec_extract_lo_v32qi, gen_vec_extract_hi_v32qi },
34779 { gen_vec_extract_lo_v16hi, gen_vec_extract_hi_v16hi },
34780 { gen_vec_extract_lo_v8si, gen_vec_extract_hi_v8si },
34781 { gen_vec_extract_lo_v4di, gen_vec_extract_hi_v4di },
34782 { gen_vec_extract_lo_v8sf, gen_vec_extract_hi_v8sf },
34783 { gen_vec_extract_lo_v4df, gen_vec_extract_hi_v4df }
34784 };
34785 static rtx (*gen_insert[6][2]) (rtx, rtx, rtx)
34786 = {
34787 { gen_vec_set_lo_v32qi, gen_vec_set_hi_v32qi },
34788 { gen_vec_set_lo_v16hi, gen_vec_set_hi_v16hi },
34789 { gen_vec_set_lo_v8si, gen_vec_set_hi_v8si },
34790 { gen_vec_set_lo_v4di, gen_vec_set_hi_v4di },
34791 { gen_vec_set_lo_v8sf, gen_vec_set_hi_v8sf },
34792 { gen_vec_set_lo_v4df, gen_vec_set_hi_v4df }
34793 };
34794 int i, j, n;
34795
34796 switch (mode)
34797 {
34798 case V2SFmode:
34799 case V2SImode:
34800 if (mmx_ok)
34801 {
34802 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
34803 ix86_expand_vector_extract (true, tmp, target, 1 - elt);
34804 if (elt == 0)
34805 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
34806 else
34807 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
34808 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
34809 return;
34810 }
34811 break;
34812
34813 case V2DImode:
34814 use_vec_merge = TARGET_SSE4_1 && TARGET_64BIT;
34815 if (use_vec_merge)
34816 break;
34817
34818 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
34819 ix86_expand_vector_extract (false, tmp, target, 1 - elt);
34820 if (elt == 0)
34821 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
34822 else
34823 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
34824 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
34825 return;
34826
34827 case V2DFmode:
34828 {
34829 rtx op0, op1;
34830
34831 /* For the two element vectors, we implement a VEC_CONCAT with
34832 the extraction of the other element. */
34833
34834 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (1 - elt)));
34835 tmp = gen_rtx_VEC_SELECT (inner_mode, target, tmp);
34836
34837 if (elt == 0)
34838 op0 = val, op1 = tmp;
34839 else
34840 op0 = tmp, op1 = val;
34841
34842 tmp = gen_rtx_VEC_CONCAT (mode, op0, op1);
34843 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
34844 }
34845 return;
34846
34847 case V4SFmode:
34848 use_vec_merge = TARGET_SSE4_1;
34849 if (use_vec_merge)
34850 break;
34851
34852 switch (elt)
34853 {
34854 case 0:
34855 use_vec_merge = true;
34856 break;
34857
34858 case 1:
34859 /* tmp = target = A B C D */
34860 tmp = copy_to_reg (target);
34861 /* target = A A B B */
34862 emit_insn (gen_vec_interleave_lowv4sf (target, target, target));
34863 /* target = X A B B */
34864 ix86_expand_vector_set (false, target, val, 0);
34865 /* target = A X C D */
34866 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
34867 const1_rtx, const0_rtx,
34868 GEN_INT (2+4), GEN_INT (3+4)));
34869 return;
34870
34871 case 2:
34872 /* tmp = target = A B C D */
34873 tmp = copy_to_reg (target);
34874 /* tmp = X B C D */
34875 ix86_expand_vector_set (false, tmp, val, 0);
34876 /* target = A B X D */
34877 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
34878 const0_rtx, const1_rtx,
34879 GEN_INT (0+4), GEN_INT (3+4)));
34880 return;
34881
34882 case 3:
34883 /* tmp = target = A B C D */
34884 tmp = copy_to_reg (target);
34885 /* tmp = X B C D */
34886 ix86_expand_vector_set (false, tmp, val, 0);
34887 /* target = A B X D */
34888 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
34889 const0_rtx, const1_rtx,
34890 GEN_INT (2+4), GEN_INT (0+4)));
34891 return;
34892
34893 default:
34894 gcc_unreachable ();
34895 }
34896 break;
34897
34898 case V4SImode:
34899 use_vec_merge = TARGET_SSE4_1;
34900 if (use_vec_merge)
34901 break;
34902
34903 /* Element 0 handled by vec_merge below. */
34904 if (elt == 0)
34905 {
34906 use_vec_merge = true;
34907 break;
34908 }
34909
34910 if (TARGET_SSE2)
34911 {
34912 /* With SSE2, use integer shuffles to swap element 0 and ELT,
34913 store into element 0, then shuffle them back. */
34914
34915 rtx order[4];
34916
34917 order[0] = GEN_INT (elt);
34918 order[1] = const1_rtx;
34919 order[2] = const2_rtx;
34920 order[3] = GEN_INT (3);
34921 order[elt] = const0_rtx;
34922
34923 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
34924 order[1], order[2], order[3]));
34925
34926 ix86_expand_vector_set (false, target, val, 0);
34927
34928 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
34929 order[1], order[2], order[3]));
34930 }
34931 else
34932 {
34933 /* For SSE1, we have to reuse the V4SF code. */
34934 ix86_expand_vector_set (false, gen_lowpart (V4SFmode, target),
34935 gen_lowpart (SFmode, val), elt);
34936 }
34937 return;
34938
34939 case V8HImode:
34940 use_vec_merge = TARGET_SSE2;
34941 break;
34942 case V4HImode:
34943 use_vec_merge = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
34944 break;
34945
34946 case V16QImode:
34947 use_vec_merge = TARGET_SSE4_1;
34948 break;
34949
34950 case V8QImode:
34951 break;
34952
34953 case V32QImode:
34954 half_mode = V16QImode;
34955 j = 0;
34956 n = 16;
34957 goto half;
34958
34959 case V16HImode:
34960 half_mode = V8HImode;
34961 j = 1;
34962 n = 8;
34963 goto half;
34964
34965 case V8SImode:
34966 half_mode = V4SImode;
34967 j = 2;
34968 n = 4;
34969 goto half;
34970
34971 case V4DImode:
34972 half_mode = V2DImode;
34973 j = 3;
34974 n = 2;
34975 goto half;
34976
34977 case V8SFmode:
34978 half_mode = V4SFmode;
34979 j = 4;
34980 n = 4;
34981 goto half;
34982
34983 case V4DFmode:
34984 half_mode = V2DFmode;
34985 j = 5;
34986 n = 2;
34987 goto half;
34988
34989 half:
34990 /* Compute offset. */
34991 i = elt / n;
34992 elt %= n;
34993
34994 gcc_assert (i <= 1);
34995
34996 /* Extract the half. */
34997 tmp = gen_reg_rtx (half_mode);
34998 emit_insn (gen_extract[j][i] (tmp, target));
34999
35000 /* Put val in tmp at elt. */
35001 ix86_expand_vector_set (false, tmp, val, elt);
35002
35003 /* Put it back. */
35004 emit_insn (gen_insert[j][i] (target, target, tmp));
35005 return;
35006
35007 default:
35008 break;
35009 }
35010
35011 if (use_vec_merge)
35012 {
35013 tmp = gen_rtx_VEC_DUPLICATE (mode, val);
35014 tmp = gen_rtx_VEC_MERGE (mode, tmp, target, GEN_INT (1 << elt));
35015 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
35016 }
35017 else
35018 {
35019 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
35020
35021 emit_move_insn (mem, target);
35022
35023 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
35024 emit_move_insn (tmp, val);
35025
35026 emit_move_insn (target, mem);
35027 }
35028 }
35029
35030 void
35031 ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt)
35032 {
35033 enum machine_mode mode = GET_MODE (vec);
35034 enum machine_mode inner_mode = GET_MODE_INNER (mode);
35035 bool use_vec_extr = false;
35036 rtx tmp;
35037
35038 switch (mode)
35039 {
35040 case V2SImode:
35041 case V2SFmode:
35042 if (!mmx_ok)
35043 break;
35044 /* FALLTHRU */
35045
35046 case V2DFmode:
35047 case V2DImode:
35048 use_vec_extr = true;
35049 break;
35050
35051 case V4SFmode:
35052 use_vec_extr = TARGET_SSE4_1;
35053 if (use_vec_extr)
35054 break;
35055
35056 switch (elt)
35057 {
35058 case 0:
35059 tmp = vec;
35060 break;
35061
35062 case 1:
35063 case 3:
35064 tmp = gen_reg_rtx (mode);
35065 emit_insn (gen_sse_shufps_v4sf (tmp, vec, vec,
35066 GEN_INT (elt), GEN_INT (elt),
35067 GEN_INT (elt+4), GEN_INT (elt+4)));
35068 break;
35069
35070 case 2:
35071 tmp = gen_reg_rtx (mode);
35072 emit_insn (gen_vec_interleave_highv4sf (tmp, vec, vec));
35073 break;
35074
35075 default:
35076 gcc_unreachable ();
35077 }
35078 vec = tmp;
35079 use_vec_extr = true;
35080 elt = 0;
35081 break;
35082
35083 case V4SImode:
35084 use_vec_extr = TARGET_SSE4_1;
35085 if (use_vec_extr)
35086 break;
35087
35088 if (TARGET_SSE2)
35089 {
35090 switch (elt)
35091 {
35092 case 0:
35093 tmp = vec;
35094 break;
35095
35096 case 1:
35097 case 3:
35098 tmp = gen_reg_rtx (mode);
35099 emit_insn (gen_sse2_pshufd_1 (tmp, vec,
35100 GEN_INT (elt), GEN_INT (elt),
35101 GEN_INT (elt), GEN_INT (elt)));
35102 break;
35103
35104 case 2:
35105 tmp = gen_reg_rtx (mode);
35106 emit_insn (gen_vec_interleave_highv4si (tmp, vec, vec));
35107 break;
35108
35109 default:
35110 gcc_unreachable ();
35111 }
35112 vec = tmp;
35113 use_vec_extr = true;
35114 elt = 0;
35115 }
35116 else
35117 {
35118 /* For SSE1, we have to reuse the V4SF code. */
35119 ix86_expand_vector_extract (false, gen_lowpart (SFmode, target),
35120 gen_lowpart (V4SFmode, vec), elt);
35121 return;
35122 }
35123 break;
35124
35125 case V8HImode:
35126 use_vec_extr = TARGET_SSE2;
35127 break;
35128 case V4HImode:
35129 use_vec_extr = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
35130 break;
35131
35132 case V16QImode:
35133 use_vec_extr = TARGET_SSE4_1;
35134 break;
35135
35136 case V8SFmode:
35137 if (TARGET_AVX)
35138 {
35139 tmp = gen_reg_rtx (V4SFmode);
35140 if (elt < 4)
35141 emit_insn (gen_vec_extract_lo_v8sf (tmp, vec));
35142 else
35143 emit_insn (gen_vec_extract_hi_v8sf (tmp, vec));
35144 ix86_expand_vector_extract (false, target, tmp, elt & 3);
35145 return;
35146 }
35147 break;
35148
35149 case V4DFmode:
35150 if (TARGET_AVX)
35151 {
35152 tmp = gen_reg_rtx (V2DFmode);
35153 if (elt < 2)
35154 emit_insn (gen_vec_extract_lo_v4df (tmp, vec));
35155 else
35156 emit_insn (gen_vec_extract_hi_v4df (tmp, vec));
35157 ix86_expand_vector_extract (false, target, tmp, elt & 1);
35158 return;
35159 }
35160 break;
35161
35162 case V32QImode:
35163 if (TARGET_AVX)
35164 {
35165 tmp = gen_reg_rtx (V16QImode);
35166 if (elt < 16)
35167 emit_insn (gen_vec_extract_lo_v32qi (tmp, vec));
35168 else
35169 emit_insn (gen_vec_extract_hi_v32qi (tmp, vec));
35170 ix86_expand_vector_extract (false, target, tmp, elt & 15);
35171 return;
35172 }
35173 break;
35174
35175 case V16HImode:
35176 if (TARGET_AVX)
35177 {
35178 tmp = gen_reg_rtx (V8HImode);
35179 if (elt < 8)
35180 emit_insn (gen_vec_extract_lo_v16hi (tmp, vec));
35181 else
35182 emit_insn (gen_vec_extract_hi_v16hi (tmp, vec));
35183 ix86_expand_vector_extract (false, target, tmp, elt & 7);
35184 return;
35185 }
35186 break;
35187
35188 case V8SImode:
35189 if (TARGET_AVX)
35190 {
35191 tmp = gen_reg_rtx (V4SImode);
35192 if (elt < 4)
35193 emit_insn (gen_vec_extract_lo_v8si (tmp, vec));
35194 else
35195 emit_insn (gen_vec_extract_hi_v8si (tmp, vec));
35196 ix86_expand_vector_extract (false, target, tmp, elt & 3);
35197 return;
35198 }
35199 break;
35200
35201 case V4DImode:
35202 if (TARGET_AVX)
35203 {
35204 tmp = gen_reg_rtx (V2DImode);
35205 if (elt < 2)
35206 emit_insn (gen_vec_extract_lo_v4di (tmp, vec));
35207 else
35208 emit_insn (gen_vec_extract_hi_v4di (tmp, vec));
35209 ix86_expand_vector_extract (false, target, tmp, elt & 1);
35210 return;
35211 }
35212 break;
35213
35214 case V8QImode:
35215 /* ??? Could extract the appropriate HImode element and shift. */
35216 default:
35217 break;
35218 }
35219
35220 if (use_vec_extr)
35221 {
35222 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (elt)));
35223 tmp = gen_rtx_VEC_SELECT (inner_mode, vec, tmp);
35224
35225 /* Let the rtl optimizers know about the zero extension performed. */
35226 if (inner_mode == QImode || inner_mode == HImode)
35227 {
35228 tmp = gen_rtx_ZERO_EXTEND (SImode, tmp);
35229 target = gen_lowpart (SImode, target);
35230 }
35231
35232 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
35233 }
35234 else
35235 {
35236 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
35237
35238 emit_move_insn (mem, vec);
35239
35240 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
35241 emit_move_insn (target, tmp);
35242 }
35243 }
35244
35245 /* Generate code to copy vector bits i / 2 ... i - 1 from vector SRC
35246 to bits 0 ... i / 2 - 1 of vector DEST, which has the same mode.
35247 The upper bits of DEST are undefined, though they shouldn't cause
35248 exceptions (some bits from src or all zeros are ok). */
35249
35250 static void
35251 emit_reduc_half (rtx dest, rtx src, int i)
35252 {
35253 rtx tem;
35254 switch (GET_MODE (src))
35255 {
35256 case V4SFmode:
35257 if (i == 128)
35258 tem = gen_sse_movhlps (dest, src, src);
35259 else
35260 tem = gen_sse_shufps_v4sf (dest, src, src, const1_rtx, const1_rtx,
35261 GEN_INT (1 + 4), GEN_INT (1 + 4));
35262 break;
35263 case V2DFmode:
35264 tem = gen_vec_interleave_highv2df (dest, src, src);
35265 break;
35266 case V16QImode:
35267 case V8HImode:
35268 case V4SImode:
35269 case V2DImode:
35270 tem = gen_sse2_lshrv1ti3 (gen_lowpart (V1TImode, dest),
35271 gen_lowpart (V1TImode, src),
35272 GEN_INT (i / 2));
35273 break;
35274 case V8SFmode:
35275 if (i == 256)
35276 tem = gen_avx_vperm2f128v8sf3 (dest, src, src, const1_rtx);
35277 else
35278 tem = gen_avx_shufps256 (dest, src, src,
35279 GEN_INT (i == 128 ? 2 + (3 << 2) : 1));
35280 break;
35281 case V4DFmode:
35282 if (i == 256)
35283 tem = gen_avx_vperm2f128v4df3 (dest, src, src, const1_rtx);
35284 else
35285 tem = gen_avx_shufpd256 (dest, src, src, const1_rtx);
35286 break;
35287 case V32QImode:
35288 case V16HImode:
35289 case V8SImode:
35290 case V4DImode:
35291 if (i == 256)
35292 tem = gen_avx2_permv2ti (gen_lowpart (V4DImode, dest),
35293 gen_lowpart (V4DImode, src),
35294 gen_lowpart (V4DImode, src),
35295 const1_rtx);
35296 else
35297 tem = gen_avx2_lshrv2ti3 (gen_lowpart (V2TImode, dest),
35298 gen_lowpart (V2TImode, src),
35299 GEN_INT (i / 2));
35300 break;
35301 default:
35302 gcc_unreachable ();
35303 }
35304 emit_insn (tem);
35305 }
35306
35307 /* Expand a vector reduction. FN is the binary pattern to reduce;
35308 DEST is the destination; IN is the input vector. */
35309
35310 void
35311 ix86_expand_reduc (rtx (*fn) (rtx, rtx, rtx), rtx dest, rtx in)
35312 {
35313 rtx half, dst, vec = in;
35314 enum machine_mode mode = GET_MODE (in);
35315 int i;
35316
35317 /* SSE4 has a special instruction for V8HImode UMIN reduction. */
35318 if (TARGET_SSE4_1
35319 && mode == V8HImode
35320 && fn == gen_uminv8hi3)
35321 {
35322 emit_insn (gen_sse4_1_phminposuw (dest, in));
35323 return;
35324 }
35325
35326 for (i = GET_MODE_BITSIZE (mode);
35327 i > GET_MODE_BITSIZE (GET_MODE_INNER (mode));
35328 i >>= 1)
35329 {
35330 half = gen_reg_rtx (mode);
35331 emit_reduc_half (half, vec, i);
35332 if (i == GET_MODE_BITSIZE (GET_MODE_INNER (mode)) * 2)
35333 dst = dest;
35334 else
35335 dst = gen_reg_rtx (mode);
35336 emit_insn (fn (dst, half, vec));
35337 vec = dst;
35338 }
35339 }
35340 \f
35341 /* Target hook for scalar_mode_supported_p. */
35342 static bool
35343 ix86_scalar_mode_supported_p (enum machine_mode mode)
35344 {
35345 if (DECIMAL_FLOAT_MODE_P (mode))
35346 return default_decimal_float_supported_p ();
35347 else if (mode == TFmode)
35348 return true;
35349 else
35350 return default_scalar_mode_supported_p (mode);
35351 }
35352
35353 /* Implements target hook vector_mode_supported_p. */
35354 static bool
35355 ix86_vector_mode_supported_p (enum machine_mode mode)
35356 {
35357 if (TARGET_SSE && VALID_SSE_REG_MODE (mode))
35358 return true;
35359 if (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode))
35360 return true;
35361 if (TARGET_AVX && VALID_AVX256_REG_MODE (mode))
35362 return true;
35363 if (TARGET_MMX && VALID_MMX_REG_MODE (mode))
35364 return true;
35365 if (TARGET_3DNOW && VALID_MMX_REG_MODE_3DNOW (mode))
35366 return true;
35367 return false;
35368 }
35369
35370 /* Target hook for c_mode_for_suffix. */
35371 static enum machine_mode
35372 ix86_c_mode_for_suffix (char suffix)
35373 {
35374 if (suffix == 'q')
35375 return TFmode;
35376 if (suffix == 'w')
35377 return XFmode;
35378
35379 return VOIDmode;
35380 }
35381
35382 /* Worker function for TARGET_MD_ASM_CLOBBERS.
35383
35384 We do this in the new i386 backend to maintain source compatibility
35385 with the old cc0-based compiler. */
35386
35387 static tree
35388 ix86_md_asm_clobbers (tree outputs ATTRIBUTE_UNUSED,
35389 tree inputs ATTRIBUTE_UNUSED,
35390 tree clobbers)
35391 {
35392 clobbers = tree_cons (NULL_TREE, build_string (5, "flags"),
35393 clobbers);
35394 clobbers = tree_cons (NULL_TREE, build_string (4, "fpsr"),
35395 clobbers);
35396 return clobbers;
35397 }
35398
35399 /* Implements target vector targetm.asm.encode_section_info. */
35400
35401 static void ATTRIBUTE_UNUSED
35402 ix86_encode_section_info (tree decl, rtx rtl, int first)
35403 {
35404 default_encode_section_info (decl, rtl, first);
35405
35406 if (TREE_CODE (decl) == VAR_DECL
35407 && (TREE_STATIC (decl) || DECL_EXTERNAL (decl))
35408 && ix86_in_large_data_p (decl))
35409 SYMBOL_REF_FLAGS (XEXP (rtl, 0)) |= SYMBOL_FLAG_FAR_ADDR;
35410 }
35411
35412 /* Worker function for REVERSE_CONDITION. */
35413
35414 enum rtx_code
35415 ix86_reverse_condition (enum rtx_code code, enum machine_mode mode)
35416 {
35417 return (mode != CCFPmode && mode != CCFPUmode
35418 ? reverse_condition (code)
35419 : reverse_condition_maybe_unordered (code));
35420 }
35421
35422 /* Output code to perform an x87 FP register move, from OPERANDS[1]
35423 to OPERANDS[0]. */
35424
35425 const char *
35426 output_387_reg_move (rtx insn, rtx *operands)
35427 {
35428 if (REG_P (operands[0]))
35429 {
35430 if (REG_P (operands[1])
35431 && find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
35432 {
35433 if (REGNO (operands[0]) == FIRST_STACK_REG)
35434 return output_387_ffreep (operands, 0);
35435 return "fstp\t%y0";
35436 }
35437 if (STACK_TOP_P (operands[0]))
35438 return "fld%Z1\t%y1";
35439 return "fst\t%y0";
35440 }
35441 else if (MEM_P (operands[0]))
35442 {
35443 gcc_assert (REG_P (operands[1]));
35444 if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
35445 return "fstp%Z0\t%y0";
35446 else
35447 {
35448 /* There is no non-popping store to memory for XFmode.
35449 So if we need one, follow the store with a load. */
35450 if (GET_MODE (operands[0]) == XFmode)
35451 return "fstp%Z0\t%y0\n\tfld%Z0\t%y0";
35452 else
35453 return "fst%Z0\t%y0";
35454 }
35455 }
35456 else
35457 gcc_unreachable();
35458 }
35459
35460 /* Output code to perform a conditional jump to LABEL, if C2 flag in
35461 FP status register is set. */
35462
35463 void
35464 ix86_emit_fp_unordered_jump (rtx label)
35465 {
35466 rtx reg = gen_reg_rtx (HImode);
35467 rtx temp;
35468
35469 emit_insn (gen_x86_fnstsw_1 (reg));
35470
35471 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
35472 {
35473 emit_insn (gen_x86_sahf_1 (reg));
35474
35475 temp = gen_rtx_REG (CCmode, FLAGS_REG);
35476 temp = gen_rtx_UNORDERED (VOIDmode, temp, const0_rtx);
35477 }
35478 else
35479 {
35480 emit_insn (gen_testqi_ext_ccno_0 (reg, GEN_INT (0x04)));
35481
35482 temp = gen_rtx_REG (CCNOmode, FLAGS_REG);
35483 temp = gen_rtx_NE (VOIDmode, temp, const0_rtx);
35484 }
35485
35486 temp = gen_rtx_IF_THEN_ELSE (VOIDmode, temp,
35487 gen_rtx_LABEL_REF (VOIDmode, label),
35488 pc_rtx);
35489 temp = gen_rtx_SET (VOIDmode, pc_rtx, temp);
35490
35491 emit_jump_insn (temp);
35492 predict_jump (REG_BR_PROB_BASE * 10 / 100);
35493 }
35494
35495 /* Output code to perform a log1p XFmode calculation. */
35496
35497 void ix86_emit_i387_log1p (rtx op0, rtx op1)
35498 {
35499 rtx label1 = gen_label_rtx ();
35500 rtx label2 = gen_label_rtx ();
35501
35502 rtx tmp = gen_reg_rtx (XFmode);
35503 rtx tmp2 = gen_reg_rtx (XFmode);
35504 rtx test;
35505
35506 emit_insn (gen_absxf2 (tmp, op1));
35507 test = gen_rtx_GE (VOIDmode, tmp,
35508 CONST_DOUBLE_FROM_REAL_VALUE (
35509 REAL_VALUE_ATOF ("0.29289321881345247561810596348408353", XFmode),
35510 XFmode));
35511 emit_jump_insn (gen_cbranchxf4 (test, XEXP (test, 0), XEXP (test, 1), label1));
35512
35513 emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
35514 emit_insn (gen_fyl2xp1xf3_i387 (op0, op1, tmp2));
35515 emit_jump (label2);
35516
35517 emit_label (label1);
35518 emit_move_insn (tmp, CONST1_RTX (XFmode));
35519 emit_insn (gen_addxf3 (tmp, op1, tmp));
35520 emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
35521 emit_insn (gen_fyl2xxf3_i387 (op0, tmp, tmp2));
35522
35523 emit_label (label2);
35524 }
35525
35526 /* Emit code for round calculation. */
35527 void ix86_emit_i387_round (rtx op0, rtx op1)
35528 {
35529 enum machine_mode inmode = GET_MODE (op1);
35530 enum machine_mode outmode = GET_MODE (op0);
35531 rtx e1, e2, res, tmp, tmp1, half;
35532 rtx scratch = gen_reg_rtx (HImode);
35533 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
35534 rtx jump_label = gen_label_rtx ();
35535 rtx insn;
35536 rtx (*gen_abs) (rtx, rtx);
35537 rtx (*gen_neg) (rtx, rtx);
35538
35539 switch (inmode)
35540 {
35541 case SFmode:
35542 gen_abs = gen_abssf2;
35543 break;
35544 case DFmode:
35545 gen_abs = gen_absdf2;
35546 break;
35547 case XFmode:
35548 gen_abs = gen_absxf2;
35549 break;
35550 default:
35551 gcc_unreachable ();
35552 }
35553
35554 switch (outmode)
35555 {
35556 case SFmode:
35557 gen_neg = gen_negsf2;
35558 break;
35559 case DFmode:
35560 gen_neg = gen_negdf2;
35561 break;
35562 case XFmode:
35563 gen_neg = gen_negxf2;
35564 break;
35565 case HImode:
35566 gen_neg = gen_neghi2;
35567 break;
35568 case SImode:
35569 gen_neg = gen_negsi2;
35570 break;
35571 case DImode:
35572 gen_neg = gen_negdi2;
35573 break;
35574 default:
35575 gcc_unreachable ();
35576 }
35577
35578 e1 = gen_reg_rtx (inmode);
35579 e2 = gen_reg_rtx (inmode);
35580 res = gen_reg_rtx (outmode);
35581
35582 half = CONST_DOUBLE_FROM_REAL_VALUE (dconsthalf, inmode);
35583
35584 /* round(a) = sgn(a) * floor(fabs(a) + 0.5) */
35585
35586 /* scratch = fxam(op1) */
35587 emit_insn (gen_rtx_SET (VOIDmode, scratch,
35588 gen_rtx_UNSPEC (HImode, gen_rtvec (1, op1),
35589 UNSPEC_FXAM)));
35590 /* e1 = fabs(op1) */
35591 emit_insn (gen_abs (e1, op1));
35592
35593 /* e2 = e1 + 0.5 */
35594 half = force_reg (inmode, half);
35595 emit_insn (gen_rtx_SET (VOIDmode, e2,
35596 gen_rtx_PLUS (inmode, e1, half)));
35597
35598 /* res = floor(e2) */
35599 if (inmode != XFmode)
35600 {
35601 tmp1 = gen_reg_rtx (XFmode);
35602
35603 emit_insn (gen_rtx_SET (VOIDmode, tmp1,
35604 gen_rtx_FLOAT_EXTEND (XFmode, e2)));
35605 }
35606 else
35607 tmp1 = e2;
35608
35609 switch (outmode)
35610 {
35611 case SFmode:
35612 case DFmode:
35613 {
35614 rtx tmp0 = gen_reg_rtx (XFmode);
35615
35616 emit_insn (gen_frndintxf2_floor (tmp0, tmp1));
35617
35618 emit_insn (gen_rtx_SET (VOIDmode, res,
35619 gen_rtx_UNSPEC (outmode, gen_rtvec (1, tmp0),
35620 UNSPEC_TRUNC_NOOP)));
35621 }
35622 break;
35623 case XFmode:
35624 emit_insn (gen_frndintxf2_floor (res, tmp1));
35625 break;
35626 case HImode:
35627 emit_insn (gen_lfloorxfhi2 (res, tmp1));
35628 break;
35629 case SImode:
35630 emit_insn (gen_lfloorxfsi2 (res, tmp1));
35631 break;
35632 case DImode:
35633 emit_insn (gen_lfloorxfdi2 (res, tmp1));
35634 break;
35635 default:
35636 gcc_unreachable ();
35637 }
35638
35639 /* flags = signbit(a) */
35640 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x02)));
35641
35642 /* if (flags) then res = -res */
35643 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
35644 gen_rtx_EQ (VOIDmode, flags, const0_rtx),
35645 gen_rtx_LABEL_REF (VOIDmode, jump_label),
35646 pc_rtx);
35647 insn = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
35648 predict_jump (REG_BR_PROB_BASE * 50 / 100);
35649 JUMP_LABEL (insn) = jump_label;
35650
35651 emit_insn (gen_neg (res, res));
35652
35653 emit_label (jump_label);
35654 LABEL_NUSES (jump_label) = 1;
35655
35656 emit_move_insn (op0, res);
35657 }
35658
35659 /* Output code to perform a Newton-Rhapson approximation of a single precision
35660 floating point divide [http://en.wikipedia.org/wiki/N-th_root_algorithm]. */
35661
35662 void ix86_emit_swdivsf (rtx res, rtx a, rtx b, enum machine_mode mode)
35663 {
35664 rtx x0, x1, e0, e1;
35665
35666 x0 = gen_reg_rtx (mode);
35667 e0 = gen_reg_rtx (mode);
35668 e1 = gen_reg_rtx (mode);
35669 x1 = gen_reg_rtx (mode);
35670
35671 /* a / b = a * ((rcp(b) + rcp(b)) - (b * rcp(b) * rcp (b))) */
35672
35673 b = force_reg (mode, b);
35674
35675 /* x0 = rcp(b) estimate */
35676 emit_insn (gen_rtx_SET (VOIDmode, x0,
35677 gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
35678 UNSPEC_RCP)));
35679 /* e0 = x0 * b */
35680 emit_insn (gen_rtx_SET (VOIDmode, e0,
35681 gen_rtx_MULT (mode, x0, b)));
35682
35683 /* e0 = x0 * e0 */
35684 emit_insn (gen_rtx_SET (VOIDmode, e0,
35685 gen_rtx_MULT (mode, x0, e0)));
35686
35687 /* e1 = x0 + x0 */
35688 emit_insn (gen_rtx_SET (VOIDmode, e1,
35689 gen_rtx_PLUS (mode, x0, x0)));
35690
35691 /* x1 = e1 - e0 */
35692 emit_insn (gen_rtx_SET (VOIDmode, x1,
35693 gen_rtx_MINUS (mode, e1, e0)));
35694
35695 /* res = a * x1 */
35696 emit_insn (gen_rtx_SET (VOIDmode, res,
35697 gen_rtx_MULT (mode, a, x1)));
35698 }
35699
35700 /* Output code to perform a Newton-Rhapson approximation of a
35701 single precision floating point [reciprocal] square root. */
35702
35703 void ix86_emit_swsqrtsf (rtx res, rtx a, enum machine_mode mode,
35704 bool recip)
35705 {
35706 rtx x0, e0, e1, e2, e3, mthree, mhalf;
35707 REAL_VALUE_TYPE r;
35708
35709 x0 = gen_reg_rtx (mode);
35710 e0 = gen_reg_rtx (mode);
35711 e1 = gen_reg_rtx (mode);
35712 e2 = gen_reg_rtx (mode);
35713 e3 = gen_reg_rtx (mode);
35714
35715 real_from_integer (&r, VOIDmode, -3, -1, 0);
35716 mthree = CONST_DOUBLE_FROM_REAL_VALUE (r, SFmode);
35717
35718 real_arithmetic (&r, NEGATE_EXPR, &dconsthalf, NULL);
35719 mhalf = CONST_DOUBLE_FROM_REAL_VALUE (r, SFmode);
35720
35721 if (VECTOR_MODE_P (mode))
35722 {
35723 mthree = ix86_build_const_vector (mode, true, mthree);
35724 mhalf = ix86_build_const_vector (mode, true, mhalf);
35725 }
35726
35727 /* sqrt(a) = -0.5 * a * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0)
35728 rsqrt(a) = -0.5 * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) */
35729
35730 a = force_reg (mode, a);
35731
35732 /* x0 = rsqrt(a) estimate */
35733 emit_insn (gen_rtx_SET (VOIDmode, x0,
35734 gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
35735 UNSPEC_RSQRT)));
35736
35737 /* If (a == 0.0) Filter out infinity to prevent NaN for sqrt(0.0). */
35738 if (!recip)
35739 {
35740 rtx zero, mask;
35741
35742 zero = gen_reg_rtx (mode);
35743 mask = gen_reg_rtx (mode);
35744
35745 zero = force_reg (mode, CONST0_RTX(mode));
35746 emit_insn (gen_rtx_SET (VOIDmode, mask,
35747 gen_rtx_NE (mode, zero, a)));
35748
35749 emit_insn (gen_rtx_SET (VOIDmode, x0,
35750 gen_rtx_AND (mode, x0, mask)));
35751 }
35752
35753 /* e0 = x0 * a */
35754 emit_insn (gen_rtx_SET (VOIDmode, e0,
35755 gen_rtx_MULT (mode, x0, a)));
35756 /* e1 = e0 * x0 */
35757 emit_insn (gen_rtx_SET (VOIDmode, e1,
35758 gen_rtx_MULT (mode, e0, x0)));
35759
35760 /* e2 = e1 - 3. */
35761 mthree = force_reg (mode, mthree);
35762 emit_insn (gen_rtx_SET (VOIDmode, e2,
35763 gen_rtx_PLUS (mode, e1, mthree)));
35764
35765 mhalf = force_reg (mode, mhalf);
35766 if (recip)
35767 /* e3 = -.5 * x0 */
35768 emit_insn (gen_rtx_SET (VOIDmode, e3,
35769 gen_rtx_MULT (mode, x0, mhalf)));
35770 else
35771 /* e3 = -.5 * e0 */
35772 emit_insn (gen_rtx_SET (VOIDmode, e3,
35773 gen_rtx_MULT (mode, e0, mhalf)));
35774 /* ret = e2 * e3 */
35775 emit_insn (gen_rtx_SET (VOIDmode, res,
35776 gen_rtx_MULT (mode, e2, e3)));
35777 }
35778
35779 #ifdef TARGET_SOLARIS
35780 /* Solaris implementation of TARGET_ASM_NAMED_SECTION. */
35781
35782 static void
35783 i386_solaris_elf_named_section (const char *name, unsigned int flags,
35784 tree decl)
35785 {
35786 /* With Binutils 2.15, the "@unwind" marker must be specified on
35787 every occurrence of the ".eh_frame" section, not just the first
35788 one. */
35789 if (TARGET_64BIT
35790 && strcmp (name, ".eh_frame") == 0)
35791 {
35792 fprintf (asm_out_file, "\t.section\t%s,\"%s\",@unwind\n", name,
35793 flags & SECTION_WRITE ? "aw" : "a");
35794 return;
35795 }
35796
35797 #ifndef USE_GAS
35798 if (HAVE_COMDAT_GROUP && flags & SECTION_LINKONCE)
35799 {
35800 solaris_elf_asm_comdat_section (name, flags, decl);
35801 return;
35802 }
35803 #endif
35804
35805 default_elf_asm_named_section (name, flags, decl);
35806 }
35807 #endif /* TARGET_SOLARIS */
35808
35809 /* Return the mangling of TYPE if it is an extended fundamental type. */
35810
35811 static const char *
35812 ix86_mangle_type (const_tree type)
35813 {
35814 type = TYPE_MAIN_VARIANT (type);
35815
35816 if (TREE_CODE (type) != VOID_TYPE && TREE_CODE (type) != BOOLEAN_TYPE
35817 && TREE_CODE (type) != INTEGER_TYPE && TREE_CODE (type) != REAL_TYPE)
35818 return NULL;
35819
35820 switch (TYPE_MODE (type))
35821 {
35822 case TFmode:
35823 /* __float128 is "g". */
35824 return "g";
35825 case XFmode:
35826 /* "long double" or __float80 is "e". */
35827 return "e";
35828 default:
35829 return NULL;
35830 }
35831 }
35832
35833 /* For 32-bit code we can save PIC register setup by using
35834 __stack_chk_fail_local hidden function instead of calling
35835 __stack_chk_fail directly. 64-bit code doesn't need to setup any PIC
35836 register, so it is better to call __stack_chk_fail directly. */
35837
35838 static tree ATTRIBUTE_UNUSED
35839 ix86_stack_protect_fail (void)
35840 {
35841 return TARGET_64BIT
35842 ? default_external_stack_protect_fail ()
35843 : default_hidden_stack_protect_fail ();
35844 }
35845
35846 /* Select a format to encode pointers in exception handling data. CODE
35847 is 0 for data, 1 for code labels, 2 for function pointers. GLOBAL is
35848 true if the symbol may be affected by dynamic relocations.
35849
35850 ??? All x86 object file formats are capable of representing this.
35851 After all, the relocation needed is the same as for the call insn.
35852 Whether or not a particular assembler allows us to enter such, I
35853 guess we'll have to see. */
35854 int
35855 asm_preferred_eh_data_format (int code, int global)
35856 {
35857 if (flag_pic)
35858 {
35859 int type = DW_EH_PE_sdata8;
35860 if (!TARGET_64BIT
35861 || ix86_cmodel == CM_SMALL_PIC
35862 || (ix86_cmodel == CM_MEDIUM_PIC && (global || code)))
35863 type = DW_EH_PE_sdata4;
35864 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
35865 }
35866 if (ix86_cmodel == CM_SMALL
35867 || (ix86_cmodel == CM_MEDIUM && code))
35868 return DW_EH_PE_udata4;
35869 return DW_EH_PE_absptr;
35870 }
35871 \f
35872 /* Expand copysign from SIGN to the positive value ABS_VALUE
35873 storing in RESULT. If MASK is non-null, it shall be a mask to mask out
35874 the sign-bit. */
35875 static void
35876 ix86_sse_copysign_to_positive (rtx result, rtx abs_value, rtx sign, rtx mask)
35877 {
35878 enum machine_mode mode = GET_MODE (sign);
35879 rtx sgn = gen_reg_rtx (mode);
35880 if (mask == NULL_RTX)
35881 {
35882 enum machine_mode vmode;
35883
35884 if (mode == SFmode)
35885 vmode = V4SFmode;
35886 else if (mode == DFmode)
35887 vmode = V2DFmode;
35888 else
35889 vmode = mode;
35890
35891 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), false);
35892 if (!VECTOR_MODE_P (mode))
35893 {
35894 /* We need to generate a scalar mode mask in this case. */
35895 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
35896 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
35897 mask = gen_reg_rtx (mode);
35898 emit_insn (gen_rtx_SET (VOIDmode, mask, tmp));
35899 }
35900 }
35901 else
35902 mask = gen_rtx_NOT (mode, mask);
35903 emit_insn (gen_rtx_SET (VOIDmode, sgn,
35904 gen_rtx_AND (mode, mask, sign)));
35905 emit_insn (gen_rtx_SET (VOIDmode, result,
35906 gen_rtx_IOR (mode, abs_value, sgn)));
35907 }
35908
35909 /* Expand fabs (OP0) and return a new rtx that holds the result. The
35910 mask for masking out the sign-bit is stored in *SMASK, if that is
35911 non-null. */
35912 static rtx
35913 ix86_expand_sse_fabs (rtx op0, rtx *smask)
35914 {
35915 enum machine_mode vmode, mode = GET_MODE (op0);
35916 rtx xa, mask;
35917
35918 xa = gen_reg_rtx (mode);
35919 if (mode == SFmode)
35920 vmode = V4SFmode;
35921 else if (mode == DFmode)
35922 vmode = V2DFmode;
35923 else
35924 vmode = mode;
35925 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), true);
35926 if (!VECTOR_MODE_P (mode))
35927 {
35928 /* We need to generate a scalar mode mask in this case. */
35929 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
35930 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
35931 mask = gen_reg_rtx (mode);
35932 emit_insn (gen_rtx_SET (VOIDmode, mask, tmp));
35933 }
35934 emit_insn (gen_rtx_SET (VOIDmode, xa,
35935 gen_rtx_AND (mode, op0, mask)));
35936
35937 if (smask)
35938 *smask = mask;
35939
35940 return xa;
35941 }
35942
35943 /* Expands a comparison of OP0 with OP1 using comparison code CODE,
35944 swapping the operands if SWAP_OPERANDS is true. The expanded
35945 code is a forward jump to a newly created label in case the
35946 comparison is true. The generated label rtx is returned. */
35947 static rtx
35948 ix86_expand_sse_compare_and_jump (enum rtx_code code, rtx op0, rtx op1,
35949 bool swap_operands)
35950 {
35951 rtx label, tmp;
35952
35953 if (swap_operands)
35954 {
35955 tmp = op0;
35956 op0 = op1;
35957 op1 = tmp;
35958 }
35959
35960 label = gen_label_rtx ();
35961 tmp = gen_rtx_REG (CCFPUmode, FLAGS_REG);
35962 emit_insn (gen_rtx_SET (VOIDmode, tmp,
35963 gen_rtx_COMPARE (CCFPUmode, op0, op1)));
35964 tmp = gen_rtx_fmt_ee (code, VOIDmode, tmp, const0_rtx);
35965 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
35966 gen_rtx_LABEL_REF (VOIDmode, label), pc_rtx);
35967 tmp = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
35968 JUMP_LABEL (tmp) = label;
35969
35970 return label;
35971 }
35972
35973 /* Expand a mask generating SSE comparison instruction comparing OP0 with OP1
35974 using comparison code CODE. Operands are swapped for the comparison if
35975 SWAP_OPERANDS is true. Returns a rtx for the generated mask. */
35976 static rtx
35977 ix86_expand_sse_compare_mask (enum rtx_code code, rtx op0, rtx op1,
35978 bool swap_operands)
35979 {
35980 rtx (*insn)(rtx, rtx, rtx, rtx);
35981 enum machine_mode mode = GET_MODE (op0);
35982 rtx mask = gen_reg_rtx (mode);
35983
35984 if (swap_operands)
35985 {
35986 rtx tmp = op0;
35987 op0 = op1;
35988 op1 = tmp;
35989 }
35990
35991 insn = mode == DFmode ? gen_setcc_df_sse : gen_setcc_sf_sse;
35992
35993 emit_insn (insn (mask, op0, op1,
35994 gen_rtx_fmt_ee (code, mode, op0, op1)));
35995 return mask;
35996 }
35997
35998 /* Generate and return a rtx of mode MODE for 2**n where n is the number
35999 of bits of the mantissa of MODE, which must be one of DFmode or SFmode. */
36000 static rtx
36001 ix86_gen_TWO52 (enum machine_mode mode)
36002 {
36003 REAL_VALUE_TYPE TWO52r;
36004 rtx TWO52;
36005
36006 real_ldexp (&TWO52r, &dconst1, mode == DFmode ? 52 : 23);
36007 TWO52 = const_double_from_real_value (TWO52r, mode);
36008 TWO52 = force_reg (mode, TWO52);
36009
36010 return TWO52;
36011 }
36012
36013 /* Expand SSE sequence for computing lround from OP1 storing
36014 into OP0. */
36015 void
36016 ix86_expand_lround (rtx op0, rtx op1)
36017 {
36018 /* C code for the stuff we're doing below:
36019 tmp = op1 + copysign (nextafter (0.5, 0.0), op1)
36020 return (long)tmp;
36021 */
36022 enum machine_mode mode = GET_MODE (op1);
36023 const struct real_format *fmt;
36024 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
36025 rtx adj;
36026
36027 /* load nextafter (0.5, 0.0) */
36028 fmt = REAL_MODE_FORMAT (mode);
36029 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
36030 REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
36031
36032 /* adj = copysign (0.5, op1) */
36033 adj = force_reg (mode, const_double_from_real_value (pred_half, mode));
36034 ix86_sse_copysign_to_positive (adj, adj, force_reg (mode, op1), NULL_RTX);
36035
36036 /* adj = op1 + adj */
36037 adj = expand_simple_binop (mode, PLUS, adj, op1, NULL_RTX, 0, OPTAB_DIRECT);
36038
36039 /* op0 = (imode)adj */
36040 expand_fix (op0, adj, 0);
36041 }
36042
36043 /* Expand SSE2 sequence for computing lround from OPERAND1 storing
36044 into OPERAND0. */
36045 void
36046 ix86_expand_lfloorceil (rtx op0, rtx op1, bool do_floor)
36047 {
36048 /* C code for the stuff we're doing below (for do_floor):
36049 xi = (long)op1;
36050 xi -= (double)xi > op1 ? 1 : 0;
36051 return xi;
36052 */
36053 enum machine_mode fmode = GET_MODE (op1);
36054 enum machine_mode imode = GET_MODE (op0);
36055 rtx ireg, freg, label, tmp;
36056
36057 /* reg = (long)op1 */
36058 ireg = gen_reg_rtx (imode);
36059 expand_fix (ireg, op1, 0);
36060
36061 /* freg = (double)reg */
36062 freg = gen_reg_rtx (fmode);
36063 expand_float (freg, ireg, 0);
36064
36065 /* ireg = (freg > op1) ? ireg - 1 : ireg */
36066 label = ix86_expand_sse_compare_and_jump (UNLE,
36067 freg, op1, !do_floor);
36068 tmp = expand_simple_binop (imode, do_floor ? MINUS : PLUS,
36069 ireg, const1_rtx, NULL_RTX, 0, OPTAB_DIRECT);
36070 emit_move_insn (ireg, tmp);
36071
36072 emit_label (label);
36073 LABEL_NUSES (label) = 1;
36074
36075 emit_move_insn (op0, ireg);
36076 }
36077
36078 /* Expand rint (IEEE round to nearest) rounding OPERAND1 and storing the
36079 result in OPERAND0. */
36080 void
36081 ix86_expand_rint (rtx operand0, rtx operand1)
36082 {
36083 /* C code for the stuff we're doing below:
36084 xa = fabs (operand1);
36085 if (!isless (xa, 2**52))
36086 return operand1;
36087 xa = xa + 2**52 - 2**52;
36088 return copysign (xa, operand1);
36089 */
36090 enum machine_mode mode = GET_MODE (operand0);
36091 rtx res, xa, label, TWO52, mask;
36092
36093 res = gen_reg_rtx (mode);
36094 emit_move_insn (res, operand1);
36095
36096 /* xa = abs (operand1) */
36097 xa = ix86_expand_sse_fabs (res, &mask);
36098
36099 /* if (!isless (xa, TWO52)) goto label; */
36100 TWO52 = ix86_gen_TWO52 (mode);
36101 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
36102
36103 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
36104 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
36105
36106 ix86_sse_copysign_to_positive (res, xa, res, mask);
36107
36108 emit_label (label);
36109 LABEL_NUSES (label) = 1;
36110
36111 emit_move_insn (operand0, res);
36112 }
36113
36114 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
36115 into OPERAND0. */
36116 void
36117 ix86_expand_floorceildf_32 (rtx operand0, rtx operand1, bool do_floor)
36118 {
36119 /* C code for the stuff we expand below.
36120 double xa = fabs (x), x2;
36121 if (!isless (xa, TWO52))
36122 return x;
36123 xa = xa + TWO52 - TWO52;
36124 x2 = copysign (xa, x);
36125 Compensate. Floor:
36126 if (x2 > x)
36127 x2 -= 1;
36128 Compensate. Ceil:
36129 if (x2 < x)
36130 x2 -= -1;
36131 return x2;
36132 */
36133 enum machine_mode mode = GET_MODE (operand0);
36134 rtx xa, TWO52, tmp, label, one, res, mask;
36135
36136 TWO52 = ix86_gen_TWO52 (mode);
36137
36138 /* Temporary for holding the result, initialized to the input
36139 operand to ease control flow. */
36140 res = gen_reg_rtx (mode);
36141 emit_move_insn (res, operand1);
36142
36143 /* xa = abs (operand1) */
36144 xa = ix86_expand_sse_fabs (res, &mask);
36145
36146 /* if (!isless (xa, TWO52)) goto label; */
36147 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
36148
36149 /* xa = xa + TWO52 - TWO52; */
36150 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
36151 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
36152
36153 /* xa = copysign (xa, operand1) */
36154 ix86_sse_copysign_to_positive (xa, xa, res, mask);
36155
36156 /* generate 1.0 or -1.0 */
36157 one = force_reg (mode,
36158 const_double_from_real_value (do_floor
36159 ? dconst1 : dconstm1, mode));
36160
36161 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
36162 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
36163 emit_insn (gen_rtx_SET (VOIDmode, tmp,
36164 gen_rtx_AND (mode, one, tmp)));
36165 /* We always need to subtract here to preserve signed zero. */
36166 tmp = expand_simple_binop (mode, MINUS,
36167 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
36168 emit_move_insn (res, tmp);
36169
36170 emit_label (label);
36171 LABEL_NUSES (label) = 1;
36172
36173 emit_move_insn (operand0, res);
36174 }
36175
36176 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
36177 into OPERAND0. */
36178 void
36179 ix86_expand_floorceil (rtx operand0, rtx operand1, bool do_floor)
36180 {
36181 /* C code for the stuff we expand below.
36182 double xa = fabs (x), x2;
36183 if (!isless (xa, TWO52))
36184 return x;
36185 x2 = (double)(long)x;
36186 Compensate. Floor:
36187 if (x2 > x)
36188 x2 -= 1;
36189 Compensate. Ceil:
36190 if (x2 < x)
36191 x2 += 1;
36192 if (HONOR_SIGNED_ZEROS (mode))
36193 return copysign (x2, x);
36194 return x2;
36195 */
36196 enum machine_mode mode = GET_MODE (operand0);
36197 rtx xa, xi, TWO52, tmp, label, one, res, mask;
36198
36199 TWO52 = ix86_gen_TWO52 (mode);
36200
36201 /* Temporary for holding the result, initialized to the input
36202 operand to ease control flow. */
36203 res = gen_reg_rtx (mode);
36204 emit_move_insn (res, operand1);
36205
36206 /* xa = abs (operand1) */
36207 xa = ix86_expand_sse_fabs (res, &mask);
36208
36209 /* if (!isless (xa, TWO52)) goto label; */
36210 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
36211
36212 /* xa = (double)(long)x */
36213 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
36214 expand_fix (xi, res, 0);
36215 expand_float (xa, xi, 0);
36216
36217 /* generate 1.0 */
36218 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
36219
36220 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
36221 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
36222 emit_insn (gen_rtx_SET (VOIDmode, tmp,
36223 gen_rtx_AND (mode, one, tmp)));
36224 tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
36225 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
36226 emit_move_insn (res, tmp);
36227
36228 if (HONOR_SIGNED_ZEROS (mode))
36229 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
36230
36231 emit_label (label);
36232 LABEL_NUSES (label) = 1;
36233
36234 emit_move_insn (operand0, res);
36235 }
36236
36237 /* Expand SSE sequence for computing round from OPERAND1 storing
36238 into OPERAND0. Sequence that works without relying on DImode truncation
36239 via cvttsd2siq that is only available on 64bit targets. */
36240 void
36241 ix86_expand_rounddf_32 (rtx operand0, rtx operand1)
36242 {
36243 /* C code for the stuff we expand below.
36244 double xa = fabs (x), xa2, x2;
36245 if (!isless (xa, TWO52))
36246 return x;
36247 Using the absolute value and copying back sign makes
36248 -0.0 -> -0.0 correct.
36249 xa2 = xa + TWO52 - TWO52;
36250 Compensate.
36251 dxa = xa2 - xa;
36252 if (dxa <= -0.5)
36253 xa2 += 1;
36254 else if (dxa > 0.5)
36255 xa2 -= 1;
36256 x2 = copysign (xa2, x);
36257 return x2;
36258 */
36259 enum machine_mode mode = GET_MODE (operand0);
36260 rtx xa, xa2, dxa, TWO52, tmp, label, half, mhalf, one, res, mask;
36261
36262 TWO52 = ix86_gen_TWO52 (mode);
36263
36264 /* Temporary for holding the result, initialized to the input
36265 operand to ease control flow. */
36266 res = gen_reg_rtx (mode);
36267 emit_move_insn (res, operand1);
36268
36269 /* xa = abs (operand1) */
36270 xa = ix86_expand_sse_fabs (res, &mask);
36271
36272 /* if (!isless (xa, TWO52)) goto label; */
36273 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
36274
36275 /* xa2 = xa + TWO52 - TWO52; */
36276 xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
36277 xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
36278
36279 /* dxa = xa2 - xa; */
36280 dxa = expand_simple_binop (mode, MINUS, xa2, xa, NULL_RTX, 0, OPTAB_DIRECT);
36281
36282 /* generate 0.5, 1.0 and -0.5 */
36283 half = force_reg (mode, const_double_from_real_value (dconsthalf, mode));
36284 one = expand_simple_binop (mode, PLUS, half, half, NULL_RTX, 0, OPTAB_DIRECT);
36285 mhalf = expand_simple_binop (mode, MINUS, half, one, NULL_RTX,
36286 0, OPTAB_DIRECT);
36287
36288 /* Compensate. */
36289 tmp = gen_reg_rtx (mode);
36290 /* xa2 = xa2 - (dxa > 0.5 ? 1 : 0) */
36291 tmp = ix86_expand_sse_compare_mask (UNGT, dxa, half, false);
36292 emit_insn (gen_rtx_SET (VOIDmode, tmp,
36293 gen_rtx_AND (mode, one, tmp)));
36294 xa2 = expand_simple_binop (mode, MINUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
36295 /* xa2 = xa2 + (dxa <= -0.5 ? 1 : 0) */
36296 tmp = ix86_expand_sse_compare_mask (UNGE, mhalf, dxa, false);
36297 emit_insn (gen_rtx_SET (VOIDmode, tmp,
36298 gen_rtx_AND (mode, one, tmp)));
36299 xa2 = expand_simple_binop (mode, PLUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
36300
36301 /* res = copysign (xa2, operand1) */
36302 ix86_sse_copysign_to_positive (res, xa2, force_reg (mode, operand1), mask);
36303
36304 emit_label (label);
36305 LABEL_NUSES (label) = 1;
36306
36307 emit_move_insn (operand0, res);
36308 }
36309
36310 /* Expand SSE sequence for computing trunc from OPERAND1 storing
36311 into OPERAND0. */
36312 void
36313 ix86_expand_trunc (rtx operand0, rtx operand1)
36314 {
36315 /* C code for SSE variant we expand below.
36316 double xa = fabs (x), x2;
36317 if (!isless (xa, TWO52))
36318 return x;
36319 x2 = (double)(long)x;
36320 if (HONOR_SIGNED_ZEROS (mode))
36321 return copysign (x2, x);
36322 return x2;
36323 */
36324 enum machine_mode mode = GET_MODE (operand0);
36325 rtx xa, xi, TWO52, label, res, mask;
36326
36327 TWO52 = ix86_gen_TWO52 (mode);
36328
36329 /* Temporary for holding the result, initialized to the input
36330 operand to ease control flow. */
36331 res = gen_reg_rtx (mode);
36332 emit_move_insn (res, operand1);
36333
36334 /* xa = abs (operand1) */
36335 xa = ix86_expand_sse_fabs (res, &mask);
36336
36337 /* if (!isless (xa, TWO52)) goto label; */
36338 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
36339
36340 /* x = (double)(long)x */
36341 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
36342 expand_fix (xi, res, 0);
36343 expand_float (res, xi, 0);
36344
36345 if (HONOR_SIGNED_ZEROS (mode))
36346 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
36347
36348 emit_label (label);
36349 LABEL_NUSES (label) = 1;
36350
36351 emit_move_insn (operand0, res);
36352 }
36353
36354 /* Expand SSE sequence for computing trunc from OPERAND1 storing
36355 into OPERAND0. */
36356 void
36357 ix86_expand_truncdf_32 (rtx operand0, rtx operand1)
36358 {
36359 enum machine_mode mode = GET_MODE (operand0);
36360 rtx xa, mask, TWO52, label, one, res, smask, tmp;
36361
36362 /* C code for SSE variant we expand below.
36363 double xa = fabs (x), x2;
36364 if (!isless (xa, TWO52))
36365 return x;
36366 xa2 = xa + TWO52 - TWO52;
36367 Compensate:
36368 if (xa2 > xa)
36369 xa2 -= 1.0;
36370 x2 = copysign (xa2, x);
36371 return x2;
36372 */
36373
36374 TWO52 = ix86_gen_TWO52 (mode);
36375
36376 /* Temporary for holding the result, initialized to the input
36377 operand to ease control flow. */
36378 res = gen_reg_rtx (mode);
36379 emit_move_insn (res, operand1);
36380
36381 /* xa = abs (operand1) */
36382 xa = ix86_expand_sse_fabs (res, &smask);
36383
36384 /* if (!isless (xa, TWO52)) goto label; */
36385 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
36386
36387 /* res = xa + TWO52 - TWO52; */
36388 tmp = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
36389 tmp = expand_simple_binop (mode, MINUS, tmp, TWO52, tmp, 0, OPTAB_DIRECT);
36390 emit_move_insn (res, tmp);
36391
36392 /* generate 1.0 */
36393 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
36394
36395 /* Compensate: res = xa2 - (res > xa ? 1 : 0) */
36396 mask = ix86_expand_sse_compare_mask (UNGT, res, xa, false);
36397 emit_insn (gen_rtx_SET (VOIDmode, mask,
36398 gen_rtx_AND (mode, mask, one)));
36399 tmp = expand_simple_binop (mode, MINUS,
36400 res, mask, NULL_RTX, 0, OPTAB_DIRECT);
36401 emit_move_insn (res, tmp);
36402
36403 /* res = copysign (res, operand1) */
36404 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), smask);
36405
36406 emit_label (label);
36407 LABEL_NUSES (label) = 1;
36408
36409 emit_move_insn (operand0, res);
36410 }
36411
36412 /* Expand SSE sequence for computing round from OPERAND1 storing
36413 into OPERAND0. */
36414 void
36415 ix86_expand_round (rtx operand0, rtx operand1)
36416 {
36417 /* C code for the stuff we're doing below:
36418 double xa = fabs (x);
36419 if (!isless (xa, TWO52))
36420 return x;
36421 xa = (double)(long)(xa + nextafter (0.5, 0.0));
36422 return copysign (xa, x);
36423 */
36424 enum machine_mode mode = GET_MODE (operand0);
36425 rtx res, TWO52, xa, label, xi, half, mask;
36426 const struct real_format *fmt;
36427 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
36428
36429 /* Temporary for holding the result, initialized to the input
36430 operand to ease control flow. */
36431 res = gen_reg_rtx (mode);
36432 emit_move_insn (res, operand1);
36433
36434 TWO52 = ix86_gen_TWO52 (mode);
36435 xa = ix86_expand_sse_fabs (res, &mask);
36436 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
36437
36438 /* load nextafter (0.5, 0.0) */
36439 fmt = REAL_MODE_FORMAT (mode);
36440 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
36441 REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
36442
36443 /* xa = xa + 0.5 */
36444 half = force_reg (mode, const_double_from_real_value (pred_half, mode));
36445 xa = expand_simple_binop (mode, PLUS, xa, half, NULL_RTX, 0, OPTAB_DIRECT);
36446
36447 /* xa = (double)(int64_t)xa */
36448 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
36449 expand_fix (xi, xa, 0);
36450 expand_float (xa, xi, 0);
36451
36452 /* res = copysign (xa, operand1) */
36453 ix86_sse_copysign_to_positive (res, xa, force_reg (mode, operand1), mask);
36454
36455 emit_label (label);
36456 LABEL_NUSES (label) = 1;
36457
36458 emit_move_insn (operand0, res);
36459 }
36460
36461 /* Expand SSE sequence for computing round
36462 from OP1 storing into OP0 using sse4 round insn. */
36463 void
36464 ix86_expand_round_sse4 (rtx op0, rtx op1)
36465 {
36466 enum machine_mode mode = GET_MODE (op0);
36467 rtx e1, e2, res, half;
36468 const struct real_format *fmt;
36469 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
36470 rtx (*gen_copysign) (rtx, rtx, rtx);
36471 rtx (*gen_round) (rtx, rtx, rtx);
36472
36473 switch (mode)
36474 {
36475 case SFmode:
36476 gen_copysign = gen_copysignsf3;
36477 gen_round = gen_sse4_1_roundsf2;
36478 break;
36479 case DFmode:
36480 gen_copysign = gen_copysigndf3;
36481 gen_round = gen_sse4_1_rounddf2;
36482 break;
36483 default:
36484 gcc_unreachable ();
36485 }
36486
36487 /* round (a) = trunc (a + copysign (0.5, a)) */
36488
36489 /* load nextafter (0.5, 0.0) */
36490 fmt = REAL_MODE_FORMAT (mode);
36491 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
36492 REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
36493 half = const_double_from_real_value (pred_half, mode);
36494
36495 /* e1 = copysign (0.5, op1) */
36496 e1 = gen_reg_rtx (mode);
36497 emit_insn (gen_copysign (e1, half, op1));
36498
36499 /* e2 = op1 + e1 */
36500 e2 = expand_simple_binop (mode, PLUS, op1, e1, NULL_RTX, 0, OPTAB_DIRECT);
36501
36502 /* res = trunc (e2) */
36503 res = gen_reg_rtx (mode);
36504 emit_insn (gen_round (res, e2, GEN_INT (ROUND_TRUNC)));
36505
36506 emit_move_insn (op0, res);
36507 }
36508 \f
36509
36510 /* Table of valid machine attributes. */
36511 static const struct attribute_spec ix86_attribute_table[] =
36512 {
36513 /* { name, min_len, max_len, decl_req, type_req, fn_type_req, handler,
36514 affects_type_identity } */
36515 /* Stdcall attribute says callee is responsible for popping arguments
36516 if they are not variable. */
36517 { "stdcall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
36518 true },
36519 /* Fastcall attribute says callee is responsible for popping arguments
36520 if they are not variable. */
36521 { "fastcall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
36522 true },
36523 /* Thiscall attribute says callee is responsible for popping arguments
36524 if they are not variable. */
36525 { "thiscall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
36526 true },
36527 /* Cdecl attribute says the callee is a normal C declaration */
36528 { "cdecl", 0, 0, false, true, true, ix86_handle_cconv_attribute,
36529 true },
36530 /* Regparm attribute specifies how many integer arguments are to be
36531 passed in registers. */
36532 { "regparm", 1, 1, false, true, true, ix86_handle_cconv_attribute,
36533 true },
36534 /* Sseregparm attribute says we are using x86_64 calling conventions
36535 for FP arguments. */
36536 { "sseregparm", 0, 0, false, true, true, ix86_handle_cconv_attribute,
36537 true },
36538 /* The transactional memory builtins are implicitly regparm or fastcall
36539 depending on the ABI. Override the generic do-nothing attribute that
36540 these builtins were declared with. */
36541 { "*tm regparm", 0, 0, false, true, true, ix86_handle_tm_regparm_attribute,
36542 true },
36543 /* force_align_arg_pointer says this function realigns the stack at entry. */
36544 { (const char *)&ix86_force_align_arg_pointer_string, 0, 0,
36545 false, true, true, ix86_handle_cconv_attribute, false },
36546 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
36547 { "dllimport", 0, 0, false, false, false, handle_dll_attribute, false },
36548 { "dllexport", 0, 0, false, false, false, handle_dll_attribute, false },
36549 { "shared", 0, 0, true, false, false, ix86_handle_shared_attribute,
36550 false },
36551 #endif
36552 { "ms_struct", 0, 0, false, false, false, ix86_handle_struct_attribute,
36553 false },
36554 { "gcc_struct", 0, 0, false, false, false, ix86_handle_struct_attribute,
36555 false },
36556 #ifdef SUBTARGET_ATTRIBUTE_TABLE
36557 SUBTARGET_ATTRIBUTE_TABLE,
36558 #endif
36559 /* ms_abi and sysv_abi calling convention function attributes. */
36560 { "ms_abi", 0, 0, false, true, true, ix86_handle_abi_attribute, true },
36561 { "sysv_abi", 0, 0, false, true, true, ix86_handle_abi_attribute, true },
36562 { "ms_hook_prologue", 0, 0, true, false, false, ix86_handle_fndecl_attribute,
36563 false },
36564 { "callee_pop_aggregate_return", 1, 1, false, true, true,
36565 ix86_handle_callee_pop_aggregate_return, true },
36566 /* End element. */
36567 { NULL, 0, 0, false, false, false, NULL, false }
36568 };
36569
36570 /* Implement targetm.vectorize.builtin_vectorization_cost. */
36571 static int
36572 ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
36573 tree vectype,
36574 int misalign ATTRIBUTE_UNUSED)
36575 {
36576 unsigned elements;
36577
36578 switch (type_of_cost)
36579 {
36580 case scalar_stmt:
36581 return ix86_cost->scalar_stmt_cost;
36582
36583 case scalar_load:
36584 return ix86_cost->scalar_load_cost;
36585
36586 case scalar_store:
36587 return ix86_cost->scalar_store_cost;
36588
36589 case vector_stmt:
36590 return ix86_cost->vec_stmt_cost;
36591
36592 case vector_load:
36593 return ix86_cost->vec_align_load_cost;
36594
36595 case vector_store:
36596 return ix86_cost->vec_store_cost;
36597
36598 case vec_to_scalar:
36599 return ix86_cost->vec_to_scalar_cost;
36600
36601 case scalar_to_vec:
36602 return ix86_cost->scalar_to_vec_cost;
36603
36604 case unaligned_load:
36605 case unaligned_store:
36606 return ix86_cost->vec_unalign_load_cost;
36607
36608 case cond_branch_taken:
36609 return ix86_cost->cond_taken_branch_cost;
36610
36611 case cond_branch_not_taken:
36612 return ix86_cost->cond_not_taken_branch_cost;
36613
36614 case vec_perm:
36615 case vec_promote_demote:
36616 return ix86_cost->vec_stmt_cost;
36617
36618 case vec_construct:
36619 elements = TYPE_VECTOR_SUBPARTS (vectype);
36620 return elements / 2 + 1;
36621
36622 default:
36623 gcc_unreachable ();
36624 }
36625 }
36626
36627 /* A cached (set (nil) (vselect (vconcat (nil) (nil)) (parallel [])))
36628 insn, so that expand_vselect{,_vconcat} doesn't have to create a fresh
36629 insn every time. */
36630
36631 static GTY(()) rtx vselect_insn;
36632
36633 /* Initialize vselect_insn. */
36634
36635 static void
36636 init_vselect_insn (void)
36637 {
36638 unsigned i;
36639 rtx x;
36640
36641 x = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (MAX_VECT_LEN));
36642 for (i = 0; i < MAX_VECT_LEN; ++i)
36643 XVECEXP (x, 0, i) = const0_rtx;
36644 x = gen_rtx_VEC_SELECT (V2DFmode, gen_rtx_VEC_CONCAT (V4DFmode, const0_rtx,
36645 const0_rtx), x);
36646 x = gen_rtx_SET (VOIDmode, const0_rtx, x);
36647 start_sequence ();
36648 vselect_insn = emit_insn (x);
36649 end_sequence ();
36650 }
36651
36652 /* Construct (set target (vec_select op0 (parallel perm))) and
36653 return true if that's a valid instruction in the active ISA. */
36654
36655 static bool
36656 expand_vselect (rtx target, rtx op0, const unsigned char *perm,
36657 unsigned nelt, bool testing_p)
36658 {
36659 unsigned int i;
36660 rtx x, save_vconcat;
36661 int icode;
36662
36663 if (vselect_insn == NULL_RTX)
36664 init_vselect_insn ();
36665
36666 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 1);
36667 PUT_NUM_ELEM (XVEC (x, 0), nelt);
36668 for (i = 0; i < nelt; ++i)
36669 XVECEXP (x, 0, i) = GEN_INT (perm[i]);
36670 save_vconcat = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
36671 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = op0;
36672 PUT_MODE (SET_SRC (PATTERN (vselect_insn)), GET_MODE (target));
36673 SET_DEST (PATTERN (vselect_insn)) = target;
36674 icode = recog_memoized (vselect_insn);
36675
36676 if (icode >= 0 && !testing_p)
36677 emit_insn (copy_rtx (PATTERN (vselect_insn)));
36678
36679 SET_DEST (PATTERN (vselect_insn)) = const0_rtx;
36680 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = save_vconcat;
36681 INSN_CODE (vselect_insn) = -1;
36682
36683 return icode >= 0;
36684 }
36685
36686 /* Similar, but generate a vec_concat from op0 and op1 as well. */
36687
36688 static bool
36689 expand_vselect_vconcat (rtx target, rtx op0, rtx op1,
36690 const unsigned char *perm, unsigned nelt,
36691 bool testing_p)
36692 {
36693 enum machine_mode v2mode;
36694 rtx x;
36695 bool ok;
36696
36697 if (vselect_insn == NULL_RTX)
36698 init_vselect_insn ();
36699
36700 v2mode = GET_MODE_2XWIDER_MODE (GET_MODE (op0));
36701 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
36702 PUT_MODE (x, v2mode);
36703 XEXP (x, 0) = op0;
36704 XEXP (x, 1) = op1;
36705 ok = expand_vselect (target, x, perm, nelt, testing_p);
36706 XEXP (x, 0) = const0_rtx;
36707 XEXP (x, 1) = const0_rtx;
36708 return ok;
36709 }
36710
36711 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
36712 in terms of blendp[sd] / pblendw / pblendvb / vpblendd. */
36713
36714 static bool
36715 expand_vec_perm_blend (struct expand_vec_perm_d *d)
36716 {
36717 enum machine_mode vmode = d->vmode;
36718 unsigned i, mask, nelt = d->nelt;
36719 rtx target, op0, op1, x;
36720 rtx rperm[32], vperm;
36721
36722 if (d->one_operand_p)
36723 return false;
36724 if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
36725 ;
36726 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
36727 ;
36728 else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
36729 ;
36730 else
36731 return false;
36732
36733 /* This is a blend, not a permute. Elements must stay in their
36734 respective lanes. */
36735 for (i = 0; i < nelt; ++i)
36736 {
36737 unsigned e = d->perm[i];
36738 if (!(e == i || e == i + nelt))
36739 return false;
36740 }
36741
36742 if (d->testing_p)
36743 return true;
36744
36745 /* ??? Without SSE4.1, we could implement this with and/andn/or. This
36746 decision should be extracted elsewhere, so that we only try that
36747 sequence once all budget==3 options have been tried. */
36748 target = d->target;
36749 op0 = d->op0;
36750 op1 = d->op1;
36751 mask = 0;
36752
36753 switch (vmode)
36754 {
36755 case V4DFmode:
36756 case V8SFmode:
36757 case V2DFmode:
36758 case V4SFmode:
36759 case V8HImode:
36760 case V8SImode:
36761 for (i = 0; i < nelt; ++i)
36762 mask |= (d->perm[i] >= nelt) << i;
36763 break;
36764
36765 case V2DImode:
36766 for (i = 0; i < 2; ++i)
36767 mask |= (d->perm[i] >= 2 ? 15 : 0) << (i * 4);
36768 vmode = V8HImode;
36769 goto do_subreg;
36770
36771 case V4SImode:
36772 for (i = 0; i < 4; ++i)
36773 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
36774 vmode = V8HImode;
36775 goto do_subreg;
36776
36777 case V16QImode:
36778 /* See if bytes move in pairs so we can use pblendw with
36779 an immediate argument, rather than pblendvb with a vector
36780 argument. */
36781 for (i = 0; i < 16; i += 2)
36782 if (d->perm[i] + 1 != d->perm[i + 1])
36783 {
36784 use_pblendvb:
36785 for (i = 0; i < nelt; ++i)
36786 rperm[i] = (d->perm[i] < nelt ? const0_rtx : constm1_rtx);
36787
36788 finish_pblendvb:
36789 vperm = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
36790 vperm = force_reg (vmode, vperm);
36791
36792 if (GET_MODE_SIZE (vmode) == 16)
36793 emit_insn (gen_sse4_1_pblendvb (target, op0, op1, vperm));
36794 else
36795 emit_insn (gen_avx2_pblendvb (target, op0, op1, vperm));
36796 return true;
36797 }
36798
36799 for (i = 0; i < 8; ++i)
36800 mask |= (d->perm[i * 2] >= 16) << i;
36801 vmode = V8HImode;
36802 /* FALLTHRU */
36803
36804 do_subreg:
36805 target = gen_lowpart (vmode, target);
36806 op0 = gen_lowpart (vmode, op0);
36807 op1 = gen_lowpart (vmode, op1);
36808 break;
36809
36810 case V32QImode:
36811 /* See if bytes move in pairs. If not, vpblendvb must be used. */
36812 for (i = 0; i < 32; i += 2)
36813 if (d->perm[i] + 1 != d->perm[i + 1])
36814 goto use_pblendvb;
36815 /* See if bytes move in quadruplets. If yes, vpblendd
36816 with immediate can be used. */
36817 for (i = 0; i < 32; i += 4)
36818 if (d->perm[i] + 2 != d->perm[i + 2])
36819 break;
36820 if (i < 32)
36821 {
36822 /* See if bytes move the same in both lanes. If yes,
36823 vpblendw with immediate can be used. */
36824 for (i = 0; i < 16; i += 2)
36825 if (d->perm[i] + 16 != d->perm[i + 16])
36826 goto use_pblendvb;
36827
36828 /* Use vpblendw. */
36829 for (i = 0; i < 16; ++i)
36830 mask |= (d->perm[i * 2] >= 32) << i;
36831 vmode = V16HImode;
36832 goto do_subreg;
36833 }
36834
36835 /* Use vpblendd. */
36836 for (i = 0; i < 8; ++i)
36837 mask |= (d->perm[i * 4] >= 32) << i;
36838 vmode = V8SImode;
36839 goto do_subreg;
36840
36841 case V16HImode:
36842 /* See if words move in pairs. If yes, vpblendd can be used. */
36843 for (i = 0; i < 16; i += 2)
36844 if (d->perm[i] + 1 != d->perm[i + 1])
36845 break;
36846 if (i < 16)
36847 {
36848 /* See if words move the same in both lanes. If not,
36849 vpblendvb must be used. */
36850 for (i = 0; i < 8; i++)
36851 if (d->perm[i] + 8 != d->perm[i + 8])
36852 {
36853 /* Use vpblendvb. */
36854 for (i = 0; i < 32; ++i)
36855 rperm[i] = (d->perm[i / 2] < 16 ? const0_rtx : constm1_rtx);
36856
36857 vmode = V32QImode;
36858 nelt = 32;
36859 target = gen_lowpart (vmode, target);
36860 op0 = gen_lowpart (vmode, op0);
36861 op1 = gen_lowpart (vmode, op1);
36862 goto finish_pblendvb;
36863 }
36864
36865 /* Use vpblendw. */
36866 for (i = 0; i < 16; ++i)
36867 mask |= (d->perm[i] >= 16) << i;
36868 break;
36869 }
36870
36871 /* Use vpblendd. */
36872 for (i = 0; i < 8; ++i)
36873 mask |= (d->perm[i * 2] >= 16) << i;
36874 vmode = V8SImode;
36875 goto do_subreg;
36876
36877 case V4DImode:
36878 /* Use vpblendd. */
36879 for (i = 0; i < 4; ++i)
36880 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
36881 vmode = V8SImode;
36882 goto do_subreg;
36883
36884 default:
36885 gcc_unreachable ();
36886 }
36887
36888 /* This matches five different patterns with the different modes. */
36889 x = gen_rtx_VEC_MERGE (vmode, op1, op0, GEN_INT (mask));
36890 x = gen_rtx_SET (VOIDmode, target, x);
36891 emit_insn (x);
36892
36893 return true;
36894 }
36895
36896 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
36897 in terms of the variable form of vpermilps.
36898
36899 Note that we will have already failed the immediate input vpermilps,
36900 which requires that the high and low part shuffle be identical; the
36901 variable form doesn't require that. */
36902
36903 static bool
36904 expand_vec_perm_vpermil (struct expand_vec_perm_d *d)
36905 {
36906 rtx rperm[8], vperm;
36907 unsigned i;
36908
36909 if (!TARGET_AVX || d->vmode != V8SFmode || !d->one_operand_p)
36910 return false;
36911
36912 /* We can only permute within the 128-bit lane. */
36913 for (i = 0; i < 8; ++i)
36914 {
36915 unsigned e = d->perm[i];
36916 if (i < 4 ? e >= 4 : e < 4)
36917 return false;
36918 }
36919
36920 if (d->testing_p)
36921 return true;
36922
36923 for (i = 0; i < 8; ++i)
36924 {
36925 unsigned e = d->perm[i];
36926
36927 /* Within each 128-bit lane, the elements of op0 are numbered
36928 from 0 and the elements of op1 are numbered from 4. */
36929 if (e >= 8 + 4)
36930 e -= 8;
36931 else if (e >= 4)
36932 e -= 4;
36933
36934 rperm[i] = GEN_INT (e);
36935 }
36936
36937 vperm = gen_rtx_CONST_VECTOR (V8SImode, gen_rtvec_v (8, rperm));
36938 vperm = force_reg (V8SImode, vperm);
36939 emit_insn (gen_avx_vpermilvarv8sf3 (d->target, d->op0, vperm));
36940
36941 return true;
36942 }
36943
36944 /* Return true if permutation D can be performed as VMODE permutation
36945 instead. */
36946
36947 static bool
36948 valid_perm_using_mode_p (enum machine_mode vmode, struct expand_vec_perm_d *d)
36949 {
36950 unsigned int i, j, chunk;
36951
36952 if (GET_MODE_CLASS (vmode) != MODE_VECTOR_INT
36953 || GET_MODE_CLASS (d->vmode) != MODE_VECTOR_INT
36954 || GET_MODE_SIZE (vmode) != GET_MODE_SIZE (d->vmode))
36955 return false;
36956
36957 if (GET_MODE_NUNITS (vmode) >= d->nelt)
36958 return true;
36959
36960 chunk = d->nelt / GET_MODE_NUNITS (vmode);
36961 for (i = 0; i < d->nelt; i += chunk)
36962 if (d->perm[i] & (chunk - 1))
36963 return false;
36964 else
36965 for (j = 1; j < chunk; ++j)
36966 if (d->perm[i] + j != d->perm[i + j])
36967 return false;
36968
36969 return true;
36970 }
36971
36972 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
36973 in terms of pshufb, vpperm, vpermq, vpermd, vpermps or vperm2i128. */
36974
36975 static bool
36976 expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
36977 {
36978 unsigned i, nelt, eltsz, mask;
36979 unsigned char perm[32];
36980 enum machine_mode vmode = V16QImode;
36981 rtx rperm[32], vperm, target, op0, op1;
36982
36983 nelt = d->nelt;
36984
36985 if (!d->one_operand_p)
36986 {
36987 if (!TARGET_XOP || GET_MODE_SIZE (d->vmode) != 16)
36988 {
36989 if (TARGET_AVX2
36990 && valid_perm_using_mode_p (V2TImode, d))
36991 {
36992 if (d->testing_p)
36993 return true;
36994
36995 /* Use vperm2i128 insn. The pattern uses
36996 V4DImode instead of V2TImode. */
36997 target = gen_lowpart (V4DImode, d->target);
36998 op0 = gen_lowpart (V4DImode, d->op0);
36999 op1 = gen_lowpart (V4DImode, d->op1);
37000 rperm[0]
37001 = GEN_INT (((d->perm[0] & (nelt / 2)) ? 1 : 0)
37002 || ((d->perm[nelt / 2] & (nelt / 2)) ? 2 : 0));
37003 emit_insn (gen_avx2_permv2ti (target, op0, op1, rperm[0]));
37004 return true;
37005 }
37006 return false;
37007 }
37008 }
37009 else
37010 {
37011 if (GET_MODE_SIZE (d->vmode) == 16)
37012 {
37013 if (!TARGET_SSSE3)
37014 return false;
37015 }
37016 else if (GET_MODE_SIZE (d->vmode) == 32)
37017 {
37018 if (!TARGET_AVX2)
37019 return false;
37020
37021 /* V4DImode should be already handled through
37022 expand_vselect by vpermq instruction. */
37023 gcc_assert (d->vmode != V4DImode);
37024
37025 vmode = V32QImode;
37026 if (d->vmode == V8SImode
37027 || d->vmode == V16HImode
37028 || d->vmode == V32QImode)
37029 {
37030 /* First see if vpermq can be used for
37031 V8SImode/V16HImode/V32QImode. */
37032 if (valid_perm_using_mode_p (V4DImode, d))
37033 {
37034 for (i = 0; i < 4; i++)
37035 perm[i] = (d->perm[i * nelt / 4] * 4 / nelt) & 3;
37036 if (d->testing_p)
37037 return true;
37038 return expand_vselect (gen_lowpart (V4DImode, d->target),
37039 gen_lowpart (V4DImode, d->op0),
37040 perm, 4, false);
37041 }
37042
37043 /* Next see if vpermd can be used. */
37044 if (valid_perm_using_mode_p (V8SImode, d))
37045 vmode = V8SImode;
37046 }
37047 /* Or if vpermps can be used. */
37048 else if (d->vmode == V8SFmode)
37049 vmode = V8SImode;
37050
37051 if (vmode == V32QImode)
37052 {
37053 /* vpshufb only works intra lanes, it is not
37054 possible to shuffle bytes in between the lanes. */
37055 for (i = 0; i < nelt; ++i)
37056 if ((d->perm[i] ^ i) & (nelt / 2))
37057 return false;
37058 }
37059 }
37060 else
37061 return false;
37062 }
37063
37064 if (d->testing_p)
37065 return true;
37066
37067 if (vmode == V8SImode)
37068 for (i = 0; i < 8; ++i)
37069 rperm[i] = GEN_INT ((d->perm[i * nelt / 8] * 8 / nelt) & 7);
37070 else
37071 {
37072 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
37073 if (!d->one_operand_p)
37074 mask = 2 * nelt - 1;
37075 else if (vmode == V16QImode)
37076 mask = nelt - 1;
37077 else
37078 mask = nelt / 2 - 1;
37079
37080 for (i = 0; i < nelt; ++i)
37081 {
37082 unsigned j, e = d->perm[i] & mask;
37083 for (j = 0; j < eltsz; ++j)
37084 rperm[i * eltsz + j] = GEN_INT (e * eltsz + j);
37085 }
37086 }
37087
37088 vperm = gen_rtx_CONST_VECTOR (vmode,
37089 gen_rtvec_v (GET_MODE_NUNITS (vmode), rperm));
37090 vperm = force_reg (vmode, vperm);
37091
37092 target = gen_lowpart (vmode, d->target);
37093 op0 = gen_lowpart (vmode, d->op0);
37094 if (d->one_operand_p)
37095 {
37096 if (vmode == V16QImode)
37097 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, vperm));
37098 else if (vmode == V32QImode)
37099 emit_insn (gen_avx2_pshufbv32qi3 (target, op0, vperm));
37100 else if (vmode == V8SFmode)
37101 emit_insn (gen_avx2_permvarv8sf (target, op0, vperm));
37102 else
37103 emit_insn (gen_avx2_permvarv8si (target, op0, vperm));
37104 }
37105 else
37106 {
37107 op1 = gen_lowpart (vmode, d->op1);
37108 emit_insn (gen_xop_pperm (target, op0, op1, vperm));
37109 }
37110
37111 return true;
37112 }
37113
37114 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to instantiate D
37115 in a single instruction. */
37116
37117 static bool
37118 expand_vec_perm_1 (struct expand_vec_perm_d *d)
37119 {
37120 unsigned i, nelt = d->nelt;
37121 unsigned char perm2[MAX_VECT_LEN];
37122
37123 /* Check plain VEC_SELECT first, because AVX has instructions that could
37124 match both SEL and SEL+CONCAT, but the plain SEL will allow a memory
37125 input where SEL+CONCAT may not. */
37126 if (d->one_operand_p)
37127 {
37128 int mask = nelt - 1;
37129 bool identity_perm = true;
37130 bool broadcast_perm = true;
37131
37132 for (i = 0; i < nelt; i++)
37133 {
37134 perm2[i] = d->perm[i] & mask;
37135 if (perm2[i] != i)
37136 identity_perm = false;
37137 if (perm2[i])
37138 broadcast_perm = false;
37139 }
37140
37141 if (identity_perm)
37142 {
37143 if (!d->testing_p)
37144 emit_move_insn (d->target, d->op0);
37145 return true;
37146 }
37147 else if (broadcast_perm && TARGET_AVX2)
37148 {
37149 /* Use vpbroadcast{b,w,d}. */
37150 rtx (*gen) (rtx, rtx) = NULL;
37151 switch (d->vmode)
37152 {
37153 case V32QImode:
37154 gen = gen_avx2_pbroadcastv32qi_1;
37155 break;
37156 case V16HImode:
37157 gen = gen_avx2_pbroadcastv16hi_1;
37158 break;
37159 case V8SImode:
37160 gen = gen_avx2_pbroadcastv8si_1;
37161 break;
37162 case V16QImode:
37163 gen = gen_avx2_pbroadcastv16qi;
37164 break;
37165 case V8HImode:
37166 gen = gen_avx2_pbroadcastv8hi;
37167 break;
37168 case V8SFmode:
37169 gen = gen_avx2_vec_dupv8sf_1;
37170 break;
37171 /* For other modes prefer other shuffles this function creates. */
37172 default: break;
37173 }
37174 if (gen != NULL)
37175 {
37176 if (!d->testing_p)
37177 emit_insn (gen (d->target, d->op0));
37178 return true;
37179 }
37180 }
37181
37182 if (expand_vselect (d->target, d->op0, perm2, nelt, d->testing_p))
37183 return true;
37184
37185 /* There are plenty of patterns in sse.md that are written for
37186 SEL+CONCAT and are not replicated for a single op. Perhaps
37187 that should be changed, to avoid the nastiness here. */
37188
37189 /* Recognize interleave style patterns, which means incrementing
37190 every other permutation operand. */
37191 for (i = 0; i < nelt; i += 2)
37192 {
37193 perm2[i] = d->perm[i] & mask;
37194 perm2[i + 1] = (d->perm[i + 1] & mask) + nelt;
37195 }
37196 if (expand_vselect_vconcat (d->target, d->op0, d->op0, perm2, nelt,
37197 d->testing_p))
37198 return true;
37199
37200 /* Recognize shufps, which means adding {0, 0, nelt, nelt}. */
37201 if (nelt >= 4)
37202 {
37203 for (i = 0; i < nelt; i += 4)
37204 {
37205 perm2[i + 0] = d->perm[i + 0] & mask;
37206 perm2[i + 1] = d->perm[i + 1] & mask;
37207 perm2[i + 2] = (d->perm[i + 2] & mask) + nelt;
37208 perm2[i + 3] = (d->perm[i + 3] & mask) + nelt;
37209 }
37210
37211 if (expand_vselect_vconcat (d->target, d->op0, d->op0, perm2, nelt,
37212 d->testing_p))
37213 return true;
37214 }
37215 }
37216
37217 /* Finally, try the fully general two operand permute. */
37218 if (expand_vselect_vconcat (d->target, d->op0, d->op1, d->perm, nelt,
37219 d->testing_p))
37220 return true;
37221
37222 /* Recognize interleave style patterns with reversed operands. */
37223 if (!d->one_operand_p)
37224 {
37225 for (i = 0; i < nelt; ++i)
37226 {
37227 unsigned e = d->perm[i];
37228 if (e >= nelt)
37229 e -= nelt;
37230 else
37231 e += nelt;
37232 perm2[i] = e;
37233 }
37234
37235 if (expand_vselect_vconcat (d->target, d->op1, d->op0, perm2, nelt,
37236 d->testing_p))
37237 return true;
37238 }
37239
37240 /* Try the SSE4.1 blend variable merge instructions. */
37241 if (expand_vec_perm_blend (d))
37242 return true;
37243
37244 /* Try one of the AVX vpermil variable permutations. */
37245 if (expand_vec_perm_vpermil (d))
37246 return true;
37247
37248 /* Try the SSSE3 pshufb or XOP vpperm or AVX2 vperm2i128,
37249 vpshufb, vpermd, vpermps or vpermq variable permutation. */
37250 if (expand_vec_perm_pshufb (d))
37251 return true;
37252
37253 return false;
37254 }
37255
37256 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
37257 in terms of a pair of pshuflw + pshufhw instructions. */
37258
37259 static bool
37260 expand_vec_perm_pshuflw_pshufhw (struct expand_vec_perm_d *d)
37261 {
37262 unsigned char perm2[MAX_VECT_LEN];
37263 unsigned i;
37264 bool ok;
37265
37266 if (d->vmode != V8HImode || !d->one_operand_p)
37267 return false;
37268
37269 /* The two permutations only operate in 64-bit lanes. */
37270 for (i = 0; i < 4; ++i)
37271 if (d->perm[i] >= 4)
37272 return false;
37273 for (i = 4; i < 8; ++i)
37274 if (d->perm[i] < 4)
37275 return false;
37276
37277 if (d->testing_p)
37278 return true;
37279
37280 /* Emit the pshuflw. */
37281 memcpy (perm2, d->perm, 4);
37282 for (i = 4; i < 8; ++i)
37283 perm2[i] = i;
37284 ok = expand_vselect (d->target, d->op0, perm2, 8, d->testing_p);
37285 gcc_assert (ok);
37286
37287 /* Emit the pshufhw. */
37288 memcpy (perm2 + 4, d->perm + 4, 4);
37289 for (i = 0; i < 4; ++i)
37290 perm2[i] = i;
37291 ok = expand_vselect (d->target, d->target, perm2, 8, d->testing_p);
37292 gcc_assert (ok);
37293
37294 return true;
37295 }
37296
37297 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
37298 the permutation using the SSSE3 palignr instruction. This succeeds
37299 when all of the elements in PERM fit within one vector and we merely
37300 need to shift them down so that a single vector permutation has a
37301 chance to succeed. */
37302
37303 static bool
37304 expand_vec_perm_palignr (struct expand_vec_perm_d *d)
37305 {
37306 unsigned i, nelt = d->nelt;
37307 unsigned min, max;
37308 bool in_order, ok;
37309 rtx shift;
37310
37311 /* Even with AVX, palignr only operates on 128-bit vectors. */
37312 if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
37313 return false;
37314
37315 min = nelt, max = 0;
37316 for (i = 0; i < nelt; ++i)
37317 {
37318 unsigned e = d->perm[i];
37319 if (e < min)
37320 min = e;
37321 if (e > max)
37322 max = e;
37323 }
37324 if (min == 0 || max - min >= nelt)
37325 return false;
37326
37327 /* Given that we have SSSE3, we know we'll be able to implement the
37328 single operand permutation after the palignr with pshufb. */
37329 if (d->testing_p)
37330 return true;
37331
37332 shift = GEN_INT (min * GET_MODE_BITSIZE (GET_MODE_INNER (d->vmode)));
37333 emit_insn (gen_ssse3_palignrti (gen_lowpart (TImode, d->target),
37334 gen_lowpart (TImode, d->op1),
37335 gen_lowpart (TImode, d->op0), shift));
37336
37337 d->op0 = d->op1 = d->target;
37338 d->one_operand_p = true;
37339
37340 in_order = true;
37341 for (i = 0; i < nelt; ++i)
37342 {
37343 unsigned e = d->perm[i] - min;
37344 if (e != i)
37345 in_order = false;
37346 d->perm[i] = e;
37347 }
37348
37349 /* Test for the degenerate case where the alignment by itself
37350 produces the desired permutation. */
37351 if (in_order)
37352 return true;
37353
37354 ok = expand_vec_perm_1 (d);
37355 gcc_assert (ok);
37356
37357 return ok;
37358 }
37359
37360 static bool expand_vec_perm_interleave3 (struct expand_vec_perm_d *d);
37361
37362 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
37363 a two vector permutation into a single vector permutation by using
37364 an interleave operation to merge the vectors. */
37365
37366 static bool
37367 expand_vec_perm_interleave2 (struct expand_vec_perm_d *d)
37368 {
37369 struct expand_vec_perm_d dremap, dfinal;
37370 unsigned i, nelt = d->nelt, nelt2 = nelt / 2;
37371 unsigned HOST_WIDE_INT contents;
37372 unsigned char remap[2 * MAX_VECT_LEN];
37373 rtx seq;
37374 bool ok, same_halves = false;
37375
37376 if (GET_MODE_SIZE (d->vmode) == 16)
37377 {
37378 if (d->one_operand_p)
37379 return false;
37380 }
37381 else if (GET_MODE_SIZE (d->vmode) == 32)
37382 {
37383 if (!TARGET_AVX)
37384 return false;
37385 /* For 32-byte modes allow even d->one_operand_p.
37386 The lack of cross-lane shuffling in some instructions
37387 might prevent a single insn shuffle. */
37388 dfinal = *d;
37389 dfinal.testing_p = true;
37390 /* If expand_vec_perm_interleave3 can expand this into
37391 a 3 insn sequence, give up and let it be expanded as
37392 3 insn sequence. While that is one insn longer,
37393 it doesn't need a memory operand and in the common
37394 case that both interleave low and high permutations
37395 with the same operands are adjacent needs 4 insns
37396 for both after CSE. */
37397 if (expand_vec_perm_interleave3 (&dfinal))
37398 return false;
37399 }
37400 else
37401 return false;
37402
37403 /* Examine from whence the elements come. */
37404 contents = 0;
37405 for (i = 0; i < nelt; ++i)
37406 contents |= ((unsigned HOST_WIDE_INT) 1) << d->perm[i];
37407
37408 memset (remap, 0xff, sizeof (remap));
37409 dremap = *d;
37410
37411 if (GET_MODE_SIZE (d->vmode) == 16)
37412 {
37413 unsigned HOST_WIDE_INT h1, h2, h3, h4;
37414
37415 /* Split the two input vectors into 4 halves. */
37416 h1 = (((unsigned HOST_WIDE_INT) 1) << nelt2) - 1;
37417 h2 = h1 << nelt2;
37418 h3 = h2 << nelt2;
37419 h4 = h3 << nelt2;
37420
37421 /* If the elements from the low halves use interleave low, and similarly
37422 for interleave high. If the elements are from mis-matched halves, we
37423 can use shufps for V4SF/V4SI or do a DImode shuffle. */
37424 if ((contents & (h1 | h3)) == contents)
37425 {
37426 /* punpckl* */
37427 for (i = 0; i < nelt2; ++i)
37428 {
37429 remap[i] = i * 2;
37430 remap[i + nelt] = i * 2 + 1;
37431 dremap.perm[i * 2] = i;
37432 dremap.perm[i * 2 + 1] = i + nelt;
37433 }
37434 if (!TARGET_SSE2 && d->vmode == V4SImode)
37435 dremap.vmode = V4SFmode;
37436 }
37437 else if ((contents & (h2 | h4)) == contents)
37438 {
37439 /* punpckh* */
37440 for (i = 0; i < nelt2; ++i)
37441 {
37442 remap[i + nelt2] = i * 2;
37443 remap[i + nelt + nelt2] = i * 2 + 1;
37444 dremap.perm[i * 2] = i + nelt2;
37445 dremap.perm[i * 2 + 1] = i + nelt + nelt2;
37446 }
37447 if (!TARGET_SSE2 && d->vmode == V4SImode)
37448 dremap.vmode = V4SFmode;
37449 }
37450 else if ((contents & (h1 | h4)) == contents)
37451 {
37452 /* shufps */
37453 for (i = 0; i < nelt2; ++i)
37454 {
37455 remap[i] = i;
37456 remap[i + nelt + nelt2] = i + nelt2;
37457 dremap.perm[i] = i;
37458 dremap.perm[i + nelt2] = i + nelt + nelt2;
37459 }
37460 if (nelt != 4)
37461 {
37462 /* shufpd */
37463 dremap.vmode = V2DImode;
37464 dremap.nelt = 2;
37465 dremap.perm[0] = 0;
37466 dremap.perm[1] = 3;
37467 }
37468 }
37469 else if ((contents & (h2 | h3)) == contents)
37470 {
37471 /* shufps */
37472 for (i = 0; i < nelt2; ++i)
37473 {
37474 remap[i + nelt2] = i;
37475 remap[i + nelt] = i + nelt2;
37476 dremap.perm[i] = i + nelt2;
37477 dremap.perm[i + nelt2] = i + nelt;
37478 }
37479 if (nelt != 4)
37480 {
37481 /* shufpd */
37482 dremap.vmode = V2DImode;
37483 dremap.nelt = 2;
37484 dremap.perm[0] = 1;
37485 dremap.perm[1] = 2;
37486 }
37487 }
37488 else
37489 return false;
37490 }
37491 else
37492 {
37493 unsigned int nelt4 = nelt / 4, nzcnt = 0;
37494 unsigned HOST_WIDE_INT q[8];
37495 unsigned int nonzero_halves[4];
37496
37497 /* Split the two input vectors into 8 quarters. */
37498 q[0] = (((unsigned HOST_WIDE_INT) 1) << nelt4) - 1;
37499 for (i = 1; i < 8; ++i)
37500 q[i] = q[0] << (nelt4 * i);
37501 for (i = 0; i < 4; ++i)
37502 if (((q[2 * i] | q[2 * i + 1]) & contents) != 0)
37503 {
37504 nonzero_halves[nzcnt] = i;
37505 ++nzcnt;
37506 }
37507
37508 if (nzcnt == 1)
37509 {
37510 gcc_assert (d->one_operand_p);
37511 nonzero_halves[1] = nonzero_halves[0];
37512 same_halves = true;
37513 }
37514 else if (d->one_operand_p)
37515 {
37516 gcc_assert (nonzero_halves[0] == 0);
37517 gcc_assert (nonzero_halves[1] == 1);
37518 }
37519
37520 if (nzcnt <= 2)
37521 {
37522 if (d->perm[0] / nelt2 == nonzero_halves[1])
37523 {
37524 /* Attempt to increase the likelihood that dfinal
37525 shuffle will be intra-lane. */
37526 char tmph = nonzero_halves[0];
37527 nonzero_halves[0] = nonzero_halves[1];
37528 nonzero_halves[1] = tmph;
37529 }
37530
37531 /* vperm2f128 or vperm2i128. */
37532 for (i = 0; i < nelt2; ++i)
37533 {
37534 remap[i + nonzero_halves[1] * nelt2] = i + nelt2;
37535 remap[i + nonzero_halves[0] * nelt2] = i;
37536 dremap.perm[i + nelt2] = i + nonzero_halves[1] * nelt2;
37537 dremap.perm[i] = i + nonzero_halves[0] * nelt2;
37538 }
37539
37540 if (d->vmode != V8SFmode
37541 && d->vmode != V4DFmode
37542 && d->vmode != V8SImode)
37543 {
37544 dremap.vmode = V8SImode;
37545 dremap.nelt = 8;
37546 for (i = 0; i < 4; ++i)
37547 {
37548 dremap.perm[i] = i + nonzero_halves[0] * 4;
37549 dremap.perm[i + 4] = i + nonzero_halves[1] * 4;
37550 }
37551 }
37552 }
37553 else if (d->one_operand_p)
37554 return false;
37555 else if (TARGET_AVX2
37556 && (contents & (q[0] | q[2] | q[4] | q[6])) == contents)
37557 {
37558 /* vpunpckl* */
37559 for (i = 0; i < nelt4; ++i)
37560 {
37561 remap[i] = i * 2;
37562 remap[i + nelt] = i * 2 + 1;
37563 remap[i + nelt2] = i * 2 + nelt2;
37564 remap[i + nelt + nelt2] = i * 2 + nelt2 + 1;
37565 dremap.perm[i * 2] = i;
37566 dremap.perm[i * 2 + 1] = i + nelt;
37567 dremap.perm[i * 2 + nelt2] = i + nelt2;
37568 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2;
37569 }
37570 }
37571 else if (TARGET_AVX2
37572 && (contents & (q[1] | q[3] | q[5] | q[7])) == contents)
37573 {
37574 /* vpunpckh* */
37575 for (i = 0; i < nelt4; ++i)
37576 {
37577 remap[i + nelt4] = i * 2;
37578 remap[i + nelt + nelt4] = i * 2 + 1;
37579 remap[i + nelt2 + nelt4] = i * 2 + nelt2;
37580 remap[i + nelt + nelt2 + nelt4] = i * 2 + nelt2 + 1;
37581 dremap.perm[i * 2] = i + nelt4;
37582 dremap.perm[i * 2 + 1] = i + nelt + nelt4;
37583 dremap.perm[i * 2 + nelt2] = i + nelt2 + nelt4;
37584 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2 + nelt4;
37585 }
37586 }
37587 else
37588 return false;
37589 }
37590
37591 /* Use the remapping array set up above to move the elements from their
37592 swizzled locations into their final destinations. */
37593 dfinal = *d;
37594 for (i = 0; i < nelt; ++i)
37595 {
37596 unsigned e = remap[d->perm[i]];
37597 gcc_assert (e < nelt);
37598 /* If same_halves is true, both halves of the remapped vector are the
37599 same. Avoid cross-lane accesses if possible. */
37600 if (same_halves && i >= nelt2)
37601 {
37602 gcc_assert (e < nelt2);
37603 dfinal.perm[i] = e + nelt2;
37604 }
37605 else
37606 dfinal.perm[i] = e;
37607 }
37608 dfinal.op0 = gen_reg_rtx (dfinal.vmode);
37609 dfinal.op1 = dfinal.op0;
37610 dfinal.one_operand_p = true;
37611 dremap.target = dfinal.op0;
37612
37613 /* Test if the final remap can be done with a single insn. For V4SFmode or
37614 V4SImode this *will* succeed. For V8HImode or V16QImode it may not. */
37615 start_sequence ();
37616 ok = expand_vec_perm_1 (&dfinal);
37617 seq = get_insns ();
37618 end_sequence ();
37619
37620 if (!ok)
37621 return false;
37622
37623 if (d->testing_p)
37624 return true;
37625
37626 if (dremap.vmode != dfinal.vmode)
37627 {
37628 dremap.target = gen_lowpart (dremap.vmode, dremap.target);
37629 dremap.op0 = gen_lowpart (dremap.vmode, dremap.op0);
37630 dremap.op1 = gen_lowpart (dremap.vmode, dremap.op1);
37631 }
37632
37633 ok = expand_vec_perm_1 (&dremap);
37634 gcc_assert (ok);
37635
37636 emit_insn (seq);
37637 return true;
37638 }
37639
37640 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
37641 a single vector cross-lane permutation into vpermq followed
37642 by any of the single insn permutations. */
37643
37644 static bool
37645 expand_vec_perm_vpermq_perm_1 (struct expand_vec_perm_d *d)
37646 {
37647 struct expand_vec_perm_d dremap, dfinal;
37648 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, nelt4 = nelt / 4;
37649 unsigned contents[2];
37650 bool ok;
37651
37652 if (!(TARGET_AVX2
37653 && (d->vmode == V32QImode || d->vmode == V16HImode)
37654 && d->one_operand_p))
37655 return false;
37656
37657 contents[0] = 0;
37658 contents[1] = 0;
37659 for (i = 0; i < nelt2; ++i)
37660 {
37661 contents[0] |= 1u << (d->perm[i] / nelt4);
37662 contents[1] |= 1u << (d->perm[i + nelt2] / nelt4);
37663 }
37664
37665 for (i = 0; i < 2; ++i)
37666 {
37667 unsigned int cnt = 0;
37668 for (j = 0; j < 4; ++j)
37669 if ((contents[i] & (1u << j)) != 0 && ++cnt > 2)
37670 return false;
37671 }
37672
37673 if (d->testing_p)
37674 return true;
37675
37676 dremap = *d;
37677 dremap.vmode = V4DImode;
37678 dremap.nelt = 4;
37679 dremap.target = gen_reg_rtx (V4DImode);
37680 dremap.op0 = gen_lowpart (V4DImode, d->op0);
37681 dremap.op1 = dremap.op0;
37682 dremap.one_operand_p = true;
37683 for (i = 0; i < 2; ++i)
37684 {
37685 unsigned int cnt = 0;
37686 for (j = 0; j < 4; ++j)
37687 if ((contents[i] & (1u << j)) != 0)
37688 dremap.perm[2 * i + cnt++] = j;
37689 for (; cnt < 2; ++cnt)
37690 dremap.perm[2 * i + cnt] = 0;
37691 }
37692
37693 dfinal = *d;
37694 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
37695 dfinal.op1 = dfinal.op0;
37696 dfinal.one_operand_p = true;
37697 for (i = 0, j = 0; i < nelt; ++i)
37698 {
37699 if (i == nelt2)
37700 j = 2;
37701 dfinal.perm[i] = (d->perm[i] & (nelt4 - 1)) | (j ? nelt2 : 0);
37702 if ((d->perm[i] / nelt4) == dremap.perm[j])
37703 ;
37704 else if ((d->perm[i] / nelt4) == dremap.perm[j + 1])
37705 dfinal.perm[i] |= nelt4;
37706 else
37707 gcc_unreachable ();
37708 }
37709
37710 ok = expand_vec_perm_1 (&dremap);
37711 gcc_assert (ok);
37712
37713 ok = expand_vec_perm_1 (&dfinal);
37714 gcc_assert (ok);
37715
37716 return true;
37717 }
37718
37719 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to expand
37720 a vector permutation using two instructions, vperm2f128 resp.
37721 vperm2i128 followed by any single in-lane permutation. */
37722
37723 static bool
37724 expand_vec_perm_vperm2f128 (struct expand_vec_perm_d *d)
37725 {
37726 struct expand_vec_perm_d dfirst, dsecond;
37727 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, perm;
37728 bool ok;
37729
37730 if (!TARGET_AVX
37731 || GET_MODE_SIZE (d->vmode) != 32
37732 || (d->vmode != V8SFmode && d->vmode != V4DFmode && !TARGET_AVX2))
37733 return false;
37734
37735 dsecond = *d;
37736 dsecond.one_operand_p = false;
37737 dsecond.testing_p = true;
37738
37739 /* ((perm << 2)|perm) & 0x33 is the vperm2[fi]128
37740 immediate. For perm < 16 the second permutation uses
37741 d->op0 as first operand, for perm >= 16 it uses d->op1
37742 as first operand. The second operand is the result of
37743 vperm2[fi]128. */
37744 for (perm = 0; perm < 32; perm++)
37745 {
37746 /* Ignore permutations which do not move anything cross-lane. */
37747 if (perm < 16)
37748 {
37749 /* The second shuffle for e.g. V4DFmode has
37750 0123 and ABCD operands.
37751 Ignore AB23, as 23 is already in the second lane
37752 of the first operand. */
37753 if ((perm & 0xc) == (1 << 2)) continue;
37754 /* And 01CD, as 01 is in the first lane of the first
37755 operand. */
37756 if ((perm & 3) == 0) continue;
37757 /* And 4567, as then the vperm2[fi]128 doesn't change
37758 anything on the original 4567 second operand. */
37759 if ((perm & 0xf) == ((3 << 2) | 2)) continue;
37760 }
37761 else
37762 {
37763 /* The second shuffle for e.g. V4DFmode has
37764 4567 and ABCD operands.
37765 Ignore AB67, as 67 is already in the second lane
37766 of the first operand. */
37767 if ((perm & 0xc) == (3 << 2)) continue;
37768 /* And 45CD, as 45 is in the first lane of the first
37769 operand. */
37770 if ((perm & 3) == 2) continue;
37771 /* And 0123, as then the vperm2[fi]128 doesn't change
37772 anything on the original 0123 first operand. */
37773 if ((perm & 0xf) == (1 << 2)) continue;
37774 }
37775
37776 for (i = 0; i < nelt; i++)
37777 {
37778 j = d->perm[i] / nelt2;
37779 if (j == ((perm >> (2 * (i >= nelt2))) & 3))
37780 dsecond.perm[i] = nelt + (i & nelt2) + (d->perm[i] & (nelt2 - 1));
37781 else if (j == (unsigned) (i >= nelt2) + 2 * (perm >= 16))
37782 dsecond.perm[i] = d->perm[i] & (nelt - 1);
37783 else
37784 break;
37785 }
37786
37787 if (i == nelt)
37788 {
37789 start_sequence ();
37790 ok = expand_vec_perm_1 (&dsecond);
37791 end_sequence ();
37792 }
37793 else
37794 ok = false;
37795
37796 if (ok)
37797 {
37798 if (d->testing_p)
37799 return true;
37800
37801 /* Found a usable second shuffle. dfirst will be
37802 vperm2f128 on d->op0 and d->op1. */
37803 dsecond.testing_p = false;
37804 dfirst = *d;
37805 dfirst.target = gen_reg_rtx (d->vmode);
37806 for (i = 0; i < nelt; i++)
37807 dfirst.perm[i] = (i & (nelt2 - 1))
37808 + ((perm >> (2 * (i >= nelt2))) & 3) * nelt2;
37809
37810 ok = expand_vec_perm_1 (&dfirst);
37811 gcc_assert (ok);
37812
37813 /* And dsecond is some single insn shuffle, taking
37814 d->op0 and result of vperm2f128 (if perm < 16) or
37815 d->op1 and result of vperm2f128 (otherwise). */
37816 dsecond.op1 = dfirst.target;
37817 if (perm >= 16)
37818 dsecond.op0 = dfirst.op1;
37819
37820 ok = expand_vec_perm_1 (&dsecond);
37821 gcc_assert (ok);
37822
37823 return true;
37824 }
37825
37826 /* For one operand, the only useful vperm2f128 permutation is 0x10. */
37827 if (d->one_operand_p)
37828 return false;
37829 }
37830
37831 return false;
37832 }
37833
37834 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
37835 a two vector permutation using 2 intra-lane interleave insns
37836 and cross-lane shuffle for 32-byte vectors. */
37837
37838 static bool
37839 expand_vec_perm_interleave3 (struct expand_vec_perm_d *d)
37840 {
37841 unsigned i, nelt;
37842 rtx (*gen) (rtx, rtx, rtx);
37843
37844 if (d->one_operand_p)
37845 return false;
37846 if (TARGET_AVX2 && GET_MODE_SIZE (d->vmode) == 32)
37847 ;
37848 else if (TARGET_AVX && (d->vmode == V8SFmode || d->vmode == V4DFmode))
37849 ;
37850 else
37851 return false;
37852
37853 nelt = d->nelt;
37854 if (d->perm[0] != 0 && d->perm[0] != nelt / 2)
37855 return false;
37856 for (i = 0; i < nelt; i += 2)
37857 if (d->perm[i] != d->perm[0] + i / 2
37858 || d->perm[i + 1] != d->perm[0] + i / 2 + nelt)
37859 return false;
37860
37861 if (d->testing_p)
37862 return true;
37863
37864 switch (d->vmode)
37865 {
37866 case V32QImode:
37867 if (d->perm[0])
37868 gen = gen_vec_interleave_highv32qi;
37869 else
37870 gen = gen_vec_interleave_lowv32qi;
37871 break;
37872 case V16HImode:
37873 if (d->perm[0])
37874 gen = gen_vec_interleave_highv16hi;
37875 else
37876 gen = gen_vec_interleave_lowv16hi;
37877 break;
37878 case V8SImode:
37879 if (d->perm[0])
37880 gen = gen_vec_interleave_highv8si;
37881 else
37882 gen = gen_vec_interleave_lowv8si;
37883 break;
37884 case V4DImode:
37885 if (d->perm[0])
37886 gen = gen_vec_interleave_highv4di;
37887 else
37888 gen = gen_vec_interleave_lowv4di;
37889 break;
37890 case V8SFmode:
37891 if (d->perm[0])
37892 gen = gen_vec_interleave_highv8sf;
37893 else
37894 gen = gen_vec_interleave_lowv8sf;
37895 break;
37896 case V4DFmode:
37897 if (d->perm[0])
37898 gen = gen_vec_interleave_highv4df;
37899 else
37900 gen = gen_vec_interleave_lowv4df;
37901 break;
37902 default:
37903 gcc_unreachable ();
37904 }
37905
37906 emit_insn (gen (d->target, d->op0, d->op1));
37907 return true;
37908 }
37909
37910 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement
37911 a single vector permutation using a single intra-lane vector
37912 permutation, vperm2f128 swapping the lanes and vblend* insn blending
37913 the non-swapped and swapped vectors together. */
37914
37915 static bool
37916 expand_vec_perm_vperm2f128_vblend (struct expand_vec_perm_d *d)
37917 {
37918 struct expand_vec_perm_d dfirst, dsecond;
37919 unsigned i, j, msk, nelt = d->nelt, nelt2 = nelt / 2;
37920 rtx seq;
37921 bool ok;
37922 rtx (*blend) (rtx, rtx, rtx, rtx) = NULL;
37923
37924 if (!TARGET_AVX
37925 || TARGET_AVX2
37926 || (d->vmode != V8SFmode && d->vmode != V4DFmode)
37927 || !d->one_operand_p)
37928 return false;
37929
37930 dfirst = *d;
37931 for (i = 0; i < nelt; i++)
37932 dfirst.perm[i] = 0xff;
37933 for (i = 0, msk = 0; i < nelt; i++)
37934 {
37935 j = (d->perm[i] & nelt2) ? i | nelt2 : i & ~nelt2;
37936 if (dfirst.perm[j] != 0xff && dfirst.perm[j] != d->perm[i])
37937 return false;
37938 dfirst.perm[j] = d->perm[i];
37939 if (j != i)
37940 msk |= (1 << i);
37941 }
37942 for (i = 0; i < nelt; i++)
37943 if (dfirst.perm[i] == 0xff)
37944 dfirst.perm[i] = i;
37945
37946 if (!d->testing_p)
37947 dfirst.target = gen_reg_rtx (dfirst.vmode);
37948
37949 start_sequence ();
37950 ok = expand_vec_perm_1 (&dfirst);
37951 seq = get_insns ();
37952 end_sequence ();
37953
37954 if (!ok)
37955 return false;
37956
37957 if (d->testing_p)
37958 return true;
37959
37960 emit_insn (seq);
37961
37962 dsecond = *d;
37963 dsecond.op0 = dfirst.target;
37964 dsecond.op1 = dfirst.target;
37965 dsecond.one_operand_p = true;
37966 dsecond.target = gen_reg_rtx (dsecond.vmode);
37967 for (i = 0; i < nelt; i++)
37968 dsecond.perm[i] = i ^ nelt2;
37969
37970 ok = expand_vec_perm_1 (&dsecond);
37971 gcc_assert (ok);
37972
37973 blend = d->vmode == V8SFmode ? gen_avx_blendps256 : gen_avx_blendpd256;
37974 emit_insn (blend (d->target, dfirst.target, dsecond.target, GEN_INT (msk)));
37975 return true;
37976 }
37977
37978 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement a V4DF
37979 permutation using two vperm2f128, followed by a vshufpd insn blending
37980 the two vectors together. */
37981
37982 static bool
37983 expand_vec_perm_2vperm2f128_vshuf (struct expand_vec_perm_d *d)
37984 {
37985 struct expand_vec_perm_d dfirst, dsecond, dthird;
37986 bool ok;
37987
37988 if (!TARGET_AVX || (d->vmode != V4DFmode))
37989 return false;
37990
37991 if (d->testing_p)
37992 return true;
37993
37994 dfirst = *d;
37995 dsecond = *d;
37996 dthird = *d;
37997
37998 dfirst.perm[0] = (d->perm[0] & ~1);
37999 dfirst.perm[1] = (d->perm[0] & ~1) + 1;
38000 dfirst.perm[2] = (d->perm[2] & ~1);
38001 dfirst.perm[3] = (d->perm[2] & ~1) + 1;
38002 dsecond.perm[0] = (d->perm[1] & ~1);
38003 dsecond.perm[1] = (d->perm[1] & ~1) + 1;
38004 dsecond.perm[2] = (d->perm[3] & ~1);
38005 dsecond.perm[3] = (d->perm[3] & ~1) + 1;
38006 dthird.perm[0] = (d->perm[0] % 2);
38007 dthird.perm[1] = (d->perm[1] % 2) + 4;
38008 dthird.perm[2] = (d->perm[2] % 2) + 2;
38009 dthird.perm[3] = (d->perm[3] % 2) + 6;
38010
38011 dfirst.target = gen_reg_rtx (dfirst.vmode);
38012 dsecond.target = gen_reg_rtx (dsecond.vmode);
38013 dthird.op0 = dfirst.target;
38014 dthird.op1 = dsecond.target;
38015 dthird.one_operand_p = false;
38016
38017 canonicalize_perm (&dfirst);
38018 canonicalize_perm (&dsecond);
38019
38020 ok = expand_vec_perm_1 (&dfirst)
38021 && expand_vec_perm_1 (&dsecond)
38022 && expand_vec_perm_1 (&dthird);
38023
38024 gcc_assert (ok);
38025
38026 return true;
38027 }
38028
38029 /* A subroutine of expand_vec_perm_even_odd_1. Implement the double-word
38030 permutation with two pshufb insns and an ior. We should have already
38031 failed all two instruction sequences. */
38032
38033 static bool
38034 expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d)
38035 {
38036 rtx rperm[2][16], vperm, l, h, op, m128;
38037 unsigned int i, nelt, eltsz;
38038
38039 if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
38040 return false;
38041 gcc_assert (!d->one_operand_p);
38042
38043 nelt = d->nelt;
38044 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
38045
38046 /* Generate two permutation masks. If the required element is within
38047 the given vector it is shuffled into the proper lane. If the required
38048 element is in the other vector, force a zero into the lane by setting
38049 bit 7 in the permutation mask. */
38050 m128 = GEN_INT (-128);
38051 for (i = 0; i < nelt; ++i)
38052 {
38053 unsigned j, e = d->perm[i];
38054 unsigned which = (e >= nelt);
38055 if (e >= nelt)
38056 e -= nelt;
38057
38058 for (j = 0; j < eltsz; ++j)
38059 {
38060 rperm[which][i*eltsz + j] = GEN_INT (e*eltsz + j);
38061 rperm[1-which][i*eltsz + j] = m128;
38062 }
38063 }
38064
38065 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[0]));
38066 vperm = force_reg (V16QImode, vperm);
38067
38068 l = gen_reg_rtx (V16QImode);
38069 op = gen_lowpart (V16QImode, d->op0);
38070 emit_insn (gen_ssse3_pshufbv16qi3 (l, op, vperm));
38071
38072 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[1]));
38073 vperm = force_reg (V16QImode, vperm);
38074
38075 h = gen_reg_rtx (V16QImode);
38076 op = gen_lowpart (V16QImode, d->op1);
38077 emit_insn (gen_ssse3_pshufbv16qi3 (h, op, vperm));
38078
38079 op = gen_lowpart (V16QImode, d->target);
38080 emit_insn (gen_iorv16qi3 (op, l, h));
38081
38082 return true;
38083 }
38084
38085 /* Implement arbitrary permutation of one V32QImode and V16QImode operand
38086 with two vpshufb insns, vpermq and vpor. We should have already failed
38087 all two or three instruction sequences. */
38088
38089 static bool
38090 expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d)
38091 {
38092 rtx rperm[2][32], vperm, l, h, hp, op, m128;
38093 unsigned int i, nelt, eltsz;
38094
38095 if (!TARGET_AVX2
38096 || !d->one_operand_p
38097 || (d->vmode != V32QImode && d->vmode != V16HImode))
38098 return false;
38099
38100 if (d->testing_p)
38101 return true;
38102
38103 nelt = d->nelt;
38104 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
38105
38106 /* Generate two permutation masks. If the required element is within
38107 the same lane, it is shuffled in. If the required element from the
38108 other lane, force a zero by setting bit 7 in the permutation mask.
38109 In the other mask the mask has non-negative elements if element
38110 is requested from the other lane, but also moved to the other lane,
38111 so that the result of vpshufb can have the two V2TImode halves
38112 swapped. */
38113 m128 = GEN_INT (-128);
38114 for (i = 0; i < nelt; ++i)
38115 {
38116 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
38117 unsigned which = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
38118
38119 for (j = 0; j < eltsz; ++j)
38120 {
38121 rperm[!!which][(i * eltsz + j) ^ which] = GEN_INT (e * eltsz + j);
38122 rperm[!which][(i * eltsz + j) ^ (which ^ 16)] = m128;
38123 }
38124 }
38125
38126 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
38127 vperm = force_reg (V32QImode, vperm);
38128
38129 h = gen_reg_rtx (V32QImode);
38130 op = gen_lowpart (V32QImode, d->op0);
38131 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
38132
38133 /* Swap the 128-byte lanes of h into hp. */
38134 hp = gen_reg_rtx (V4DImode);
38135 op = gen_lowpart (V4DImode, h);
38136 emit_insn (gen_avx2_permv4di_1 (hp, op, const2_rtx, GEN_INT (3), const0_rtx,
38137 const1_rtx));
38138
38139 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
38140 vperm = force_reg (V32QImode, vperm);
38141
38142 l = gen_reg_rtx (V32QImode);
38143 op = gen_lowpart (V32QImode, d->op0);
38144 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
38145
38146 op = gen_lowpart (V32QImode, d->target);
38147 emit_insn (gen_iorv32qi3 (op, l, gen_lowpart (V32QImode, hp)));
38148
38149 return true;
38150 }
38151
38152 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
38153 and extract-odd permutations of two V32QImode and V16QImode operand
38154 with two vpshufb insns, vpor and vpermq. We should have already
38155 failed all two or three instruction sequences. */
38156
38157 static bool
38158 expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d *d)
38159 {
38160 rtx rperm[2][32], vperm, l, h, ior, op, m128;
38161 unsigned int i, nelt, eltsz;
38162
38163 if (!TARGET_AVX2
38164 || d->one_operand_p
38165 || (d->vmode != V32QImode && d->vmode != V16HImode))
38166 return false;
38167
38168 for (i = 0; i < d->nelt; ++i)
38169 if ((d->perm[i] ^ (i * 2)) & (3 * d->nelt / 2))
38170 return false;
38171
38172 if (d->testing_p)
38173 return true;
38174
38175 nelt = d->nelt;
38176 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
38177
38178 /* Generate two permutation masks. In the first permutation mask
38179 the first quarter will contain indexes for the first half
38180 of the op0, the second quarter will contain bit 7 set, third quarter
38181 will contain indexes for the second half of the op0 and the
38182 last quarter bit 7 set. In the second permutation mask
38183 the first quarter will contain bit 7 set, the second quarter
38184 indexes for the first half of the op1, the third quarter bit 7 set
38185 and last quarter indexes for the second half of the op1.
38186 I.e. the first mask e.g. for V32QImode extract even will be:
38187 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128
38188 (all values masked with 0xf except for -128) and second mask
38189 for extract even will be
38190 -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe. */
38191 m128 = GEN_INT (-128);
38192 for (i = 0; i < nelt; ++i)
38193 {
38194 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
38195 unsigned which = d->perm[i] >= nelt;
38196 unsigned xorv = (i >= nelt / 4 && i < 3 * nelt / 4) ? 24 : 0;
38197
38198 for (j = 0; j < eltsz; ++j)
38199 {
38200 rperm[which][(i * eltsz + j) ^ xorv] = GEN_INT (e * eltsz + j);
38201 rperm[1 - which][(i * eltsz + j) ^ xorv] = m128;
38202 }
38203 }
38204
38205 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
38206 vperm = force_reg (V32QImode, vperm);
38207
38208 l = gen_reg_rtx (V32QImode);
38209 op = gen_lowpart (V32QImode, d->op0);
38210 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
38211
38212 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
38213 vperm = force_reg (V32QImode, vperm);
38214
38215 h = gen_reg_rtx (V32QImode);
38216 op = gen_lowpart (V32QImode, d->op1);
38217 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
38218
38219 ior = gen_reg_rtx (V32QImode);
38220 emit_insn (gen_iorv32qi3 (ior, l, h));
38221
38222 /* Permute the V4DImode quarters using { 0, 2, 1, 3 } permutation. */
38223 op = gen_lowpart (V4DImode, d->target);
38224 ior = gen_lowpart (V4DImode, ior);
38225 emit_insn (gen_avx2_permv4di_1 (op, ior, const0_rtx, const2_rtx,
38226 const1_rtx, GEN_INT (3)));
38227
38228 return true;
38229 }
38230
38231 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement extract-even
38232 and extract-odd permutations. */
38233
38234 static bool
38235 expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
38236 {
38237 rtx t1, t2, t3;
38238
38239 switch (d->vmode)
38240 {
38241 case V4DFmode:
38242 t1 = gen_reg_rtx (V4DFmode);
38243 t2 = gen_reg_rtx (V4DFmode);
38244
38245 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
38246 emit_insn (gen_avx_vperm2f128v4df3 (t1, d->op0, d->op1, GEN_INT (0x20)));
38247 emit_insn (gen_avx_vperm2f128v4df3 (t2, d->op0, d->op1, GEN_INT (0x31)));
38248
38249 /* Now an unpck[lh]pd will produce the result required. */
38250 if (odd)
38251 t3 = gen_avx_unpckhpd256 (d->target, t1, t2);
38252 else
38253 t3 = gen_avx_unpcklpd256 (d->target, t1, t2);
38254 emit_insn (t3);
38255 break;
38256
38257 case V8SFmode:
38258 {
38259 int mask = odd ? 0xdd : 0x88;
38260
38261 t1 = gen_reg_rtx (V8SFmode);
38262 t2 = gen_reg_rtx (V8SFmode);
38263 t3 = gen_reg_rtx (V8SFmode);
38264
38265 /* Shuffle within the 128-bit lanes to produce:
38266 { 0 2 8 a 4 6 c e } | { 1 3 9 b 5 7 d f }. */
38267 emit_insn (gen_avx_shufps256 (t1, d->op0, d->op1,
38268 GEN_INT (mask)));
38269
38270 /* Shuffle the lanes around to produce:
38271 { 4 6 c e 0 2 8 a } and { 5 7 d f 1 3 9 b }. */
38272 emit_insn (gen_avx_vperm2f128v8sf3 (t2, t1, t1,
38273 GEN_INT (0x3)));
38274
38275 /* Shuffle within the 128-bit lanes to produce:
38276 { 0 2 4 6 4 6 0 2 } | { 1 3 5 7 5 7 1 3 }. */
38277 emit_insn (gen_avx_shufps256 (t3, t1, t2, GEN_INT (0x44)));
38278
38279 /* Shuffle within the 128-bit lanes to produce:
38280 { 8 a c e c e 8 a } | { 9 b d f d f 9 b }. */
38281 emit_insn (gen_avx_shufps256 (t2, t1, t2, GEN_INT (0xee)));
38282
38283 /* Shuffle the lanes around to produce:
38284 { 0 2 4 6 8 a c e } | { 1 3 5 7 9 b d f }. */
38285 emit_insn (gen_avx_vperm2f128v8sf3 (d->target, t3, t2,
38286 GEN_INT (0x20)));
38287 }
38288 break;
38289
38290 case V2DFmode:
38291 case V4SFmode:
38292 case V2DImode:
38293 case V4SImode:
38294 /* These are always directly implementable by expand_vec_perm_1. */
38295 gcc_unreachable ();
38296
38297 case V8HImode:
38298 if (TARGET_SSSE3)
38299 return expand_vec_perm_pshufb2 (d);
38300 else
38301 {
38302 /* We need 2*log2(N)-1 operations to achieve odd/even
38303 with interleave. */
38304 t1 = gen_reg_rtx (V8HImode);
38305 t2 = gen_reg_rtx (V8HImode);
38306 emit_insn (gen_vec_interleave_highv8hi (t1, d->op0, d->op1));
38307 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->op0, d->op1));
38308 emit_insn (gen_vec_interleave_highv8hi (t2, d->target, t1));
38309 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->target, t1));
38310 if (odd)
38311 t3 = gen_vec_interleave_highv8hi (d->target, d->target, t2);
38312 else
38313 t3 = gen_vec_interleave_lowv8hi (d->target, d->target, t2);
38314 emit_insn (t3);
38315 }
38316 break;
38317
38318 case V16QImode:
38319 if (TARGET_SSSE3)
38320 return expand_vec_perm_pshufb2 (d);
38321 else
38322 {
38323 t1 = gen_reg_rtx (V16QImode);
38324 t2 = gen_reg_rtx (V16QImode);
38325 t3 = gen_reg_rtx (V16QImode);
38326 emit_insn (gen_vec_interleave_highv16qi (t1, d->op0, d->op1));
38327 emit_insn (gen_vec_interleave_lowv16qi (d->target, d->op0, d->op1));
38328 emit_insn (gen_vec_interleave_highv16qi (t2, d->target, t1));
38329 emit_insn (gen_vec_interleave_lowv16qi (d->target, d->target, t1));
38330 emit_insn (gen_vec_interleave_highv16qi (t3, d->target, t2));
38331 emit_insn (gen_vec_interleave_lowv16qi (d->target, d->target, t2));
38332 if (odd)
38333 t3 = gen_vec_interleave_highv16qi (d->target, d->target, t3);
38334 else
38335 t3 = gen_vec_interleave_lowv16qi (d->target, d->target, t3);
38336 emit_insn (t3);
38337 }
38338 break;
38339
38340 case V16HImode:
38341 case V32QImode:
38342 return expand_vec_perm_vpshufb2_vpermq_even_odd (d);
38343
38344 case V4DImode:
38345 if (!TARGET_AVX2)
38346 {
38347 struct expand_vec_perm_d d_copy = *d;
38348 d_copy.vmode = V4DFmode;
38349 d_copy.target = gen_lowpart (V4DFmode, d->target);
38350 d_copy.op0 = gen_lowpart (V4DFmode, d->op0);
38351 d_copy.op1 = gen_lowpart (V4DFmode, d->op1);
38352 return expand_vec_perm_even_odd_1 (&d_copy, odd);
38353 }
38354
38355 t1 = gen_reg_rtx (V4DImode);
38356 t2 = gen_reg_rtx (V4DImode);
38357
38358 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
38359 emit_insn (gen_avx2_permv2ti (t1, d->op0, d->op1, GEN_INT (0x20)));
38360 emit_insn (gen_avx2_permv2ti (t2, d->op0, d->op1, GEN_INT (0x31)));
38361
38362 /* Now an vpunpck[lh]qdq will produce the result required. */
38363 if (odd)
38364 t3 = gen_avx2_interleave_highv4di (d->target, t1, t2);
38365 else
38366 t3 = gen_avx2_interleave_lowv4di (d->target, t1, t2);
38367 emit_insn (t3);
38368 break;
38369
38370 case V8SImode:
38371 if (!TARGET_AVX2)
38372 {
38373 struct expand_vec_perm_d d_copy = *d;
38374 d_copy.vmode = V8SFmode;
38375 d_copy.target = gen_lowpart (V8SFmode, d->target);
38376 d_copy.op0 = gen_lowpart (V8SFmode, d->op0);
38377 d_copy.op1 = gen_lowpart (V8SFmode, d->op1);
38378 return expand_vec_perm_even_odd_1 (&d_copy, odd);
38379 }
38380
38381 t1 = gen_reg_rtx (V8SImode);
38382 t2 = gen_reg_rtx (V8SImode);
38383
38384 /* Shuffle the lanes around into
38385 { 0 1 2 3 8 9 a b } and { 4 5 6 7 c d e f }. */
38386 emit_insn (gen_avx2_permv2ti (gen_lowpart (V4DImode, t1),
38387 gen_lowpart (V4DImode, d->op0),
38388 gen_lowpart (V4DImode, d->op1),
38389 GEN_INT (0x20)));
38390 emit_insn (gen_avx2_permv2ti (gen_lowpart (V4DImode, t2),
38391 gen_lowpart (V4DImode, d->op0),
38392 gen_lowpart (V4DImode, d->op1),
38393 GEN_INT (0x31)));
38394
38395 /* Swap the 2nd and 3rd position in each lane into
38396 { 0 2 1 3 8 a 9 b } and { 4 6 5 7 c e d f }. */
38397 emit_insn (gen_avx2_pshufdv3 (t1, t1,
38398 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
38399 emit_insn (gen_avx2_pshufdv3 (t2, t2,
38400 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
38401
38402 /* Now an vpunpck[lh]qdq will produce
38403 { 0 2 4 6 8 a c e } resp. { 1 3 5 7 9 b d f }. */
38404 if (odd)
38405 t3 = gen_avx2_interleave_highv4di (gen_lowpart (V4DImode, d->target),
38406 gen_lowpart (V4DImode, t1),
38407 gen_lowpart (V4DImode, t2));
38408 else
38409 t3 = gen_avx2_interleave_lowv4di (gen_lowpart (V4DImode, d->target),
38410 gen_lowpart (V4DImode, t1),
38411 gen_lowpart (V4DImode, t2));
38412 emit_insn (t3);
38413 break;
38414
38415 default:
38416 gcc_unreachable ();
38417 }
38418
38419 return true;
38420 }
38421
38422 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
38423 extract-even and extract-odd permutations. */
38424
38425 static bool
38426 expand_vec_perm_even_odd (struct expand_vec_perm_d *d)
38427 {
38428 unsigned i, odd, nelt = d->nelt;
38429
38430 odd = d->perm[0];
38431 if (odd != 0 && odd != 1)
38432 return false;
38433
38434 for (i = 1; i < nelt; ++i)
38435 if (d->perm[i] != 2 * i + odd)
38436 return false;
38437
38438 return expand_vec_perm_even_odd_1 (d, odd);
38439 }
38440
38441 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement broadcast
38442 permutations. We assume that expand_vec_perm_1 has already failed. */
38443
38444 static bool
38445 expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
38446 {
38447 unsigned elt = d->perm[0], nelt2 = d->nelt / 2;
38448 enum machine_mode vmode = d->vmode;
38449 unsigned char perm2[4];
38450 rtx op0 = d->op0;
38451 bool ok;
38452
38453 switch (vmode)
38454 {
38455 case V4DFmode:
38456 case V8SFmode:
38457 /* These are special-cased in sse.md so that we can optionally
38458 use the vbroadcast instruction. They expand to two insns
38459 if the input happens to be in a register. */
38460 gcc_unreachable ();
38461
38462 case V2DFmode:
38463 case V2DImode:
38464 case V4SFmode:
38465 case V4SImode:
38466 /* These are always implementable using standard shuffle patterns. */
38467 gcc_unreachable ();
38468
38469 case V8HImode:
38470 case V16QImode:
38471 /* These can be implemented via interleave. We save one insn by
38472 stopping once we have promoted to V4SImode and then use pshufd. */
38473 do
38474 {
38475 rtx dest;
38476 rtx (*gen) (rtx, rtx, rtx)
38477 = vmode == V16QImode ? gen_vec_interleave_lowv16qi
38478 : gen_vec_interleave_lowv8hi;
38479
38480 if (elt >= nelt2)
38481 {
38482 gen = vmode == V16QImode ? gen_vec_interleave_highv16qi
38483 : gen_vec_interleave_highv8hi;
38484 elt -= nelt2;
38485 }
38486 nelt2 /= 2;
38487
38488 dest = gen_reg_rtx (vmode);
38489 emit_insn (gen (dest, op0, op0));
38490 vmode = get_mode_wider_vector (vmode);
38491 op0 = gen_lowpart (vmode, dest);
38492 }
38493 while (vmode != V4SImode);
38494
38495 memset (perm2, elt, 4);
38496 ok = expand_vselect (gen_lowpart (V4SImode, d->target), op0, perm2, 4,
38497 d->testing_p);
38498 gcc_assert (ok);
38499 return true;
38500
38501 case V32QImode:
38502 case V16HImode:
38503 case V8SImode:
38504 case V4DImode:
38505 /* For AVX2 broadcasts of the first element vpbroadcast* or
38506 vpermq should be used by expand_vec_perm_1. */
38507 gcc_assert (!TARGET_AVX2 || d->perm[0]);
38508 return false;
38509
38510 default:
38511 gcc_unreachable ();
38512 }
38513 }
38514
38515 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
38516 broadcast permutations. */
38517
38518 static bool
38519 expand_vec_perm_broadcast (struct expand_vec_perm_d *d)
38520 {
38521 unsigned i, elt, nelt = d->nelt;
38522
38523 if (!d->one_operand_p)
38524 return false;
38525
38526 elt = d->perm[0];
38527 for (i = 1; i < nelt; ++i)
38528 if (d->perm[i] != elt)
38529 return false;
38530
38531 return expand_vec_perm_broadcast_1 (d);
38532 }
38533
38534 /* Implement arbitrary permutation of two V32QImode and V16QImode operands
38535 with 4 vpshufb insns, 2 vpermq and 3 vpor. We should have already failed
38536 all the shorter instruction sequences. */
38537
38538 static bool
38539 expand_vec_perm_vpshufb4_vpermq2 (struct expand_vec_perm_d *d)
38540 {
38541 rtx rperm[4][32], vperm, l[2], h[2], op, m128;
38542 unsigned int i, nelt, eltsz;
38543 bool used[4];
38544
38545 if (!TARGET_AVX2
38546 || d->one_operand_p
38547 || (d->vmode != V32QImode && d->vmode != V16HImode))
38548 return false;
38549
38550 if (d->testing_p)
38551 return true;
38552
38553 nelt = d->nelt;
38554 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
38555
38556 /* Generate 4 permutation masks. If the required element is within
38557 the same lane, it is shuffled in. If the required element from the
38558 other lane, force a zero by setting bit 7 in the permutation mask.
38559 In the other mask the mask has non-negative elements if element
38560 is requested from the other lane, but also moved to the other lane,
38561 so that the result of vpshufb can have the two V2TImode halves
38562 swapped. */
38563 m128 = GEN_INT (-128);
38564 for (i = 0; i < 32; ++i)
38565 {
38566 rperm[0][i] = m128;
38567 rperm[1][i] = m128;
38568 rperm[2][i] = m128;
38569 rperm[3][i] = m128;
38570 }
38571 used[0] = false;
38572 used[1] = false;
38573 used[2] = false;
38574 used[3] = false;
38575 for (i = 0; i < nelt; ++i)
38576 {
38577 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
38578 unsigned xlane = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
38579 unsigned int which = ((d->perm[i] & nelt) ? 2 : 0) + (xlane ? 1 : 0);
38580
38581 for (j = 0; j < eltsz; ++j)
38582 rperm[which][(i * eltsz + j) ^ xlane] = GEN_INT (e * eltsz + j);
38583 used[which] = true;
38584 }
38585
38586 for (i = 0; i < 2; ++i)
38587 {
38588 if (!used[2 * i + 1])
38589 {
38590 h[i] = NULL_RTX;
38591 continue;
38592 }
38593 vperm = gen_rtx_CONST_VECTOR (V32QImode,
38594 gen_rtvec_v (32, rperm[2 * i + 1]));
38595 vperm = force_reg (V32QImode, vperm);
38596 h[i] = gen_reg_rtx (V32QImode);
38597 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
38598 emit_insn (gen_avx2_pshufbv32qi3 (h[i], op, vperm));
38599 }
38600
38601 /* Swap the 128-byte lanes of h[X]. */
38602 for (i = 0; i < 2; ++i)
38603 {
38604 if (h[i] == NULL_RTX)
38605 continue;
38606 op = gen_reg_rtx (V4DImode);
38607 emit_insn (gen_avx2_permv4di_1 (op, gen_lowpart (V4DImode, h[i]),
38608 const2_rtx, GEN_INT (3), const0_rtx,
38609 const1_rtx));
38610 h[i] = gen_lowpart (V32QImode, op);
38611 }
38612
38613 for (i = 0; i < 2; ++i)
38614 {
38615 if (!used[2 * i])
38616 {
38617 l[i] = NULL_RTX;
38618 continue;
38619 }
38620 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[2 * i]));
38621 vperm = force_reg (V32QImode, vperm);
38622 l[i] = gen_reg_rtx (V32QImode);
38623 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
38624 emit_insn (gen_avx2_pshufbv32qi3 (l[i], op, vperm));
38625 }
38626
38627 for (i = 0; i < 2; ++i)
38628 {
38629 if (h[i] && l[i])
38630 {
38631 op = gen_reg_rtx (V32QImode);
38632 emit_insn (gen_iorv32qi3 (op, l[i], h[i]));
38633 l[i] = op;
38634 }
38635 else if (h[i])
38636 l[i] = h[i];
38637 }
38638
38639 gcc_assert (l[0] && l[1]);
38640 op = gen_lowpart (V32QImode, d->target);
38641 emit_insn (gen_iorv32qi3 (op, l[0], l[1]));
38642 return true;
38643 }
38644
38645 /* The guts of ix86_expand_vec_perm_const, also used by the ok hook.
38646 With all of the interface bits taken care of, perform the expansion
38647 in D and return true on success. */
38648
38649 static bool
38650 ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
38651 {
38652 /* Try a single instruction expansion. */
38653 if (expand_vec_perm_1 (d))
38654 return true;
38655
38656 /* Try sequences of two instructions. */
38657
38658 if (expand_vec_perm_pshuflw_pshufhw (d))
38659 return true;
38660
38661 if (expand_vec_perm_palignr (d))
38662 return true;
38663
38664 if (expand_vec_perm_interleave2 (d))
38665 return true;
38666
38667 if (expand_vec_perm_broadcast (d))
38668 return true;
38669
38670 if (expand_vec_perm_vpermq_perm_1 (d))
38671 return true;
38672
38673 if (expand_vec_perm_vperm2f128 (d))
38674 return true;
38675
38676 /* Try sequences of three instructions. */
38677
38678 if (expand_vec_perm_2vperm2f128_vshuf (d))
38679 return true;
38680
38681 if (expand_vec_perm_pshufb2 (d))
38682 return true;
38683
38684 if (expand_vec_perm_interleave3 (d))
38685 return true;
38686
38687 if (expand_vec_perm_vperm2f128_vblend (d))
38688 return true;
38689
38690 /* Try sequences of four instructions. */
38691
38692 if (expand_vec_perm_vpshufb2_vpermq (d))
38693 return true;
38694
38695 if (expand_vec_perm_vpshufb2_vpermq_even_odd (d))
38696 return true;
38697
38698 /* ??? Look for narrow permutations whose element orderings would
38699 allow the promotion to a wider mode. */
38700
38701 /* ??? Look for sequences of interleave or a wider permute that place
38702 the data into the correct lanes for a half-vector shuffle like
38703 pshuf[lh]w or vpermilps. */
38704
38705 /* ??? Look for sequences of interleave that produce the desired results.
38706 The combinatorics of punpck[lh] get pretty ugly... */
38707
38708 if (expand_vec_perm_even_odd (d))
38709 return true;
38710
38711 /* Even longer sequences. */
38712 if (expand_vec_perm_vpshufb4_vpermq2 (d))
38713 return true;
38714
38715 return false;
38716 }
38717
38718 /* If a permutation only uses one operand, make it clear. Returns true
38719 if the permutation references both operands. */
38720
38721 static bool
38722 canonicalize_perm (struct expand_vec_perm_d *d)
38723 {
38724 int i, which, nelt = d->nelt;
38725
38726 for (i = which = 0; i < nelt; ++i)
38727 which |= (d->perm[i] < nelt ? 1 : 2);
38728
38729 d->one_operand_p = true;
38730 switch (which)
38731 {
38732 default:
38733 gcc_unreachable();
38734
38735 case 3:
38736 if (!rtx_equal_p (d->op0, d->op1))
38737 {
38738 d->one_operand_p = false;
38739 break;
38740 }
38741 /* The elements of PERM do not suggest that only the first operand
38742 is used, but both operands are identical. Allow easier matching
38743 of the permutation by folding the permutation into the single
38744 input vector. */
38745 /* FALLTHRU */
38746
38747 case 2:
38748 for (i = 0; i < nelt; ++i)
38749 d->perm[i] &= nelt - 1;
38750 d->op0 = d->op1;
38751 break;
38752
38753 case 1:
38754 d->op1 = d->op0;
38755 break;
38756 }
38757
38758 return (which == 3);
38759 }
38760
38761 bool
38762 ix86_expand_vec_perm_const (rtx operands[4])
38763 {
38764 struct expand_vec_perm_d d;
38765 unsigned char perm[MAX_VECT_LEN];
38766 int i, nelt;
38767 bool two_args;
38768 rtx sel;
38769
38770 d.target = operands[0];
38771 d.op0 = operands[1];
38772 d.op1 = operands[2];
38773 sel = operands[3];
38774
38775 d.vmode = GET_MODE (d.target);
38776 gcc_assert (VECTOR_MODE_P (d.vmode));
38777 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
38778 d.testing_p = false;
38779
38780 gcc_assert (GET_CODE (sel) == CONST_VECTOR);
38781 gcc_assert (XVECLEN (sel, 0) == nelt);
38782 gcc_checking_assert (sizeof (d.perm) == sizeof (perm));
38783
38784 for (i = 0; i < nelt; ++i)
38785 {
38786 rtx e = XVECEXP (sel, 0, i);
38787 int ei = INTVAL (e) & (2 * nelt - 1);
38788 d.perm[i] = ei;
38789 perm[i] = ei;
38790 }
38791
38792 two_args = canonicalize_perm (&d);
38793
38794 if (ix86_expand_vec_perm_const_1 (&d))
38795 return true;
38796
38797 /* If the selector says both arguments are needed, but the operands are the
38798 same, the above tried to expand with one_operand_p and flattened selector.
38799 If that didn't work, retry without one_operand_p; we succeeded with that
38800 during testing. */
38801 if (two_args && d.one_operand_p)
38802 {
38803 d.one_operand_p = false;
38804 memcpy (d.perm, perm, sizeof (perm));
38805 return ix86_expand_vec_perm_const_1 (&d);
38806 }
38807
38808 return false;
38809 }
38810
38811 /* Implement targetm.vectorize.vec_perm_const_ok. */
38812
38813 static bool
38814 ix86_vectorize_vec_perm_const_ok (enum machine_mode vmode,
38815 const unsigned char *sel)
38816 {
38817 struct expand_vec_perm_d d;
38818 unsigned int i, nelt, which;
38819 bool ret;
38820
38821 d.vmode = vmode;
38822 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
38823 d.testing_p = true;
38824
38825 /* Given sufficient ISA support we can just return true here
38826 for selected vector modes. */
38827 if (GET_MODE_SIZE (d.vmode) == 16)
38828 {
38829 /* All implementable with a single vpperm insn. */
38830 if (TARGET_XOP)
38831 return true;
38832 /* All implementable with 2 pshufb + 1 ior. */
38833 if (TARGET_SSSE3)
38834 return true;
38835 /* All implementable with shufpd or unpck[lh]pd. */
38836 if (d.nelt == 2)
38837 return true;
38838 }
38839
38840 /* Extract the values from the vector CST into the permutation
38841 array in D. */
38842 memcpy (d.perm, sel, nelt);
38843 for (i = which = 0; i < nelt; ++i)
38844 {
38845 unsigned char e = d.perm[i];
38846 gcc_assert (e < 2 * nelt);
38847 which |= (e < nelt ? 1 : 2);
38848 }
38849
38850 /* For all elements from second vector, fold the elements to first. */
38851 if (which == 2)
38852 for (i = 0; i < nelt; ++i)
38853 d.perm[i] -= nelt;
38854
38855 /* Check whether the mask can be applied to the vector type. */
38856 d.one_operand_p = (which != 3);
38857
38858 /* Implementable with shufps or pshufd. */
38859 if (d.one_operand_p && (d.vmode == V4SFmode || d.vmode == V4SImode))
38860 return true;
38861
38862 /* Otherwise we have to go through the motions and see if we can
38863 figure out how to generate the requested permutation. */
38864 d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
38865 d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
38866 if (!d.one_operand_p)
38867 d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
38868
38869 start_sequence ();
38870 ret = ix86_expand_vec_perm_const_1 (&d);
38871 end_sequence ();
38872
38873 return ret;
38874 }
38875
38876 void
38877 ix86_expand_vec_extract_even_odd (rtx targ, rtx op0, rtx op1, unsigned odd)
38878 {
38879 struct expand_vec_perm_d d;
38880 unsigned i, nelt;
38881
38882 d.target = targ;
38883 d.op0 = op0;
38884 d.op1 = op1;
38885 d.vmode = GET_MODE (targ);
38886 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
38887 d.one_operand_p = false;
38888 d.testing_p = false;
38889
38890 for (i = 0; i < nelt; ++i)
38891 d.perm[i] = i * 2 + odd;
38892
38893 /* We'll either be able to implement the permutation directly... */
38894 if (expand_vec_perm_1 (&d))
38895 return;
38896
38897 /* ... or we use the special-case patterns. */
38898 expand_vec_perm_even_odd_1 (&d, odd);
38899 }
38900
38901 static void
38902 ix86_expand_vec_interleave (rtx targ, rtx op0, rtx op1, bool high_p)
38903 {
38904 struct expand_vec_perm_d d;
38905 unsigned i, nelt, base;
38906 bool ok;
38907
38908 d.target = targ;
38909 d.op0 = op0;
38910 d.op1 = op1;
38911 d.vmode = GET_MODE (targ);
38912 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
38913 d.one_operand_p = false;
38914 d.testing_p = false;
38915
38916 base = high_p ? nelt / 2 : 0;
38917 for (i = 0; i < nelt / 2; ++i)
38918 {
38919 d.perm[i * 2] = i + base;
38920 d.perm[i * 2 + 1] = i + base + nelt;
38921 }
38922
38923 /* Note that for AVX this isn't one instruction. */
38924 ok = ix86_expand_vec_perm_const_1 (&d);
38925 gcc_assert (ok);
38926 }
38927
38928
38929 /* Expand a vector operation CODE for a V*QImode in terms of the
38930 same operation on V*HImode. */
38931
38932 void
38933 ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
38934 {
38935 enum machine_mode qimode = GET_MODE (dest);
38936 enum machine_mode himode;
38937 rtx (*gen_il) (rtx, rtx, rtx);
38938 rtx (*gen_ih) (rtx, rtx, rtx);
38939 rtx op1_l, op1_h, op2_l, op2_h, res_l, res_h;
38940 struct expand_vec_perm_d d;
38941 bool ok, full_interleave;
38942 bool uns_p = false;
38943 int i;
38944
38945 switch (qimode)
38946 {
38947 case V16QImode:
38948 himode = V8HImode;
38949 gen_il = gen_vec_interleave_lowv16qi;
38950 gen_ih = gen_vec_interleave_highv16qi;
38951 break;
38952 case V32QImode:
38953 himode = V16HImode;
38954 gen_il = gen_avx2_interleave_lowv32qi;
38955 gen_ih = gen_avx2_interleave_highv32qi;
38956 break;
38957 default:
38958 gcc_unreachable ();
38959 }
38960
38961 op2_l = op2_h = op2;
38962 switch (code)
38963 {
38964 case MULT:
38965 /* Unpack data such that we've got a source byte in each low byte of
38966 each word. We don't care what goes into the high byte of each word.
38967 Rather than trying to get zero in there, most convenient is to let
38968 it be a copy of the low byte. */
38969 op2_l = gen_reg_rtx (qimode);
38970 op2_h = gen_reg_rtx (qimode);
38971 emit_insn (gen_il (op2_l, op2, op2));
38972 emit_insn (gen_ih (op2_h, op2, op2));
38973 /* FALLTHRU */
38974
38975 op1_l = gen_reg_rtx (qimode);
38976 op1_h = gen_reg_rtx (qimode);
38977 emit_insn (gen_il (op1_l, op1, op1));
38978 emit_insn (gen_ih (op1_h, op1, op1));
38979 full_interleave = qimode == V16QImode;
38980 break;
38981
38982 case ASHIFT:
38983 case LSHIFTRT:
38984 uns_p = true;
38985 /* FALLTHRU */
38986 case ASHIFTRT:
38987 op1_l = gen_reg_rtx (himode);
38988 op1_h = gen_reg_rtx (himode);
38989 ix86_expand_sse_unpack (op1_l, op1, uns_p, false);
38990 ix86_expand_sse_unpack (op1_h, op1, uns_p, true);
38991 full_interleave = true;
38992 break;
38993 default:
38994 gcc_unreachable ();
38995 }
38996
38997 /* Perform the operation. */
38998 res_l = expand_simple_binop (himode, code, op1_l, op2_l, NULL_RTX,
38999 1, OPTAB_DIRECT);
39000 res_h = expand_simple_binop (himode, code, op1_h, op2_h, NULL_RTX,
39001 1, OPTAB_DIRECT);
39002 gcc_assert (res_l && res_h);
39003
39004 /* Merge the data back into the right place. */
39005 d.target = dest;
39006 d.op0 = gen_lowpart (qimode, res_l);
39007 d.op1 = gen_lowpart (qimode, res_h);
39008 d.vmode = qimode;
39009 d.nelt = GET_MODE_NUNITS (qimode);
39010 d.one_operand_p = false;
39011 d.testing_p = false;
39012
39013 if (full_interleave)
39014 {
39015 /* For SSE2, we used an full interleave, so the desired
39016 results are in the even elements. */
39017 for (i = 0; i < 32; ++i)
39018 d.perm[i] = i * 2;
39019 }
39020 else
39021 {
39022 /* For AVX, the interleave used above was not cross-lane. So the
39023 extraction is evens but with the second and third quarter swapped.
39024 Happily, that is even one insn shorter than even extraction. */
39025 for (i = 0; i < 32; ++i)
39026 d.perm[i] = i * 2 + ((i & 24) == 8 ? 16 : (i & 24) == 16 ? -16 : 0);
39027 }
39028
39029 ok = ix86_expand_vec_perm_const_1 (&d);
39030 gcc_assert (ok);
39031
39032 set_unique_reg_note (get_last_insn (), REG_EQUAL,
39033 gen_rtx_fmt_ee (code, qimode, op1, op2));
39034 }
39035
39036 void
39037 ix86_expand_mul_widen_evenodd (rtx dest, rtx op1, rtx op2,
39038 bool uns_p, bool odd_p)
39039 {
39040 enum machine_mode mode = GET_MODE (op1);
39041 enum machine_mode wmode = GET_MODE (dest);
39042 rtx x;
39043
39044 /* We only play even/odd games with vectors of SImode. */
39045 gcc_assert (mode == V4SImode || mode == V8SImode);
39046
39047 /* If we're looking for the odd results, shift those members down to
39048 the even slots. For some cpus this is faster than a PSHUFD. */
39049 if (odd_p)
39050 {
39051 if (TARGET_XOP && mode == V4SImode)
39052 {
39053 x = force_reg (wmode, CONST0_RTX (wmode));
39054 emit_insn (gen_xop_pmacsdqh (dest, op1, op2, x));
39055 return;
39056 }
39057
39058 x = GEN_INT (GET_MODE_UNIT_BITSIZE (mode));
39059 op1 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op1),
39060 x, NULL, 1, OPTAB_DIRECT);
39061 op2 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op2),
39062 x, NULL, 1, OPTAB_DIRECT);
39063 op1 = gen_lowpart (mode, op1);
39064 op2 = gen_lowpart (mode, op2);
39065 }
39066
39067 if (mode == V8SImode)
39068 {
39069 if (uns_p)
39070 x = gen_vec_widen_umult_even_v8si (dest, op1, op2);
39071 else
39072 x = gen_vec_widen_smult_even_v8si (dest, op1, op2);
39073 }
39074 else if (uns_p)
39075 x = gen_vec_widen_umult_even_v4si (dest, op1, op2);
39076 else if (TARGET_SSE4_1)
39077 x = gen_sse4_1_mulv2siv2di3 (dest, op1, op2);
39078 else
39079 {
39080 rtx s1, s2, t0, t1, t2;
39081
39082 /* The easiest way to implement this without PMULDQ is to go through
39083 the motions as if we are performing a full 64-bit multiply. With
39084 the exception that we need to do less shuffling of the elements. */
39085
39086 /* Compute the sign-extension, aka highparts, of the two operands. */
39087 s1 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
39088 op1, pc_rtx, pc_rtx);
39089 s2 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
39090 op2, pc_rtx, pc_rtx);
39091
39092 /* Multiply LO(A) * HI(B), and vice-versa. */
39093 t1 = gen_reg_rtx (wmode);
39094 t2 = gen_reg_rtx (wmode);
39095 emit_insn (gen_vec_widen_umult_even_v4si (t1, s1, op2));
39096 emit_insn (gen_vec_widen_umult_even_v4si (t2, s2, op1));
39097
39098 /* Multiply LO(A) * LO(B). */
39099 t0 = gen_reg_rtx (wmode);
39100 emit_insn (gen_vec_widen_umult_even_v4si (t0, op1, op2));
39101
39102 /* Combine and shift the highparts into place. */
39103 t1 = expand_binop (wmode, add_optab, t1, t2, t1, 1, OPTAB_DIRECT);
39104 t1 = expand_binop (wmode, ashl_optab, t1, GEN_INT (32), t1,
39105 1, OPTAB_DIRECT);
39106
39107 /* Combine high and low parts. */
39108 force_expand_binop (wmode, add_optab, t0, t1, dest, 1, OPTAB_DIRECT);
39109 return;
39110 }
39111 emit_insn (x);
39112 }
39113
39114 void
39115 ix86_expand_mul_widen_hilo (rtx dest, rtx op1, rtx op2,
39116 bool uns_p, bool high_p)
39117 {
39118 enum machine_mode wmode = GET_MODE (dest);
39119 enum machine_mode mode = GET_MODE (op1);
39120 rtx t1, t2, t3, t4, mask;
39121
39122 switch (mode)
39123 {
39124 case V4SImode:
39125 t1 = gen_reg_rtx (mode);
39126 t2 = gen_reg_rtx (mode);
39127 if (TARGET_XOP && !uns_p)
39128 {
39129 /* With XOP, we have pmacsdqh, aka mul_widen_odd. In this case,
39130 shuffle the elements once so that all elements are in the right
39131 place for immediate use: { A C B D }. */
39132 emit_insn (gen_sse2_pshufd_1 (t1, op1, const0_rtx, const2_rtx,
39133 const1_rtx, GEN_INT (3)));
39134 emit_insn (gen_sse2_pshufd_1 (t2, op2, const0_rtx, const2_rtx,
39135 const1_rtx, GEN_INT (3)));
39136 }
39137 else
39138 {
39139 /* Put the elements into place for the multiply. */
39140 ix86_expand_vec_interleave (t1, op1, op1, high_p);
39141 ix86_expand_vec_interleave (t2, op2, op2, high_p);
39142 high_p = false;
39143 }
39144 ix86_expand_mul_widen_evenodd (dest, t1, t2, uns_p, high_p);
39145 break;
39146
39147 case V8SImode:
39148 /* Shuffle the elements between the lanes. After this we
39149 have { A B E F | C D G H } for each operand. */
39150 t1 = gen_reg_rtx (V4DImode);
39151 t2 = gen_reg_rtx (V4DImode);
39152 emit_insn (gen_avx2_permv4di_1 (t1, gen_lowpart (V4DImode, op1),
39153 const0_rtx, const2_rtx,
39154 const1_rtx, GEN_INT (3)));
39155 emit_insn (gen_avx2_permv4di_1 (t2, gen_lowpart (V4DImode, op2),
39156 const0_rtx, const2_rtx,
39157 const1_rtx, GEN_INT (3)));
39158
39159 /* Shuffle the elements within the lanes. After this we
39160 have { A A B B | C C D D } or { E E F F | G G H H }. */
39161 t3 = gen_reg_rtx (V8SImode);
39162 t4 = gen_reg_rtx (V8SImode);
39163 mask = GEN_INT (high_p
39164 ? 2 + (2 << 2) + (3 << 4) + (3 << 6)
39165 : 0 + (0 << 2) + (1 << 4) + (1 << 6));
39166 emit_insn (gen_avx2_pshufdv3 (t3, gen_lowpart (V8SImode, t1), mask));
39167 emit_insn (gen_avx2_pshufdv3 (t4, gen_lowpart (V8SImode, t2), mask));
39168
39169 ix86_expand_mul_widen_evenodd (dest, t3, t4, uns_p, false);
39170 break;
39171
39172 case V8HImode:
39173 case V16HImode:
39174 t1 = expand_binop (mode, smul_optab, op1, op2, NULL_RTX,
39175 uns_p, OPTAB_DIRECT);
39176 t2 = expand_binop (mode,
39177 uns_p ? umul_highpart_optab : smul_highpart_optab,
39178 op1, op2, NULL_RTX, uns_p, OPTAB_DIRECT);
39179 gcc_assert (t1 && t2);
39180
39181 ix86_expand_vec_interleave (gen_lowpart (mode, dest), t1, t2, high_p);
39182 break;
39183
39184 case V16QImode:
39185 case V32QImode:
39186 t1 = gen_reg_rtx (wmode);
39187 t2 = gen_reg_rtx (wmode);
39188 ix86_expand_sse_unpack (t1, op1, uns_p, high_p);
39189 ix86_expand_sse_unpack (t2, op2, uns_p, high_p);
39190
39191 emit_insn (gen_rtx_SET (VOIDmode, dest, gen_rtx_MULT (wmode, t1, t2)));
39192 break;
39193
39194 default:
39195 gcc_unreachable ();
39196 }
39197 }
39198
39199 void
39200 ix86_expand_sse2_mulv4si3 (rtx op0, rtx op1, rtx op2)
39201 {
39202 rtx res_1, res_2;
39203
39204 res_1 = gen_reg_rtx (V4SImode);
39205 res_2 = gen_reg_rtx (V4SImode);
39206 ix86_expand_mul_widen_evenodd (gen_lowpart (V2DImode, res_1),
39207 op1, op2, true, false);
39208 ix86_expand_mul_widen_evenodd (gen_lowpart (V2DImode, res_2),
39209 op1, op2, true, true);
39210
39211 /* Move the results in element 2 down to element 1; we don't care
39212 what goes in elements 2 and 3. Then we can merge the parts
39213 back together with an interleave.
39214
39215 Note that two other sequences were tried:
39216 (1) Use interleaves at the start instead of psrldq, which allows
39217 us to use a single shufps to merge things back at the end.
39218 (2) Use shufps here to combine the two vectors, then pshufd to
39219 put the elements in the correct order.
39220 In both cases the cost of the reformatting stall was too high
39221 and the overall sequence slower. */
39222
39223 emit_insn (gen_sse2_pshufd_1 (res_1, res_1, const0_rtx, const2_rtx,
39224 const0_rtx, const0_rtx));
39225 emit_insn (gen_sse2_pshufd_1 (res_2, res_2, const0_rtx, const2_rtx,
39226 const0_rtx, const0_rtx));
39227 res_1 = emit_insn (gen_vec_interleave_lowv4si (op0, res_1, res_2));
39228
39229 set_unique_reg_note (res_1, REG_EQUAL, gen_rtx_MULT (V4SImode, op1, op2));
39230 }
39231
39232 void
39233 ix86_expand_sse2_mulvxdi3 (rtx op0, rtx op1, rtx op2)
39234 {
39235 enum machine_mode mode = GET_MODE (op0);
39236 rtx t1, t2, t3, t4, t5, t6;
39237
39238 if (TARGET_XOP && mode == V2DImode)
39239 {
39240 /* op1: A,B,C,D, op2: E,F,G,H */
39241 op1 = gen_lowpart (V4SImode, op1);
39242 op2 = gen_lowpart (V4SImode, op2);
39243
39244 t1 = gen_reg_rtx (V4SImode);
39245 t2 = gen_reg_rtx (V4SImode);
39246 t3 = gen_reg_rtx (V2DImode);
39247 t4 = gen_reg_rtx (V2DImode);
39248
39249 /* t1: B,A,D,C */
39250 emit_insn (gen_sse2_pshufd_1 (t1, op1,
39251 GEN_INT (1),
39252 GEN_INT (0),
39253 GEN_INT (3),
39254 GEN_INT (2)));
39255
39256 /* t2: (B*E),(A*F),(D*G),(C*H) */
39257 emit_insn (gen_mulv4si3 (t2, t1, op2));
39258
39259 /* t3: (B*E)+(A*F), (D*G)+(C*H) */
39260 emit_insn (gen_xop_phadddq (t3, t2));
39261
39262 /* t4: ((B*E)+(A*F))<<32, ((D*G)+(C*H))<<32 */
39263 emit_insn (gen_ashlv2di3 (t4, t3, GEN_INT (32)));
39264
39265 /* op0: (((B*E)+(A*F))<<32)+(B*F), (((D*G)+(C*H))<<32)+(D*H) */
39266 emit_insn (gen_xop_pmacsdql (op0, op1, op2, t4));
39267 }
39268 else
39269 {
39270 enum machine_mode nmode;
39271 rtx (*umul) (rtx, rtx, rtx);
39272
39273 if (mode == V2DImode)
39274 {
39275 umul = gen_vec_widen_umult_even_v4si;
39276 nmode = V4SImode;
39277 }
39278 else if (mode == V4DImode)
39279 {
39280 umul = gen_vec_widen_umult_even_v8si;
39281 nmode = V8SImode;
39282 }
39283 else
39284 gcc_unreachable ();
39285
39286
39287 /* Multiply low parts. */
39288 t1 = gen_reg_rtx (mode);
39289 emit_insn (umul (t1, gen_lowpart (nmode, op1), gen_lowpart (nmode, op2)));
39290
39291 /* Shift input vectors right 32 bits so we can multiply high parts. */
39292 t6 = GEN_INT (32);
39293 t2 = expand_binop (mode, lshr_optab, op1, t6, NULL, 1, OPTAB_DIRECT);
39294 t3 = expand_binop (mode, lshr_optab, op2, t6, NULL, 1, OPTAB_DIRECT);
39295
39296 /* Multiply high parts by low parts. */
39297 t4 = gen_reg_rtx (mode);
39298 t5 = gen_reg_rtx (mode);
39299 emit_insn (umul (t4, gen_lowpart (nmode, t2), gen_lowpart (nmode, op2)));
39300 emit_insn (umul (t5, gen_lowpart (nmode, t3), gen_lowpart (nmode, op1)));
39301
39302 /* Combine and shift the highparts back. */
39303 t4 = expand_binop (mode, add_optab, t4, t5, t4, 1, OPTAB_DIRECT);
39304 t4 = expand_binop (mode, ashl_optab, t4, t6, t4, 1, OPTAB_DIRECT);
39305
39306 /* Combine high and low parts. */
39307 force_expand_binop (mode, add_optab, t1, t4, op0, 1, OPTAB_DIRECT);
39308 }
39309
39310 set_unique_reg_note (get_last_insn (), REG_EQUAL,
39311 gen_rtx_MULT (mode, op1, op2));
39312 }
39313
39314 /* Expand an insert into a vector register through pinsr insn.
39315 Return true if successful. */
39316
39317 bool
39318 ix86_expand_pinsr (rtx *operands)
39319 {
39320 rtx dst = operands[0];
39321 rtx src = operands[3];
39322
39323 unsigned int size = INTVAL (operands[1]);
39324 unsigned int pos = INTVAL (operands[2]);
39325
39326 if (GET_CODE (dst) == SUBREG)
39327 {
39328 pos += SUBREG_BYTE (dst) * BITS_PER_UNIT;
39329 dst = SUBREG_REG (dst);
39330 }
39331
39332 if (GET_CODE (src) == SUBREG)
39333 src = SUBREG_REG (src);
39334
39335 switch (GET_MODE (dst))
39336 {
39337 case V16QImode:
39338 case V8HImode:
39339 case V4SImode:
39340 case V2DImode:
39341 {
39342 enum machine_mode srcmode, dstmode;
39343 rtx (*pinsr)(rtx, rtx, rtx, rtx);
39344
39345 srcmode = mode_for_size (size, MODE_INT, 0);
39346
39347 switch (srcmode)
39348 {
39349 case QImode:
39350 if (!TARGET_SSE4_1)
39351 return false;
39352 dstmode = V16QImode;
39353 pinsr = gen_sse4_1_pinsrb;
39354 break;
39355
39356 case HImode:
39357 if (!TARGET_SSE2)
39358 return false;
39359 dstmode = V8HImode;
39360 pinsr = gen_sse2_pinsrw;
39361 break;
39362
39363 case SImode:
39364 if (!TARGET_SSE4_1)
39365 return false;
39366 dstmode = V4SImode;
39367 pinsr = gen_sse4_1_pinsrd;
39368 break;
39369
39370 case DImode:
39371 gcc_assert (TARGET_64BIT);
39372 if (!TARGET_SSE4_1)
39373 return false;
39374 dstmode = V2DImode;
39375 pinsr = gen_sse4_1_pinsrq;
39376 break;
39377
39378 default:
39379 return false;
39380 }
39381
39382 dst = gen_lowpart (dstmode, dst);
39383 src = gen_lowpart (srcmode, src);
39384
39385 pos /= size;
39386
39387 emit_insn (pinsr (dst, dst, src, GEN_INT (1 << pos)));
39388 return true;
39389 }
39390
39391 default:
39392 return false;
39393 }
39394 }
39395 \f
39396 /* This function returns the calling abi specific va_list type node.
39397 It returns the FNDECL specific va_list type. */
39398
39399 static tree
39400 ix86_fn_abi_va_list (tree fndecl)
39401 {
39402 if (!TARGET_64BIT)
39403 return va_list_type_node;
39404 gcc_assert (fndecl != NULL_TREE);
39405
39406 if (ix86_function_abi ((const_tree) fndecl) == MS_ABI)
39407 return ms_va_list_type_node;
39408 else
39409 return sysv_va_list_type_node;
39410 }
39411
39412 /* Returns the canonical va_list type specified by TYPE. If there
39413 is no valid TYPE provided, it return NULL_TREE. */
39414
39415 static tree
39416 ix86_canonical_va_list_type (tree type)
39417 {
39418 tree wtype, htype;
39419
39420 /* Resolve references and pointers to va_list type. */
39421 if (TREE_CODE (type) == MEM_REF)
39422 type = TREE_TYPE (type);
39423 else if (POINTER_TYPE_P (type) && POINTER_TYPE_P (TREE_TYPE(type)))
39424 type = TREE_TYPE (type);
39425 else if (POINTER_TYPE_P (type) && TREE_CODE (TREE_TYPE (type)) == ARRAY_TYPE)
39426 type = TREE_TYPE (type);
39427
39428 if (TARGET_64BIT && va_list_type_node != NULL_TREE)
39429 {
39430 wtype = va_list_type_node;
39431 gcc_assert (wtype != NULL_TREE);
39432 htype = type;
39433 if (TREE_CODE (wtype) == ARRAY_TYPE)
39434 {
39435 /* If va_list is an array type, the argument may have decayed
39436 to a pointer type, e.g. by being passed to another function.
39437 In that case, unwrap both types so that we can compare the
39438 underlying records. */
39439 if (TREE_CODE (htype) == ARRAY_TYPE
39440 || POINTER_TYPE_P (htype))
39441 {
39442 wtype = TREE_TYPE (wtype);
39443 htype = TREE_TYPE (htype);
39444 }
39445 }
39446 if (TYPE_MAIN_VARIANT (wtype) == TYPE_MAIN_VARIANT (htype))
39447 return va_list_type_node;
39448 wtype = sysv_va_list_type_node;
39449 gcc_assert (wtype != NULL_TREE);
39450 htype = type;
39451 if (TREE_CODE (wtype) == ARRAY_TYPE)
39452 {
39453 /* If va_list is an array type, the argument may have decayed
39454 to a pointer type, e.g. by being passed to another function.
39455 In that case, unwrap both types so that we can compare the
39456 underlying records. */
39457 if (TREE_CODE (htype) == ARRAY_TYPE
39458 || POINTER_TYPE_P (htype))
39459 {
39460 wtype = TREE_TYPE (wtype);
39461 htype = TREE_TYPE (htype);
39462 }
39463 }
39464 if (TYPE_MAIN_VARIANT (wtype) == TYPE_MAIN_VARIANT (htype))
39465 return sysv_va_list_type_node;
39466 wtype = ms_va_list_type_node;
39467 gcc_assert (wtype != NULL_TREE);
39468 htype = type;
39469 if (TREE_CODE (wtype) == ARRAY_TYPE)
39470 {
39471 /* If va_list is an array type, the argument may have decayed
39472 to a pointer type, e.g. by being passed to another function.
39473 In that case, unwrap both types so that we can compare the
39474 underlying records. */
39475 if (TREE_CODE (htype) == ARRAY_TYPE
39476 || POINTER_TYPE_P (htype))
39477 {
39478 wtype = TREE_TYPE (wtype);
39479 htype = TREE_TYPE (htype);
39480 }
39481 }
39482 if (TYPE_MAIN_VARIANT (wtype) == TYPE_MAIN_VARIANT (htype))
39483 return ms_va_list_type_node;
39484 return NULL_TREE;
39485 }
39486 return std_canonical_va_list_type (type);
39487 }
39488
39489 /* Iterate through the target-specific builtin types for va_list.
39490 IDX denotes the iterator, *PTREE is set to the result type of
39491 the va_list builtin, and *PNAME to its internal type.
39492 Returns zero if there is no element for this index, otherwise
39493 IDX should be increased upon the next call.
39494 Note, do not iterate a base builtin's name like __builtin_va_list.
39495 Used from c_common_nodes_and_builtins. */
39496
39497 static int
39498 ix86_enum_va_list (int idx, const char **pname, tree *ptree)
39499 {
39500 if (TARGET_64BIT)
39501 {
39502 switch (idx)
39503 {
39504 default:
39505 break;
39506
39507 case 0:
39508 *ptree = ms_va_list_type_node;
39509 *pname = "__builtin_ms_va_list";
39510 return 1;
39511
39512 case 1:
39513 *ptree = sysv_va_list_type_node;
39514 *pname = "__builtin_sysv_va_list";
39515 return 1;
39516 }
39517 }
39518
39519 return 0;
39520 }
39521
39522 #undef TARGET_SCHED_DISPATCH
39523 #define TARGET_SCHED_DISPATCH has_dispatch
39524 #undef TARGET_SCHED_DISPATCH_DO
39525 #define TARGET_SCHED_DISPATCH_DO do_dispatch
39526 #undef TARGET_SCHED_REASSOCIATION_WIDTH
39527 #define TARGET_SCHED_REASSOCIATION_WIDTH ix86_reassociation_width
39528 #undef TARGET_SCHED_REORDER
39529 #define TARGET_SCHED_REORDER ix86_sched_reorder
39530
39531 /* The size of the dispatch window is the total number of bytes of
39532 object code allowed in a window. */
39533 #define DISPATCH_WINDOW_SIZE 16
39534
39535 /* Number of dispatch windows considered for scheduling. */
39536 #define MAX_DISPATCH_WINDOWS 3
39537
39538 /* Maximum number of instructions in a window. */
39539 #define MAX_INSN 4
39540
39541 /* Maximum number of immediate operands in a window. */
39542 #define MAX_IMM 4
39543
39544 /* Maximum number of immediate bits allowed in a window. */
39545 #define MAX_IMM_SIZE 128
39546
39547 /* Maximum number of 32 bit immediates allowed in a window. */
39548 #define MAX_IMM_32 4
39549
39550 /* Maximum number of 64 bit immediates allowed in a window. */
39551 #define MAX_IMM_64 2
39552
39553 /* Maximum total of loads or prefetches allowed in a window. */
39554 #define MAX_LOAD 2
39555
39556 /* Maximum total of stores allowed in a window. */
39557 #define MAX_STORE 1
39558
39559 #undef BIG
39560 #define BIG 100
39561
39562
39563 /* Dispatch groups. Istructions that affect the mix in a dispatch window. */
39564 enum dispatch_group {
39565 disp_no_group = 0,
39566 disp_load,
39567 disp_store,
39568 disp_load_store,
39569 disp_prefetch,
39570 disp_imm,
39571 disp_imm_32,
39572 disp_imm_64,
39573 disp_branch,
39574 disp_cmp,
39575 disp_jcc,
39576 disp_last
39577 };
39578
39579 /* Number of allowable groups in a dispatch window. It is an array
39580 indexed by dispatch_group enum. 100 is used as a big number,
39581 because the number of these kind of operations does not have any
39582 effect in dispatch window, but we need them for other reasons in
39583 the table. */
39584 static unsigned int num_allowable_groups[disp_last] = {
39585 0, 2, 1, 1, 2, 4, 4, 2, 1, BIG, BIG
39586 };
39587
39588 char group_name[disp_last + 1][16] = {
39589 "disp_no_group", "disp_load", "disp_store", "disp_load_store",
39590 "disp_prefetch", "disp_imm", "disp_imm_32", "disp_imm_64",
39591 "disp_branch", "disp_cmp", "disp_jcc", "disp_last"
39592 };
39593
39594 /* Instruction path. */
39595 enum insn_path {
39596 no_path = 0,
39597 path_single, /* Single micro op. */
39598 path_double, /* Double micro op. */
39599 path_multi, /* Instructions with more than 2 micro op.. */
39600 last_path
39601 };
39602
39603 /* sched_insn_info defines a window to the instructions scheduled in
39604 the basic block. It contains a pointer to the insn_info table and
39605 the instruction scheduled.
39606
39607 Windows are allocated for each basic block and are linked
39608 together. */
39609 typedef struct sched_insn_info_s {
39610 rtx insn;
39611 enum dispatch_group group;
39612 enum insn_path path;
39613 int byte_len;
39614 int imm_bytes;
39615 } sched_insn_info;
39616
39617 /* Linked list of dispatch windows. This is a two way list of
39618 dispatch windows of a basic block. It contains information about
39619 the number of uops in the window and the total number of
39620 instructions and of bytes in the object code for this dispatch
39621 window. */
39622 typedef struct dispatch_windows_s {
39623 int num_insn; /* Number of insn in the window. */
39624 int num_uops; /* Number of uops in the window. */
39625 int window_size; /* Number of bytes in the window. */
39626 int window_num; /* Window number between 0 or 1. */
39627 int num_imm; /* Number of immediates in an insn. */
39628 int num_imm_32; /* Number of 32 bit immediates in an insn. */
39629 int num_imm_64; /* Number of 64 bit immediates in an insn. */
39630 int imm_size; /* Total immediates in the window. */
39631 int num_loads; /* Total memory loads in the window. */
39632 int num_stores; /* Total memory stores in the window. */
39633 int violation; /* Violation exists in window. */
39634 sched_insn_info *window; /* Pointer to the window. */
39635 struct dispatch_windows_s *next;
39636 struct dispatch_windows_s *prev;
39637 } dispatch_windows;
39638
39639 /* Immediate valuse used in an insn. */
39640 typedef struct imm_info_s
39641 {
39642 int imm;
39643 int imm32;
39644 int imm64;
39645 } imm_info;
39646
39647 static dispatch_windows *dispatch_window_list;
39648 static dispatch_windows *dispatch_window_list1;
39649
39650 /* Get dispatch group of insn. */
39651
39652 static enum dispatch_group
39653 get_mem_group (rtx insn)
39654 {
39655 enum attr_memory memory;
39656
39657 if (INSN_CODE (insn) < 0)
39658 return disp_no_group;
39659 memory = get_attr_memory (insn);
39660 if (memory == MEMORY_STORE)
39661 return disp_store;
39662
39663 if (memory == MEMORY_LOAD)
39664 return disp_load;
39665
39666 if (memory == MEMORY_BOTH)
39667 return disp_load_store;
39668
39669 return disp_no_group;
39670 }
39671
39672 /* Return true if insn is a compare instruction. */
39673
39674 static bool
39675 is_cmp (rtx insn)
39676 {
39677 enum attr_type type;
39678
39679 type = get_attr_type (insn);
39680 return (type == TYPE_TEST
39681 || type == TYPE_ICMP
39682 || type == TYPE_FCMP
39683 || GET_CODE (PATTERN (insn)) == COMPARE);
39684 }
39685
39686 /* Return true if a dispatch violation encountered. */
39687
39688 static bool
39689 dispatch_violation (void)
39690 {
39691 if (dispatch_window_list->next)
39692 return dispatch_window_list->next->violation;
39693 return dispatch_window_list->violation;
39694 }
39695
39696 /* Return true if insn is a branch instruction. */
39697
39698 static bool
39699 is_branch (rtx insn)
39700 {
39701 return (CALL_P (insn) || JUMP_P (insn));
39702 }
39703
39704 /* Return true if insn is a prefetch instruction. */
39705
39706 static bool
39707 is_prefetch (rtx insn)
39708 {
39709 return NONJUMP_INSN_P (insn) && GET_CODE (PATTERN (insn)) == PREFETCH;
39710 }
39711
39712 /* This function initializes a dispatch window and the list container holding a
39713 pointer to the window. */
39714
39715 static void
39716 init_window (int window_num)
39717 {
39718 int i;
39719 dispatch_windows *new_list;
39720
39721 if (window_num == 0)
39722 new_list = dispatch_window_list;
39723 else
39724 new_list = dispatch_window_list1;
39725
39726 new_list->num_insn = 0;
39727 new_list->num_uops = 0;
39728 new_list->window_size = 0;
39729 new_list->next = NULL;
39730 new_list->prev = NULL;
39731 new_list->window_num = window_num;
39732 new_list->num_imm = 0;
39733 new_list->num_imm_32 = 0;
39734 new_list->num_imm_64 = 0;
39735 new_list->imm_size = 0;
39736 new_list->num_loads = 0;
39737 new_list->num_stores = 0;
39738 new_list->violation = false;
39739
39740 for (i = 0; i < MAX_INSN; i++)
39741 {
39742 new_list->window[i].insn = NULL;
39743 new_list->window[i].group = disp_no_group;
39744 new_list->window[i].path = no_path;
39745 new_list->window[i].byte_len = 0;
39746 new_list->window[i].imm_bytes = 0;
39747 }
39748 return;
39749 }
39750
39751 /* This function allocates and initializes a dispatch window and the
39752 list container holding a pointer to the window. */
39753
39754 static dispatch_windows *
39755 allocate_window (void)
39756 {
39757 dispatch_windows *new_list = XNEW (struct dispatch_windows_s);
39758 new_list->window = XNEWVEC (struct sched_insn_info_s, MAX_INSN + 1);
39759
39760 return new_list;
39761 }
39762
39763 /* This routine initializes the dispatch scheduling information. It
39764 initiates building dispatch scheduler tables and constructs the
39765 first dispatch window. */
39766
39767 static void
39768 init_dispatch_sched (void)
39769 {
39770 /* Allocate a dispatch list and a window. */
39771 dispatch_window_list = allocate_window ();
39772 dispatch_window_list1 = allocate_window ();
39773 init_window (0);
39774 init_window (1);
39775 }
39776
39777 /* This function returns true if a branch is detected. End of a basic block
39778 does not have to be a branch, but here we assume only branches end a
39779 window. */
39780
39781 static bool
39782 is_end_basic_block (enum dispatch_group group)
39783 {
39784 return group == disp_branch;
39785 }
39786
39787 /* This function is called when the end of a window processing is reached. */
39788
39789 static void
39790 process_end_window (void)
39791 {
39792 gcc_assert (dispatch_window_list->num_insn <= MAX_INSN);
39793 if (dispatch_window_list->next)
39794 {
39795 gcc_assert (dispatch_window_list1->num_insn <= MAX_INSN);
39796 gcc_assert (dispatch_window_list->window_size
39797 + dispatch_window_list1->window_size <= 48);
39798 init_window (1);
39799 }
39800 init_window (0);
39801 }
39802
39803 /* Allocates a new dispatch window and adds it to WINDOW_LIST.
39804 WINDOW_NUM is either 0 or 1. A maximum of two windows are generated
39805 for 48 bytes of instructions. Note that these windows are not dispatch
39806 windows that their sizes are DISPATCH_WINDOW_SIZE. */
39807
39808 static dispatch_windows *
39809 allocate_next_window (int window_num)
39810 {
39811 if (window_num == 0)
39812 {
39813 if (dispatch_window_list->next)
39814 init_window (1);
39815 init_window (0);
39816 return dispatch_window_list;
39817 }
39818
39819 dispatch_window_list->next = dispatch_window_list1;
39820 dispatch_window_list1->prev = dispatch_window_list;
39821
39822 return dispatch_window_list1;
39823 }
39824
39825 /* Increment the number of immediate operands of an instruction. */
39826
39827 static int
39828 find_constant_1 (rtx *in_rtx, imm_info *imm_values)
39829 {
39830 if (*in_rtx == 0)
39831 return 0;
39832
39833 switch ( GET_CODE (*in_rtx))
39834 {
39835 case CONST:
39836 case SYMBOL_REF:
39837 case CONST_INT:
39838 (imm_values->imm)++;
39839 if (x86_64_immediate_operand (*in_rtx, SImode))
39840 (imm_values->imm32)++;
39841 else
39842 (imm_values->imm64)++;
39843 break;
39844
39845 case CONST_DOUBLE:
39846 (imm_values->imm)++;
39847 (imm_values->imm64)++;
39848 break;
39849
39850 case CODE_LABEL:
39851 if (LABEL_KIND (*in_rtx) == LABEL_NORMAL)
39852 {
39853 (imm_values->imm)++;
39854 (imm_values->imm32)++;
39855 }
39856 break;
39857
39858 default:
39859 break;
39860 }
39861
39862 return 0;
39863 }
39864
39865 /* Compute number of immediate operands of an instruction. */
39866
39867 static void
39868 find_constant (rtx in_rtx, imm_info *imm_values)
39869 {
39870 for_each_rtx (INSN_P (in_rtx) ? &PATTERN (in_rtx) : &in_rtx,
39871 (rtx_function) find_constant_1, (void *) imm_values);
39872 }
39873
39874 /* Return total size of immediate operands of an instruction along with number
39875 of corresponding immediate-operands. It initializes its parameters to zero
39876 befor calling FIND_CONSTANT.
39877 INSN is the input instruction. IMM is the total of immediates.
39878 IMM32 is the number of 32 bit immediates. IMM64 is the number of 64
39879 bit immediates. */
39880
39881 static int
39882 get_num_immediates (rtx insn, int *imm, int *imm32, int *imm64)
39883 {
39884 imm_info imm_values = {0, 0, 0};
39885
39886 find_constant (insn, &imm_values);
39887 *imm = imm_values.imm;
39888 *imm32 = imm_values.imm32;
39889 *imm64 = imm_values.imm64;
39890 return imm_values.imm32 * 4 + imm_values.imm64 * 8;
39891 }
39892
39893 /* This function indicates if an operand of an instruction is an
39894 immediate. */
39895
39896 static bool
39897 has_immediate (rtx insn)
39898 {
39899 int num_imm_operand;
39900 int num_imm32_operand;
39901 int num_imm64_operand;
39902
39903 if (insn)
39904 return get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
39905 &num_imm64_operand);
39906 return false;
39907 }
39908
39909 /* Return single or double path for instructions. */
39910
39911 static enum insn_path
39912 get_insn_path (rtx insn)
39913 {
39914 enum attr_amdfam10_decode path = get_attr_amdfam10_decode (insn);
39915
39916 if ((int)path == 0)
39917 return path_single;
39918
39919 if ((int)path == 1)
39920 return path_double;
39921
39922 return path_multi;
39923 }
39924
39925 /* Return insn dispatch group. */
39926
39927 static enum dispatch_group
39928 get_insn_group (rtx insn)
39929 {
39930 enum dispatch_group group = get_mem_group (insn);
39931 if (group)
39932 return group;
39933
39934 if (is_branch (insn))
39935 return disp_branch;
39936
39937 if (is_cmp (insn))
39938 return disp_cmp;
39939
39940 if (has_immediate (insn))
39941 return disp_imm;
39942
39943 if (is_prefetch (insn))
39944 return disp_prefetch;
39945
39946 return disp_no_group;
39947 }
39948
39949 /* Count number of GROUP restricted instructions in a dispatch
39950 window WINDOW_LIST. */
39951
39952 static int
39953 count_num_restricted (rtx insn, dispatch_windows *window_list)
39954 {
39955 enum dispatch_group group = get_insn_group (insn);
39956 int imm_size;
39957 int num_imm_operand;
39958 int num_imm32_operand;
39959 int num_imm64_operand;
39960
39961 if (group == disp_no_group)
39962 return 0;
39963
39964 if (group == disp_imm)
39965 {
39966 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
39967 &num_imm64_operand);
39968 if (window_list->imm_size + imm_size > MAX_IMM_SIZE
39969 || num_imm_operand + window_list->num_imm > MAX_IMM
39970 || (num_imm32_operand > 0
39971 && (window_list->num_imm_32 + num_imm32_operand > MAX_IMM_32
39972 || window_list->num_imm_64 * 2 + num_imm32_operand > MAX_IMM_32))
39973 || (num_imm64_operand > 0
39974 && (window_list->num_imm_64 + num_imm64_operand > MAX_IMM_64
39975 || window_list->num_imm_32 + num_imm64_operand * 2 > MAX_IMM_32))
39976 || (window_list->imm_size + imm_size == MAX_IMM_SIZE
39977 && num_imm64_operand > 0
39978 && ((window_list->num_imm_64 > 0
39979 && window_list->num_insn >= 2)
39980 || window_list->num_insn >= 3)))
39981 return BIG;
39982
39983 return 1;
39984 }
39985
39986 if ((group == disp_load_store
39987 && (window_list->num_loads >= MAX_LOAD
39988 || window_list->num_stores >= MAX_STORE))
39989 || ((group == disp_load
39990 || group == disp_prefetch)
39991 && window_list->num_loads >= MAX_LOAD)
39992 || (group == disp_store
39993 && window_list->num_stores >= MAX_STORE))
39994 return BIG;
39995
39996 return 1;
39997 }
39998
39999 /* This function returns true if insn satisfies dispatch rules on the
40000 last window scheduled. */
40001
40002 static bool
40003 fits_dispatch_window (rtx insn)
40004 {
40005 dispatch_windows *window_list = dispatch_window_list;
40006 dispatch_windows *window_list_next = dispatch_window_list->next;
40007 unsigned int num_restrict;
40008 enum dispatch_group group = get_insn_group (insn);
40009 enum insn_path path = get_insn_path (insn);
40010 int sum;
40011
40012 /* Make disp_cmp and disp_jcc get scheduled at the latest. These
40013 instructions should be given the lowest priority in the
40014 scheduling process in Haifa scheduler to make sure they will be
40015 scheduled in the same dispatch window as the reference to them. */
40016 if (group == disp_jcc || group == disp_cmp)
40017 return false;
40018
40019 /* Check nonrestricted. */
40020 if (group == disp_no_group || group == disp_branch)
40021 return true;
40022
40023 /* Get last dispatch window. */
40024 if (window_list_next)
40025 window_list = window_list_next;
40026
40027 if (window_list->window_num == 1)
40028 {
40029 sum = window_list->prev->window_size + window_list->window_size;
40030
40031 if (sum == 32
40032 || (min_insn_size (insn) + sum) >= 48)
40033 /* Window 1 is full. Go for next window. */
40034 return true;
40035 }
40036
40037 num_restrict = count_num_restricted (insn, window_list);
40038
40039 if (num_restrict > num_allowable_groups[group])
40040 return false;
40041
40042 /* See if it fits in the first window. */
40043 if (window_list->window_num == 0)
40044 {
40045 /* The first widow should have only single and double path
40046 uops. */
40047 if (path == path_double
40048 && (window_list->num_uops + 2) > MAX_INSN)
40049 return false;
40050 else if (path != path_single)
40051 return false;
40052 }
40053 return true;
40054 }
40055
40056 /* Add an instruction INSN with NUM_UOPS micro-operations to the
40057 dispatch window WINDOW_LIST. */
40058
40059 static void
40060 add_insn_window (rtx insn, dispatch_windows *window_list, int num_uops)
40061 {
40062 int byte_len = min_insn_size (insn);
40063 int num_insn = window_list->num_insn;
40064 int imm_size;
40065 sched_insn_info *window = window_list->window;
40066 enum dispatch_group group = get_insn_group (insn);
40067 enum insn_path path = get_insn_path (insn);
40068 int num_imm_operand;
40069 int num_imm32_operand;
40070 int num_imm64_operand;
40071
40072 if (!window_list->violation && group != disp_cmp
40073 && !fits_dispatch_window (insn))
40074 window_list->violation = true;
40075
40076 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
40077 &num_imm64_operand);
40078
40079 /* Initialize window with new instruction. */
40080 window[num_insn].insn = insn;
40081 window[num_insn].byte_len = byte_len;
40082 window[num_insn].group = group;
40083 window[num_insn].path = path;
40084 window[num_insn].imm_bytes = imm_size;
40085
40086 window_list->window_size += byte_len;
40087 window_list->num_insn = num_insn + 1;
40088 window_list->num_uops = window_list->num_uops + num_uops;
40089 window_list->imm_size += imm_size;
40090 window_list->num_imm += num_imm_operand;
40091 window_list->num_imm_32 += num_imm32_operand;
40092 window_list->num_imm_64 += num_imm64_operand;
40093
40094 if (group == disp_store)
40095 window_list->num_stores += 1;
40096 else if (group == disp_load
40097 || group == disp_prefetch)
40098 window_list->num_loads += 1;
40099 else if (group == disp_load_store)
40100 {
40101 window_list->num_stores += 1;
40102 window_list->num_loads += 1;
40103 }
40104 }
40105
40106 /* Adds a scheduled instruction, INSN, to the current dispatch window.
40107 If the total bytes of instructions or the number of instructions in
40108 the window exceed allowable, it allocates a new window. */
40109
40110 static void
40111 add_to_dispatch_window (rtx insn)
40112 {
40113 int byte_len;
40114 dispatch_windows *window_list;
40115 dispatch_windows *next_list;
40116 dispatch_windows *window0_list;
40117 enum insn_path path;
40118 enum dispatch_group insn_group;
40119 bool insn_fits;
40120 int num_insn;
40121 int num_uops;
40122 int window_num;
40123 int insn_num_uops;
40124 int sum;
40125
40126 if (INSN_CODE (insn) < 0)
40127 return;
40128
40129 byte_len = min_insn_size (insn);
40130 window_list = dispatch_window_list;
40131 next_list = window_list->next;
40132 path = get_insn_path (insn);
40133 insn_group = get_insn_group (insn);
40134
40135 /* Get the last dispatch window. */
40136 if (next_list)
40137 window_list = dispatch_window_list->next;
40138
40139 if (path == path_single)
40140 insn_num_uops = 1;
40141 else if (path == path_double)
40142 insn_num_uops = 2;
40143 else
40144 insn_num_uops = (int) path;
40145
40146 /* If current window is full, get a new window.
40147 Window number zero is full, if MAX_INSN uops are scheduled in it.
40148 Window number one is full, if window zero's bytes plus window
40149 one's bytes is 32, or if the bytes of the new instruction added
40150 to the total makes it greater than 48, or it has already MAX_INSN
40151 instructions in it. */
40152 num_insn = window_list->num_insn;
40153 num_uops = window_list->num_uops;
40154 window_num = window_list->window_num;
40155 insn_fits = fits_dispatch_window (insn);
40156
40157 if (num_insn >= MAX_INSN
40158 || num_uops + insn_num_uops > MAX_INSN
40159 || !(insn_fits))
40160 {
40161 window_num = ~window_num & 1;
40162 window_list = allocate_next_window (window_num);
40163 }
40164
40165 if (window_num == 0)
40166 {
40167 add_insn_window (insn, window_list, insn_num_uops);
40168 if (window_list->num_insn >= MAX_INSN
40169 && insn_group == disp_branch)
40170 {
40171 process_end_window ();
40172 return;
40173 }
40174 }
40175 else if (window_num == 1)
40176 {
40177 window0_list = window_list->prev;
40178 sum = window0_list->window_size + window_list->window_size;
40179 if (sum == 32
40180 || (byte_len + sum) >= 48)
40181 {
40182 process_end_window ();
40183 window_list = dispatch_window_list;
40184 }
40185
40186 add_insn_window (insn, window_list, insn_num_uops);
40187 }
40188 else
40189 gcc_unreachable ();
40190
40191 if (is_end_basic_block (insn_group))
40192 {
40193 /* End of basic block is reached do end-basic-block process. */
40194 process_end_window ();
40195 return;
40196 }
40197 }
40198
40199 /* Print the dispatch window, WINDOW_NUM, to FILE. */
40200
40201 DEBUG_FUNCTION static void
40202 debug_dispatch_window_file (FILE *file, int window_num)
40203 {
40204 dispatch_windows *list;
40205 int i;
40206
40207 if (window_num == 0)
40208 list = dispatch_window_list;
40209 else
40210 list = dispatch_window_list1;
40211
40212 fprintf (file, "Window #%d:\n", list->window_num);
40213 fprintf (file, " num_insn = %d, num_uops = %d, window_size = %d\n",
40214 list->num_insn, list->num_uops, list->window_size);
40215 fprintf (file, " num_imm = %d, num_imm_32 = %d, num_imm_64 = %d, imm_size = %d\n",
40216 list->num_imm, list->num_imm_32, list->num_imm_64, list->imm_size);
40217
40218 fprintf (file, " num_loads = %d, num_stores = %d\n", list->num_loads,
40219 list->num_stores);
40220 fprintf (file, " insn info:\n");
40221
40222 for (i = 0; i < MAX_INSN; i++)
40223 {
40224 if (!list->window[i].insn)
40225 break;
40226 fprintf (file, " group[%d] = %s, insn[%d] = %p, path[%d] = %d byte_len[%d] = %d, imm_bytes[%d] = %d\n",
40227 i, group_name[list->window[i].group],
40228 i, (void *)list->window[i].insn,
40229 i, list->window[i].path,
40230 i, list->window[i].byte_len,
40231 i, list->window[i].imm_bytes);
40232 }
40233 }
40234
40235 /* Print to stdout a dispatch window. */
40236
40237 DEBUG_FUNCTION void
40238 debug_dispatch_window (int window_num)
40239 {
40240 debug_dispatch_window_file (stdout, window_num);
40241 }
40242
40243 /* Print INSN dispatch information to FILE. */
40244
40245 DEBUG_FUNCTION static void
40246 debug_insn_dispatch_info_file (FILE *file, rtx insn)
40247 {
40248 int byte_len;
40249 enum insn_path path;
40250 enum dispatch_group group;
40251 int imm_size;
40252 int num_imm_operand;
40253 int num_imm32_operand;
40254 int num_imm64_operand;
40255
40256 if (INSN_CODE (insn) < 0)
40257 return;
40258
40259 byte_len = min_insn_size (insn);
40260 path = get_insn_path (insn);
40261 group = get_insn_group (insn);
40262 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
40263 &num_imm64_operand);
40264
40265 fprintf (file, " insn info:\n");
40266 fprintf (file, " group = %s, path = %d, byte_len = %d\n",
40267 group_name[group], path, byte_len);
40268 fprintf (file, " num_imm = %d, num_imm_32 = %d, num_imm_64 = %d, imm_size = %d\n",
40269 num_imm_operand, num_imm32_operand, num_imm64_operand, imm_size);
40270 }
40271
40272 /* Print to STDERR the status of the ready list with respect to
40273 dispatch windows. */
40274
40275 DEBUG_FUNCTION void
40276 debug_ready_dispatch (void)
40277 {
40278 int i;
40279 int no_ready = number_in_ready ();
40280
40281 fprintf (stdout, "Number of ready: %d\n", no_ready);
40282
40283 for (i = 0; i < no_ready; i++)
40284 debug_insn_dispatch_info_file (stdout, get_ready_element (i));
40285 }
40286
40287 /* This routine is the driver of the dispatch scheduler. */
40288
40289 static void
40290 do_dispatch (rtx insn, int mode)
40291 {
40292 if (mode == DISPATCH_INIT)
40293 init_dispatch_sched ();
40294 else if (mode == ADD_TO_DISPATCH_WINDOW)
40295 add_to_dispatch_window (insn);
40296 }
40297
40298 /* Return TRUE if Dispatch Scheduling is supported. */
40299
40300 static bool
40301 has_dispatch (rtx insn, int action)
40302 {
40303 if ((TARGET_BDVER1 || TARGET_BDVER2)
40304 && flag_dispatch_scheduler)
40305 switch (action)
40306 {
40307 default:
40308 return false;
40309
40310 case IS_DISPATCH_ON:
40311 return true;
40312 break;
40313
40314 case IS_CMP:
40315 return is_cmp (insn);
40316
40317 case DISPATCH_VIOLATION:
40318 return dispatch_violation ();
40319
40320 case FITS_DISPATCH_WINDOW:
40321 return fits_dispatch_window (insn);
40322 }
40323
40324 return false;
40325 }
40326
40327 /* Implementation of reassociation_width target hook used by
40328 reassoc phase to identify parallelism level in reassociated
40329 tree. Statements tree_code is passed in OPC. Arguments type
40330 is passed in MODE.
40331
40332 Currently parallel reassociation is enabled for Atom
40333 processors only and we set reassociation width to be 2
40334 because Atom may issue up to 2 instructions per cycle.
40335
40336 Return value should be fixed if parallel reassociation is
40337 enabled for other processors. */
40338
40339 static int
40340 ix86_reassociation_width (unsigned int opc ATTRIBUTE_UNUSED,
40341 enum machine_mode mode)
40342 {
40343 int res = 1;
40344
40345 if (INTEGRAL_MODE_P (mode) && TARGET_REASSOC_INT_TO_PARALLEL)
40346 res = 2;
40347 else if (FLOAT_MODE_P (mode) && TARGET_REASSOC_FP_TO_PARALLEL)
40348 res = 2;
40349
40350 return res;
40351 }
40352
40353 /* ??? No autovectorization into MMX or 3DNOW until we can reliably
40354 place emms and femms instructions. */
40355
40356 static enum machine_mode
40357 ix86_preferred_simd_mode (enum machine_mode mode)
40358 {
40359 if (!TARGET_SSE)
40360 return word_mode;
40361
40362 switch (mode)
40363 {
40364 case QImode:
40365 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V32QImode : V16QImode;
40366 case HImode:
40367 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V16HImode : V8HImode;
40368 case SImode:
40369 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V8SImode : V4SImode;
40370 case DImode:
40371 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V4DImode : V2DImode;
40372
40373 case SFmode:
40374 if (TARGET_AVX && !TARGET_PREFER_AVX128)
40375 return V8SFmode;
40376 else
40377 return V4SFmode;
40378
40379 case DFmode:
40380 if (!TARGET_VECTORIZE_DOUBLE)
40381 return word_mode;
40382 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
40383 return V4DFmode;
40384 else if (TARGET_SSE2)
40385 return V2DFmode;
40386 /* FALLTHRU */
40387
40388 default:
40389 return word_mode;
40390 }
40391 }
40392
40393 /* If AVX is enabled then try vectorizing with both 256bit and 128bit
40394 vectors. */
40395
40396 static unsigned int
40397 ix86_autovectorize_vector_sizes (void)
40398 {
40399 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? 32 | 16 : 0;
40400 }
40401
40402 /* Implement targetm.vectorize.init_cost. */
40403
40404 static void *
40405 ix86_init_cost (struct loop *loop_info ATTRIBUTE_UNUSED)
40406 {
40407 unsigned *cost = XNEWVEC (unsigned, 3);
40408 cost[vect_prologue] = cost[vect_body] = cost[vect_epilogue] = 0;
40409 return cost;
40410 }
40411
40412 /* Implement targetm.vectorize.add_stmt_cost. */
40413
40414 static unsigned
40415 ix86_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
40416 struct _stmt_vec_info *stmt_info, int misalign,
40417 enum vect_cost_model_location where)
40418 {
40419 unsigned *cost = (unsigned *) data;
40420 unsigned retval = 0;
40421
40422 if (flag_vect_cost_model)
40423 {
40424 tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
40425 int stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign);
40426
40427 /* Statements in an inner loop relative to the loop being
40428 vectorized are weighted more heavily. The value here is
40429 arbitrary and could potentially be improved with analysis. */
40430 if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
40431 count *= 50; /* FIXME. */
40432
40433 retval = (unsigned) (count * stmt_cost);
40434 cost[where] += retval;
40435 }
40436
40437 return retval;
40438 }
40439
40440 /* Implement targetm.vectorize.finish_cost. */
40441
40442 static void
40443 ix86_finish_cost (void *data, unsigned *prologue_cost,
40444 unsigned *body_cost, unsigned *epilogue_cost)
40445 {
40446 unsigned *cost = (unsigned *) data;
40447 *prologue_cost = cost[vect_prologue];
40448 *body_cost = cost[vect_body];
40449 *epilogue_cost = cost[vect_epilogue];
40450 }
40451
40452 /* Implement targetm.vectorize.destroy_cost_data. */
40453
40454 static void
40455 ix86_destroy_cost_data (void *data)
40456 {
40457 free (data);
40458 }
40459
40460 /* Validate target specific memory model bits in VAL. */
40461
40462 static unsigned HOST_WIDE_INT
40463 ix86_memmodel_check (unsigned HOST_WIDE_INT val)
40464 {
40465 unsigned HOST_WIDE_INT model = val & MEMMODEL_MASK;
40466 unsigned HOST_WIDE_INT strong;
40467
40468 if (val & ~(unsigned HOST_WIDE_INT)(IX86_HLE_ACQUIRE|IX86_HLE_RELEASE
40469 |MEMMODEL_MASK)
40470 || ((val & IX86_HLE_ACQUIRE) && (val & IX86_HLE_RELEASE)))
40471 {
40472 warning (OPT_Winvalid_memory_model,
40473 "Unknown architecture specific memory model");
40474 return MEMMODEL_SEQ_CST;
40475 }
40476 strong = (model == MEMMODEL_ACQ_REL || model == MEMMODEL_SEQ_CST);
40477 if (val & IX86_HLE_ACQUIRE && !(model == MEMMODEL_ACQUIRE || strong))
40478 {
40479 warning (OPT_Winvalid_memory_model,
40480 "HLE_ACQUIRE not used with ACQUIRE or stronger memory model");
40481 return MEMMODEL_SEQ_CST | IX86_HLE_ACQUIRE;
40482 }
40483 if (val & IX86_HLE_RELEASE && !(model == MEMMODEL_RELEASE || strong))
40484 {
40485 warning (OPT_Winvalid_memory_model,
40486 "HLE_RELEASE not used with RELEASE or stronger memory model");
40487 return MEMMODEL_SEQ_CST | IX86_HLE_RELEASE;
40488 }
40489 return val;
40490 }
40491
40492 /* Initialize the GCC target structure. */
40493 #undef TARGET_RETURN_IN_MEMORY
40494 #define TARGET_RETURN_IN_MEMORY ix86_return_in_memory
40495
40496 #undef TARGET_LEGITIMIZE_ADDRESS
40497 #define TARGET_LEGITIMIZE_ADDRESS ix86_legitimize_address
40498
40499 #undef TARGET_ATTRIBUTE_TABLE
40500 #define TARGET_ATTRIBUTE_TABLE ix86_attribute_table
40501 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
40502 # undef TARGET_MERGE_DECL_ATTRIBUTES
40503 # define TARGET_MERGE_DECL_ATTRIBUTES merge_dllimport_decl_attributes
40504 #endif
40505
40506 #undef TARGET_COMP_TYPE_ATTRIBUTES
40507 #define TARGET_COMP_TYPE_ATTRIBUTES ix86_comp_type_attributes
40508
40509 #undef TARGET_INIT_BUILTINS
40510 #define TARGET_INIT_BUILTINS ix86_init_builtins
40511 #undef TARGET_BUILTIN_DECL
40512 #define TARGET_BUILTIN_DECL ix86_builtin_decl
40513 #undef TARGET_EXPAND_BUILTIN
40514 #define TARGET_EXPAND_BUILTIN ix86_expand_builtin
40515
40516 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
40517 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
40518 ix86_builtin_vectorized_function
40519
40520 #undef TARGET_VECTORIZE_BUILTIN_TM_LOAD
40521 #define TARGET_VECTORIZE_BUILTIN_TM_LOAD ix86_builtin_tm_load
40522
40523 #undef TARGET_VECTORIZE_BUILTIN_TM_STORE
40524 #define TARGET_VECTORIZE_BUILTIN_TM_STORE ix86_builtin_tm_store
40525
40526 #undef TARGET_VECTORIZE_BUILTIN_GATHER
40527 #define TARGET_VECTORIZE_BUILTIN_GATHER ix86_vectorize_builtin_gather
40528
40529 #undef TARGET_BUILTIN_RECIPROCAL
40530 #define TARGET_BUILTIN_RECIPROCAL ix86_builtin_reciprocal
40531
40532 #undef TARGET_ASM_FUNCTION_EPILOGUE
40533 #define TARGET_ASM_FUNCTION_EPILOGUE ix86_output_function_epilogue
40534
40535 #undef TARGET_ENCODE_SECTION_INFO
40536 #ifndef SUBTARGET_ENCODE_SECTION_INFO
40537 #define TARGET_ENCODE_SECTION_INFO ix86_encode_section_info
40538 #else
40539 #define TARGET_ENCODE_SECTION_INFO SUBTARGET_ENCODE_SECTION_INFO
40540 #endif
40541
40542 #undef TARGET_ASM_OPEN_PAREN
40543 #define TARGET_ASM_OPEN_PAREN ""
40544 #undef TARGET_ASM_CLOSE_PAREN
40545 #define TARGET_ASM_CLOSE_PAREN ""
40546
40547 #undef TARGET_ASM_BYTE_OP
40548 #define TARGET_ASM_BYTE_OP ASM_BYTE
40549
40550 #undef TARGET_ASM_ALIGNED_HI_OP
40551 #define TARGET_ASM_ALIGNED_HI_OP ASM_SHORT
40552 #undef TARGET_ASM_ALIGNED_SI_OP
40553 #define TARGET_ASM_ALIGNED_SI_OP ASM_LONG
40554 #ifdef ASM_QUAD
40555 #undef TARGET_ASM_ALIGNED_DI_OP
40556 #define TARGET_ASM_ALIGNED_DI_OP ASM_QUAD
40557 #endif
40558
40559 #undef TARGET_PROFILE_BEFORE_PROLOGUE
40560 #define TARGET_PROFILE_BEFORE_PROLOGUE ix86_profile_before_prologue
40561
40562 #undef TARGET_ASM_UNALIGNED_HI_OP
40563 #define TARGET_ASM_UNALIGNED_HI_OP TARGET_ASM_ALIGNED_HI_OP
40564 #undef TARGET_ASM_UNALIGNED_SI_OP
40565 #define TARGET_ASM_UNALIGNED_SI_OP TARGET_ASM_ALIGNED_SI_OP
40566 #undef TARGET_ASM_UNALIGNED_DI_OP
40567 #define TARGET_ASM_UNALIGNED_DI_OP TARGET_ASM_ALIGNED_DI_OP
40568
40569 #undef TARGET_PRINT_OPERAND
40570 #define TARGET_PRINT_OPERAND ix86_print_operand
40571 #undef TARGET_PRINT_OPERAND_ADDRESS
40572 #define TARGET_PRINT_OPERAND_ADDRESS ix86_print_operand_address
40573 #undef TARGET_PRINT_OPERAND_PUNCT_VALID_P
40574 #define TARGET_PRINT_OPERAND_PUNCT_VALID_P ix86_print_operand_punct_valid_p
40575 #undef TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA
40576 #define TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA i386_asm_output_addr_const_extra
40577
40578 #undef TARGET_SCHED_INIT_GLOBAL
40579 #define TARGET_SCHED_INIT_GLOBAL ix86_sched_init_global
40580 #undef TARGET_SCHED_ADJUST_COST
40581 #define TARGET_SCHED_ADJUST_COST ix86_adjust_cost
40582 #undef TARGET_SCHED_ISSUE_RATE
40583 #define TARGET_SCHED_ISSUE_RATE ix86_issue_rate
40584 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
40585 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
40586 ia32_multipass_dfa_lookahead
40587
40588 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
40589 #define TARGET_FUNCTION_OK_FOR_SIBCALL ix86_function_ok_for_sibcall
40590
40591 #undef TARGET_MEMMODEL_CHECK
40592 #define TARGET_MEMMODEL_CHECK ix86_memmodel_check
40593
40594 #ifdef HAVE_AS_TLS
40595 #undef TARGET_HAVE_TLS
40596 #define TARGET_HAVE_TLS true
40597 #endif
40598 #undef TARGET_CANNOT_FORCE_CONST_MEM
40599 #define TARGET_CANNOT_FORCE_CONST_MEM ix86_cannot_force_const_mem
40600 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
40601 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P hook_bool_mode_const_rtx_true
40602
40603 #undef TARGET_DELEGITIMIZE_ADDRESS
40604 #define TARGET_DELEGITIMIZE_ADDRESS ix86_delegitimize_address
40605
40606 #undef TARGET_MS_BITFIELD_LAYOUT_P
40607 #define TARGET_MS_BITFIELD_LAYOUT_P ix86_ms_bitfield_layout_p
40608
40609 #if TARGET_MACHO
40610 #undef TARGET_BINDS_LOCAL_P
40611 #define TARGET_BINDS_LOCAL_P darwin_binds_local_p
40612 #endif
40613 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
40614 #undef TARGET_BINDS_LOCAL_P
40615 #define TARGET_BINDS_LOCAL_P i386_pe_binds_local_p
40616 #endif
40617
40618 #undef TARGET_ASM_OUTPUT_MI_THUNK
40619 #define TARGET_ASM_OUTPUT_MI_THUNK x86_output_mi_thunk
40620 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
40621 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK x86_can_output_mi_thunk
40622
40623 #undef TARGET_ASM_FILE_START
40624 #define TARGET_ASM_FILE_START x86_file_start
40625
40626 #undef TARGET_OPTION_OVERRIDE
40627 #define TARGET_OPTION_OVERRIDE ix86_option_override
40628
40629 #undef TARGET_REGISTER_MOVE_COST
40630 #define TARGET_REGISTER_MOVE_COST ix86_register_move_cost
40631 #undef TARGET_MEMORY_MOVE_COST
40632 #define TARGET_MEMORY_MOVE_COST ix86_memory_move_cost
40633 #undef TARGET_RTX_COSTS
40634 #define TARGET_RTX_COSTS ix86_rtx_costs
40635 #undef TARGET_ADDRESS_COST
40636 #define TARGET_ADDRESS_COST ix86_address_cost
40637
40638 #undef TARGET_FIXED_CONDITION_CODE_REGS
40639 #define TARGET_FIXED_CONDITION_CODE_REGS ix86_fixed_condition_code_regs
40640 #undef TARGET_CC_MODES_COMPATIBLE
40641 #define TARGET_CC_MODES_COMPATIBLE ix86_cc_modes_compatible
40642
40643 #undef TARGET_MACHINE_DEPENDENT_REORG
40644 #define TARGET_MACHINE_DEPENDENT_REORG ix86_reorg
40645
40646 #undef TARGET_BUILTIN_SETJMP_FRAME_VALUE
40647 #define TARGET_BUILTIN_SETJMP_FRAME_VALUE ix86_builtin_setjmp_frame_value
40648
40649 #undef TARGET_BUILD_BUILTIN_VA_LIST
40650 #define TARGET_BUILD_BUILTIN_VA_LIST ix86_build_builtin_va_list
40651
40652 #undef TARGET_FOLD_BUILTIN
40653 #define TARGET_FOLD_BUILTIN ix86_fold_builtin
40654
40655 #undef TARGET_ENUM_VA_LIST_P
40656 #define TARGET_ENUM_VA_LIST_P ix86_enum_va_list
40657
40658 #undef TARGET_FN_ABI_VA_LIST
40659 #define TARGET_FN_ABI_VA_LIST ix86_fn_abi_va_list
40660
40661 #undef TARGET_CANONICAL_VA_LIST_TYPE
40662 #define TARGET_CANONICAL_VA_LIST_TYPE ix86_canonical_va_list_type
40663
40664 #undef TARGET_EXPAND_BUILTIN_VA_START
40665 #define TARGET_EXPAND_BUILTIN_VA_START ix86_va_start
40666
40667 #undef TARGET_MD_ASM_CLOBBERS
40668 #define TARGET_MD_ASM_CLOBBERS ix86_md_asm_clobbers
40669
40670 #undef TARGET_PROMOTE_PROTOTYPES
40671 #define TARGET_PROMOTE_PROTOTYPES hook_bool_const_tree_true
40672 #undef TARGET_STRUCT_VALUE_RTX
40673 #define TARGET_STRUCT_VALUE_RTX ix86_struct_value_rtx
40674 #undef TARGET_SETUP_INCOMING_VARARGS
40675 #define TARGET_SETUP_INCOMING_VARARGS ix86_setup_incoming_varargs
40676 #undef TARGET_MUST_PASS_IN_STACK
40677 #define TARGET_MUST_PASS_IN_STACK ix86_must_pass_in_stack
40678 #undef TARGET_FUNCTION_ARG_ADVANCE
40679 #define TARGET_FUNCTION_ARG_ADVANCE ix86_function_arg_advance
40680 #undef TARGET_FUNCTION_ARG
40681 #define TARGET_FUNCTION_ARG ix86_function_arg
40682 #undef TARGET_FUNCTION_ARG_BOUNDARY
40683 #define TARGET_FUNCTION_ARG_BOUNDARY ix86_function_arg_boundary
40684 #undef TARGET_PASS_BY_REFERENCE
40685 #define TARGET_PASS_BY_REFERENCE ix86_pass_by_reference
40686 #undef TARGET_INTERNAL_ARG_POINTER
40687 #define TARGET_INTERNAL_ARG_POINTER ix86_internal_arg_pointer
40688 #undef TARGET_UPDATE_STACK_BOUNDARY
40689 #define TARGET_UPDATE_STACK_BOUNDARY ix86_update_stack_boundary
40690 #undef TARGET_GET_DRAP_RTX
40691 #define TARGET_GET_DRAP_RTX ix86_get_drap_rtx
40692 #undef TARGET_STRICT_ARGUMENT_NAMING
40693 #define TARGET_STRICT_ARGUMENT_NAMING hook_bool_CUMULATIVE_ARGS_true
40694 #undef TARGET_STATIC_CHAIN
40695 #define TARGET_STATIC_CHAIN ix86_static_chain
40696 #undef TARGET_TRAMPOLINE_INIT
40697 #define TARGET_TRAMPOLINE_INIT ix86_trampoline_init
40698 #undef TARGET_RETURN_POPS_ARGS
40699 #define TARGET_RETURN_POPS_ARGS ix86_return_pops_args
40700
40701 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
40702 #define TARGET_GIMPLIFY_VA_ARG_EXPR ix86_gimplify_va_arg
40703
40704 #undef TARGET_SCALAR_MODE_SUPPORTED_P
40705 #define TARGET_SCALAR_MODE_SUPPORTED_P ix86_scalar_mode_supported_p
40706
40707 #undef TARGET_VECTOR_MODE_SUPPORTED_P
40708 #define TARGET_VECTOR_MODE_SUPPORTED_P ix86_vector_mode_supported_p
40709
40710 #undef TARGET_C_MODE_FOR_SUFFIX
40711 #define TARGET_C_MODE_FOR_SUFFIX ix86_c_mode_for_suffix
40712
40713 #ifdef HAVE_AS_TLS
40714 #undef TARGET_ASM_OUTPUT_DWARF_DTPREL
40715 #define TARGET_ASM_OUTPUT_DWARF_DTPREL i386_output_dwarf_dtprel
40716 #endif
40717
40718 #ifdef SUBTARGET_INSERT_ATTRIBUTES
40719 #undef TARGET_INSERT_ATTRIBUTES
40720 #define TARGET_INSERT_ATTRIBUTES SUBTARGET_INSERT_ATTRIBUTES
40721 #endif
40722
40723 #undef TARGET_MANGLE_TYPE
40724 #define TARGET_MANGLE_TYPE ix86_mangle_type
40725
40726 #if !TARGET_MACHO
40727 #undef TARGET_STACK_PROTECT_FAIL
40728 #define TARGET_STACK_PROTECT_FAIL ix86_stack_protect_fail
40729 #endif
40730
40731 #undef TARGET_FUNCTION_VALUE
40732 #define TARGET_FUNCTION_VALUE ix86_function_value
40733
40734 #undef TARGET_FUNCTION_VALUE_REGNO_P
40735 #define TARGET_FUNCTION_VALUE_REGNO_P ix86_function_value_regno_p
40736
40737 #undef TARGET_PROMOTE_FUNCTION_MODE
40738 #define TARGET_PROMOTE_FUNCTION_MODE ix86_promote_function_mode
40739
40740 #undef TARGET_MEMBER_TYPE_FORCES_BLK
40741 #define TARGET_MEMBER_TYPE_FORCES_BLK ix86_member_type_forces_blk
40742
40743 #undef TARGET_SECONDARY_RELOAD
40744 #define TARGET_SECONDARY_RELOAD ix86_secondary_reload
40745
40746 #undef TARGET_CLASS_MAX_NREGS
40747 #define TARGET_CLASS_MAX_NREGS ix86_class_max_nregs
40748
40749 #undef TARGET_PREFERRED_RELOAD_CLASS
40750 #define TARGET_PREFERRED_RELOAD_CLASS ix86_preferred_reload_class
40751 #undef TARGET_PREFERRED_OUTPUT_RELOAD_CLASS
40752 #define TARGET_PREFERRED_OUTPUT_RELOAD_CLASS ix86_preferred_output_reload_class
40753 #undef TARGET_CLASS_LIKELY_SPILLED_P
40754 #define TARGET_CLASS_LIKELY_SPILLED_P ix86_class_likely_spilled_p
40755
40756 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
40757 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
40758 ix86_builtin_vectorization_cost
40759 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
40760 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
40761 ix86_vectorize_vec_perm_const_ok
40762 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
40763 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE \
40764 ix86_preferred_simd_mode
40765 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
40766 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
40767 ix86_autovectorize_vector_sizes
40768 #undef TARGET_VECTORIZE_INIT_COST
40769 #define TARGET_VECTORIZE_INIT_COST ix86_init_cost
40770 #undef TARGET_VECTORIZE_ADD_STMT_COST
40771 #define TARGET_VECTORIZE_ADD_STMT_COST ix86_add_stmt_cost
40772 #undef TARGET_VECTORIZE_FINISH_COST
40773 #define TARGET_VECTORIZE_FINISH_COST ix86_finish_cost
40774 #undef TARGET_VECTORIZE_DESTROY_COST_DATA
40775 #define TARGET_VECTORIZE_DESTROY_COST_DATA ix86_destroy_cost_data
40776
40777 #undef TARGET_SET_CURRENT_FUNCTION
40778 #define TARGET_SET_CURRENT_FUNCTION ix86_set_current_function
40779
40780 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
40781 #define TARGET_OPTION_VALID_ATTRIBUTE_P ix86_valid_target_attribute_p
40782
40783 #undef TARGET_OPTION_SAVE
40784 #define TARGET_OPTION_SAVE ix86_function_specific_save
40785
40786 #undef TARGET_OPTION_RESTORE
40787 #define TARGET_OPTION_RESTORE ix86_function_specific_restore
40788
40789 #undef TARGET_OPTION_PRINT
40790 #define TARGET_OPTION_PRINT ix86_function_specific_print
40791
40792 #undef TARGET_CAN_INLINE_P
40793 #define TARGET_CAN_INLINE_P ix86_can_inline_p
40794
40795 #undef TARGET_EXPAND_TO_RTL_HOOK
40796 #define TARGET_EXPAND_TO_RTL_HOOK ix86_maybe_switch_abi
40797
40798 #undef TARGET_LEGITIMATE_ADDRESS_P
40799 #define TARGET_LEGITIMATE_ADDRESS_P ix86_legitimate_address_p
40800
40801 #undef TARGET_LEGITIMATE_CONSTANT_P
40802 #define TARGET_LEGITIMATE_CONSTANT_P ix86_legitimate_constant_p
40803
40804 #undef TARGET_FRAME_POINTER_REQUIRED
40805 #define TARGET_FRAME_POINTER_REQUIRED ix86_frame_pointer_required
40806
40807 #undef TARGET_CAN_ELIMINATE
40808 #define TARGET_CAN_ELIMINATE ix86_can_eliminate
40809
40810 #undef TARGET_EXTRA_LIVE_ON_ENTRY
40811 #define TARGET_EXTRA_LIVE_ON_ENTRY ix86_live_on_entry
40812
40813 #undef TARGET_ASM_CODE_END
40814 #define TARGET_ASM_CODE_END ix86_code_end
40815
40816 #undef TARGET_CONDITIONAL_REGISTER_USAGE
40817 #define TARGET_CONDITIONAL_REGISTER_USAGE ix86_conditional_register_usage
40818
40819 #if TARGET_MACHO
40820 #undef TARGET_INIT_LIBFUNCS
40821 #define TARGET_INIT_LIBFUNCS darwin_rename_builtins
40822 #endif
40823
40824 struct gcc_target targetm = TARGET_INITIALIZER;
40825 \f
40826 #include "gt-i386.h"