* Revert my last commit.
[gcc.git] / gcc / config / i386 / i386.c
1 /* Subroutines used for code generation on IA-32.
2 Copyright (C) 1988, 1992, 1994, 1995, 1996, 1997, 1998, 1999, 2000,
3 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012
4 Free Software Foundation, Inc.
5
6 This file is part of GCC.
7
8 GCC is free software; you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation; either version 3, or (at your option)
11 any later version.
12
13 GCC is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
21
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "tm.h"
26 #include "rtl.h"
27 #include "tree.h"
28 #include "tm_p.h"
29 #include "regs.h"
30 #include "hard-reg-set.h"
31 #include "insn-config.h"
32 #include "conditions.h"
33 #include "output.h"
34 #include "insn-codes.h"
35 #include "insn-attr.h"
36 #include "flags.h"
37 #include "except.h"
38 #include "function.h"
39 #include "recog.h"
40 #include "expr.h"
41 #include "optabs.h"
42 #include "diagnostic-core.h"
43 #include "toplev.h"
44 #include "basic-block.h"
45 #include "ggc.h"
46 #include "target.h"
47 #include "target-def.h"
48 #include "common/common-target.h"
49 #include "langhooks.h"
50 #include "reload.h"
51 #include "cgraph.h"
52 #include "gimple.h"
53 #include "dwarf2.h"
54 #include "df.h"
55 #include "tm-constrs.h"
56 #include "params.h"
57 #include "cselib.h"
58 #include "debug.h"
59 #include "sched-int.h"
60 #include "sbitmap.h"
61 #include "fibheap.h"
62 #include "opts.h"
63 #include "diagnostic.h"
64 #include "dumpfile.h"
65
66 enum upper_128bits_state
67 {
68 unknown = 0,
69 unused,
70 used
71 };
72
73 typedef struct block_info_def
74 {
75 /* State of the upper 128bits of AVX registers at exit. */
76 enum upper_128bits_state state;
77 /* TRUE if state of the upper 128bits of AVX registers is unchanged
78 in this block. */
79 bool unchanged;
80 /* TRUE if block has been processed. */
81 bool processed;
82 /* TRUE if block has been scanned. */
83 bool scanned;
84 /* Previous state of the upper 128bits of AVX registers at entry. */
85 enum upper_128bits_state prev;
86 } *block_info;
87
88 #define BLOCK_INFO(B) ((block_info) (B)->aux)
89
90 enum call_avx256_state
91 {
92 /* Callee returns 256bit AVX register. */
93 callee_return_avx256 = -1,
94 /* Callee returns and passes 256bit AVX register. */
95 callee_return_pass_avx256,
96 /* Callee passes 256bit AVX register. */
97 callee_pass_avx256,
98 /* Callee doesn't return nor passe 256bit AVX register, or no
99 256bit AVX register in function return. */
100 call_no_avx256,
101 /* vzeroupper intrinsic. */
102 vzeroupper_intrinsic
103 };
104
105 /* Check if a 256bit AVX register is referenced in stores. */
106
107 static void
108 check_avx256_stores (rtx dest, const_rtx set, void *data)
109 {
110 if ((REG_P (dest)
111 && VALID_AVX256_REG_MODE (GET_MODE (dest)))
112 || (GET_CODE (set) == SET
113 && REG_P (SET_SRC (set))
114 && VALID_AVX256_REG_MODE (GET_MODE (SET_SRC (set)))))
115 {
116 enum upper_128bits_state *state
117 = (enum upper_128bits_state *) data;
118 *state = used;
119 }
120 }
121
122 /* Helper function for move_or_delete_vzeroupper_1. Look for vzeroupper
123 in basic block BB. Delete it if upper 128bit AVX registers are
124 unused. If it isn't deleted, move it to just before a jump insn.
125
126 STATE is state of the upper 128bits of AVX registers at entry. */
127
128 static void
129 move_or_delete_vzeroupper_2 (basic_block bb,
130 enum upper_128bits_state state)
131 {
132 rtx insn, bb_end;
133 rtx vzeroupper_insn = NULL_RTX;
134 rtx pat;
135 int avx256;
136 bool unchanged;
137
138 if (BLOCK_INFO (bb)->unchanged)
139 {
140 if (dump_file)
141 fprintf (dump_file, " [bb %i] unchanged: upper 128bits: %d\n",
142 bb->index, state);
143
144 BLOCK_INFO (bb)->state = state;
145 return;
146 }
147
148 if (BLOCK_INFO (bb)->scanned && BLOCK_INFO (bb)->prev == state)
149 {
150 if (dump_file)
151 fprintf (dump_file, " [bb %i] scanned: upper 128bits: %d\n",
152 bb->index, BLOCK_INFO (bb)->state);
153 return;
154 }
155
156 BLOCK_INFO (bb)->prev = state;
157
158 if (dump_file)
159 fprintf (dump_file, " [bb %i] entry: upper 128bits: %d\n",
160 bb->index, state);
161
162 unchanged = true;
163
164 /* BB_END changes when it is deleted. */
165 bb_end = BB_END (bb);
166 insn = BB_HEAD (bb);
167 while (insn != bb_end)
168 {
169 insn = NEXT_INSN (insn);
170
171 if (!NONDEBUG_INSN_P (insn))
172 continue;
173
174 /* Move vzeroupper before jump/call. */
175 if (JUMP_P (insn) || CALL_P (insn))
176 {
177 if (!vzeroupper_insn)
178 continue;
179
180 if (PREV_INSN (insn) != vzeroupper_insn)
181 {
182 if (dump_file)
183 {
184 fprintf (dump_file, "Move vzeroupper after:\n");
185 print_rtl_single (dump_file, PREV_INSN (insn));
186 fprintf (dump_file, "before:\n");
187 print_rtl_single (dump_file, insn);
188 }
189 reorder_insns_nobb (vzeroupper_insn, vzeroupper_insn,
190 PREV_INSN (insn));
191 }
192 vzeroupper_insn = NULL_RTX;
193 continue;
194 }
195
196 pat = PATTERN (insn);
197
198 /* Check insn for vzeroupper intrinsic. */
199 if (GET_CODE (pat) == UNSPEC_VOLATILE
200 && XINT (pat, 1) == UNSPECV_VZEROUPPER)
201 {
202 if (dump_file)
203 {
204 /* Found vzeroupper intrinsic. */
205 fprintf (dump_file, "Found vzeroupper:\n");
206 print_rtl_single (dump_file, insn);
207 }
208 }
209 else
210 {
211 /* Check insn for vzeroall intrinsic. */
212 if (GET_CODE (pat) == PARALLEL
213 && GET_CODE (XVECEXP (pat, 0, 0)) == UNSPEC_VOLATILE
214 && XINT (XVECEXP (pat, 0, 0), 1) == UNSPECV_VZEROALL)
215 {
216 state = unused;
217 unchanged = false;
218
219 /* Delete pending vzeroupper insertion. */
220 if (vzeroupper_insn)
221 {
222 delete_insn (vzeroupper_insn);
223 vzeroupper_insn = NULL_RTX;
224 }
225 }
226 else if (state != used)
227 {
228 note_stores (pat, check_avx256_stores, &state);
229 if (state == used)
230 unchanged = false;
231 }
232 continue;
233 }
234
235 /* Process vzeroupper intrinsic. */
236 avx256 = INTVAL (XVECEXP (pat, 0, 0));
237
238 if (state == unused)
239 {
240 /* Since the upper 128bits are cleared, callee must not pass
241 256bit AVX register. We only need to check if callee
242 returns 256bit AVX register. */
243 if (avx256 == callee_return_avx256)
244 {
245 state = used;
246 unchanged = false;
247 }
248
249 /* Remove unnecessary vzeroupper since upper 128bits are
250 cleared. */
251 if (dump_file)
252 {
253 fprintf (dump_file, "Delete redundant vzeroupper:\n");
254 print_rtl_single (dump_file, insn);
255 }
256 delete_insn (insn);
257 }
258 else
259 {
260 /* Set state to UNUSED if callee doesn't return 256bit AVX
261 register. */
262 if (avx256 != callee_return_pass_avx256)
263 state = unused;
264
265 if (avx256 == callee_return_pass_avx256
266 || avx256 == callee_pass_avx256)
267 {
268 /* Must remove vzeroupper since callee passes in 256bit
269 AVX register. */
270 if (dump_file)
271 {
272 fprintf (dump_file, "Delete callee pass vzeroupper:\n");
273 print_rtl_single (dump_file, insn);
274 }
275 delete_insn (insn);
276 }
277 else
278 {
279 vzeroupper_insn = insn;
280 unchanged = false;
281 }
282 }
283 }
284
285 BLOCK_INFO (bb)->state = state;
286 BLOCK_INFO (bb)->unchanged = unchanged;
287 BLOCK_INFO (bb)->scanned = true;
288
289 if (dump_file)
290 fprintf (dump_file, " [bb %i] exit: %s: upper 128bits: %d\n",
291 bb->index, unchanged ? "unchanged" : "changed",
292 state);
293 }
294
295 /* Helper function for move_or_delete_vzeroupper. Process vzeroupper
296 in BLOCK and check its predecessor blocks. Treat UNKNOWN state
297 as USED if UNKNOWN_IS_UNUSED is true. Return TRUE if the exit
298 state is changed. */
299
300 static bool
301 move_or_delete_vzeroupper_1 (basic_block block, bool unknown_is_unused)
302 {
303 edge e;
304 edge_iterator ei;
305 enum upper_128bits_state state, old_state, new_state;
306 bool seen_unknown;
307
308 if (dump_file)
309 fprintf (dump_file, " Process [bb %i]: status: %d\n",
310 block->index, BLOCK_INFO (block)->processed);
311
312 if (BLOCK_INFO (block)->processed)
313 return false;
314
315 state = unused;
316
317 /* Check all predecessor edges of this block. */
318 seen_unknown = false;
319 FOR_EACH_EDGE (e, ei, block->preds)
320 {
321 if (e->src == block)
322 continue;
323 switch (BLOCK_INFO (e->src)->state)
324 {
325 case unknown:
326 if (!unknown_is_unused)
327 seen_unknown = true;
328 case unused:
329 break;
330 case used:
331 state = used;
332 goto done;
333 }
334 }
335
336 if (seen_unknown)
337 state = unknown;
338
339 done:
340 old_state = BLOCK_INFO (block)->state;
341 move_or_delete_vzeroupper_2 (block, state);
342 new_state = BLOCK_INFO (block)->state;
343
344 if (state != unknown || new_state == used)
345 BLOCK_INFO (block)->processed = true;
346
347 /* Need to rescan if the upper 128bits of AVX registers are changed
348 to USED at exit. */
349 if (new_state != old_state)
350 {
351 if (new_state == used)
352 cfun->machine->rescan_vzeroupper_p = 1;
353 return true;
354 }
355 else
356 return false;
357 }
358
359 /* Go through the instruction stream looking for vzeroupper. Delete
360 it if upper 128bit AVX registers are unused. If it isn't deleted,
361 move it to just before a jump insn. */
362
363 static void
364 move_or_delete_vzeroupper (void)
365 {
366 edge e;
367 edge_iterator ei;
368 basic_block bb;
369 fibheap_t worklist, pending, fibheap_swap;
370 sbitmap visited, in_worklist, in_pending, sbitmap_swap;
371 int *bb_order;
372 int *rc_order;
373 int i;
374
375 /* Set up block info for each basic block. */
376 alloc_aux_for_blocks (sizeof (struct block_info_def));
377
378 /* Process outgoing edges of entry point. */
379 if (dump_file)
380 fprintf (dump_file, "Process outgoing edges of entry point\n");
381
382 FOR_EACH_EDGE (e, ei, ENTRY_BLOCK_PTR->succs)
383 {
384 move_or_delete_vzeroupper_2 (e->dest,
385 cfun->machine->caller_pass_avx256_p
386 ? used : unused);
387 BLOCK_INFO (e->dest)->processed = true;
388 }
389
390 /* Compute reverse completion order of depth first search of the CFG
391 so that the data-flow runs faster. */
392 rc_order = XNEWVEC (int, n_basic_blocks - NUM_FIXED_BLOCKS);
393 bb_order = XNEWVEC (int, last_basic_block);
394 pre_and_rev_post_order_compute (NULL, rc_order, false);
395 for (i = 0; i < n_basic_blocks - NUM_FIXED_BLOCKS; i++)
396 bb_order[rc_order[i]] = i;
397 free (rc_order);
398
399 worklist = fibheap_new ();
400 pending = fibheap_new ();
401 visited = sbitmap_alloc (last_basic_block);
402 in_worklist = sbitmap_alloc (last_basic_block);
403 in_pending = sbitmap_alloc (last_basic_block);
404 sbitmap_zero (in_worklist);
405
406 /* Don't check outgoing edges of entry point. */
407 sbitmap_ones (in_pending);
408 FOR_EACH_BB (bb)
409 if (BLOCK_INFO (bb)->processed)
410 RESET_BIT (in_pending, bb->index);
411 else
412 {
413 move_or_delete_vzeroupper_1 (bb, false);
414 fibheap_insert (pending, bb_order[bb->index], bb);
415 }
416
417 if (dump_file)
418 fprintf (dump_file, "Check remaining basic blocks\n");
419
420 while (!fibheap_empty (pending))
421 {
422 fibheap_swap = pending;
423 pending = worklist;
424 worklist = fibheap_swap;
425 sbitmap_swap = in_pending;
426 in_pending = in_worklist;
427 in_worklist = sbitmap_swap;
428
429 sbitmap_zero (visited);
430
431 cfun->machine->rescan_vzeroupper_p = 0;
432
433 while (!fibheap_empty (worklist))
434 {
435 bb = (basic_block) fibheap_extract_min (worklist);
436 RESET_BIT (in_worklist, bb->index);
437 gcc_assert (!TEST_BIT (visited, bb->index));
438 if (!TEST_BIT (visited, bb->index))
439 {
440 edge_iterator ei;
441
442 SET_BIT (visited, bb->index);
443
444 if (move_or_delete_vzeroupper_1 (bb, false))
445 FOR_EACH_EDGE (e, ei, bb->succs)
446 {
447 if (e->dest == EXIT_BLOCK_PTR
448 || BLOCK_INFO (e->dest)->processed)
449 continue;
450
451 if (TEST_BIT (visited, e->dest->index))
452 {
453 if (!TEST_BIT (in_pending, e->dest->index))
454 {
455 /* Send E->DEST to next round. */
456 SET_BIT (in_pending, e->dest->index);
457 fibheap_insert (pending,
458 bb_order[e->dest->index],
459 e->dest);
460 }
461 }
462 else if (!TEST_BIT (in_worklist, e->dest->index))
463 {
464 /* Add E->DEST to current round. */
465 SET_BIT (in_worklist, e->dest->index);
466 fibheap_insert (worklist, bb_order[e->dest->index],
467 e->dest);
468 }
469 }
470 }
471 }
472
473 if (!cfun->machine->rescan_vzeroupper_p)
474 break;
475 }
476
477 free (bb_order);
478 fibheap_delete (worklist);
479 fibheap_delete (pending);
480 sbitmap_free (visited);
481 sbitmap_free (in_worklist);
482 sbitmap_free (in_pending);
483
484 if (dump_file)
485 fprintf (dump_file, "Process remaining basic blocks\n");
486
487 FOR_EACH_BB (bb)
488 move_or_delete_vzeroupper_1 (bb, true);
489
490 free_aux_for_blocks ();
491 }
492
493 static rtx legitimize_dllimport_symbol (rtx, bool);
494
495 #ifndef CHECK_STACK_LIMIT
496 #define CHECK_STACK_LIMIT (-1)
497 #endif
498
499 /* Return index of given mode in mult and division cost tables. */
500 #define MODE_INDEX(mode) \
501 ((mode) == QImode ? 0 \
502 : (mode) == HImode ? 1 \
503 : (mode) == SImode ? 2 \
504 : (mode) == DImode ? 3 \
505 : 4)
506
507 /* Processor costs (relative to an add) */
508 /* We assume COSTS_N_INSNS is defined as (N)*4 and an addition is 2 bytes. */
509 #define COSTS_N_BYTES(N) ((N) * 2)
510
511 #define DUMMY_STRINGOP_ALGS {libcall, {{-1, libcall}}}
512
513 const
514 struct processor_costs ix86_size_cost = {/* costs for tuning for size */
515 COSTS_N_BYTES (2), /* cost of an add instruction */
516 COSTS_N_BYTES (3), /* cost of a lea instruction */
517 COSTS_N_BYTES (2), /* variable shift costs */
518 COSTS_N_BYTES (3), /* constant shift costs */
519 {COSTS_N_BYTES (3), /* cost of starting multiply for QI */
520 COSTS_N_BYTES (3), /* HI */
521 COSTS_N_BYTES (3), /* SI */
522 COSTS_N_BYTES (3), /* DI */
523 COSTS_N_BYTES (5)}, /* other */
524 0, /* cost of multiply per each bit set */
525 {COSTS_N_BYTES (3), /* cost of a divide/mod for QI */
526 COSTS_N_BYTES (3), /* HI */
527 COSTS_N_BYTES (3), /* SI */
528 COSTS_N_BYTES (3), /* DI */
529 COSTS_N_BYTES (5)}, /* other */
530 COSTS_N_BYTES (3), /* cost of movsx */
531 COSTS_N_BYTES (3), /* cost of movzx */
532 0, /* "large" insn */
533 2, /* MOVE_RATIO */
534 2, /* cost for loading QImode using movzbl */
535 {2, 2, 2}, /* cost of loading integer registers
536 in QImode, HImode and SImode.
537 Relative to reg-reg move (2). */
538 {2, 2, 2}, /* cost of storing integer registers */
539 2, /* cost of reg,reg fld/fst */
540 {2, 2, 2}, /* cost of loading fp registers
541 in SFmode, DFmode and XFmode */
542 {2, 2, 2}, /* cost of storing fp registers
543 in SFmode, DFmode and XFmode */
544 3, /* cost of moving MMX register */
545 {3, 3}, /* cost of loading MMX registers
546 in SImode and DImode */
547 {3, 3}, /* cost of storing MMX registers
548 in SImode and DImode */
549 3, /* cost of moving SSE register */
550 {3, 3, 3}, /* cost of loading SSE registers
551 in SImode, DImode and TImode */
552 {3, 3, 3}, /* cost of storing SSE registers
553 in SImode, DImode and TImode */
554 3, /* MMX or SSE register to integer */
555 0, /* size of l1 cache */
556 0, /* size of l2 cache */
557 0, /* size of prefetch block */
558 0, /* number of parallel prefetches */
559 2, /* Branch cost */
560 COSTS_N_BYTES (2), /* cost of FADD and FSUB insns. */
561 COSTS_N_BYTES (2), /* cost of FMUL instruction. */
562 COSTS_N_BYTES (2), /* cost of FDIV instruction. */
563 COSTS_N_BYTES (2), /* cost of FABS instruction. */
564 COSTS_N_BYTES (2), /* cost of FCHS instruction. */
565 COSTS_N_BYTES (2), /* cost of FSQRT instruction. */
566 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
567 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}}},
568 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
569 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}}},
570 1, /* scalar_stmt_cost. */
571 1, /* scalar load_cost. */
572 1, /* scalar_store_cost. */
573 1, /* vec_stmt_cost. */
574 1, /* vec_to_scalar_cost. */
575 1, /* scalar_to_vec_cost. */
576 1, /* vec_align_load_cost. */
577 1, /* vec_unalign_load_cost. */
578 1, /* vec_store_cost. */
579 1, /* cond_taken_branch_cost. */
580 1, /* cond_not_taken_branch_cost. */
581 };
582
583 /* Processor costs (relative to an add) */
584 static const
585 struct processor_costs i386_cost = { /* 386 specific costs */
586 COSTS_N_INSNS (1), /* cost of an add instruction */
587 COSTS_N_INSNS (1), /* cost of a lea instruction */
588 COSTS_N_INSNS (3), /* variable shift costs */
589 COSTS_N_INSNS (2), /* constant shift costs */
590 {COSTS_N_INSNS (6), /* cost of starting multiply for QI */
591 COSTS_N_INSNS (6), /* HI */
592 COSTS_N_INSNS (6), /* SI */
593 COSTS_N_INSNS (6), /* DI */
594 COSTS_N_INSNS (6)}, /* other */
595 COSTS_N_INSNS (1), /* cost of multiply per each bit set */
596 {COSTS_N_INSNS (23), /* cost of a divide/mod for QI */
597 COSTS_N_INSNS (23), /* HI */
598 COSTS_N_INSNS (23), /* SI */
599 COSTS_N_INSNS (23), /* DI */
600 COSTS_N_INSNS (23)}, /* other */
601 COSTS_N_INSNS (3), /* cost of movsx */
602 COSTS_N_INSNS (2), /* cost of movzx */
603 15, /* "large" insn */
604 3, /* MOVE_RATIO */
605 4, /* cost for loading QImode using movzbl */
606 {2, 4, 2}, /* cost of loading integer registers
607 in QImode, HImode and SImode.
608 Relative to reg-reg move (2). */
609 {2, 4, 2}, /* cost of storing integer registers */
610 2, /* cost of reg,reg fld/fst */
611 {8, 8, 8}, /* cost of loading fp registers
612 in SFmode, DFmode and XFmode */
613 {8, 8, 8}, /* cost of storing fp registers
614 in SFmode, DFmode and XFmode */
615 2, /* cost of moving MMX register */
616 {4, 8}, /* cost of loading MMX registers
617 in SImode and DImode */
618 {4, 8}, /* cost of storing MMX registers
619 in SImode and DImode */
620 2, /* cost of moving SSE register */
621 {4, 8, 16}, /* cost of loading SSE registers
622 in SImode, DImode and TImode */
623 {4, 8, 16}, /* cost of storing SSE registers
624 in SImode, DImode and TImode */
625 3, /* MMX or SSE register to integer */
626 0, /* size of l1 cache */
627 0, /* size of l2 cache */
628 0, /* size of prefetch block */
629 0, /* number of parallel prefetches */
630 1, /* Branch cost */
631 COSTS_N_INSNS (23), /* cost of FADD and FSUB insns. */
632 COSTS_N_INSNS (27), /* cost of FMUL instruction. */
633 COSTS_N_INSNS (88), /* cost of FDIV instruction. */
634 COSTS_N_INSNS (22), /* cost of FABS instruction. */
635 COSTS_N_INSNS (24), /* cost of FCHS instruction. */
636 COSTS_N_INSNS (122), /* cost of FSQRT instruction. */
637 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
638 DUMMY_STRINGOP_ALGS},
639 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
640 DUMMY_STRINGOP_ALGS},
641 1, /* scalar_stmt_cost. */
642 1, /* scalar load_cost. */
643 1, /* scalar_store_cost. */
644 1, /* vec_stmt_cost. */
645 1, /* vec_to_scalar_cost. */
646 1, /* scalar_to_vec_cost. */
647 1, /* vec_align_load_cost. */
648 2, /* vec_unalign_load_cost. */
649 1, /* vec_store_cost. */
650 3, /* cond_taken_branch_cost. */
651 1, /* cond_not_taken_branch_cost. */
652 };
653
654 static const
655 struct processor_costs i486_cost = { /* 486 specific costs */
656 COSTS_N_INSNS (1), /* cost of an add instruction */
657 COSTS_N_INSNS (1), /* cost of a lea instruction */
658 COSTS_N_INSNS (3), /* variable shift costs */
659 COSTS_N_INSNS (2), /* constant shift costs */
660 {COSTS_N_INSNS (12), /* cost of starting multiply for QI */
661 COSTS_N_INSNS (12), /* HI */
662 COSTS_N_INSNS (12), /* SI */
663 COSTS_N_INSNS (12), /* DI */
664 COSTS_N_INSNS (12)}, /* other */
665 1, /* cost of multiply per each bit set */
666 {COSTS_N_INSNS (40), /* cost of a divide/mod for QI */
667 COSTS_N_INSNS (40), /* HI */
668 COSTS_N_INSNS (40), /* SI */
669 COSTS_N_INSNS (40), /* DI */
670 COSTS_N_INSNS (40)}, /* other */
671 COSTS_N_INSNS (3), /* cost of movsx */
672 COSTS_N_INSNS (2), /* cost of movzx */
673 15, /* "large" insn */
674 3, /* MOVE_RATIO */
675 4, /* cost for loading QImode using movzbl */
676 {2, 4, 2}, /* cost of loading integer registers
677 in QImode, HImode and SImode.
678 Relative to reg-reg move (2). */
679 {2, 4, 2}, /* cost of storing integer registers */
680 2, /* cost of reg,reg fld/fst */
681 {8, 8, 8}, /* cost of loading fp registers
682 in SFmode, DFmode and XFmode */
683 {8, 8, 8}, /* cost of storing fp registers
684 in SFmode, DFmode and XFmode */
685 2, /* cost of moving MMX register */
686 {4, 8}, /* cost of loading MMX registers
687 in SImode and DImode */
688 {4, 8}, /* cost of storing MMX registers
689 in SImode and DImode */
690 2, /* cost of moving SSE register */
691 {4, 8, 16}, /* cost of loading SSE registers
692 in SImode, DImode and TImode */
693 {4, 8, 16}, /* cost of storing SSE registers
694 in SImode, DImode and TImode */
695 3, /* MMX or SSE register to integer */
696 4, /* size of l1 cache. 486 has 8kB cache
697 shared for code and data, so 4kB is
698 not really precise. */
699 4, /* size of l2 cache */
700 0, /* size of prefetch block */
701 0, /* number of parallel prefetches */
702 1, /* Branch cost */
703 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
704 COSTS_N_INSNS (16), /* cost of FMUL instruction. */
705 COSTS_N_INSNS (73), /* cost of FDIV instruction. */
706 COSTS_N_INSNS (3), /* cost of FABS instruction. */
707 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
708 COSTS_N_INSNS (83), /* cost of FSQRT instruction. */
709 {{rep_prefix_4_byte, {{-1, rep_prefix_4_byte}}},
710 DUMMY_STRINGOP_ALGS},
711 {{rep_prefix_4_byte, {{-1, rep_prefix_4_byte}}},
712 DUMMY_STRINGOP_ALGS},
713 1, /* scalar_stmt_cost. */
714 1, /* scalar load_cost. */
715 1, /* scalar_store_cost. */
716 1, /* vec_stmt_cost. */
717 1, /* vec_to_scalar_cost. */
718 1, /* scalar_to_vec_cost. */
719 1, /* vec_align_load_cost. */
720 2, /* vec_unalign_load_cost. */
721 1, /* vec_store_cost. */
722 3, /* cond_taken_branch_cost. */
723 1, /* cond_not_taken_branch_cost. */
724 };
725
726 static const
727 struct processor_costs pentium_cost = {
728 COSTS_N_INSNS (1), /* cost of an add instruction */
729 COSTS_N_INSNS (1), /* cost of a lea instruction */
730 COSTS_N_INSNS (4), /* variable shift costs */
731 COSTS_N_INSNS (1), /* constant shift costs */
732 {COSTS_N_INSNS (11), /* cost of starting multiply for QI */
733 COSTS_N_INSNS (11), /* HI */
734 COSTS_N_INSNS (11), /* SI */
735 COSTS_N_INSNS (11), /* DI */
736 COSTS_N_INSNS (11)}, /* other */
737 0, /* cost of multiply per each bit set */
738 {COSTS_N_INSNS (25), /* cost of a divide/mod for QI */
739 COSTS_N_INSNS (25), /* HI */
740 COSTS_N_INSNS (25), /* SI */
741 COSTS_N_INSNS (25), /* DI */
742 COSTS_N_INSNS (25)}, /* other */
743 COSTS_N_INSNS (3), /* cost of movsx */
744 COSTS_N_INSNS (2), /* cost of movzx */
745 8, /* "large" insn */
746 6, /* MOVE_RATIO */
747 6, /* cost for loading QImode using movzbl */
748 {2, 4, 2}, /* cost of loading integer registers
749 in QImode, HImode and SImode.
750 Relative to reg-reg move (2). */
751 {2, 4, 2}, /* cost of storing integer registers */
752 2, /* cost of reg,reg fld/fst */
753 {2, 2, 6}, /* cost of loading fp registers
754 in SFmode, DFmode and XFmode */
755 {4, 4, 6}, /* cost of storing fp registers
756 in SFmode, DFmode and XFmode */
757 8, /* cost of moving MMX register */
758 {8, 8}, /* cost of loading MMX registers
759 in SImode and DImode */
760 {8, 8}, /* cost of storing MMX registers
761 in SImode and DImode */
762 2, /* cost of moving SSE register */
763 {4, 8, 16}, /* cost of loading SSE registers
764 in SImode, DImode and TImode */
765 {4, 8, 16}, /* cost of storing SSE registers
766 in SImode, DImode and TImode */
767 3, /* MMX or SSE register to integer */
768 8, /* size of l1 cache. */
769 8, /* size of l2 cache */
770 0, /* size of prefetch block */
771 0, /* number of parallel prefetches */
772 2, /* Branch cost */
773 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
774 COSTS_N_INSNS (3), /* cost of FMUL instruction. */
775 COSTS_N_INSNS (39), /* cost of FDIV instruction. */
776 COSTS_N_INSNS (1), /* cost of FABS instruction. */
777 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
778 COSTS_N_INSNS (70), /* cost of FSQRT instruction. */
779 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
780 DUMMY_STRINGOP_ALGS},
781 {{libcall, {{-1, rep_prefix_4_byte}}},
782 DUMMY_STRINGOP_ALGS},
783 1, /* scalar_stmt_cost. */
784 1, /* scalar load_cost. */
785 1, /* scalar_store_cost. */
786 1, /* vec_stmt_cost. */
787 1, /* vec_to_scalar_cost. */
788 1, /* scalar_to_vec_cost. */
789 1, /* vec_align_load_cost. */
790 2, /* vec_unalign_load_cost. */
791 1, /* vec_store_cost. */
792 3, /* cond_taken_branch_cost. */
793 1, /* cond_not_taken_branch_cost. */
794 };
795
796 static const
797 struct processor_costs pentiumpro_cost = {
798 COSTS_N_INSNS (1), /* cost of an add instruction */
799 COSTS_N_INSNS (1), /* cost of a lea instruction */
800 COSTS_N_INSNS (1), /* variable shift costs */
801 COSTS_N_INSNS (1), /* constant shift costs */
802 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
803 COSTS_N_INSNS (4), /* HI */
804 COSTS_N_INSNS (4), /* SI */
805 COSTS_N_INSNS (4), /* DI */
806 COSTS_N_INSNS (4)}, /* other */
807 0, /* cost of multiply per each bit set */
808 {COSTS_N_INSNS (17), /* cost of a divide/mod for QI */
809 COSTS_N_INSNS (17), /* HI */
810 COSTS_N_INSNS (17), /* SI */
811 COSTS_N_INSNS (17), /* DI */
812 COSTS_N_INSNS (17)}, /* other */
813 COSTS_N_INSNS (1), /* cost of movsx */
814 COSTS_N_INSNS (1), /* cost of movzx */
815 8, /* "large" insn */
816 6, /* MOVE_RATIO */
817 2, /* cost for loading QImode using movzbl */
818 {4, 4, 4}, /* cost of loading integer registers
819 in QImode, HImode and SImode.
820 Relative to reg-reg move (2). */
821 {2, 2, 2}, /* cost of storing integer registers */
822 2, /* cost of reg,reg fld/fst */
823 {2, 2, 6}, /* cost of loading fp registers
824 in SFmode, DFmode and XFmode */
825 {4, 4, 6}, /* cost of storing fp registers
826 in SFmode, DFmode and XFmode */
827 2, /* cost of moving MMX register */
828 {2, 2}, /* cost of loading MMX registers
829 in SImode and DImode */
830 {2, 2}, /* cost of storing MMX registers
831 in SImode and DImode */
832 2, /* cost of moving SSE register */
833 {2, 2, 8}, /* cost of loading SSE registers
834 in SImode, DImode and TImode */
835 {2, 2, 8}, /* cost of storing SSE registers
836 in SImode, DImode and TImode */
837 3, /* MMX or SSE register to integer */
838 8, /* size of l1 cache. */
839 256, /* size of l2 cache */
840 32, /* size of prefetch block */
841 6, /* number of parallel prefetches */
842 2, /* Branch cost */
843 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
844 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
845 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
846 COSTS_N_INSNS (2), /* cost of FABS instruction. */
847 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
848 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
849 /* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes
850 (we ensure the alignment). For small blocks inline loop is still a
851 noticeable win, for bigger blocks either rep movsl or rep movsb is
852 way to go. Rep movsb has apparently more expensive startup time in CPU,
853 but after 4K the difference is down in the noise. */
854 {{rep_prefix_4_byte, {{128, loop}, {1024, unrolled_loop},
855 {8192, rep_prefix_4_byte}, {-1, rep_prefix_1_byte}}},
856 DUMMY_STRINGOP_ALGS},
857 {{rep_prefix_4_byte, {{1024, unrolled_loop},
858 {8192, rep_prefix_4_byte}, {-1, libcall}}},
859 DUMMY_STRINGOP_ALGS},
860 1, /* scalar_stmt_cost. */
861 1, /* scalar load_cost. */
862 1, /* scalar_store_cost. */
863 1, /* vec_stmt_cost. */
864 1, /* vec_to_scalar_cost. */
865 1, /* scalar_to_vec_cost. */
866 1, /* vec_align_load_cost. */
867 2, /* vec_unalign_load_cost. */
868 1, /* vec_store_cost. */
869 3, /* cond_taken_branch_cost. */
870 1, /* cond_not_taken_branch_cost. */
871 };
872
873 static const
874 struct processor_costs geode_cost = {
875 COSTS_N_INSNS (1), /* cost of an add instruction */
876 COSTS_N_INSNS (1), /* cost of a lea instruction */
877 COSTS_N_INSNS (2), /* variable shift costs */
878 COSTS_N_INSNS (1), /* constant shift costs */
879 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
880 COSTS_N_INSNS (4), /* HI */
881 COSTS_N_INSNS (7), /* SI */
882 COSTS_N_INSNS (7), /* DI */
883 COSTS_N_INSNS (7)}, /* other */
884 0, /* cost of multiply per each bit set */
885 {COSTS_N_INSNS (15), /* cost of a divide/mod for QI */
886 COSTS_N_INSNS (23), /* HI */
887 COSTS_N_INSNS (39), /* SI */
888 COSTS_N_INSNS (39), /* DI */
889 COSTS_N_INSNS (39)}, /* other */
890 COSTS_N_INSNS (1), /* cost of movsx */
891 COSTS_N_INSNS (1), /* cost of movzx */
892 8, /* "large" insn */
893 4, /* MOVE_RATIO */
894 1, /* cost for loading QImode using movzbl */
895 {1, 1, 1}, /* cost of loading integer registers
896 in QImode, HImode and SImode.
897 Relative to reg-reg move (2). */
898 {1, 1, 1}, /* cost of storing integer registers */
899 1, /* cost of reg,reg fld/fst */
900 {1, 1, 1}, /* cost of loading fp registers
901 in SFmode, DFmode and XFmode */
902 {4, 6, 6}, /* cost of storing fp registers
903 in SFmode, DFmode and XFmode */
904
905 1, /* cost of moving MMX register */
906 {1, 1}, /* cost of loading MMX registers
907 in SImode and DImode */
908 {1, 1}, /* cost of storing MMX registers
909 in SImode and DImode */
910 1, /* cost of moving SSE register */
911 {1, 1, 1}, /* cost of loading SSE registers
912 in SImode, DImode and TImode */
913 {1, 1, 1}, /* cost of storing SSE registers
914 in SImode, DImode and TImode */
915 1, /* MMX or SSE register to integer */
916 64, /* size of l1 cache. */
917 128, /* size of l2 cache. */
918 32, /* size of prefetch block */
919 1, /* number of parallel prefetches */
920 1, /* Branch cost */
921 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
922 COSTS_N_INSNS (11), /* cost of FMUL instruction. */
923 COSTS_N_INSNS (47), /* cost of FDIV instruction. */
924 COSTS_N_INSNS (1), /* cost of FABS instruction. */
925 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
926 COSTS_N_INSNS (54), /* cost of FSQRT instruction. */
927 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
928 DUMMY_STRINGOP_ALGS},
929 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
930 DUMMY_STRINGOP_ALGS},
931 1, /* scalar_stmt_cost. */
932 1, /* scalar load_cost. */
933 1, /* scalar_store_cost. */
934 1, /* vec_stmt_cost. */
935 1, /* vec_to_scalar_cost. */
936 1, /* scalar_to_vec_cost. */
937 1, /* vec_align_load_cost. */
938 2, /* vec_unalign_load_cost. */
939 1, /* vec_store_cost. */
940 3, /* cond_taken_branch_cost. */
941 1, /* cond_not_taken_branch_cost. */
942 };
943
944 static const
945 struct processor_costs k6_cost = {
946 COSTS_N_INSNS (1), /* cost of an add instruction */
947 COSTS_N_INSNS (2), /* cost of a lea instruction */
948 COSTS_N_INSNS (1), /* variable shift costs */
949 COSTS_N_INSNS (1), /* constant shift costs */
950 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
951 COSTS_N_INSNS (3), /* HI */
952 COSTS_N_INSNS (3), /* SI */
953 COSTS_N_INSNS (3), /* DI */
954 COSTS_N_INSNS (3)}, /* other */
955 0, /* cost of multiply per each bit set */
956 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
957 COSTS_N_INSNS (18), /* HI */
958 COSTS_N_INSNS (18), /* SI */
959 COSTS_N_INSNS (18), /* DI */
960 COSTS_N_INSNS (18)}, /* other */
961 COSTS_N_INSNS (2), /* cost of movsx */
962 COSTS_N_INSNS (2), /* cost of movzx */
963 8, /* "large" insn */
964 4, /* MOVE_RATIO */
965 3, /* cost for loading QImode using movzbl */
966 {4, 5, 4}, /* cost of loading integer registers
967 in QImode, HImode and SImode.
968 Relative to reg-reg move (2). */
969 {2, 3, 2}, /* cost of storing integer registers */
970 4, /* cost of reg,reg fld/fst */
971 {6, 6, 6}, /* cost of loading fp registers
972 in SFmode, DFmode and XFmode */
973 {4, 4, 4}, /* cost of storing fp registers
974 in SFmode, DFmode and XFmode */
975 2, /* cost of moving MMX register */
976 {2, 2}, /* cost of loading MMX registers
977 in SImode and DImode */
978 {2, 2}, /* cost of storing MMX registers
979 in SImode and DImode */
980 2, /* cost of moving SSE register */
981 {2, 2, 8}, /* cost of loading SSE registers
982 in SImode, DImode and TImode */
983 {2, 2, 8}, /* cost of storing SSE registers
984 in SImode, DImode and TImode */
985 6, /* MMX or SSE register to integer */
986 32, /* size of l1 cache. */
987 32, /* size of l2 cache. Some models
988 have integrated l2 cache, but
989 optimizing for k6 is not important
990 enough to worry about that. */
991 32, /* size of prefetch block */
992 1, /* number of parallel prefetches */
993 1, /* Branch cost */
994 COSTS_N_INSNS (2), /* cost of FADD and FSUB insns. */
995 COSTS_N_INSNS (2), /* cost of FMUL instruction. */
996 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
997 COSTS_N_INSNS (2), /* cost of FABS instruction. */
998 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
999 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
1000 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
1001 DUMMY_STRINGOP_ALGS},
1002 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
1003 DUMMY_STRINGOP_ALGS},
1004 1, /* scalar_stmt_cost. */
1005 1, /* scalar load_cost. */
1006 1, /* scalar_store_cost. */
1007 1, /* vec_stmt_cost. */
1008 1, /* vec_to_scalar_cost. */
1009 1, /* scalar_to_vec_cost. */
1010 1, /* vec_align_load_cost. */
1011 2, /* vec_unalign_load_cost. */
1012 1, /* vec_store_cost. */
1013 3, /* cond_taken_branch_cost. */
1014 1, /* cond_not_taken_branch_cost. */
1015 };
1016
1017 static const
1018 struct processor_costs athlon_cost = {
1019 COSTS_N_INSNS (1), /* cost of an add instruction */
1020 COSTS_N_INSNS (2), /* cost of a lea instruction */
1021 COSTS_N_INSNS (1), /* variable shift costs */
1022 COSTS_N_INSNS (1), /* constant shift costs */
1023 {COSTS_N_INSNS (5), /* cost of starting multiply for QI */
1024 COSTS_N_INSNS (5), /* HI */
1025 COSTS_N_INSNS (5), /* SI */
1026 COSTS_N_INSNS (5), /* DI */
1027 COSTS_N_INSNS (5)}, /* other */
1028 0, /* cost of multiply per each bit set */
1029 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1030 COSTS_N_INSNS (26), /* HI */
1031 COSTS_N_INSNS (42), /* SI */
1032 COSTS_N_INSNS (74), /* DI */
1033 COSTS_N_INSNS (74)}, /* other */
1034 COSTS_N_INSNS (1), /* cost of movsx */
1035 COSTS_N_INSNS (1), /* cost of movzx */
1036 8, /* "large" insn */
1037 9, /* MOVE_RATIO */
1038 4, /* cost for loading QImode using movzbl */
1039 {3, 4, 3}, /* cost of loading integer registers
1040 in QImode, HImode and SImode.
1041 Relative to reg-reg move (2). */
1042 {3, 4, 3}, /* cost of storing integer registers */
1043 4, /* cost of reg,reg fld/fst */
1044 {4, 4, 12}, /* cost of loading fp registers
1045 in SFmode, DFmode and XFmode */
1046 {6, 6, 8}, /* cost of storing fp registers
1047 in SFmode, DFmode and XFmode */
1048 2, /* cost of moving MMX register */
1049 {4, 4}, /* cost of loading MMX registers
1050 in SImode and DImode */
1051 {4, 4}, /* cost of storing MMX registers
1052 in SImode and DImode */
1053 2, /* cost of moving SSE register */
1054 {4, 4, 6}, /* cost of loading SSE registers
1055 in SImode, DImode and TImode */
1056 {4, 4, 5}, /* cost of storing SSE registers
1057 in SImode, DImode and TImode */
1058 5, /* MMX or SSE register to integer */
1059 64, /* size of l1 cache. */
1060 256, /* size of l2 cache. */
1061 64, /* size of prefetch block */
1062 6, /* number of parallel prefetches */
1063 5, /* Branch cost */
1064 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1065 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1066 COSTS_N_INSNS (24), /* cost of FDIV instruction. */
1067 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1068 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1069 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1070 /* For some reason, Athlon deals better with REP prefix (relative to loops)
1071 compared to K8. Alignment becomes important after 8 bytes for memcpy and
1072 128 bytes for memset. */
1073 {{libcall, {{2048, rep_prefix_4_byte}, {-1, libcall}}},
1074 DUMMY_STRINGOP_ALGS},
1075 {{libcall, {{2048, rep_prefix_4_byte}, {-1, libcall}}},
1076 DUMMY_STRINGOP_ALGS},
1077 1, /* scalar_stmt_cost. */
1078 1, /* scalar load_cost. */
1079 1, /* scalar_store_cost. */
1080 1, /* vec_stmt_cost. */
1081 1, /* vec_to_scalar_cost. */
1082 1, /* scalar_to_vec_cost. */
1083 1, /* vec_align_load_cost. */
1084 2, /* vec_unalign_load_cost. */
1085 1, /* vec_store_cost. */
1086 3, /* cond_taken_branch_cost. */
1087 1, /* cond_not_taken_branch_cost. */
1088 };
1089
1090 static const
1091 struct processor_costs k8_cost = {
1092 COSTS_N_INSNS (1), /* cost of an add instruction */
1093 COSTS_N_INSNS (2), /* cost of a lea instruction */
1094 COSTS_N_INSNS (1), /* variable shift costs */
1095 COSTS_N_INSNS (1), /* constant shift costs */
1096 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1097 COSTS_N_INSNS (4), /* HI */
1098 COSTS_N_INSNS (3), /* SI */
1099 COSTS_N_INSNS (4), /* DI */
1100 COSTS_N_INSNS (5)}, /* other */
1101 0, /* cost of multiply per each bit set */
1102 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1103 COSTS_N_INSNS (26), /* HI */
1104 COSTS_N_INSNS (42), /* SI */
1105 COSTS_N_INSNS (74), /* DI */
1106 COSTS_N_INSNS (74)}, /* other */
1107 COSTS_N_INSNS (1), /* cost of movsx */
1108 COSTS_N_INSNS (1), /* cost of movzx */
1109 8, /* "large" insn */
1110 9, /* MOVE_RATIO */
1111 4, /* cost for loading QImode using movzbl */
1112 {3, 4, 3}, /* cost of loading integer registers
1113 in QImode, HImode and SImode.
1114 Relative to reg-reg move (2). */
1115 {3, 4, 3}, /* cost of storing integer registers */
1116 4, /* cost of reg,reg fld/fst */
1117 {4, 4, 12}, /* cost of loading fp registers
1118 in SFmode, DFmode and XFmode */
1119 {6, 6, 8}, /* cost of storing fp registers
1120 in SFmode, DFmode and XFmode */
1121 2, /* cost of moving MMX register */
1122 {3, 3}, /* cost of loading MMX registers
1123 in SImode and DImode */
1124 {4, 4}, /* cost of storing MMX registers
1125 in SImode and DImode */
1126 2, /* cost of moving SSE register */
1127 {4, 3, 6}, /* cost of loading SSE registers
1128 in SImode, DImode and TImode */
1129 {4, 4, 5}, /* cost of storing SSE registers
1130 in SImode, DImode and TImode */
1131 5, /* MMX or SSE register to integer */
1132 64, /* size of l1 cache. */
1133 512, /* size of l2 cache. */
1134 64, /* size of prefetch block */
1135 /* New AMD processors never drop prefetches; if they cannot be performed
1136 immediately, they are queued. We set number of simultaneous prefetches
1137 to a large constant to reflect this (it probably is not a good idea not
1138 to limit number of prefetches at all, as their execution also takes some
1139 time). */
1140 100, /* number of parallel prefetches */
1141 3, /* Branch cost */
1142 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1143 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1144 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1145 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1146 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1147 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1148 /* K8 has optimized REP instruction for medium sized blocks, but for very
1149 small blocks it is better to use loop. For large blocks, libcall can
1150 do nontemporary accesses and beat inline considerably. */
1151 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
1152 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1153 {{libcall, {{8, loop}, {24, unrolled_loop},
1154 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1155 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1156 4, /* scalar_stmt_cost. */
1157 2, /* scalar load_cost. */
1158 2, /* scalar_store_cost. */
1159 5, /* vec_stmt_cost. */
1160 0, /* vec_to_scalar_cost. */
1161 2, /* scalar_to_vec_cost. */
1162 2, /* vec_align_load_cost. */
1163 3, /* vec_unalign_load_cost. */
1164 3, /* vec_store_cost. */
1165 3, /* cond_taken_branch_cost. */
1166 2, /* cond_not_taken_branch_cost. */
1167 };
1168
1169 struct processor_costs amdfam10_cost = {
1170 COSTS_N_INSNS (1), /* cost of an add instruction */
1171 COSTS_N_INSNS (2), /* cost of a lea instruction */
1172 COSTS_N_INSNS (1), /* variable shift costs */
1173 COSTS_N_INSNS (1), /* constant shift costs */
1174 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1175 COSTS_N_INSNS (4), /* HI */
1176 COSTS_N_INSNS (3), /* SI */
1177 COSTS_N_INSNS (4), /* DI */
1178 COSTS_N_INSNS (5)}, /* other */
1179 0, /* cost of multiply per each bit set */
1180 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1181 COSTS_N_INSNS (35), /* HI */
1182 COSTS_N_INSNS (51), /* SI */
1183 COSTS_N_INSNS (83), /* DI */
1184 COSTS_N_INSNS (83)}, /* other */
1185 COSTS_N_INSNS (1), /* cost of movsx */
1186 COSTS_N_INSNS (1), /* cost of movzx */
1187 8, /* "large" insn */
1188 9, /* MOVE_RATIO */
1189 4, /* cost for loading QImode using movzbl */
1190 {3, 4, 3}, /* cost of loading integer registers
1191 in QImode, HImode and SImode.
1192 Relative to reg-reg move (2). */
1193 {3, 4, 3}, /* cost of storing integer registers */
1194 4, /* cost of reg,reg fld/fst */
1195 {4, 4, 12}, /* cost of loading fp registers
1196 in SFmode, DFmode and XFmode */
1197 {6, 6, 8}, /* cost of storing fp registers
1198 in SFmode, DFmode and XFmode */
1199 2, /* cost of moving MMX register */
1200 {3, 3}, /* cost of loading MMX registers
1201 in SImode and DImode */
1202 {4, 4}, /* cost of storing MMX registers
1203 in SImode and DImode */
1204 2, /* cost of moving SSE register */
1205 {4, 4, 3}, /* cost of loading SSE registers
1206 in SImode, DImode and TImode */
1207 {4, 4, 5}, /* cost of storing SSE registers
1208 in SImode, DImode and TImode */
1209 3, /* MMX or SSE register to integer */
1210 /* On K8:
1211 MOVD reg64, xmmreg Double FSTORE 4
1212 MOVD reg32, xmmreg Double FSTORE 4
1213 On AMDFAM10:
1214 MOVD reg64, xmmreg Double FADD 3
1215 1/1 1/1
1216 MOVD reg32, xmmreg Double FADD 3
1217 1/1 1/1 */
1218 64, /* size of l1 cache. */
1219 512, /* size of l2 cache. */
1220 64, /* size of prefetch block */
1221 /* New AMD processors never drop prefetches; if they cannot be performed
1222 immediately, they are queued. We set number of simultaneous prefetches
1223 to a large constant to reflect this (it probably is not a good idea not
1224 to limit number of prefetches at all, as their execution also takes some
1225 time). */
1226 100, /* number of parallel prefetches */
1227 2, /* Branch cost */
1228 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1229 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1230 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1231 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1232 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1233 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1234
1235 /* AMDFAM10 has optimized REP instruction for medium sized blocks, but for
1236 very small blocks it is better to use loop. For large blocks, libcall can
1237 do nontemporary accesses and beat inline considerably. */
1238 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
1239 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1240 {{libcall, {{8, loop}, {24, unrolled_loop},
1241 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1242 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1243 4, /* scalar_stmt_cost. */
1244 2, /* scalar load_cost. */
1245 2, /* scalar_store_cost. */
1246 6, /* vec_stmt_cost. */
1247 0, /* vec_to_scalar_cost. */
1248 2, /* scalar_to_vec_cost. */
1249 2, /* vec_align_load_cost. */
1250 2, /* vec_unalign_load_cost. */
1251 2, /* vec_store_cost. */
1252 2, /* cond_taken_branch_cost. */
1253 1, /* cond_not_taken_branch_cost. */
1254 };
1255
1256 struct processor_costs bdver1_cost = {
1257 COSTS_N_INSNS (1), /* cost of an add instruction */
1258 COSTS_N_INSNS (1), /* cost of a lea instruction */
1259 COSTS_N_INSNS (1), /* variable shift costs */
1260 COSTS_N_INSNS (1), /* constant shift costs */
1261 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1262 COSTS_N_INSNS (4), /* HI */
1263 COSTS_N_INSNS (4), /* SI */
1264 COSTS_N_INSNS (6), /* DI */
1265 COSTS_N_INSNS (6)}, /* other */
1266 0, /* cost of multiply per each bit set */
1267 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1268 COSTS_N_INSNS (35), /* HI */
1269 COSTS_N_INSNS (51), /* SI */
1270 COSTS_N_INSNS (83), /* DI */
1271 COSTS_N_INSNS (83)}, /* other */
1272 COSTS_N_INSNS (1), /* cost of movsx */
1273 COSTS_N_INSNS (1), /* cost of movzx */
1274 8, /* "large" insn */
1275 9, /* MOVE_RATIO */
1276 4, /* cost for loading QImode using movzbl */
1277 {5, 5, 4}, /* cost of loading integer registers
1278 in QImode, HImode and SImode.
1279 Relative to reg-reg move (2). */
1280 {4, 4, 4}, /* cost of storing integer registers */
1281 2, /* cost of reg,reg fld/fst */
1282 {5, 5, 12}, /* cost of loading fp registers
1283 in SFmode, DFmode and XFmode */
1284 {4, 4, 8}, /* cost of storing fp registers
1285 in SFmode, DFmode and XFmode */
1286 2, /* cost of moving MMX register */
1287 {4, 4}, /* cost of loading MMX registers
1288 in SImode and DImode */
1289 {4, 4}, /* cost of storing MMX registers
1290 in SImode and DImode */
1291 2, /* cost of moving SSE register */
1292 {4, 4, 4}, /* cost of loading SSE registers
1293 in SImode, DImode and TImode */
1294 {4, 4, 4}, /* cost of storing SSE registers
1295 in SImode, DImode and TImode */
1296 2, /* MMX or SSE register to integer */
1297 /* On K8:
1298 MOVD reg64, xmmreg Double FSTORE 4
1299 MOVD reg32, xmmreg Double FSTORE 4
1300 On AMDFAM10:
1301 MOVD reg64, xmmreg Double FADD 3
1302 1/1 1/1
1303 MOVD reg32, xmmreg Double FADD 3
1304 1/1 1/1 */
1305 16, /* size of l1 cache. */
1306 2048, /* size of l2 cache. */
1307 64, /* size of prefetch block */
1308 /* New AMD processors never drop prefetches; if they cannot be performed
1309 immediately, they are queued. We set number of simultaneous prefetches
1310 to a large constant to reflect this (it probably is not a good idea not
1311 to limit number of prefetches at all, as their execution also takes some
1312 time). */
1313 100, /* number of parallel prefetches */
1314 2, /* Branch cost */
1315 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1316 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1317 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1318 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1319 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1320 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1321
1322 /* BDVER1 has optimized REP instruction for medium sized blocks, but for
1323 very small blocks it is better to use loop. For large blocks, libcall
1324 can do nontemporary accesses and beat inline considerably. */
1325 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
1326 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1327 {{libcall, {{8, loop}, {24, unrolled_loop},
1328 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1329 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1330 6, /* scalar_stmt_cost. */
1331 4, /* scalar load_cost. */
1332 4, /* scalar_store_cost. */
1333 6, /* vec_stmt_cost. */
1334 0, /* vec_to_scalar_cost. */
1335 2, /* scalar_to_vec_cost. */
1336 4, /* vec_align_load_cost. */
1337 4, /* vec_unalign_load_cost. */
1338 4, /* vec_store_cost. */
1339 2, /* cond_taken_branch_cost. */
1340 1, /* cond_not_taken_branch_cost. */
1341 };
1342
1343 struct processor_costs bdver2_cost = {
1344 COSTS_N_INSNS (1), /* cost of an add instruction */
1345 COSTS_N_INSNS (1), /* cost of a lea instruction */
1346 COSTS_N_INSNS (1), /* variable shift costs */
1347 COSTS_N_INSNS (1), /* constant shift costs */
1348 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1349 COSTS_N_INSNS (4), /* HI */
1350 COSTS_N_INSNS (4), /* SI */
1351 COSTS_N_INSNS (6), /* DI */
1352 COSTS_N_INSNS (6)}, /* other */
1353 0, /* cost of multiply per each bit set */
1354 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1355 COSTS_N_INSNS (35), /* HI */
1356 COSTS_N_INSNS (51), /* SI */
1357 COSTS_N_INSNS (83), /* DI */
1358 COSTS_N_INSNS (83)}, /* other */
1359 COSTS_N_INSNS (1), /* cost of movsx */
1360 COSTS_N_INSNS (1), /* cost of movzx */
1361 8, /* "large" insn */
1362 9, /* MOVE_RATIO */
1363 4, /* cost for loading QImode using movzbl */
1364 {5, 5, 4}, /* cost of loading integer registers
1365 in QImode, HImode and SImode.
1366 Relative to reg-reg move (2). */
1367 {4, 4, 4}, /* cost of storing integer registers */
1368 2, /* cost of reg,reg fld/fst */
1369 {5, 5, 12}, /* cost of loading fp registers
1370 in SFmode, DFmode and XFmode */
1371 {4, 4, 8}, /* cost of storing fp registers
1372 in SFmode, DFmode and XFmode */
1373 2, /* cost of moving MMX register */
1374 {4, 4}, /* cost of loading MMX registers
1375 in SImode and DImode */
1376 {4, 4}, /* cost of storing MMX registers
1377 in SImode and DImode */
1378 2, /* cost of moving SSE register */
1379 {4, 4, 4}, /* cost of loading SSE registers
1380 in SImode, DImode and TImode */
1381 {4, 4, 4}, /* cost of storing SSE registers
1382 in SImode, DImode and TImode */
1383 2, /* MMX or SSE register to integer */
1384 /* On K8:
1385 MOVD reg64, xmmreg Double FSTORE 4
1386 MOVD reg32, xmmreg Double FSTORE 4
1387 On AMDFAM10:
1388 MOVD reg64, xmmreg Double FADD 3
1389 1/1 1/1
1390 MOVD reg32, xmmreg Double FADD 3
1391 1/1 1/1 */
1392 16, /* size of l1 cache. */
1393 2048, /* size of l2 cache. */
1394 64, /* size of prefetch block */
1395 /* New AMD processors never drop prefetches; if they cannot be performed
1396 immediately, they are queued. We set number of simultaneous prefetches
1397 to a large constant to reflect this (it probably is not a good idea not
1398 to limit number of prefetches at all, as their execution also takes some
1399 time). */
1400 100, /* number of parallel prefetches */
1401 2, /* Branch cost */
1402 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1403 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1404 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1405 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1406 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1407 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1408
1409 /* BDVER2 has optimized REP instruction for medium sized blocks, but for
1410 very small blocks it is better to use loop. For large blocks, libcall
1411 can do nontemporary accesses and beat inline considerably. */
1412 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
1413 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1414 {{libcall, {{8, loop}, {24, unrolled_loop},
1415 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1416 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1417 6, /* scalar_stmt_cost. */
1418 4, /* scalar load_cost. */
1419 4, /* scalar_store_cost. */
1420 6, /* vec_stmt_cost. */
1421 0, /* vec_to_scalar_cost. */
1422 2, /* scalar_to_vec_cost. */
1423 4, /* vec_align_load_cost. */
1424 4, /* vec_unalign_load_cost. */
1425 4, /* vec_store_cost. */
1426 2, /* cond_taken_branch_cost. */
1427 1, /* cond_not_taken_branch_cost. */
1428 };
1429
1430 struct processor_costs btver1_cost = {
1431 COSTS_N_INSNS (1), /* cost of an add instruction */
1432 COSTS_N_INSNS (2), /* cost of a lea instruction */
1433 COSTS_N_INSNS (1), /* variable shift costs */
1434 COSTS_N_INSNS (1), /* constant shift costs */
1435 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1436 COSTS_N_INSNS (4), /* HI */
1437 COSTS_N_INSNS (3), /* SI */
1438 COSTS_N_INSNS (4), /* DI */
1439 COSTS_N_INSNS (5)}, /* other */
1440 0, /* cost of multiply per each bit set */
1441 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1442 COSTS_N_INSNS (35), /* HI */
1443 COSTS_N_INSNS (51), /* SI */
1444 COSTS_N_INSNS (83), /* DI */
1445 COSTS_N_INSNS (83)}, /* other */
1446 COSTS_N_INSNS (1), /* cost of movsx */
1447 COSTS_N_INSNS (1), /* cost of movzx */
1448 8, /* "large" insn */
1449 9, /* MOVE_RATIO */
1450 4, /* cost for loading QImode using movzbl */
1451 {3, 4, 3}, /* cost of loading integer registers
1452 in QImode, HImode and SImode.
1453 Relative to reg-reg move (2). */
1454 {3, 4, 3}, /* cost of storing integer registers */
1455 4, /* cost of reg,reg fld/fst */
1456 {4, 4, 12}, /* cost of loading fp registers
1457 in SFmode, DFmode and XFmode */
1458 {6, 6, 8}, /* cost of storing fp registers
1459 in SFmode, DFmode and XFmode */
1460 2, /* cost of moving MMX register */
1461 {3, 3}, /* cost of loading MMX registers
1462 in SImode and DImode */
1463 {4, 4}, /* cost of storing MMX registers
1464 in SImode and DImode */
1465 2, /* cost of moving SSE register */
1466 {4, 4, 3}, /* cost of loading SSE registers
1467 in SImode, DImode and TImode */
1468 {4, 4, 5}, /* cost of storing SSE registers
1469 in SImode, DImode and TImode */
1470 3, /* MMX or SSE register to integer */
1471 /* On K8:
1472 MOVD reg64, xmmreg Double FSTORE 4
1473 MOVD reg32, xmmreg Double FSTORE 4
1474 On AMDFAM10:
1475 MOVD reg64, xmmreg Double FADD 3
1476 1/1 1/1
1477 MOVD reg32, xmmreg Double FADD 3
1478 1/1 1/1 */
1479 32, /* size of l1 cache. */
1480 512, /* size of l2 cache. */
1481 64, /* size of prefetch block */
1482 100, /* number of parallel prefetches */
1483 2, /* Branch cost */
1484 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1485 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1486 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1487 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1488 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1489 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1490
1491 /* BTVER1 has optimized REP instruction for medium sized blocks, but for
1492 very small blocks it is better to use loop. For large blocks, libcall can
1493 do nontemporary accesses and beat inline considerably. */
1494 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
1495 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1496 {{libcall, {{8, loop}, {24, unrolled_loop},
1497 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1498 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1499 4, /* scalar_stmt_cost. */
1500 2, /* scalar load_cost. */
1501 2, /* scalar_store_cost. */
1502 6, /* vec_stmt_cost. */
1503 0, /* vec_to_scalar_cost. */
1504 2, /* scalar_to_vec_cost. */
1505 2, /* vec_align_load_cost. */
1506 2, /* vec_unalign_load_cost. */
1507 2, /* vec_store_cost. */
1508 2, /* cond_taken_branch_cost. */
1509 1, /* cond_not_taken_branch_cost. */
1510 };
1511
1512 struct processor_costs btver2_cost = {
1513 COSTS_N_INSNS (1), /* cost of an add instruction */
1514 COSTS_N_INSNS (2), /* cost of a lea instruction */
1515 COSTS_N_INSNS (1), /* variable shift costs */
1516 COSTS_N_INSNS (1), /* constant shift costs */
1517 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1518 COSTS_N_INSNS (4), /* HI */
1519 COSTS_N_INSNS (3), /* SI */
1520 COSTS_N_INSNS (4), /* DI */
1521 COSTS_N_INSNS (5)}, /* other */
1522 0, /* cost of multiply per each bit set */
1523 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1524 COSTS_N_INSNS (35), /* HI */
1525 COSTS_N_INSNS (51), /* SI */
1526 COSTS_N_INSNS (83), /* DI */
1527 COSTS_N_INSNS (83)}, /* other */
1528 COSTS_N_INSNS (1), /* cost of movsx */
1529 COSTS_N_INSNS (1), /* cost of movzx */
1530 8, /* "large" insn */
1531 9, /* MOVE_RATIO */
1532 4, /* cost for loading QImode using movzbl */
1533 {3, 4, 3}, /* cost of loading integer registers
1534 in QImode, HImode and SImode.
1535 Relative to reg-reg move (2). */
1536 {3, 4, 3}, /* cost of storing integer registers */
1537 4, /* cost of reg,reg fld/fst */
1538 {4, 4, 12}, /* cost of loading fp registers
1539 in SFmode, DFmode and XFmode */
1540 {6, 6, 8}, /* cost of storing fp registers
1541 in SFmode, DFmode and XFmode */
1542 2, /* cost of moving MMX register */
1543 {3, 3}, /* cost of loading MMX registers
1544 in SImode and DImode */
1545 {4, 4}, /* cost of storing MMX registers
1546 in SImode and DImode */
1547 2, /* cost of moving SSE register */
1548 {4, 4, 3}, /* cost of loading SSE registers
1549 in SImode, DImode and TImode */
1550 {4, 4, 5}, /* cost of storing SSE registers
1551 in SImode, DImode and TImode */
1552 3, /* MMX or SSE register to integer */
1553 /* On K8:
1554 MOVD reg64, xmmreg Double FSTORE 4
1555 MOVD reg32, xmmreg Double FSTORE 4
1556 On AMDFAM10:
1557 MOVD reg64, xmmreg Double FADD 3
1558 1/1 1/1
1559 MOVD reg32, xmmreg Double FADD 3
1560 1/1 1/1 */
1561 32, /* size of l1 cache. */
1562 2048, /* size of l2 cache. */
1563 64, /* size of prefetch block */
1564 100, /* number of parallel prefetches */
1565 2, /* Branch cost */
1566 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1567 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1568 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1569 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1570 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1571 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1572
1573 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
1574 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1575 {{libcall, {{8, loop}, {24, unrolled_loop},
1576 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1577 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1578 4, /* scalar_stmt_cost. */
1579 2, /* scalar load_cost. */
1580 2, /* scalar_store_cost. */
1581 6, /* vec_stmt_cost. */
1582 0, /* vec_to_scalar_cost. */
1583 2, /* scalar_to_vec_cost. */
1584 2, /* vec_align_load_cost. */
1585 2, /* vec_unalign_load_cost. */
1586 2, /* vec_store_cost. */
1587 2, /* cond_taken_branch_cost. */
1588 1, /* cond_not_taken_branch_cost. */
1589 };
1590
1591 static const
1592 struct processor_costs pentium4_cost = {
1593 COSTS_N_INSNS (1), /* cost of an add instruction */
1594 COSTS_N_INSNS (3), /* cost of a lea instruction */
1595 COSTS_N_INSNS (4), /* variable shift costs */
1596 COSTS_N_INSNS (4), /* constant shift costs */
1597 {COSTS_N_INSNS (15), /* cost of starting multiply for QI */
1598 COSTS_N_INSNS (15), /* HI */
1599 COSTS_N_INSNS (15), /* SI */
1600 COSTS_N_INSNS (15), /* DI */
1601 COSTS_N_INSNS (15)}, /* other */
1602 0, /* cost of multiply per each bit set */
1603 {COSTS_N_INSNS (56), /* cost of a divide/mod for QI */
1604 COSTS_N_INSNS (56), /* HI */
1605 COSTS_N_INSNS (56), /* SI */
1606 COSTS_N_INSNS (56), /* DI */
1607 COSTS_N_INSNS (56)}, /* other */
1608 COSTS_N_INSNS (1), /* cost of movsx */
1609 COSTS_N_INSNS (1), /* cost of movzx */
1610 16, /* "large" insn */
1611 6, /* MOVE_RATIO */
1612 2, /* cost for loading QImode using movzbl */
1613 {4, 5, 4}, /* cost of loading integer registers
1614 in QImode, HImode and SImode.
1615 Relative to reg-reg move (2). */
1616 {2, 3, 2}, /* cost of storing integer registers */
1617 2, /* cost of reg,reg fld/fst */
1618 {2, 2, 6}, /* cost of loading fp registers
1619 in SFmode, DFmode and XFmode */
1620 {4, 4, 6}, /* cost of storing fp registers
1621 in SFmode, DFmode and XFmode */
1622 2, /* cost of moving MMX register */
1623 {2, 2}, /* cost of loading MMX registers
1624 in SImode and DImode */
1625 {2, 2}, /* cost of storing MMX registers
1626 in SImode and DImode */
1627 12, /* cost of moving SSE register */
1628 {12, 12, 12}, /* cost of loading SSE registers
1629 in SImode, DImode and TImode */
1630 {2, 2, 8}, /* cost of storing SSE registers
1631 in SImode, DImode and TImode */
1632 10, /* MMX or SSE register to integer */
1633 8, /* size of l1 cache. */
1634 256, /* size of l2 cache. */
1635 64, /* size of prefetch block */
1636 6, /* number of parallel prefetches */
1637 2, /* Branch cost */
1638 COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */
1639 COSTS_N_INSNS (7), /* cost of FMUL instruction. */
1640 COSTS_N_INSNS (43), /* cost of FDIV instruction. */
1641 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1642 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1643 COSTS_N_INSNS (43), /* cost of FSQRT instruction. */
1644 {{libcall, {{12, loop_1_byte}, {-1, rep_prefix_4_byte}}},
1645 DUMMY_STRINGOP_ALGS},
1646 {{libcall, {{6, loop_1_byte}, {48, loop}, {20480, rep_prefix_4_byte},
1647 {-1, libcall}}},
1648 DUMMY_STRINGOP_ALGS},
1649 1, /* scalar_stmt_cost. */
1650 1, /* scalar load_cost. */
1651 1, /* scalar_store_cost. */
1652 1, /* vec_stmt_cost. */
1653 1, /* vec_to_scalar_cost. */
1654 1, /* scalar_to_vec_cost. */
1655 1, /* vec_align_load_cost. */
1656 2, /* vec_unalign_load_cost. */
1657 1, /* vec_store_cost. */
1658 3, /* cond_taken_branch_cost. */
1659 1, /* cond_not_taken_branch_cost. */
1660 };
1661
1662 static const
1663 struct processor_costs nocona_cost = {
1664 COSTS_N_INSNS (1), /* cost of an add instruction */
1665 COSTS_N_INSNS (1), /* cost of a lea instruction */
1666 COSTS_N_INSNS (1), /* variable shift costs */
1667 COSTS_N_INSNS (1), /* constant shift costs */
1668 {COSTS_N_INSNS (10), /* cost of starting multiply for QI */
1669 COSTS_N_INSNS (10), /* HI */
1670 COSTS_N_INSNS (10), /* SI */
1671 COSTS_N_INSNS (10), /* DI */
1672 COSTS_N_INSNS (10)}, /* other */
1673 0, /* cost of multiply per each bit set */
1674 {COSTS_N_INSNS (66), /* cost of a divide/mod for QI */
1675 COSTS_N_INSNS (66), /* HI */
1676 COSTS_N_INSNS (66), /* SI */
1677 COSTS_N_INSNS (66), /* DI */
1678 COSTS_N_INSNS (66)}, /* other */
1679 COSTS_N_INSNS (1), /* cost of movsx */
1680 COSTS_N_INSNS (1), /* cost of movzx */
1681 16, /* "large" insn */
1682 17, /* MOVE_RATIO */
1683 4, /* cost for loading QImode using movzbl */
1684 {4, 4, 4}, /* cost of loading integer registers
1685 in QImode, HImode and SImode.
1686 Relative to reg-reg move (2). */
1687 {4, 4, 4}, /* cost of storing integer registers */
1688 3, /* cost of reg,reg fld/fst */
1689 {12, 12, 12}, /* cost of loading fp registers
1690 in SFmode, DFmode and XFmode */
1691 {4, 4, 4}, /* cost of storing fp registers
1692 in SFmode, DFmode and XFmode */
1693 6, /* cost of moving MMX register */
1694 {12, 12}, /* cost of loading MMX registers
1695 in SImode and DImode */
1696 {12, 12}, /* cost of storing MMX registers
1697 in SImode and DImode */
1698 6, /* cost of moving SSE register */
1699 {12, 12, 12}, /* cost of loading SSE registers
1700 in SImode, DImode and TImode */
1701 {12, 12, 12}, /* cost of storing SSE registers
1702 in SImode, DImode and TImode */
1703 8, /* MMX or SSE register to integer */
1704 8, /* size of l1 cache. */
1705 1024, /* size of l2 cache. */
1706 128, /* size of prefetch block */
1707 8, /* number of parallel prefetches */
1708 1, /* Branch cost */
1709 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1710 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1711 COSTS_N_INSNS (40), /* cost of FDIV instruction. */
1712 COSTS_N_INSNS (3), /* cost of FABS instruction. */
1713 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
1714 COSTS_N_INSNS (44), /* cost of FSQRT instruction. */
1715 {{libcall, {{12, loop_1_byte}, {-1, rep_prefix_4_byte}}},
1716 {libcall, {{32, loop}, {20000, rep_prefix_8_byte},
1717 {100000, unrolled_loop}, {-1, libcall}}}},
1718 {{libcall, {{6, loop_1_byte}, {48, loop}, {20480, rep_prefix_4_byte},
1719 {-1, libcall}}},
1720 {libcall, {{24, loop}, {64, unrolled_loop},
1721 {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1722 1, /* scalar_stmt_cost. */
1723 1, /* scalar load_cost. */
1724 1, /* scalar_store_cost. */
1725 1, /* vec_stmt_cost. */
1726 1, /* vec_to_scalar_cost. */
1727 1, /* scalar_to_vec_cost. */
1728 1, /* vec_align_load_cost. */
1729 2, /* vec_unalign_load_cost. */
1730 1, /* vec_store_cost. */
1731 3, /* cond_taken_branch_cost. */
1732 1, /* cond_not_taken_branch_cost. */
1733 };
1734
1735 static const
1736 struct processor_costs atom_cost = {
1737 COSTS_N_INSNS (1), /* cost of an add instruction */
1738 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1739 COSTS_N_INSNS (1), /* variable shift costs */
1740 COSTS_N_INSNS (1), /* constant shift costs */
1741 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1742 COSTS_N_INSNS (4), /* HI */
1743 COSTS_N_INSNS (3), /* SI */
1744 COSTS_N_INSNS (4), /* DI */
1745 COSTS_N_INSNS (2)}, /* other */
1746 0, /* cost of multiply per each bit set */
1747 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1748 COSTS_N_INSNS (26), /* HI */
1749 COSTS_N_INSNS (42), /* SI */
1750 COSTS_N_INSNS (74), /* DI */
1751 COSTS_N_INSNS (74)}, /* other */
1752 COSTS_N_INSNS (1), /* cost of movsx */
1753 COSTS_N_INSNS (1), /* cost of movzx */
1754 8, /* "large" insn */
1755 17, /* MOVE_RATIO */
1756 4, /* cost for loading QImode using movzbl */
1757 {4, 4, 4}, /* cost of loading integer registers
1758 in QImode, HImode and SImode.
1759 Relative to reg-reg move (2). */
1760 {4, 4, 4}, /* cost of storing integer registers */
1761 4, /* cost of reg,reg fld/fst */
1762 {12, 12, 12}, /* cost of loading fp registers
1763 in SFmode, DFmode and XFmode */
1764 {6, 6, 8}, /* cost of storing fp registers
1765 in SFmode, DFmode and XFmode */
1766 2, /* cost of moving MMX register */
1767 {8, 8}, /* cost of loading MMX registers
1768 in SImode and DImode */
1769 {8, 8}, /* cost of storing MMX registers
1770 in SImode and DImode */
1771 2, /* cost of moving SSE register */
1772 {8, 8, 8}, /* cost of loading SSE registers
1773 in SImode, DImode and TImode */
1774 {8, 8, 8}, /* cost of storing SSE registers
1775 in SImode, DImode and TImode */
1776 5, /* MMX or SSE register to integer */
1777 32, /* size of l1 cache. */
1778 256, /* size of l2 cache. */
1779 64, /* size of prefetch block */
1780 6, /* number of parallel prefetches */
1781 3, /* Branch cost */
1782 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1783 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1784 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1785 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1786 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1787 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1788 {{libcall, {{11, loop}, {-1, rep_prefix_4_byte}}},
1789 {libcall, {{32, loop}, {64, rep_prefix_4_byte},
1790 {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1791 {{libcall, {{8, loop}, {15, unrolled_loop},
1792 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1793 {libcall, {{24, loop}, {32, unrolled_loop},
1794 {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1795 1, /* scalar_stmt_cost. */
1796 1, /* scalar load_cost. */
1797 1, /* scalar_store_cost. */
1798 1, /* vec_stmt_cost. */
1799 1, /* vec_to_scalar_cost. */
1800 1, /* scalar_to_vec_cost. */
1801 1, /* vec_align_load_cost. */
1802 2, /* vec_unalign_load_cost. */
1803 1, /* vec_store_cost. */
1804 3, /* cond_taken_branch_cost. */
1805 1, /* cond_not_taken_branch_cost. */
1806 };
1807
1808 /* Generic64 should produce code tuned for Nocona and K8. */
1809 static const
1810 struct processor_costs generic64_cost = {
1811 COSTS_N_INSNS (1), /* cost of an add instruction */
1812 /* On all chips taken into consideration lea is 2 cycles and more. With
1813 this cost however our current implementation of synth_mult results in
1814 use of unnecessary temporary registers causing regression on several
1815 SPECfp benchmarks. */
1816 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1817 COSTS_N_INSNS (1), /* variable shift costs */
1818 COSTS_N_INSNS (1), /* constant shift costs */
1819 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1820 COSTS_N_INSNS (4), /* HI */
1821 COSTS_N_INSNS (3), /* SI */
1822 COSTS_N_INSNS (4), /* DI */
1823 COSTS_N_INSNS (2)}, /* other */
1824 0, /* cost of multiply per each bit set */
1825 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1826 COSTS_N_INSNS (26), /* HI */
1827 COSTS_N_INSNS (42), /* SI */
1828 COSTS_N_INSNS (74), /* DI */
1829 COSTS_N_INSNS (74)}, /* other */
1830 COSTS_N_INSNS (1), /* cost of movsx */
1831 COSTS_N_INSNS (1), /* cost of movzx */
1832 8, /* "large" insn */
1833 17, /* MOVE_RATIO */
1834 4, /* cost for loading QImode using movzbl */
1835 {4, 4, 4}, /* cost of loading integer registers
1836 in QImode, HImode and SImode.
1837 Relative to reg-reg move (2). */
1838 {4, 4, 4}, /* cost of storing integer registers */
1839 4, /* cost of reg,reg fld/fst */
1840 {12, 12, 12}, /* cost of loading fp registers
1841 in SFmode, DFmode and XFmode */
1842 {6, 6, 8}, /* cost of storing fp registers
1843 in SFmode, DFmode and XFmode */
1844 2, /* cost of moving MMX register */
1845 {8, 8}, /* cost of loading MMX registers
1846 in SImode and DImode */
1847 {8, 8}, /* cost of storing MMX registers
1848 in SImode and DImode */
1849 2, /* cost of moving SSE register */
1850 {8, 8, 8}, /* cost of loading SSE registers
1851 in SImode, DImode and TImode */
1852 {8, 8, 8}, /* cost of storing SSE registers
1853 in SImode, DImode and TImode */
1854 5, /* MMX or SSE register to integer */
1855 32, /* size of l1 cache. */
1856 512, /* size of l2 cache. */
1857 64, /* size of prefetch block */
1858 6, /* number of parallel prefetches */
1859 /* Benchmarks shows large regressions on K8 sixtrack benchmark when this
1860 value is increased to perhaps more appropriate value of 5. */
1861 3, /* Branch cost */
1862 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1863 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1864 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1865 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1866 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1867 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1868 {DUMMY_STRINGOP_ALGS,
1869 {libcall, {{32, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1870 {DUMMY_STRINGOP_ALGS,
1871 {libcall, {{32, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1872 1, /* scalar_stmt_cost. */
1873 1, /* scalar load_cost. */
1874 1, /* scalar_store_cost. */
1875 1, /* vec_stmt_cost. */
1876 1, /* vec_to_scalar_cost. */
1877 1, /* scalar_to_vec_cost. */
1878 1, /* vec_align_load_cost. */
1879 2, /* vec_unalign_load_cost. */
1880 1, /* vec_store_cost. */
1881 3, /* cond_taken_branch_cost. */
1882 1, /* cond_not_taken_branch_cost. */
1883 };
1884
1885 /* Generic32 should produce code tuned for PPro, Pentium4, Nocona,
1886 Athlon and K8. */
1887 static const
1888 struct processor_costs generic32_cost = {
1889 COSTS_N_INSNS (1), /* cost of an add instruction */
1890 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1891 COSTS_N_INSNS (1), /* variable shift costs */
1892 COSTS_N_INSNS (1), /* constant shift costs */
1893 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1894 COSTS_N_INSNS (4), /* HI */
1895 COSTS_N_INSNS (3), /* SI */
1896 COSTS_N_INSNS (4), /* DI */
1897 COSTS_N_INSNS (2)}, /* other */
1898 0, /* cost of multiply per each bit set */
1899 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1900 COSTS_N_INSNS (26), /* HI */
1901 COSTS_N_INSNS (42), /* SI */
1902 COSTS_N_INSNS (74), /* DI */
1903 COSTS_N_INSNS (74)}, /* other */
1904 COSTS_N_INSNS (1), /* cost of movsx */
1905 COSTS_N_INSNS (1), /* cost of movzx */
1906 8, /* "large" insn */
1907 17, /* MOVE_RATIO */
1908 4, /* cost for loading QImode using movzbl */
1909 {4, 4, 4}, /* cost of loading integer registers
1910 in QImode, HImode and SImode.
1911 Relative to reg-reg move (2). */
1912 {4, 4, 4}, /* cost of storing integer registers */
1913 4, /* cost of reg,reg fld/fst */
1914 {12, 12, 12}, /* cost of loading fp registers
1915 in SFmode, DFmode and XFmode */
1916 {6, 6, 8}, /* cost of storing fp registers
1917 in SFmode, DFmode and XFmode */
1918 2, /* cost of moving MMX register */
1919 {8, 8}, /* cost of loading MMX registers
1920 in SImode and DImode */
1921 {8, 8}, /* cost of storing MMX registers
1922 in SImode and DImode */
1923 2, /* cost of moving SSE register */
1924 {8, 8, 8}, /* cost of loading SSE registers
1925 in SImode, DImode and TImode */
1926 {8, 8, 8}, /* cost of storing SSE registers
1927 in SImode, DImode and TImode */
1928 5, /* MMX or SSE register to integer */
1929 32, /* size of l1 cache. */
1930 256, /* size of l2 cache. */
1931 64, /* size of prefetch block */
1932 6, /* number of parallel prefetches */
1933 3, /* Branch cost */
1934 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1935 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1936 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1937 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1938 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1939 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1940 {{libcall, {{32, loop}, {8192, rep_prefix_4_byte}, {-1, libcall}}},
1941 DUMMY_STRINGOP_ALGS},
1942 {{libcall, {{32, loop}, {8192, rep_prefix_4_byte}, {-1, libcall}}},
1943 DUMMY_STRINGOP_ALGS},
1944 1, /* scalar_stmt_cost. */
1945 1, /* scalar load_cost. */
1946 1, /* scalar_store_cost. */
1947 1, /* vec_stmt_cost. */
1948 1, /* vec_to_scalar_cost. */
1949 1, /* scalar_to_vec_cost. */
1950 1, /* vec_align_load_cost. */
1951 2, /* vec_unalign_load_cost. */
1952 1, /* vec_store_cost. */
1953 3, /* cond_taken_branch_cost. */
1954 1, /* cond_not_taken_branch_cost. */
1955 };
1956
1957 /* Set by -mtune. */
1958 const struct processor_costs *ix86_tune_cost = &pentium_cost;
1959
1960 /* Set by -mtune or -Os. */
1961 const struct processor_costs *ix86_cost = &pentium_cost;
1962
1963 /* Processor feature/optimization bitmasks. */
1964 #define m_386 (1<<PROCESSOR_I386)
1965 #define m_486 (1<<PROCESSOR_I486)
1966 #define m_PENT (1<<PROCESSOR_PENTIUM)
1967 #define m_PPRO (1<<PROCESSOR_PENTIUMPRO)
1968 #define m_PENT4 (1<<PROCESSOR_PENTIUM4)
1969 #define m_NOCONA (1<<PROCESSOR_NOCONA)
1970 #define m_P4_NOCONA (m_PENT4 | m_NOCONA)
1971 #define m_CORE2_32 (1<<PROCESSOR_CORE2_32)
1972 #define m_CORE2_64 (1<<PROCESSOR_CORE2_64)
1973 #define m_COREI7_32 (1<<PROCESSOR_COREI7_32)
1974 #define m_COREI7_64 (1<<PROCESSOR_COREI7_64)
1975 #define m_COREI7 (m_COREI7_32 | m_COREI7_64)
1976 #define m_CORE2I7_32 (m_CORE2_32 | m_COREI7_32)
1977 #define m_CORE2I7_64 (m_CORE2_64 | m_COREI7_64)
1978 #define m_CORE2I7 (m_CORE2I7_32 | m_CORE2I7_64)
1979 #define m_ATOM (1<<PROCESSOR_ATOM)
1980
1981 #define m_GEODE (1<<PROCESSOR_GEODE)
1982 #define m_K6 (1<<PROCESSOR_K6)
1983 #define m_K6_GEODE (m_K6 | m_GEODE)
1984 #define m_K8 (1<<PROCESSOR_K8)
1985 #define m_ATHLON (1<<PROCESSOR_ATHLON)
1986 #define m_ATHLON_K8 (m_K8 | m_ATHLON)
1987 #define m_AMDFAM10 (1<<PROCESSOR_AMDFAM10)
1988 #define m_BDVER1 (1<<PROCESSOR_BDVER1)
1989 #define m_BDVER2 (1<<PROCESSOR_BDVER2)
1990 #define m_BDVER (m_BDVER1 | m_BDVER2)
1991 #define m_BTVER (m_BTVER1 | m_BTVER2)
1992 #define m_BTVER1 (1<<PROCESSOR_BTVER1)
1993 #define m_BTVER2 (1<<PROCESSOR_BTVER2)
1994 #define m_AMD_MULTIPLE (m_ATHLON_K8 | m_AMDFAM10 | m_BDVER | m_BTVER)
1995
1996 #define m_GENERIC32 (1<<PROCESSOR_GENERIC32)
1997 #define m_GENERIC64 (1<<PROCESSOR_GENERIC64)
1998
1999 /* Generic instruction choice should be common subset of supported CPUs
2000 (PPro/PENT4/NOCONA/CORE2/Athlon/K8). */
2001 #define m_GENERIC (m_GENERIC32 | m_GENERIC64)
2002
2003 /* Feature tests against the various tunings. */
2004 unsigned char ix86_tune_features[X86_TUNE_LAST];
2005
2006 /* Feature tests against the various tunings used to create ix86_tune_features
2007 based on the processor mask. */
2008 static unsigned int initial_ix86_tune_features[X86_TUNE_LAST] = {
2009 /* X86_TUNE_USE_LEAVE: Leave does not affect Nocona SPEC2000 results
2010 negatively, so enabling for Generic64 seems like good code size
2011 tradeoff. We can't enable it for 32bit generic because it does not
2012 work well with PPro base chips. */
2013 m_386 | m_CORE2I7_64 | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC64,
2014
2015 /* X86_TUNE_PUSH_MEMORY */
2016 m_386 | m_P4_NOCONA | m_CORE2I7 | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC,
2017
2018 /* X86_TUNE_ZERO_EXTEND_WITH_AND */
2019 m_486 | m_PENT,
2020
2021 /* X86_TUNE_UNROLL_STRLEN */
2022 m_486 | m_PENT | m_PPRO | m_ATOM | m_CORE2I7 | m_K6 | m_AMD_MULTIPLE | m_GENERIC,
2023
2024 /* X86_TUNE_BRANCH_PREDICTION_HINTS: Branch hints were put in P4 based
2025 on simulation result. But after P4 was made, no performance benefit
2026 was observed with branch hints. It also increases the code size.
2027 As a result, icc never generates branch hints. */
2028 0,
2029
2030 /* X86_TUNE_DOUBLE_WITH_ADD */
2031 ~m_386,
2032
2033 /* X86_TUNE_USE_SAHF */
2034 m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_K6_GEODE | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER | m_GENERIC,
2035
2036 /* X86_TUNE_MOVX: Enable to zero extend integer registers to avoid
2037 partial dependencies. */
2038 m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_GEODE | m_AMD_MULTIPLE | m_GENERIC,
2039
2040 /* X86_TUNE_PARTIAL_REG_STALL: We probably ought to watch for partial
2041 register stalls on Generic32 compilation setting as well. However
2042 in current implementation the partial register stalls are not eliminated
2043 very well - they can be introduced via subregs synthesized by combine
2044 and can happen in caller/callee saving sequences. Because this option
2045 pays back little on PPro based chips and is in conflict with partial reg
2046 dependencies used by Athlon/P4 based chips, it is better to leave it off
2047 for generic32 for now. */
2048 m_PPRO,
2049
2050 /* X86_TUNE_PARTIAL_FLAG_REG_STALL */
2051 m_CORE2I7 | m_GENERIC,
2052
2053 /* X86_TUNE_LCP_STALL: Avoid an expensive length-changing prefix stall
2054 * on 16-bit immediate moves into memory on Core2 and Corei7. */
2055 m_CORE2I7 | m_GENERIC,
2056
2057 /* X86_TUNE_USE_HIMODE_FIOP */
2058 m_386 | m_486 | m_K6_GEODE,
2059
2060 /* X86_TUNE_USE_SIMODE_FIOP */
2061 ~(m_PENT | m_PPRO | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC),
2062
2063 /* X86_TUNE_USE_MOV0 */
2064 m_K6,
2065
2066 /* X86_TUNE_USE_CLTD */
2067 ~(m_PENT | m_CORE2I7 | m_ATOM | m_K6 | m_GENERIC),
2068
2069 /* X86_TUNE_USE_XCHGB: Use xchgb %rh,%rl instead of rolw/rorw $8,rx. */
2070 m_PENT4,
2071
2072 /* X86_TUNE_SPLIT_LONG_MOVES */
2073 m_PPRO,
2074
2075 /* X86_TUNE_READ_MODIFY_WRITE */
2076 ~m_PENT,
2077
2078 /* X86_TUNE_READ_MODIFY */
2079 ~(m_PENT | m_PPRO),
2080
2081 /* X86_TUNE_PROMOTE_QIMODE */
2082 m_386 | m_486 | m_PENT | m_CORE2I7 | m_ATOM | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC,
2083
2084 /* X86_TUNE_FAST_PREFIX */
2085 ~(m_386 | m_486 | m_PENT),
2086
2087 /* X86_TUNE_SINGLE_STRINGOP */
2088 m_386 | m_P4_NOCONA,
2089
2090 /* X86_TUNE_QIMODE_MATH */
2091 ~0,
2092
2093 /* X86_TUNE_HIMODE_MATH: On PPro this flag is meant to avoid partial
2094 register stalls. Just like X86_TUNE_PARTIAL_REG_STALL this option
2095 might be considered for Generic32 if our scheme for avoiding partial
2096 stalls was more effective. */
2097 ~m_PPRO,
2098
2099 /* X86_TUNE_PROMOTE_QI_REGS */
2100 0,
2101
2102 /* X86_TUNE_PROMOTE_HI_REGS */
2103 m_PPRO,
2104
2105 /* X86_TUNE_SINGLE_POP: Enable if single pop insn is preferred
2106 over esp addition. */
2107 m_386 | m_486 | m_PENT | m_PPRO,
2108
2109 /* X86_TUNE_DOUBLE_POP: Enable if double pop insn is preferred
2110 over esp addition. */
2111 m_PENT,
2112
2113 /* X86_TUNE_SINGLE_PUSH: Enable if single push insn is preferred
2114 over esp subtraction. */
2115 m_386 | m_486 | m_PENT | m_K6_GEODE,
2116
2117 /* X86_TUNE_DOUBLE_PUSH. Enable if double push insn is preferred
2118 over esp subtraction. */
2119 m_PENT | m_K6_GEODE,
2120
2121 /* X86_TUNE_INTEGER_DFMODE_MOVES: Enable if integer moves are preferred
2122 for DFmode copies */
2123 ~(m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_GEODE | m_AMD_MULTIPLE | m_ATOM | m_GENERIC),
2124
2125 /* X86_TUNE_PARTIAL_REG_DEPENDENCY */
2126 m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
2127
2128 /* X86_TUNE_SSE_PARTIAL_REG_DEPENDENCY: In the Generic model we have a
2129 conflict here in between PPro/Pentium4 based chips that thread 128bit
2130 SSE registers as single units versus K8 based chips that divide SSE
2131 registers to two 64bit halves. This knob promotes all store destinations
2132 to be 128bit to allow register renaming on 128bit SSE units, but usually
2133 results in one extra microop on 64bit SSE units. Experimental results
2134 shows that disabling this option on P4 brings over 20% SPECfp regression,
2135 while enabling it on K8 brings roughly 2.4% regression that can be partly
2136 masked by careful scheduling of moves. */
2137 m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMDFAM10 | m_BDVER | m_GENERIC,
2138
2139 /* X86_TUNE_SSE_UNALIGNED_LOAD_OPTIMAL */
2140 m_COREI7 | m_AMDFAM10 | m_BDVER | m_BTVER,
2141
2142 /* X86_TUNE_SSE_UNALIGNED_STORE_OPTIMAL */
2143 m_COREI7 | m_BDVER,
2144
2145 /* X86_TUNE_SSE_PACKED_SINGLE_INSN_OPTIMAL */
2146 m_BDVER ,
2147
2148 /* X86_TUNE_SSE_SPLIT_REGS: Set for machines where the type and dependencies
2149 are resolved on SSE register parts instead of whole registers, so we may
2150 maintain just lower part of scalar values in proper format leaving the
2151 upper part undefined. */
2152 m_ATHLON_K8,
2153
2154 /* X86_TUNE_SSE_TYPELESS_STORES */
2155 m_AMD_MULTIPLE,
2156
2157 /* X86_TUNE_SSE_LOAD0_BY_PXOR */
2158 m_PPRO | m_P4_NOCONA,
2159
2160 /* X86_TUNE_MEMORY_MISMATCH_STALL */
2161 m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
2162
2163 /* X86_TUNE_PROLOGUE_USING_MOVE */
2164 m_PPRO | m_CORE2I7 | m_ATOM | m_ATHLON_K8 | m_GENERIC,
2165
2166 /* X86_TUNE_EPILOGUE_USING_MOVE */
2167 m_PPRO | m_CORE2I7 | m_ATOM | m_ATHLON_K8 | m_GENERIC,
2168
2169 /* X86_TUNE_SHIFT1 */
2170 ~m_486,
2171
2172 /* X86_TUNE_USE_FFREEP */
2173 m_AMD_MULTIPLE,
2174
2175 /* X86_TUNE_INTER_UNIT_MOVES */
2176 ~(m_AMD_MULTIPLE | m_GENERIC),
2177
2178 /* X86_TUNE_INTER_UNIT_CONVERSIONS */
2179 ~(m_AMDFAM10 | m_BDVER ),
2180
2181 /* X86_TUNE_FOUR_JUMP_LIMIT: Some CPU cores are not able to predict more
2182 than 4 branch instructions in the 16 byte window. */
2183 m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
2184
2185 /* X86_TUNE_SCHEDULE */
2186 m_PENT | m_PPRO | m_CORE2I7 | m_ATOM | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC,
2187
2188 /* X86_TUNE_USE_BT */
2189 m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
2190
2191 /* X86_TUNE_USE_INCDEC */
2192 ~(m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_GENERIC),
2193
2194 /* X86_TUNE_PAD_RETURNS */
2195 m_CORE2I7 | m_AMD_MULTIPLE | m_GENERIC,
2196
2197 /* X86_TUNE_PAD_SHORT_FUNCTION: Pad short funtion. */
2198 m_ATOM,
2199
2200 /* X86_TUNE_EXT_80387_CONSTANTS */
2201 m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_K6_GEODE | m_ATHLON_K8 | m_GENERIC,
2202
2203 /* X86_TUNE_SHORTEN_X87_SSE */
2204 ~m_K8,
2205
2206 /* X86_TUNE_AVOID_VECTOR_DECODE */
2207 m_CORE2I7_64 | m_K8 | m_GENERIC64,
2208
2209 /* X86_TUNE_PROMOTE_HIMODE_IMUL: Modern CPUs have same latency for HImode
2210 and SImode multiply, but 386 and 486 do HImode multiply faster. */
2211 ~(m_386 | m_486),
2212
2213 /* X86_TUNE_SLOW_IMUL_IMM32_MEM: Imul of 32-bit constant and memory is
2214 vector path on AMD machines. */
2215 m_CORE2I7_64 | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER | m_GENERIC64,
2216
2217 /* X86_TUNE_SLOW_IMUL_IMM8: Imul of 8-bit constant is vector path on AMD
2218 machines. */
2219 m_CORE2I7_64 | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER | m_GENERIC64,
2220
2221 /* X86_TUNE_MOVE_M1_VIA_OR: On pentiums, it is faster to load -1 via OR
2222 than a MOV. */
2223 m_PENT,
2224
2225 /* X86_TUNE_NOT_UNPAIRABLE: NOT is not pairable on Pentium, while XOR is,
2226 but one byte longer. */
2227 m_PENT,
2228
2229 /* X86_TUNE_NOT_VECTORMODE: On AMD K6, NOT is vector decoded with memory
2230 operand that cannot be represented using a modRM byte. The XOR
2231 replacement is long decoded, so this split helps here as well. */
2232 m_K6,
2233
2234 /* X86_TUNE_USE_VECTOR_FP_CONVERTS: Prefer vector packed SSE conversion
2235 from FP to FP. */
2236 m_CORE2I7 | m_AMDFAM10 | m_GENERIC,
2237
2238 /* X86_TUNE_USE_VECTOR_CONVERTS: Prefer vector packed SSE conversion
2239 from integer to FP. */
2240 m_AMDFAM10,
2241
2242 /* X86_TUNE_FUSE_CMP_AND_BRANCH: Fuse a compare or test instruction
2243 with a subsequent conditional jump instruction into a single
2244 compare-and-branch uop. */
2245 m_BDVER,
2246
2247 /* X86_TUNE_OPT_AGU: Optimize for Address Generation Unit. This flag
2248 will impact LEA instruction selection. */
2249 m_ATOM,
2250
2251 /* X86_TUNE_VECTORIZE_DOUBLE: Enable double precision vector
2252 instructions. */
2253 ~m_ATOM,
2254
2255 /* X86_SOFTARE_PREFETCHING_BENEFICIAL: Enable software prefetching
2256 at -O3. For the moment, the prefetching seems badly tuned for Intel
2257 chips. */
2258 m_K6_GEODE | m_AMD_MULTIPLE,
2259
2260 /* X86_TUNE_AVX128_OPTIMAL: Enable 128-bit AVX instruction generation for
2261 the auto-vectorizer. */
2262 m_BDVER,
2263
2264 /* X86_TUNE_REASSOC_INT_TO_PARALLEL: Try to produce parallel computations
2265 during reassociation of integer computation. */
2266 m_ATOM,
2267
2268 /* X86_TUNE_REASSOC_FP_TO_PARALLEL: Try to produce parallel computations
2269 during reassociation of fp computation. */
2270 m_ATOM,
2271
2272 /* X86_TUNE_GENERAL_REGS_SSE_SPILL: Try to spill general regs to SSE
2273 regs instead of memory. */
2274 m_COREI7 | m_CORE2I7
2275 };
2276
2277 /* Feature tests against the various architecture variations. */
2278 unsigned char ix86_arch_features[X86_ARCH_LAST];
2279
2280 /* Feature tests against the various architecture variations, used to create
2281 ix86_arch_features based on the processor mask. */
2282 static unsigned int initial_ix86_arch_features[X86_ARCH_LAST] = {
2283 /* X86_ARCH_CMOV: Conditional move was added for pentiumpro. */
2284 ~(m_386 | m_486 | m_PENT | m_K6),
2285
2286 /* X86_ARCH_CMPXCHG: Compare and exchange was added for 80486. */
2287 ~m_386,
2288
2289 /* X86_ARCH_CMPXCHG8B: Compare and exchange 8 bytes was added for pentium. */
2290 ~(m_386 | m_486),
2291
2292 /* X86_ARCH_XADD: Exchange and add was added for 80486. */
2293 ~m_386,
2294
2295 /* X86_ARCH_BSWAP: Byteswap was added for 80486. */
2296 ~m_386,
2297 };
2298
2299 static const unsigned int x86_accumulate_outgoing_args
2300 = m_PPRO | m_P4_NOCONA | m_ATOM | m_CORE2I7 | m_AMD_MULTIPLE | m_GENERIC;
2301
2302 static const unsigned int x86_arch_always_fancy_math_387
2303 = m_PENT | m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC;
2304
2305 static const unsigned int x86_avx256_split_unaligned_load
2306 = m_COREI7 | m_GENERIC;
2307
2308 static const unsigned int x86_avx256_split_unaligned_store
2309 = m_COREI7 | m_BDVER | m_GENERIC;
2310
2311 /* In case the average insn count for single function invocation is
2312 lower than this constant, emit fast (but longer) prologue and
2313 epilogue code. */
2314 #define FAST_PROLOGUE_INSN_COUNT 20
2315
2316 /* Names for 8 (low), 8 (high), and 16-bit registers, respectively. */
2317 static const char *const qi_reg_name[] = QI_REGISTER_NAMES;
2318 static const char *const qi_high_reg_name[] = QI_HIGH_REGISTER_NAMES;
2319 static const char *const hi_reg_name[] = HI_REGISTER_NAMES;
2320
2321 /* Array of the smallest class containing reg number REGNO, indexed by
2322 REGNO. Used by REGNO_REG_CLASS in i386.h. */
2323
2324 enum reg_class const regclass_map[FIRST_PSEUDO_REGISTER] =
2325 {
2326 /* ax, dx, cx, bx */
2327 AREG, DREG, CREG, BREG,
2328 /* si, di, bp, sp */
2329 SIREG, DIREG, NON_Q_REGS, NON_Q_REGS,
2330 /* FP registers */
2331 FP_TOP_REG, FP_SECOND_REG, FLOAT_REGS, FLOAT_REGS,
2332 FLOAT_REGS, FLOAT_REGS, FLOAT_REGS, FLOAT_REGS,
2333 /* arg pointer */
2334 NON_Q_REGS,
2335 /* flags, fpsr, fpcr, frame */
2336 NO_REGS, NO_REGS, NO_REGS, NON_Q_REGS,
2337 /* SSE registers */
2338 SSE_FIRST_REG, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
2339 SSE_REGS, SSE_REGS,
2340 /* MMX registers */
2341 MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS,
2342 MMX_REGS, MMX_REGS,
2343 /* REX registers */
2344 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
2345 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
2346 /* SSE REX registers */
2347 SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
2348 SSE_REGS, SSE_REGS,
2349 };
2350
2351 /* The "default" register map used in 32bit mode. */
2352
2353 int const dbx_register_map[FIRST_PSEUDO_REGISTER] =
2354 {
2355 0, 2, 1, 3, 6, 7, 4, 5, /* general regs */
2356 12, 13, 14, 15, 16, 17, 18, 19, /* fp regs */
2357 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2358 21, 22, 23, 24, 25, 26, 27, 28, /* SSE */
2359 29, 30, 31, 32, 33, 34, 35, 36, /* MMX */
2360 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
2361 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
2362 };
2363
2364 /* The "default" register map used in 64bit mode. */
2365
2366 int const dbx64_register_map[FIRST_PSEUDO_REGISTER] =
2367 {
2368 0, 1, 2, 3, 4, 5, 6, 7, /* general regs */
2369 33, 34, 35, 36, 37, 38, 39, 40, /* fp regs */
2370 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2371 17, 18, 19, 20, 21, 22, 23, 24, /* SSE */
2372 41, 42, 43, 44, 45, 46, 47, 48, /* MMX */
2373 8,9,10,11,12,13,14,15, /* extended integer registers */
2374 25, 26, 27, 28, 29, 30, 31, 32, /* extended SSE registers */
2375 };
2376
2377 /* Define the register numbers to be used in Dwarf debugging information.
2378 The SVR4 reference port C compiler uses the following register numbers
2379 in its Dwarf output code:
2380 0 for %eax (gcc regno = 0)
2381 1 for %ecx (gcc regno = 2)
2382 2 for %edx (gcc regno = 1)
2383 3 for %ebx (gcc regno = 3)
2384 4 for %esp (gcc regno = 7)
2385 5 for %ebp (gcc regno = 6)
2386 6 for %esi (gcc regno = 4)
2387 7 for %edi (gcc regno = 5)
2388 The following three DWARF register numbers are never generated by
2389 the SVR4 C compiler or by the GNU compilers, but SDB on x86/svr4
2390 believes these numbers have these meanings.
2391 8 for %eip (no gcc equivalent)
2392 9 for %eflags (gcc regno = 17)
2393 10 for %trapno (no gcc equivalent)
2394 It is not at all clear how we should number the FP stack registers
2395 for the x86 architecture. If the version of SDB on x86/svr4 were
2396 a bit less brain dead with respect to floating-point then we would
2397 have a precedent to follow with respect to DWARF register numbers
2398 for x86 FP registers, but the SDB on x86/svr4 is so completely
2399 broken with respect to FP registers that it is hardly worth thinking
2400 of it as something to strive for compatibility with.
2401 The version of x86/svr4 SDB I have at the moment does (partially)
2402 seem to believe that DWARF register number 11 is associated with
2403 the x86 register %st(0), but that's about all. Higher DWARF
2404 register numbers don't seem to be associated with anything in
2405 particular, and even for DWARF regno 11, SDB only seems to under-
2406 stand that it should say that a variable lives in %st(0) (when
2407 asked via an `=' command) if we said it was in DWARF regno 11,
2408 but SDB still prints garbage when asked for the value of the
2409 variable in question (via a `/' command).
2410 (Also note that the labels SDB prints for various FP stack regs
2411 when doing an `x' command are all wrong.)
2412 Note that these problems generally don't affect the native SVR4
2413 C compiler because it doesn't allow the use of -O with -g and
2414 because when it is *not* optimizing, it allocates a memory
2415 location for each floating-point variable, and the memory
2416 location is what gets described in the DWARF AT_location
2417 attribute for the variable in question.
2418 Regardless of the severe mental illness of the x86/svr4 SDB, we
2419 do something sensible here and we use the following DWARF
2420 register numbers. Note that these are all stack-top-relative
2421 numbers.
2422 11 for %st(0) (gcc regno = 8)
2423 12 for %st(1) (gcc regno = 9)
2424 13 for %st(2) (gcc regno = 10)
2425 14 for %st(3) (gcc regno = 11)
2426 15 for %st(4) (gcc regno = 12)
2427 16 for %st(5) (gcc regno = 13)
2428 17 for %st(6) (gcc regno = 14)
2429 18 for %st(7) (gcc regno = 15)
2430 */
2431 int const svr4_dbx_register_map[FIRST_PSEUDO_REGISTER] =
2432 {
2433 0, 2, 1, 3, 6, 7, 5, 4, /* general regs */
2434 11, 12, 13, 14, 15, 16, 17, 18, /* fp regs */
2435 -1, 9, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2436 21, 22, 23, 24, 25, 26, 27, 28, /* SSE registers */
2437 29, 30, 31, 32, 33, 34, 35, 36, /* MMX registers */
2438 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
2439 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
2440 };
2441
2442 /* Define parameter passing and return registers. */
2443
2444 static int const x86_64_int_parameter_registers[6] =
2445 {
2446 DI_REG, SI_REG, DX_REG, CX_REG, R8_REG, R9_REG
2447 };
2448
2449 static int const x86_64_ms_abi_int_parameter_registers[4] =
2450 {
2451 CX_REG, DX_REG, R8_REG, R9_REG
2452 };
2453
2454 static int const x86_64_int_return_registers[4] =
2455 {
2456 AX_REG, DX_REG, DI_REG, SI_REG
2457 };
2458
2459 /* Define the structure for the machine field in struct function. */
2460
2461 struct GTY(()) stack_local_entry {
2462 unsigned short mode;
2463 unsigned short n;
2464 rtx rtl;
2465 struct stack_local_entry *next;
2466 };
2467
2468 /* Structure describing stack frame layout.
2469 Stack grows downward:
2470
2471 [arguments]
2472 <- ARG_POINTER
2473 saved pc
2474
2475 saved static chain if ix86_static_chain_on_stack
2476
2477 saved frame pointer if frame_pointer_needed
2478 <- HARD_FRAME_POINTER
2479 [saved regs]
2480 <- regs_save_offset
2481 [padding0]
2482
2483 [saved SSE regs]
2484 <- sse_regs_save_offset
2485 [padding1] |
2486 | <- FRAME_POINTER
2487 [va_arg registers] |
2488 |
2489 [frame] |
2490 |
2491 [padding2] | = to_allocate
2492 <- STACK_POINTER
2493 */
2494 struct ix86_frame
2495 {
2496 int nsseregs;
2497 int nregs;
2498 int va_arg_size;
2499 int red_zone_size;
2500 int outgoing_arguments_size;
2501
2502 /* The offsets relative to ARG_POINTER. */
2503 HOST_WIDE_INT frame_pointer_offset;
2504 HOST_WIDE_INT hard_frame_pointer_offset;
2505 HOST_WIDE_INT stack_pointer_offset;
2506 HOST_WIDE_INT hfp_save_offset;
2507 HOST_WIDE_INT reg_save_offset;
2508 HOST_WIDE_INT sse_reg_save_offset;
2509
2510 /* When save_regs_using_mov is set, emit prologue using
2511 move instead of push instructions. */
2512 bool save_regs_using_mov;
2513 };
2514
2515 /* Which cpu are we scheduling for. */
2516 enum attr_cpu ix86_schedule;
2517
2518 /* Which cpu are we optimizing for. */
2519 enum processor_type ix86_tune;
2520
2521 /* Which instruction set architecture to use. */
2522 enum processor_type ix86_arch;
2523
2524 /* True if processor has SSE prefetch instruction. */
2525 unsigned char x86_prefetch_sse;
2526
2527 /* -mstackrealign option */
2528 static const char ix86_force_align_arg_pointer_string[]
2529 = "force_align_arg_pointer";
2530
2531 static rtx (*ix86_gen_leave) (void);
2532 static rtx (*ix86_gen_add3) (rtx, rtx, rtx);
2533 static rtx (*ix86_gen_sub3) (rtx, rtx, rtx);
2534 static rtx (*ix86_gen_sub3_carry) (rtx, rtx, rtx, rtx, rtx);
2535 static rtx (*ix86_gen_one_cmpl2) (rtx, rtx);
2536 static rtx (*ix86_gen_monitor) (rtx, rtx, rtx);
2537 static rtx (*ix86_gen_andsp) (rtx, rtx, rtx);
2538 static rtx (*ix86_gen_allocate_stack_worker) (rtx, rtx);
2539 static rtx (*ix86_gen_adjust_stack_and_probe) (rtx, rtx, rtx);
2540 static rtx (*ix86_gen_probe_stack_range) (rtx, rtx, rtx);
2541 static rtx (*ix86_gen_tls_global_dynamic_64) (rtx, rtx, rtx);
2542 static rtx (*ix86_gen_tls_local_dynamic_base_64) (rtx, rtx);
2543
2544 /* Preferred alignment for stack boundary in bits. */
2545 unsigned int ix86_preferred_stack_boundary;
2546
2547 /* Alignment for incoming stack boundary in bits specified at
2548 command line. */
2549 static unsigned int ix86_user_incoming_stack_boundary;
2550
2551 /* Default alignment for incoming stack boundary in bits. */
2552 static unsigned int ix86_default_incoming_stack_boundary;
2553
2554 /* Alignment for incoming stack boundary in bits. */
2555 unsigned int ix86_incoming_stack_boundary;
2556
2557 /* Calling abi specific va_list type nodes. */
2558 static GTY(()) tree sysv_va_list_type_node;
2559 static GTY(()) tree ms_va_list_type_node;
2560
2561 /* Prefix built by ASM_GENERATE_INTERNAL_LABEL. */
2562 char internal_label_prefix[16];
2563 int internal_label_prefix_len;
2564
2565 /* Fence to use after loop using movnt. */
2566 tree x86_mfence;
2567
2568 /* Register class used for passing given 64bit part of the argument.
2569 These represent classes as documented by the PS ABI, with the exception
2570 of SSESF, SSEDF classes, that are basically SSE class, just gcc will
2571 use SF or DFmode move instead of DImode to avoid reformatting penalties.
2572
2573 Similarly we play games with INTEGERSI_CLASS to use cheaper SImode moves
2574 whenever possible (upper half does contain padding). */
2575 enum x86_64_reg_class
2576 {
2577 X86_64_NO_CLASS,
2578 X86_64_INTEGER_CLASS,
2579 X86_64_INTEGERSI_CLASS,
2580 X86_64_SSE_CLASS,
2581 X86_64_SSESF_CLASS,
2582 X86_64_SSEDF_CLASS,
2583 X86_64_SSEUP_CLASS,
2584 X86_64_X87_CLASS,
2585 X86_64_X87UP_CLASS,
2586 X86_64_COMPLEX_X87_CLASS,
2587 X86_64_MEMORY_CLASS
2588 };
2589
2590 #define MAX_CLASSES 4
2591
2592 /* Table of constants used by fldpi, fldln2, etc.... */
2593 static REAL_VALUE_TYPE ext_80387_constants_table [5];
2594 static bool ext_80387_constants_init = 0;
2595
2596 \f
2597 static struct machine_function * ix86_init_machine_status (void);
2598 static rtx ix86_function_value (const_tree, const_tree, bool);
2599 static bool ix86_function_value_regno_p (const unsigned int);
2600 static unsigned int ix86_function_arg_boundary (enum machine_mode,
2601 const_tree);
2602 static rtx ix86_static_chain (const_tree, bool);
2603 static int ix86_function_regparm (const_tree, const_tree);
2604 static void ix86_compute_frame_layout (struct ix86_frame *);
2605 static bool ix86_expand_vector_init_one_nonzero (bool, enum machine_mode,
2606 rtx, rtx, int);
2607 static void ix86_add_new_builtins (HOST_WIDE_INT);
2608 static tree ix86_canonical_va_list_type (tree);
2609 static void predict_jump (int);
2610 static unsigned int split_stack_prologue_scratch_regno (void);
2611 static bool i386_asm_output_addr_const_extra (FILE *, rtx);
2612
2613 enum ix86_function_specific_strings
2614 {
2615 IX86_FUNCTION_SPECIFIC_ARCH,
2616 IX86_FUNCTION_SPECIFIC_TUNE,
2617 IX86_FUNCTION_SPECIFIC_MAX
2618 };
2619
2620 static char *ix86_target_string (HOST_WIDE_INT, int, const char *,
2621 const char *, enum fpmath_unit, bool);
2622 static void ix86_debug_options (void) ATTRIBUTE_UNUSED;
2623 static void ix86_function_specific_save (struct cl_target_option *);
2624 static void ix86_function_specific_restore (struct cl_target_option *);
2625 static void ix86_function_specific_print (FILE *, int,
2626 struct cl_target_option *);
2627 static bool ix86_valid_target_attribute_p (tree, tree, tree, int);
2628 static bool ix86_valid_target_attribute_inner_p (tree, char *[],
2629 struct gcc_options *);
2630 static bool ix86_can_inline_p (tree, tree);
2631 static void ix86_set_current_function (tree);
2632 static unsigned int ix86_minimum_incoming_stack_boundary (bool);
2633
2634 static enum calling_abi ix86_function_abi (const_tree);
2635
2636 \f
2637 #ifndef SUBTARGET32_DEFAULT_CPU
2638 #define SUBTARGET32_DEFAULT_CPU "i386"
2639 #endif
2640
2641 /* The svr4 ABI for the i386 says that records and unions are returned
2642 in memory. */
2643 #ifndef DEFAULT_PCC_STRUCT_RETURN
2644 #define DEFAULT_PCC_STRUCT_RETURN 1
2645 #endif
2646
2647 /* Whether -mtune= or -march= were specified */
2648 static int ix86_tune_defaulted;
2649 static int ix86_arch_specified;
2650
2651 /* Vectorization library interface and handlers. */
2652 static tree (*ix86_veclib_handler) (enum built_in_function, tree, tree);
2653
2654 static tree ix86_veclibabi_svml (enum built_in_function, tree, tree);
2655 static tree ix86_veclibabi_acml (enum built_in_function, tree, tree);
2656
2657 /* Processor target table, indexed by processor number */
2658 struct ptt
2659 {
2660 const struct processor_costs *cost; /* Processor costs */
2661 const int align_loop; /* Default alignments. */
2662 const int align_loop_max_skip;
2663 const int align_jump;
2664 const int align_jump_max_skip;
2665 const int align_func;
2666 };
2667
2668 static const struct ptt processor_target_table[PROCESSOR_max] =
2669 {
2670 {&i386_cost, 4, 3, 4, 3, 4},
2671 {&i486_cost, 16, 15, 16, 15, 16},
2672 {&pentium_cost, 16, 7, 16, 7, 16},
2673 {&pentiumpro_cost, 16, 15, 16, 10, 16},
2674 {&geode_cost, 0, 0, 0, 0, 0},
2675 {&k6_cost, 32, 7, 32, 7, 32},
2676 {&athlon_cost, 16, 7, 16, 7, 16},
2677 {&pentium4_cost, 0, 0, 0, 0, 0},
2678 {&k8_cost, 16, 7, 16, 7, 16},
2679 {&nocona_cost, 0, 0, 0, 0, 0},
2680 /* Core 2 32-bit. */
2681 {&generic32_cost, 16, 10, 16, 10, 16},
2682 /* Core 2 64-bit. */
2683 {&generic64_cost, 16, 10, 16, 10, 16},
2684 /* Core i7 32-bit. */
2685 {&generic32_cost, 16, 10, 16, 10, 16},
2686 /* Core i7 64-bit. */
2687 {&generic64_cost, 16, 10, 16, 10, 16},
2688 {&generic32_cost, 16, 7, 16, 7, 16},
2689 {&generic64_cost, 16, 10, 16, 10, 16},
2690 {&amdfam10_cost, 32, 24, 32, 7, 32},
2691 {&bdver1_cost, 32, 24, 32, 7, 32},
2692 {&bdver2_cost, 32, 24, 32, 7, 32},
2693 {&btver1_cost, 32, 24, 32, 7, 32},
2694 {&btver2_cost, 32, 24, 32, 7, 32},
2695 {&atom_cost, 16, 15, 16, 7, 16}
2696 };
2697
2698 static const char *const cpu_names[TARGET_CPU_DEFAULT_max] =
2699 {
2700 "generic",
2701 "i386",
2702 "i486",
2703 "pentium",
2704 "pentium-mmx",
2705 "pentiumpro",
2706 "pentium2",
2707 "pentium3",
2708 "pentium4",
2709 "pentium-m",
2710 "prescott",
2711 "nocona",
2712 "core2",
2713 "corei7",
2714 "atom",
2715 "geode",
2716 "k6",
2717 "k6-2",
2718 "k6-3",
2719 "athlon",
2720 "athlon-4",
2721 "k8",
2722 "amdfam10",
2723 "bdver1",
2724 "bdver2",
2725 "btver1",
2726 "btver2"
2727 };
2728 \f
2729 /* Return true if a red-zone is in use. */
2730
2731 static inline bool
2732 ix86_using_red_zone (void)
2733 {
2734 return TARGET_RED_ZONE && !TARGET_64BIT_MS_ABI;
2735 }
2736 \f
2737 /* Return a string that documents the current -m options. The caller is
2738 responsible for freeing the string. */
2739
2740 static char *
2741 ix86_target_string (HOST_WIDE_INT isa, int flags, const char *arch,
2742 const char *tune, enum fpmath_unit fpmath,
2743 bool add_nl_p)
2744 {
2745 struct ix86_target_opts
2746 {
2747 const char *option; /* option string */
2748 HOST_WIDE_INT mask; /* isa mask options */
2749 };
2750
2751 /* This table is ordered so that options like -msse4.2 that imply
2752 preceding options while match those first. */
2753 static struct ix86_target_opts isa_opts[] =
2754 {
2755 { "-mfma4", OPTION_MASK_ISA_FMA4 },
2756 { "-mfma", OPTION_MASK_ISA_FMA },
2757 { "-mxop", OPTION_MASK_ISA_XOP },
2758 { "-mlwp", OPTION_MASK_ISA_LWP },
2759 { "-msse4a", OPTION_MASK_ISA_SSE4A },
2760 { "-msse4.2", OPTION_MASK_ISA_SSE4_2 },
2761 { "-msse4.1", OPTION_MASK_ISA_SSE4_1 },
2762 { "-mssse3", OPTION_MASK_ISA_SSSE3 },
2763 { "-msse3", OPTION_MASK_ISA_SSE3 },
2764 { "-msse2", OPTION_MASK_ISA_SSE2 },
2765 { "-msse", OPTION_MASK_ISA_SSE },
2766 { "-m3dnow", OPTION_MASK_ISA_3DNOW },
2767 { "-m3dnowa", OPTION_MASK_ISA_3DNOW_A },
2768 { "-mmmx", OPTION_MASK_ISA_MMX },
2769 { "-mabm", OPTION_MASK_ISA_ABM },
2770 { "-mbmi", OPTION_MASK_ISA_BMI },
2771 { "-mbmi2", OPTION_MASK_ISA_BMI2 },
2772 { "-mlzcnt", OPTION_MASK_ISA_LZCNT },
2773 { "-mhle", OPTION_MASK_ISA_HLE },
2774 { "-mfxsr", OPTION_MASK_ISA_FXSR },
2775 { "-mrdseed", OPTION_MASK_ISA_RDSEED },
2776 { "-mprfchw", OPTION_MASK_ISA_PRFCHW },
2777 { "-madx", OPTION_MASK_ISA_ADX },
2778 { "-mtbm", OPTION_MASK_ISA_TBM },
2779 { "-mpopcnt", OPTION_MASK_ISA_POPCNT },
2780 { "-mmovbe", OPTION_MASK_ISA_MOVBE },
2781 { "-mcrc32", OPTION_MASK_ISA_CRC32 },
2782 { "-maes", OPTION_MASK_ISA_AES },
2783 { "-mpclmul", OPTION_MASK_ISA_PCLMUL },
2784 { "-mfsgsbase", OPTION_MASK_ISA_FSGSBASE },
2785 { "-mrdrnd", OPTION_MASK_ISA_RDRND },
2786 { "-mf16c", OPTION_MASK_ISA_F16C },
2787 { "-mrtm", OPTION_MASK_ISA_RTM },
2788 { "-mxsave", OPTION_MASK_ISA_XSAVE },
2789 { "-mxsaveopt", OPTION_MASK_ISA_XSAVEOPT },
2790 };
2791
2792 /* Flag options. */
2793 static struct ix86_target_opts flag_opts[] =
2794 {
2795 { "-m128bit-long-double", MASK_128BIT_LONG_DOUBLE },
2796 { "-mlong-double-64", MASK_LONG_DOUBLE_64 },
2797 { "-m80387", MASK_80387 },
2798 { "-maccumulate-outgoing-args", MASK_ACCUMULATE_OUTGOING_ARGS },
2799 { "-malign-double", MASK_ALIGN_DOUBLE },
2800 { "-mcld", MASK_CLD },
2801 { "-mfp-ret-in-387", MASK_FLOAT_RETURNS },
2802 { "-mieee-fp", MASK_IEEE_FP },
2803 { "-minline-all-stringops", MASK_INLINE_ALL_STRINGOPS },
2804 { "-minline-stringops-dynamically", MASK_INLINE_STRINGOPS_DYNAMICALLY },
2805 { "-mms-bitfields", MASK_MS_BITFIELD_LAYOUT },
2806 { "-mno-align-stringops", MASK_NO_ALIGN_STRINGOPS },
2807 { "-mno-fancy-math-387", MASK_NO_FANCY_MATH_387 },
2808 { "-mno-push-args", MASK_NO_PUSH_ARGS },
2809 { "-mno-red-zone", MASK_NO_RED_ZONE },
2810 { "-momit-leaf-frame-pointer", MASK_OMIT_LEAF_FRAME_POINTER },
2811 { "-mrecip", MASK_RECIP },
2812 { "-mrtd", MASK_RTD },
2813 { "-msseregparm", MASK_SSEREGPARM },
2814 { "-mstack-arg-probe", MASK_STACK_PROBE },
2815 { "-mtls-direct-seg-refs", MASK_TLS_DIRECT_SEG_REFS },
2816 { "-mvect8-ret-in-mem", MASK_VECT8_RETURNS },
2817 { "-m8bit-idiv", MASK_USE_8BIT_IDIV },
2818 { "-mvzeroupper", MASK_VZEROUPPER },
2819 { "-mavx256-split-unaligned-load", MASK_AVX256_SPLIT_UNALIGNED_LOAD},
2820 { "-mavx256-split-unaligned-store", MASK_AVX256_SPLIT_UNALIGNED_STORE},
2821 { "-mprefer-avx128", MASK_PREFER_AVX128},
2822 };
2823
2824 const char *opts[ARRAY_SIZE (isa_opts) + ARRAY_SIZE (flag_opts) + 6][2];
2825
2826 char isa_other[40];
2827 char target_other[40];
2828 unsigned num = 0;
2829 unsigned i, j;
2830 char *ret;
2831 char *ptr;
2832 size_t len;
2833 size_t line_len;
2834 size_t sep_len;
2835 const char *abi;
2836
2837 memset (opts, '\0', sizeof (opts));
2838
2839 /* Add -march= option. */
2840 if (arch)
2841 {
2842 opts[num][0] = "-march=";
2843 opts[num++][1] = arch;
2844 }
2845
2846 /* Add -mtune= option. */
2847 if (tune)
2848 {
2849 opts[num][0] = "-mtune=";
2850 opts[num++][1] = tune;
2851 }
2852
2853 /* Add -m32/-m64/-mx32. */
2854 if ((isa & OPTION_MASK_ISA_64BIT) != 0)
2855 {
2856 if ((isa & OPTION_MASK_ABI_64) != 0)
2857 abi = "-m64";
2858 else
2859 abi = "-mx32";
2860 isa &= ~ (OPTION_MASK_ISA_64BIT
2861 | OPTION_MASK_ABI_64
2862 | OPTION_MASK_ABI_X32);
2863 }
2864 else
2865 abi = "-m32";
2866 opts[num++][0] = abi;
2867
2868 /* Pick out the options in isa options. */
2869 for (i = 0; i < ARRAY_SIZE (isa_opts); i++)
2870 {
2871 if ((isa & isa_opts[i].mask) != 0)
2872 {
2873 opts[num++][0] = isa_opts[i].option;
2874 isa &= ~ isa_opts[i].mask;
2875 }
2876 }
2877
2878 if (isa && add_nl_p)
2879 {
2880 opts[num++][0] = isa_other;
2881 sprintf (isa_other, "(other isa: %#" HOST_WIDE_INT_PRINT "x)",
2882 isa);
2883 }
2884
2885 /* Add flag options. */
2886 for (i = 0; i < ARRAY_SIZE (flag_opts); i++)
2887 {
2888 if ((flags & flag_opts[i].mask) != 0)
2889 {
2890 opts[num++][0] = flag_opts[i].option;
2891 flags &= ~ flag_opts[i].mask;
2892 }
2893 }
2894
2895 if (flags && add_nl_p)
2896 {
2897 opts[num++][0] = target_other;
2898 sprintf (target_other, "(other flags: %#x)", flags);
2899 }
2900
2901 /* Add -fpmath= option. */
2902 if (fpmath)
2903 {
2904 opts[num][0] = "-mfpmath=";
2905 switch ((int) fpmath)
2906 {
2907 case FPMATH_387:
2908 opts[num++][1] = "387";
2909 break;
2910
2911 case FPMATH_SSE:
2912 opts[num++][1] = "sse";
2913 break;
2914
2915 case FPMATH_387 | FPMATH_SSE:
2916 opts[num++][1] = "sse+387";
2917 break;
2918
2919 default:
2920 gcc_unreachable ();
2921 }
2922 }
2923
2924 /* Any options? */
2925 if (num == 0)
2926 return NULL;
2927
2928 gcc_assert (num < ARRAY_SIZE (opts));
2929
2930 /* Size the string. */
2931 len = 0;
2932 sep_len = (add_nl_p) ? 3 : 1;
2933 for (i = 0; i < num; i++)
2934 {
2935 len += sep_len;
2936 for (j = 0; j < 2; j++)
2937 if (opts[i][j])
2938 len += strlen (opts[i][j]);
2939 }
2940
2941 /* Build the string. */
2942 ret = ptr = (char *) xmalloc (len);
2943 line_len = 0;
2944
2945 for (i = 0; i < num; i++)
2946 {
2947 size_t len2[2];
2948
2949 for (j = 0; j < 2; j++)
2950 len2[j] = (opts[i][j]) ? strlen (opts[i][j]) : 0;
2951
2952 if (i != 0)
2953 {
2954 *ptr++ = ' ';
2955 line_len++;
2956
2957 if (add_nl_p && line_len + len2[0] + len2[1] > 70)
2958 {
2959 *ptr++ = '\\';
2960 *ptr++ = '\n';
2961 line_len = 0;
2962 }
2963 }
2964
2965 for (j = 0; j < 2; j++)
2966 if (opts[i][j])
2967 {
2968 memcpy (ptr, opts[i][j], len2[j]);
2969 ptr += len2[j];
2970 line_len += len2[j];
2971 }
2972 }
2973
2974 *ptr = '\0';
2975 gcc_assert (ret + len >= ptr);
2976
2977 return ret;
2978 }
2979
2980 /* Return true, if profiling code should be emitted before
2981 prologue. Otherwise it returns false.
2982 Note: For x86 with "hotfix" it is sorried. */
2983 static bool
2984 ix86_profile_before_prologue (void)
2985 {
2986 return flag_fentry != 0;
2987 }
2988
2989 /* Function that is callable from the debugger to print the current
2990 options. */
2991 void
2992 ix86_debug_options (void)
2993 {
2994 char *opts = ix86_target_string (ix86_isa_flags, target_flags,
2995 ix86_arch_string, ix86_tune_string,
2996 ix86_fpmath, true);
2997
2998 if (opts)
2999 {
3000 fprintf (stderr, "%s\n\n", opts);
3001 free (opts);
3002 }
3003 else
3004 fputs ("<no options>\n\n", stderr);
3005
3006 return;
3007 }
3008 \f
3009 /* Override various settings based on options. If MAIN_ARGS_P, the
3010 options are from the command line, otherwise they are from
3011 attributes. */
3012
3013 static void
3014 ix86_option_override_internal (bool main_args_p)
3015 {
3016 int i;
3017 unsigned int ix86_arch_mask, ix86_tune_mask;
3018 const bool ix86_tune_specified = (ix86_tune_string != NULL);
3019 const char *prefix;
3020 const char *suffix;
3021 const char *sw;
3022
3023 #define PTA_3DNOW (HOST_WIDE_INT_1 << 0)
3024 #define PTA_3DNOW_A (HOST_WIDE_INT_1 << 1)
3025 #define PTA_64BIT (HOST_WIDE_INT_1 << 2)
3026 #define PTA_ABM (HOST_WIDE_INT_1 << 3)
3027 #define PTA_AES (HOST_WIDE_INT_1 << 4)
3028 #define PTA_AVX (HOST_WIDE_INT_1 << 5)
3029 #define PTA_BMI (HOST_WIDE_INT_1 << 6)
3030 #define PTA_CX16 (HOST_WIDE_INT_1 << 7)
3031 #define PTA_F16C (HOST_WIDE_INT_1 << 8)
3032 #define PTA_FMA (HOST_WIDE_INT_1 << 9)
3033 #define PTA_FMA4 (HOST_WIDE_INT_1 << 10)
3034 #define PTA_FSGSBASE (HOST_WIDE_INT_1 << 11)
3035 #define PTA_LWP (HOST_WIDE_INT_1 << 12)
3036 #define PTA_LZCNT (HOST_WIDE_INT_1 << 13)
3037 #define PTA_MMX (HOST_WIDE_INT_1 << 14)
3038 #define PTA_MOVBE (HOST_WIDE_INT_1 << 15)
3039 #define PTA_NO_SAHF (HOST_WIDE_INT_1 << 16)
3040 #define PTA_PCLMUL (HOST_WIDE_INT_1 << 17)
3041 #define PTA_POPCNT (HOST_WIDE_INT_1 << 18)
3042 #define PTA_PREFETCH_SSE (HOST_WIDE_INT_1 << 19)
3043 #define PTA_RDRND (HOST_WIDE_INT_1 << 20)
3044 #define PTA_SSE (HOST_WIDE_INT_1 << 21)
3045 #define PTA_SSE2 (HOST_WIDE_INT_1 << 22)
3046 #define PTA_SSE3 (HOST_WIDE_INT_1 << 23)
3047 #define PTA_SSE4_1 (HOST_WIDE_INT_1 << 24)
3048 #define PTA_SSE4_2 (HOST_WIDE_INT_1 << 25)
3049 #define PTA_SSE4A (HOST_WIDE_INT_1 << 26)
3050 #define PTA_SSSE3 (HOST_WIDE_INT_1 << 27)
3051 #define PTA_TBM (HOST_WIDE_INT_1 << 28)
3052 #define PTA_XOP (HOST_WIDE_INT_1 << 29)
3053 #define PTA_AVX2 (HOST_WIDE_INT_1 << 30)
3054 #define PTA_BMI2 (HOST_WIDE_INT_1 << 31)
3055 #define PTA_RTM (HOST_WIDE_INT_1 << 32)
3056 #define PTA_HLE (HOST_WIDE_INT_1 << 33)
3057 #define PTA_PRFCHW (HOST_WIDE_INT_1 << 34)
3058 #define PTA_RDSEED (HOST_WIDE_INT_1 << 35)
3059 #define PTA_ADX (HOST_WIDE_INT_1 << 36)
3060 #define PTA_FXSR (HOST_WIDE_INT_1 << 37)
3061 #define PTA_XSAVE (HOST_WIDE_INT_1 << 38)
3062 #define PTA_XSAVEOPT (HOST_WIDE_INT_1 << 39)
3063
3064 /* if this reaches 64, need to widen struct pta flags below */
3065
3066 static struct pta
3067 {
3068 const char *const name; /* processor name or nickname. */
3069 const enum processor_type processor;
3070 const enum attr_cpu schedule;
3071 const unsigned HOST_WIDE_INT flags;
3072 }
3073 const processor_alias_table[] =
3074 {
3075 {"i386", PROCESSOR_I386, CPU_NONE, 0},
3076 {"i486", PROCESSOR_I486, CPU_NONE, 0},
3077 {"i586", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
3078 {"pentium", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
3079 {"pentium-mmx", PROCESSOR_PENTIUM, CPU_PENTIUM, PTA_MMX},
3080 {"winchip-c6", PROCESSOR_I486, CPU_NONE, PTA_MMX},
3081 {"winchip2", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
3082 {"c3", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
3083 {"c3-2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, PTA_MMX | PTA_SSE},
3084 {"i686", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
3085 {"pentiumpro", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
3086 {"pentium2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, PTA_MMX | PTA_FXSR},
3087 {"pentium3", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
3088 PTA_MMX | PTA_SSE | PTA_FXSR},
3089 {"pentium3m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
3090 PTA_MMX | PTA_SSE | PTA_FXSR},
3091 {"pentium-m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
3092 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_FXSR},
3093 {"pentium4", PROCESSOR_PENTIUM4, CPU_NONE,
3094 PTA_MMX |PTA_SSE | PTA_SSE2 | PTA_FXSR},
3095 {"pentium4m", PROCESSOR_PENTIUM4, CPU_NONE,
3096 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_FXSR},
3097 {"prescott", PROCESSOR_NOCONA, CPU_NONE,
3098 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_FXSR},
3099 {"nocona", PROCESSOR_NOCONA, CPU_NONE,
3100 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3101 | PTA_CX16 | PTA_NO_SAHF | PTA_FXSR},
3102 {"core2", PROCESSOR_CORE2_64, CPU_CORE2,
3103 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3104 | PTA_SSSE3 | PTA_CX16 | PTA_FXSR},
3105 {"corei7", PROCESSOR_COREI7_64, CPU_COREI7,
3106 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3107 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_CX16 | PTA_FXSR},
3108 {"corei7-avx", PROCESSOR_COREI7_64, CPU_COREI7,
3109 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3110 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX
3111 | PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL
3112 | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT},
3113 {"core-avx-i", PROCESSOR_COREI7_64, CPU_COREI7,
3114 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3115 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX
3116 | PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL | PTA_FSGSBASE
3117 | PTA_RDRND | PTA_F16C | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT},
3118 {"core-avx2", PROCESSOR_COREI7_64, CPU_COREI7,
3119 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3120 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX | PTA_AVX2
3121 | PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL | PTA_FSGSBASE
3122 | PTA_RDRND | PTA_F16C | PTA_BMI | PTA_BMI2 | PTA_LZCNT
3123 | PTA_FMA | PTA_MOVBE | PTA_RTM | PTA_HLE | PTA_FXSR | PTA_XSAVE
3124 | PTA_XSAVEOPT},
3125 {"atom", PROCESSOR_ATOM, CPU_ATOM,
3126 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3127 | PTA_SSSE3 | PTA_CX16 | PTA_MOVBE | PTA_FXSR},
3128 {"geode", PROCESSOR_GEODE, CPU_GEODE,
3129 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
3130 {"k6", PROCESSOR_K6, CPU_K6, PTA_MMX},
3131 {"k6-2", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW},
3132 {"k6-3", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW},
3133 {"athlon", PROCESSOR_ATHLON, CPU_ATHLON,
3134 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
3135 {"athlon-tbird", PROCESSOR_ATHLON, CPU_ATHLON,
3136 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
3137 {"athlon-4", PROCESSOR_ATHLON, CPU_ATHLON,
3138 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE},
3139 {"athlon-xp", PROCESSOR_ATHLON, CPU_ATHLON,
3140 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE},
3141 {"athlon-mp", PROCESSOR_ATHLON, CPU_ATHLON,
3142 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE},
3143 {"x86-64", PROCESSOR_K8, CPU_K8,
3144 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_NO_SAHF},
3145 {"k8", PROCESSOR_K8, CPU_K8,
3146 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3147 | PTA_SSE2 | PTA_NO_SAHF},
3148 {"k8-sse3", PROCESSOR_K8, CPU_K8,
3149 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3150 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF},
3151 {"opteron", PROCESSOR_K8, CPU_K8,
3152 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3153 | PTA_SSE2 | PTA_NO_SAHF},
3154 {"opteron-sse3", PROCESSOR_K8, CPU_K8,
3155 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3156 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF},
3157 {"athlon64", PROCESSOR_K8, CPU_K8,
3158 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3159 | PTA_SSE2 | PTA_NO_SAHF},
3160 {"athlon64-sse3", PROCESSOR_K8, CPU_K8,
3161 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3162 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF},
3163 {"athlon-fx", PROCESSOR_K8, CPU_K8,
3164 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3165 | PTA_SSE2 | PTA_NO_SAHF},
3166 {"amdfam10", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
3167 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3168 | PTA_SSE2 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM},
3169 {"barcelona", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
3170 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3171 | PTA_SSE2 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM},
3172 {"bdver1", PROCESSOR_BDVER1, CPU_BDVER1,
3173 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3174 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
3175 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
3176 | PTA_XOP | PTA_LWP | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE
3177 | PTA_XSAVEOPT},
3178 {"bdver2", PROCESSOR_BDVER2, CPU_BDVER2,
3179 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3180 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
3181 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
3182 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_TBM | PTA_F16C
3183 | PTA_FMA | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT},
3184 {"btver1", PROCESSOR_BTVER1, CPU_GENERIC64,
3185 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3186 | PTA_SSSE3 | PTA_SSE4A |PTA_ABM | PTA_CX16 | PTA_PRFCHW
3187 | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT},
3188 {"btver2", PROCESSOR_BTVER2, CPU_GENERIC64,
3189 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3190 | PTA_SSSE3 | PTA_SSE4A |PTA_ABM | PTA_CX16 | PTA_SSE4_1
3191 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX
3192 | PTA_BMI | PTA_F16C | PTA_MOVBE | PTA_PRFCHW
3193 | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT},
3194
3195 {"generic32", PROCESSOR_GENERIC32, CPU_PENTIUMPRO,
3196 PTA_HLE /* flags are only used for -march switch. */ },
3197 {"generic64", PROCESSOR_GENERIC64, CPU_GENERIC64,
3198 PTA_64BIT
3199 | PTA_HLE /* flags are only used for -march switch. */ },
3200 };
3201
3202 /* -mrecip options. */
3203 static struct
3204 {
3205 const char *string; /* option name */
3206 unsigned int mask; /* mask bits to set */
3207 }
3208 const recip_options[] =
3209 {
3210 { "all", RECIP_MASK_ALL },
3211 { "none", RECIP_MASK_NONE },
3212 { "div", RECIP_MASK_DIV },
3213 { "sqrt", RECIP_MASK_SQRT },
3214 { "vec-div", RECIP_MASK_VEC_DIV },
3215 { "vec-sqrt", RECIP_MASK_VEC_SQRT },
3216 };
3217
3218 int const pta_size = ARRAY_SIZE (processor_alias_table);
3219
3220 /* Set up prefix/suffix so the error messages refer to either the command
3221 line argument, or the attribute(target). */
3222 if (main_args_p)
3223 {
3224 prefix = "-m";
3225 suffix = "";
3226 sw = "switch";
3227 }
3228 else
3229 {
3230 prefix = "option(\"";
3231 suffix = "\")";
3232 sw = "attribute";
3233 }
3234
3235 /* Turn off both OPTION_MASK_ABI_64 and OPTION_MASK_ABI_X32 if
3236 TARGET_64BIT_DEFAULT is true and TARGET_64BIT is false. */
3237 if (TARGET_64BIT_DEFAULT && !TARGET_64BIT)
3238 ix86_isa_flags &= ~(OPTION_MASK_ABI_64 | OPTION_MASK_ABI_X32);
3239 #ifdef TARGET_BI_ARCH
3240 else
3241 {
3242 #if TARGET_BI_ARCH == 1
3243 /* When TARGET_BI_ARCH == 1, by default, OPTION_MASK_ABI_64
3244 is on and OPTION_MASK_ABI_X32 is off. We turn off
3245 OPTION_MASK_ABI_64 if OPTION_MASK_ABI_X32 is turned on by
3246 -mx32. */
3247 if (TARGET_X32)
3248 ix86_isa_flags &= ~OPTION_MASK_ABI_64;
3249 #else
3250 /* When TARGET_BI_ARCH == 2, by default, OPTION_MASK_ABI_X32 is
3251 on and OPTION_MASK_ABI_64 is off. We turn off
3252 OPTION_MASK_ABI_X32 if OPTION_MASK_ABI_64 is turned on by
3253 -m64. */
3254 if (TARGET_LP64)
3255 ix86_isa_flags &= ~OPTION_MASK_ABI_X32;
3256 #endif
3257 }
3258 #endif
3259
3260 if (TARGET_X32)
3261 {
3262 /* Always turn on OPTION_MASK_ISA_64BIT and turn off
3263 OPTION_MASK_ABI_64 for TARGET_X32. */
3264 ix86_isa_flags |= OPTION_MASK_ISA_64BIT;
3265 ix86_isa_flags &= ~OPTION_MASK_ABI_64;
3266 }
3267 else if (TARGET_LP64)
3268 {
3269 /* Always turn on OPTION_MASK_ISA_64BIT and turn off
3270 OPTION_MASK_ABI_X32 for TARGET_LP64. */
3271 ix86_isa_flags |= OPTION_MASK_ISA_64BIT;
3272 ix86_isa_flags &= ~OPTION_MASK_ABI_X32;
3273 }
3274
3275 #ifdef SUBTARGET_OVERRIDE_OPTIONS
3276 SUBTARGET_OVERRIDE_OPTIONS;
3277 #endif
3278
3279 #ifdef SUBSUBTARGET_OVERRIDE_OPTIONS
3280 SUBSUBTARGET_OVERRIDE_OPTIONS;
3281 #endif
3282
3283 /* -fPIC is the default for x86_64. */
3284 if (TARGET_MACHO && TARGET_64BIT)
3285 flag_pic = 2;
3286
3287 /* Need to check -mtune=generic first. */
3288 if (ix86_tune_string)
3289 {
3290 if (!strcmp (ix86_tune_string, "generic")
3291 || !strcmp (ix86_tune_string, "i686")
3292 /* As special support for cross compilers we read -mtune=native
3293 as -mtune=generic. With native compilers we won't see the
3294 -mtune=native, as it was changed by the driver. */
3295 || !strcmp (ix86_tune_string, "native"))
3296 {
3297 if (TARGET_64BIT)
3298 ix86_tune_string = "generic64";
3299 else
3300 ix86_tune_string = "generic32";
3301 }
3302 /* If this call is for setting the option attribute, allow the
3303 generic32/generic64 that was previously set. */
3304 else if (!main_args_p
3305 && (!strcmp (ix86_tune_string, "generic32")
3306 || !strcmp (ix86_tune_string, "generic64")))
3307 ;
3308 else if (!strncmp (ix86_tune_string, "generic", 7))
3309 error ("bad value (%s) for %stune=%s %s",
3310 ix86_tune_string, prefix, suffix, sw);
3311 else if (!strcmp (ix86_tune_string, "x86-64"))
3312 warning (OPT_Wdeprecated, "%stune=x86-64%s is deprecated; use "
3313 "%stune=k8%s or %stune=generic%s instead as appropriate",
3314 prefix, suffix, prefix, suffix, prefix, suffix);
3315 }
3316 else
3317 {
3318 if (ix86_arch_string)
3319 ix86_tune_string = ix86_arch_string;
3320 if (!ix86_tune_string)
3321 {
3322 ix86_tune_string = cpu_names[TARGET_CPU_DEFAULT];
3323 ix86_tune_defaulted = 1;
3324 }
3325
3326 /* ix86_tune_string is set to ix86_arch_string or defaulted. We
3327 need to use a sensible tune option. */
3328 if (!strcmp (ix86_tune_string, "generic")
3329 || !strcmp (ix86_tune_string, "x86-64")
3330 || !strcmp (ix86_tune_string, "i686"))
3331 {
3332 if (TARGET_64BIT)
3333 ix86_tune_string = "generic64";
3334 else
3335 ix86_tune_string = "generic32";
3336 }
3337 }
3338
3339 if (ix86_stringop_alg == rep_prefix_8_byte && !TARGET_64BIT)
3340 {
3341 /* rep; movq isn't available in 32-bit code. */
3342 error ("-mstringop-strategy=rep_8byte not supported for 32-bit code");
3343 ix86_stringop_alg = no_stringop;
3344 }
3345
3346 if (!ix86_arch_string)
3347 ix86_arch_string = TARGET_64BIT ? "x86-64" : SUBTARGET32_DEFAULT_CPU;
3348 else
3349 ix86_arch_specified = 1;
3350
3351 if (global_options_set.x_ix86_pmode)
3352 {
3353 if ((TARGET_LP64 && ix86_pmode == PMODE_SI)
3354 || (!TARGET_64BIT && ix86_pmode == PMODE_DI))
3355 error ("address mode %qs not supported in the %s bit mode",
3356 TARGET_64BIT ? "short" : "long",
3357 TARGET_64BIT ? "64" : "32");
3358 }
3359 else
3360 ix86_pmode = TARGET_LP64 ? PMODE_DI : PMODE_SI;
3361
3362 if (!global_options_set.x_ix86_abi)
3363 ix86_abi = DEFAULT_ABI;
3364
3365 if (global_options_set.x_ix86_cmodel)
3366 {
3367 switch (ix86_cmodel)
3368 {
3369 case CM_SMALL:
3370 case CM_SMALL_PIC:
3371 if (flag_pic)
3372 ix86_cmodel = CM_SMALL_PIC;
3373 if (!TARGET_64BIT)
3374 error ("code model %qs not supported in the %s bit mode",
3375 "small", "32");
3376 break;
3377
3378 case CM_MEDIUM:
3379 case CM_MEDIUM_PIC:
3380 if (flag_pic)
3381 ix86_cmodel = CM_MEDIUM_PIC;
3382 if (!TARGET_64BIT)
3383 error ("code model %qs not supported in the %s bit mode",
3384 "medium", "32");
3385 else if (TARGET_X32)
3386 error ("code model %qs not supported in x32 mode",
3387 "medium");
3388 break;
3389
3390 case CM_LARGE:
3391 case CM_LARGE_PIC:
3392 if (flag_pic)
3393 ix86_cmodel = CM_LARGE_PIC;
3394 if (!TARGET_64BIT)
3395 error ("code model %qs not supported in the %s bit mode",
3396 "large", "32");
3397 else if (TARGET_X32)
3398 error ("code model %qs not supported in x32 mode",
3399 "large");
3400 break;
3401
3402 case CM_32:
3403 if (flag_pic)
3404 error ("code model %s does not support PIC mode", "32");
3405 if (TARGET_64BIT)
3406 error ("code model %qs not supported in the %s bit mode",
3407 "32", "64");
3408 break;
3409
3410 case CM_KERNEL:
3411 if (flag_pic)
3412 {
3413 error ("code model %s does not support PIC mode", "kernel");
3414 ix86_cmodel = CM_32;
3415 }
3416 if (!TARGET_64BIT)
3417 error ("code model %qs not supported in the %s bit mode",
3418 "kernel", "32");
3419 break;
3420
3421 default:
3422 gcc_unreachable ();
3423 }
3424 }
3425 else
3426 {
3427 /* For TARGET_64BIT and MS_ABI, force pic on, in order to enable the
3428 use of rip-relative addressing. This eliminates fixups that
3429 would otherwise be needed if this object is to be placed in a
3430 DLL, and is essentially just as efficient as direct addressing. */
3431 if (TARGET_64BIT && DEFAULT_ABI == MS_ABI)
3432 ix86_cmodel = CM_SMALL_PIC, flag_pic = 1;
3433 else if (TARGET_64BIT)
3434 ix86_cmodel = flag_pic ? CM_SMALL_PIC : CM_SMALL;
3435 else
3436 ix86_cmodel = CM_32;
3437 }
3438 if (TARGET_MACHO && ix86_asm_dialect == ASM_INTEL)
3439 {
3440 error ("-masm=intel not supported in this configuration");
3441 ix86_asm_dialect = ASM_ATT;
3442 }
3443 if ((TARGET_64BIT != 0) != ((ix86_isa_flags & OPTION_MASK_ISA_64BIT) != 0))
3444 sorry ("%i-bit mode not compiled in",
3445 (ix86_isa_flags & OPTION_MASK_ISA_64BIT) ? 64 : 32);
3446
3447 for (i = 0; i < pta_size; i++)
3448 if (! strcmp (ix86_arch_string, processor_alias_table[i].name))
3449 {
3450 ix86_schedule = processor_alias_table[i].schedule;
3451 ix86_arch = processor_alias_table[i].processor;
3452 /* Default cpu tuning to the architecture. */
3453 ix86_tune = ix86_arch;
3454
3455 if (TARGET_64BIT && !(processor_alias_table[i].flags & PTA_64BIT))
3456 error ("CPU you selected does not support x86-64 "
3457 "instruction set");
3458
3459 if (processor_alias_table[i].flags & PTA_MMX
3460 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_MMX))
3461 ix86_isa_flags |= OPTION_MASK_ISA_MMX;
3462 if (processor_alias_table[i].flags & PTA_3DNOW
3463 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW))
3464 ix86_isa_flags |= OPTION_MASK_ISA_3DNOW;
3465 if (processor_alias_table[i].flags & PTA_3DNOW_A
3466 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW_A))
3467 ix86_isa_flags |= OPTION_MASK_ISA_3DNOW_A;
3468 if (processor_alias_table[i].flags & PTA_SSE
3469 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE))
3470 ix86_isa_flags |= OPTION_MASK_ISA_SSE;
3471 if (processor_alias_table[i].flags & PTA_SSE2
3472 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE2))
3473 ix86_isa_flags |= OPTION_MASK_ISA_SSE2;
3474 if (processor_alias_table[i].flags & PTA_SSE3
3475 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE3))
3476 ix86_isa_flags |= OPTION_MASK_ISA_SSE3;
3477 if (processor_alias_table[i].flags & PTA_SSSE3
3478 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSSE3))
3479 ix86_isa_flags |= OPTION_MASK_ISA_SSSE3;
3480 if (processor_alias_table[i].flags & PTA_SSE4_1
3481 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_1))
3482 ix86_isa_flags |= OPTION_MASK_ISA_SSE4_1;
3483 if (processor_alias_table[i].flags & PTA_SSE4_2
3484 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_2))
3485 ix86_isa_flags |= OPTION_MASK_ISA_SSE4_2;
3486 if (processor_alias_table[i].flags & PTA_AVX
3487 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX))
3488 ix86_isa_flags |= OPTION_MASK_ISA_AVX;
3489 if (processor_alias_table[i].flags & PTA_AVX2
3490 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX2))
3491 ix86_isa_flags |= OPTION_MASK_ISA_AVX2;
3492 if (processor_alias_table[i].flags & PTA_FMA
3493 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA))
3494 ix86_isa_flags |= OPTION_MASK_ISA_FMA;
3495 if (processor_alias_table[i].flags & PTA_SSE4A
3496 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4A))
3497 ix86_isa_flags |= OPTION_MASK_ISA_SSE4A;
3498 if (processor_alias_table[i].flags & PTA_FMA4
3499 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA4))
3500 ix86_isa_flags |= OPTION_MASK_ISA_FMA4;
3501 if (processor_alias_table[i].flags & PTA_XOP
3502 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_XOP))
3503 ix86_isa_flags |= OPTION_MASK_ISA_XOP;
3504 if (processor_alias_table[i].flags & PTA_LWP
3505 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_LWP))
3506 ix86_isa_flags |= OPTION_MASK_ISA_LWP;
3507 if (processor_alias_table[i].flags & PTA_ABM
3508 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_ABM))
3509 ix86_isa_flags |= OPTION_MASK_ISA_ABM;
3510 if (processor_alias_table[i].flags & PTA_BMI
3511 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI))
3512 ix86_isa_flags |= OPTION_MASK_ISA_BMI;
3513 if (processor_alias_table[i].flags & (PTA_LZCNT | PTA_ABM)
3514 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_LZCNT))
3515 ix86_isa_flags |= OPTION_MASK_ISA_LZCNT;
3516 if (processor_alias_table[i].flags & PTA_TBM
3517 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_TBM))
3518 ix86_isa_flags |= OPTION_MASK_ISA_TBM;
3519 if (processor_alias_table[i].flags & PTA_BMI2
3520 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI2))
3521 ix86_isa_flags |= OPTION_MASK_ISA_BMI2;
3522 if (processor_alias_table[i].flags & PTA_CX16
3523 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_CX16))
3524 ix86_isa_flags |= OPTION_MASK_ISA_CX16;
3525 if (processor_alias_table[i].flags & (PTA_POPCNT | PTA_ABM)
3526 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_POPCNT))
3527 ix86_isa_flags |= OPTION_MASK_ISA_POPCNT;
3528 if (!(TARGET_64BIT && (processor_alias_table[i].flags & PTA_NO_SAHF))
3529 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SAHF))
3530 ix86_isa_flags |= OPTION_MASK_ISA_SAHF;
3531 if (processor_alias_table[i].flags & PTA_MOVBE
3532 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_MOVBE))
3533 ix86_isa_flags |= OPTION_MASK_ISA_MOVBE;
3534 if (processor_alias_table[i].flags & PTA_AES
3535 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AES))
3536 ix86_isa_flags |= OPTION_MASK_ISA_AES;
3537 if (processor_alias_table[i].flags & PTA_PCLMUL
3538 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_PCLMUL))
3539 ix86_isa_flags |= OPTION_MASK_ISA_PCLMUL;
3540 if (processor_alias_table[i].flags & PTA_FSGSBASE
3541 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_FSGSBASE))
3542 ix86_isa_flags |= OPTION_MASK_ISA_FSGSBASE;
3543 if (processor_alias_table[i].flags & PTA_RDRND
3544 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_RDRND))
3545 ix86_isa_flags |= OPTION_MASK_ISA_RDRND;
3546 if (processor_alias_table[i].flags & PTA_F16C
3547 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_F16C))
3548 ix86_isa_flags |= OPTION_MASK_ISA_F16C;
3549 if (processor_alias_table[i].flags & PTA_RTM
3550 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_RTM))
3551 ix86_isa_flags |= OPTION_MASK_ISA_RTM;
3552 if (processor_alias_table[i].flags & PTA_HLE
3553 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_HLE))
3554 ix86_isa_flags |= OPTION_MASK_ISA_HLE;
3555 if (processor_alias_table[i].flags & PTA_PRFCHW
3556 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_PRFCHW))
3557 ix86_isa_flags |= OPTION_MASK_ISA_PRFCHW;
3558 if (processor_alias_table[i].flags & PTA_RDSEED
3559 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_RDSEED))
3560 ix86_isa_flags |= OPTION_MASK_ISA_RDSEED;
3561 if (processor_alias_table[i].flags & PTA_ADX
3562 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_ADX))
3563 ix86_isa_flags |= OPTION_MASK_ISA_ADX;
3564 if (processor_alias_table[i].flags & PTA_FXSR
3565 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_FXSR))
3566 ix86_isa_flags |= OPTION_MASK_ISA_FXSR;
3567 if (processor_alias_table[i].flags & PTA_XSAVE
3568 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVE))
3569 ix86_isa_flags |= OPTION_MASK_ISA_XSAVE;
3570 if (processor_alias_table[i].flags & PTA_XSAVEOPT
3571 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVEOPT))
3572 ix86_isa_flags |= OPTION_MASK_ISA_XSAVEOPT;
3573 if (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE))
3574 x86_prefetch_sse = true;
3575
3576 break;
3577 }
3578
3579 if (!strcmp (ix86_arch_string, "generic"))
3580 error ("generic CPU can be used only for %stune=%s %s",
3581 prefix, suffix, sw);
3582 else if (!strncmp (ix86_arch_string, "generic", 7) || i == pta_size)
3583 error ("bad value (%s) for %sarch=%s %s",
3584 ix86_arch_string, prefix, suffix, sw);
3585
3586 ix86_arch_mask = 1u << ix86_arch;
3587 for (i = 0; i < X86_ARCH_LAST; ++i)
3588 ix86_arch_features[i] = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
3589
3590 for (i = 0; i < pta_size; i++)
3591 if (! strcmp (ix86_tune_string, processor_alias_table[i].name))
3592 {
3593 ix86_schedule = processor_alias_table[i].schedule;
3594 ix86_tune = processor_alias_table[i].processor;
3595 if (TARGET_64BIT)
3596 {
3597 if (!(processor_alias_table[i].flags & PTA_64BIT))
3598 {
3599 if (ix86_tune_defaulted)
3600 {
3601 ix86_tune_string = "x86-64";
3602 for (i = 0; i < pta_size; i++)
3603 if (! strcmp (ix86_tune_string,
3604 processor_alias_table[i].name))
3605 break;
3606 ix86_schedule = processor_alias_table[i].schedule;
3607 ix86_tune = processor_alias_table[i].processor;
3608 }
3609 else
3610 error ("CPU you selected does not support x86-64 "
3611 "instruction set");
3612 }
3613 }
3614 else
3615 {
3616 /* Adjust tuning when compiling for 32-bit ABI. */
3617 switch (ix86_tune)
3618 {
3619 case PROCESSOR_GENERIC64:
3620 ix86_tune = PROCESSOR_GENERIC32;
3621 ix86_schedule = CPU_PENTIUMPRO;
3622 break;
3623
3624 case PROCESSOR_CORE2_64:
3625 ix86_tune = PROCESSOR_CORE2_32;
3626 break;
3627
3628 case PROCESSOR_COREI7_64:
3629 ix86_tune = PROCESSOR_COREI7_32;
3630 break;
3631
3632 default:
3633 break;
3634 }
3635 }
3636 /* Intel CPUs have always interpreted SSE prefetch instructions as
3637 NOPs; so, we can enable SSE prefetch instructions even when
3638 -mtune (rather than -march) points us to a processor that has them.
3639 However, the VIA C3 gives a SIGILL, so we only do that for i686 and
3640 higher processors. */
3641 if (TARGET_CMOV
3642 && (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE)))
3643 x86_prefetch_sse = true;
3644 break;
3645 }
3646
3647 if (ix86_tune_specified && i == pta_size)
3648 error ("bad value (%s) for %stune=%s %s",
3649 ix86_tune_string, prefix, suffix, sw);
3650
3651 ix86_tune_mask = 1u << ix86_tune;
3652 for (i = 0; i < X86_TUNE_LAST; ++i)
3653 ix86_tune_features[i] = !!(initial_ix86_tune_features[i] & ix86_tune_mask);
3654
3655 #ifndef USE_IX86_FRAME_POINTER
3656 #define USE_IX86_FRAME_POINTER 0
3657 #endif
3658
3659 #ifndef USE_X86_64_FRAME_POINTER
3660 #define USE_X86_64_FRAME_POINTER 0
3661 #endif
3662
3663 /* Set the default values for switches whose default depends on TARGET_64BIT
3664 in case they weren't overwritten by command line options. */
3665 if (TARGET_64BIT)
3666 {
3667 if (optimize >= 1 && !global_options_set.x_flag_omit_frame_pointer)
3668 flag_omit_frame_pointer = !USE_X86_64_FRAME_POINTER;
3669 if (flag_asynchronous_unwind_tables == 2)
3670 flag_unwind_tables = flag_asynchronous_unwind_tables = 1;
3671 if (flag_pcc_struct_return == 2)
3672 flag_pcc_struct_return = 0;
3673 }
3674 else
3675 {
3676 if (optimize >= 1 && !global_options_set.x_flag_omit_frame_pointer)
3677 flag_omit_frame_pointer = !(USE_IX86_FRAME_POINTER || optimize_size);
3678 if (flag_asynchronous_unwind_tables == 2)
3679 flag_asynchronous_unwind_tables = !USE_IX86_FRAME_POINTER;
3680 if (flag_pcc_struct_return == 2)
3681 flag_pcc_struct_return = DEFAULT_PCC_STRUCT_RETURN;
3682 }
3683
3684 ix86_tune_cost = processor_target_table[ix86_tune].cost;
3685 if (optimize_size)
3686 ix86_cost = &ix86_size_cost;
3687 else
3688 ix86_cost = ix86_tune_cost;
3689
3690 /* Arrange to set up i386_stack_locals for all functions. */
3691 init_machine_status = ix86_init_machine_status;
3692
3693 /* Validate -mregparm= value. */
3694 if (global_options_set.x_ix86_regparm)
3695 {
3696 if (TARGET_64BIT)
3697 warning (0, "-mregparm is ignored in 64-bit mode");
3698 if (ix86_regparm > REGPARM_MAX)
3699 {
3700 error ("-mregparm=%d is not between 0 and %d",
3701 ix86_regparm, REGPARM_MAX);
3702 ix86_regparm = 0;
3703 }
3704 }
3705 if (TARGET_64BIT)
3706 ix86_regparm = REGPARM_MAX;
3707
3708 /* Default align_* from the processor table. */
3709 if (align_loops == 0)
3710 {
3711 align_loops = processor_target_table[ix86_tune].align_loop;
3712 align_loops_max_skip = processor_target_table[ix86_tune].align_loop_max_skip;
3713 }
3714 if (align_jumps == 0)
3715 {
3716 align_jumps = processor_target_table[ix86_tune].align_jump;
3717 align_jumps_max_skip = processor_target_table[ix86_tune].align_jump_max_skip;
3718 }
3719 if (align_functions == 0)
3720 {
3721 align_functions = processor_target_table[ix86_tune].align_func;
3722 }
3723
3724 /* Provide default for -mbranch-cost= value. */
3725 if (!global_options_set.x_ix86_branch_cost)
3726 ix86_branch_cost = ix86_cost->branch_cost;
3727
3728 if (TARGET_64BIT)
3729 {
3730 target_flags |= TARGET_SUBTARGET64_DEFAULT & ~target_flags_explicit;
3731
3732 /* Enable by default the SSE and MMX builtins. Do allow the user to
3733 explicitly disable any of these. In particular, disabling SSE and
3734 MMX for kernel code is extremely useful. */
3735 if (!ix86_arch_specified)
3736 ix86_isa_flags
3737 |= ((OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_MMX
3738 | TARGET_SUBTARGET64_ISA_DEFAULT) & ~ix86_isa_flags_explicit);
3739
3740 if (TARGET_RTD)
3741 warning (0, "%srtd%s is ignored in 64bit mode", prefix, suffix);
3742 }
3743 else
3744 {
3745 target_flags |= TARGET_SUBTARGET32_DEFAULT & ~target_flags_explicit;
3746
3747 if (!ix86_arch_specified)
3748 ix86_isa_flags
3749 |= TARGET_SUBTARGET32_ISA_DEFAULT & ~ix86_isa_flags_explicit;
3750
3751 /* i386 ABI does not specify red zone. It still makes sense to use it
3752 when programmer takes care to stack from being destroyed. */
3753 if (!(target_flags_explicit & MASK_NO_RED_ZONE))
3754 target_flags |= MASK_NO_RED_ZONE;
3755 }
3756
3757 /* Keep nonleaf frame pointers. */
3758 if (flag_omit_frame_pointer)
3759 target_flags &= ~MASK_OMIT_LEAF_FRAME_POINTER;
3760 else if (TARGET_OMIT_LEAF_FRAME_POINTER)
3761 flag_omit_frame_pointer = 1;
3762
3763 /* If we're doing fast math, we don't care about comparison order
3764 wrt NaNs. This lets us use a shorter comparison sequence. */
3765 if (flag_finite_math_only)
3766 target_flags &= ~MASK_IEEE_FP;
3767
3768 /* If the architecture always has an FPU, turn off NO_FANCY_MATH_387,
3769 since the insns won't need emulation. */
3770 if (x86_arch_always_fancy_math_387 & ix86_arch_mask)
3771 target_flags &= ~MASK_NO_FANCY_MATH_387;
3772
3773 /* Likewise, if the target doesn't have a 387, or we've specified
3774 software floating point, don't use 387 inline intrinsics. */
3775 if (!TARGET_80387)
3776 target_flags |= MASK_NO_FANCY_MATH_387;
3777
3778 /* Turn on MMX builtins for -msse. */
3779 if (TARGET_SSE)
3780 ix86_isa_flags |= OPTION_MASK_ISA_MMX & ~ix86_isa_flags_explicit;
3781
3782 /* Enable SSE prefetch. */
3783 if (TARGET_SSE || TARGET_PRFCHW)
3784 x86_prefetch_sse = true;
3785
3786 /* Turn on popcnt instruction for -msse4.2 or -mabm. */
3787 if (TARGET_SSE4_2 || TARGET_ABM)
3788 ix86_isa_flags |= OPTION_MASK_ISA_POPCNT & ~ix86_isa_flags_explicit;
3789
3790 /* Turn on lzcnt instruction for -mabm. */
3791 if (TARGET_ABM)
3792 ix86_isa_flags |= OPTION_MASK_ISA_LZCNT & ~ix86_isa_flags_explicit;
3793
3794 /* Validate -mpreferred-stack-boundary= value or default it to
3795 PREFERRED_STACK_BOUNDARY_DEFAULT. */
3796 ix86_preferred_stack_boundary = PREFERRED_STACK_BOUNDARY_DEFAULT;
3797 if (global_options_set.x_ix86_preferred_stack_boundary_arg)
3798 {
3799 int min = (TARGET_64BIT ? (TARGET_SSE ? 4 : 3) : 2);
3800 int max = (TARGET_SEH ? 4 : 12);
3801
3802 if (ix86_preferred_stack_boundary_arg < min
3803 || ix86_preferred_stack_boundary_arg > max)
3804 {
3805 if (min == max)
3806 error ("-mpreferred-stack-boundary is not supported "
3807 "for this target");
3808 else
3809 error ("-mpreferred-stack-boundary=%d is not between %d and %d",
3810 ix86_preferred_stack_boundary_arg, min, max);
3811 }
3812 else
3813 ix86_preferred_stack_boundary
3814 = (1 << ix86_preferred_stack_boundary_arg) * BITS_PER_UNIT;
3815 }
3816
3817 /* Set the default value for -mstackrealign. */
3818 if (ix86_force_align_arg_pointer == -1)
3819 ix86_force_align_arg_pointer = STACK_REALIGN_DEFAULT;
3820
3821 ix86_default_incoming_stack_boundary = PREFERRED_STACK_BOUNDARY;
3822
3823 /* Validate -mincoming-stack-boundary= value or default it to
3824 MIN_STACK_BOUNDARY/PREFERRED_STACK_BOUNDARY. */
3825 ix86_incoming_stack_boundary = ix86_default_incoming_stack_boundary;
3826 if (global_options_set.x_ix86_incoming_stack_boundary_arg)
3827 {
3828 if (ix86_incoming_stack_boundary_arg < (TARGET_64BIT ? 4 : 2)
3829 || ix86_incoming_stack_boundary_arg > 12)
3830 error ("-mincoming-stack-boundary=%d is not between %d and 12",
3831 ix86_incoming_stack_boundary_arg, TARGET_64BIT ? 4 : 2);
3832 else
3833 {
3834 ix86_user_incoming_stack_boundary
3835 = (1 << ix86_incoming_stack_boundary_arg) * BITS_PER_UNIT;
3836 ix86_incoming_stack_boundary
3837 = ix86_user_incoming_stack_boundary;
3838 }
3839 }
3840
3841 /* Accept -msseregparm only if at least SSE support is enabled. */
3842 if (TARGET_SSEREGPARM
3843 && ! TARGET_SSE)
3844 error ("%ssseregparm%s used without SSE enabled", prefix, suffix);
3845
3846 if (global_options_set.x_ix86_fpmath)
3847 {
3848 if (ix86_fpmath & FPMATH_SSE)
3849 {
3850 if (!TARGET_SSE)
3851 {
3852 warning (0, "SSE instruction set disabled, using 387 arithmetics");
3853 ix86_fpmath = FPMATH_387;
3854 }
3855 else if ((ix86_fpmath & FPMATH_387) && !TARGET_80387)
3856 {
3857 warning (0, "387 instruction set disabled, using SSE arithmetics");
3858 ix86_fpmath = FPMATH_SSE;
3859 }
3860 }
3861 }
3862 else
3863 ix86_fpmath = TARGET_FPMATH_DEFAULT;
3864
3865 /* If the i387 is disabled, then do not return values in it. */
3866 if (!TARGET_80387)
3867 target_flags &= ~MASK_FLOAT_RETURNS;
3868
3869 /* Use external vectorized library in vectorizing intrinsics. */
3870 if (global_options_set.x_ix86_veclibabi_type)
3871 switch (ix86_veclibabi_type)
3872 {
3873 case ix86_veclibabi_type_svml:
3874 ix86_veclib_handler = ix86_veclibabi_svml;
3875 break;
3876
3877 case ix86_veclibabi_type_acml:
3878 ix86_veclib_handler = ix86_veclibabi_acml;
3879 break;
3880
3881 default:
3882 gcc_unreachable ();
3883 }
3884
3885 if ((!USE_IX86_FRAME_POINTER
3886 || (x86_accumulate_outgoing_args & ix86_tune_mask))
3887 && !(target_flags_explicit & MASK_ACCUMULATE_OUTGOING_ARGS)
3888 && !optimize_size)
3889 target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
3890
3891 /* ??? Unwind info is not correct around the CFG unless either a frame
3892 pointer is present or M_A_O_A is set. Fixing this requires rewriting
3893 unwind info generation to be aware of the CFG and propagating states
3894 around edges. */
3895 if ((flag_unwind_tables || flag_asynchronous_unwind_tables
3896 || flag_exceptions || flag_non_call_exceptions)
3897 && flag_omit_frame_pointer
3898 && !(target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
3899 {
3900 if (target_flags_explicit & MASK_ACCUMULATE_OUTGOING_ARGS)
3901 warning (0, "unwind tables currently require either a frame pointer "
3902 "or %saccumulate-outgoing-args%s for correctness",
3903 prefix, suffix);
3904 target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
3905 }
3906
3907 /* If stack probes are required, the space used for large function
3908 arguments on the stack must also be probed, so enable
3909 -maccumulate-outgoing-args so this happens in the prologue. */
3910 if (TARGET_STACK_PROBE
3911 && !(target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
3912 {
3913 if (target_flags_explicit & MASK_ACCUMULATE_OUTGOING_ARGS)
3914 warning (0, "stack probing requires %saccumulate-outgoing-args%s "
3915 "for correctness", prefix, suffix);
3916 target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
3917 }
3918
3919 /* Figure out what ASM_GENERATE_INTERNAL_LABEL builds as a prefix. */
3920 {
3921 char *p;
3922 ASM_GENERATE_INTERNAL_LABEL (internal_label_prefix, "LX", 0);
3923 p = strchr (internal_label_prefix, 'X');
3924 internal_label_prefix_len = p - internal_label_prefix;
3925 *p = '\0';
3926 }
3927
3928 /* When scheduling description is not available, disable scheduler pass
3929 so it won't slow down the compilation and make x87 code slower. */
3930 if (!TARGET_SCHEDULE)
3931 flag_schedule_insns_after_reload = flag_schedule_insns = 0;
3932
3933 maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
3934 ix86_tune_cost->simultaneous_prefetches,
3935 global_options.x_param_values,
3936 global_options_set.x_param_values);
3937 maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
3938 ix86_tune_cost->prefetch_block,
3939 global_options.x_param_values,
3940 global_options_set.x_param_values);
3941 maybe_set_param_value (PARAM_L1_CACHE_SIZE,
3942 ix86_tune_cost->l1_cache_size,
3943 global_options.x_param_values,
3944 global_options_set.x_param_values);
3945 maybe_set_param_value (PARAM_L2_CACHE_SIZE,
3946 ix86_tune_cost->l2_cache_size,
3947 global_options.x_param_values,
3948 global_options_set.x_param_values);
3949
3950 /* Enable sw prefetching at -O3 for CPUS that prefetching is helpful. */
3951 if (flag_prefetch_loop_arrays < 0
3952 && HAVE_prefetch
3953 && optimize >= 3
3954 && TARGET_SOFTWARE_PREFETCHING_BENEFICIAL)
3955 flag_prefetch_loop_arrays = 1;
3956
3957 /* If using typedef char *va_list, signal that __builtin_va_start (&ap, 0)
3958 can be optimized to ap = __builtin_next_arg (0). */
3959 if (!TARGET_64BIT && !flag_split_stack)
3960 targetm.expand_builtin_va_start = NULL;
3961
3962 if (TARGET_64BIT)
3963 {
3964 ix86_gen_leave = gen_leave_rex64;
3965 if (Pmode == DImode)
3966 {
3967 ix86_gen_monitor = gen_sse3_monitor64_di;
3968 ix86_gen_tls_global_dynamic_64 = gen_tls_global_dynamic_64_di;
3969 ix86_gen_tls_local_dynamic_base_64
3970 = gen_tls_local_dynamic_base_64_di;
3971 }
3972 else
3973 {
3974 ix86_gen_monitor = gen_sse3_monitor64_si;
3975 ix86_gen_tls_global_dynamic_64 = gen_tls_global_dynamic_64_si;
3976 ix86_gen_tls_local_dynamic_base_64
3977 = gen_tls_local_dynamic_base_64_si;
3978 }
3979 }
3980 else
3981 {
3982 ix86_gen_leave = gen_leave;
3983 ix86_gen_monitor = gen_sse3_monitor;
3984 }
3985
3986 if (Pmode == DImode)
3987 {
3988 ix86_gen_add3 = gen_adddi3;
3989 ix86_gen_sub3 = gen_subdi3;
3990 ix86_gen_sub3_carry = gen_subdi3_carry;
3991 ix86_gen_one_cmpl2 = gen_one_cmpldi2;
3992 ix86_gen_andsp = gen_anddi3;
3993 ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_di;
3994 ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probedi;
3995 ix86_gen_probe_stack_range = gen_probe_stack_rangedi;
3996 }
3997 else
3998 {
3999 ix86_gen_add3 = gen_addsi3;
4000 ix86_gen_sub3 = gen_subsi3;
4001 ix86_gen_sub3_carry = gen_subsi3_carry;
4002 ix86_gen_one_cmpl2 = gen_one_cmplsi2;
4003 ix86_gen_andsp = gen_andsi3;
4004 ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_si;
4005 ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probesi;
4006 ix86_gen_probe_stack_range = gen_probe_stack_rangesi;
4007 }
4008
4009 #ifdef USE_IX86_CLD
4010 /* Use -mcld by default for 32-bit code if configured with --enable-cld. */
4011 if (!TARGET_64BIT)
4012 target_flags |= MASK_CLD & ~target_flags_explicit;
4013 #endif
4014
4015 if (!TARGET_64BIT && flag_pic)
4016 {
4017 if (flag_fentry > 0)
4018 sorry ("-mfentry isn%'t supported for 32-bit in combination "
4019 "with -fpic");
4020 flag_fentry = 0;
4021 }
4022 else if (TARGET_SEH)
4023 {
4024 if (flag_fentry == 0)
4025 sorry ("-mno-fentry isn%'t compatible with SEH");
4026 flag_fentry = 1;
4027 }
4028 else if (flag_fentry < 0)
4029 {
4030 #if defined(PROFILE_BEFORE_PROLOGUE)
4031 flag_fentry = 1;
4032 #else
4033 flag_fentry = 0;
4034 #endif
4035 }
4036
4037 if (TARGET_AVX)
4038 {
4039 /* When not optimize for size, enable vzeroupper optimization for
4040 TARGET_AVX with -fexpensive-optimizations and split 32-byte
4041 AVX unaligned load/store. */
4042 if (!optimize_size)
4043 {
4044 if (flag_expensive_optimizations
4045 && !(target_flags_explicit & MASK_VZEROUPPER))
4046 target_flags |= MASK_VZEROUPPER;
4047 if ((x86_avx256_split_unaligned_load & ix86_tune_mask)
4048 && !(target_flags_explicit & MASK_AVX256_SPLIT_UNALIGNED_LOAD))
4049 target_flags |= MASK_AVX256_SPLIT_UNALIGNED_LOAD;
4050 if ((x86_avx256_split_unaligned_store & ix86_tune_mask)
4051 && !(target_flags_explicit & MASK_AVX256_SPLIT_UNALIGNED_STORE))
4052 target_flags |= MASK_AVX256_SPLIT_UNALIGNED_STORE;
4053 /* Enable 128-bit AVX instruction generation for the auto-vectorizer. */
4054 if (TARGET_AVX128_OPTIMAL && !(target_flags_explicit & MASK_PREFER_AVX128))
4055 target_flags |= MASK_PREFER_AVX128;
4056 }
4057 }
4058 else
4059 {
4060 /* Disable vzeroupper pass if TARGET_AVX is disabled. */
4061 target_flags &= ~MASK_VZEROUPPER;
4062 }
4063
4064 if (ix86_recip_name)
4065 {
4066 char *p = ASTRDUP (ix86_recip_name);
4067 char *q;
4068 unsigned int mask, i;
4069 bool invert;
4070
4071 while ((q = strtok (p, ",")) != NULL)
4072 {
4073 p = NULL;
4074 if (*q == '!')
4075 {
4076 invert = true;
4077 q++;
4078 }
4079 else
4080 invert = false;
4081
4082 if (!strcmp (q, "default"))
4083 mask = RECIP_MASK_ALL;
4084 else
4085 {
4086 for (i = 0; i < ARRAY_SIZE (recip_options); i++)
4087 if (!strcmp (q, recip_options[i].string))
4088 {
4089 mask = recip_options[i].mask;
4090 break;
4091 }
4092
4093 if (i == ARRAY_SIZE (recip_options))
4094 {
4095 error ("unknown option for -mrecip=%s", q);
4096 invert = false;
4097 mask = RECIP_MASK_NONE;
4098 }
4099 }
4100
4101 recip_mask_explicit |= mask;
4102 if (invert)
4103 recip_mask &= ~mask;
4104 else
4105 recip_mask |= mask;
4106 }
4107 }
4108
4109 if (TARGET_RECIP)
4110 recip_mask |= RECIP_MASK_ALL & ~recip_mask_explicit;
4111 else if (target_flags_explicit & MASK_RECIP)
4112 recip_mask &= ~(RECIP_MASK_ALL & ~recip_mask_explicit);
4113
4114 /* Default long double to 64-bit for Bionic. */
4115 if (TARGET_HAS_BIONIC
4116 && !(target_flags_explicit & MASK_LONG_DOUBLE_64))
4117 target_flags |= MASK_LONG_DOUBLE_64;
4118
4119 /* Save the initial options in case the user does function specific
4120 options. */
4121 if (main_args_p)
4122 target_option_default_node = target_option_current_node
4123 = build_target_option_node ();
4124 }
4125
4126 /* Return TRUE if VAL is passed in register with 256bit AVX modes. */
4127
4128 static bool
4129 function_pass_avx256_p (const_rtx val)
4130 {
4131 if (!val)
4132 return false;
4133
4134 if (REG_P (val) && VALID_AVX256_REG_MODE (GET_MODE (val)))
4135 return true;
4136
4137 if (GET_CODE (val) == PARALLEL)
4138 {
4139 int i;
4140 rtx r;
4141
4142 for (i = XVECLEN (val, 0) - 1; i >= 0; i--)
4143 {
4144 r = XVECEXP (val, 0, i);
4145 if (GET_CODE (r) == EXPR_LIST
4146 && XEXP (r, 0)
4147 && REG_P (XEXP (r, 0))
4148 && (GET_MODE (XEXP (r, 0)) == OImode
4149 || VALID_AVX256_REG_MODE (GET_MODE (XEXP (r, 0)))))
4150 return true;
4151 }
4152 }
4153
4154 return false;
4155 }
4156
4157 /* Implement the TARGET_OPTION_OVERRIDE hook. */
4158
4159 static void
4160 ix86_option_override (void)
4161 {
4162 ix86_option_override_internal (true);
4163 }
4164
4165 /* Update register usage after having seen the compiler flags. */
4166
4167 static void
4168 ix86_conditional_register_usage (void)
4169 {
4170 int i, c_mask;
4171 unsigned int j;
4172
4173 /* The PIC register, if it exists, is fixed. */
4174 j = PIC_OFFSET_TABLE_REGNUM;
4175 if (j != INVALID_REGNUM)
4176 fixed_regs[j] = call_used_regs[j] = 1;
4177
4178 /* For 32-bit targets, squash the REX registers. */
4179 if (! TARGET_64BIT)
4180 {
4181 for (i = FIRST_REX_INT_REG; i <= LAST_REX_INT_REG; i++)
4182 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4183 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
4184 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4185 }
4186
4187 /* See the definition of CALL_USED_REGISTERS in i386.h. */
4188 c_mask = (TARGET_64BIT_MS_ABI ? (1 << 3)
4189 : TARGET_64BIT ? (1 << 2)
4190 : (1 << 1));
4191
4192 CLEAR_HARD_REG_SET (reg_class_contents[(int)CLOBBERED_REGS]);
4193
4194 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4195 {
4196 /* Set/reset conditionally defined registers from
4197 CALL_USED_REGISTERS initializer. */
4198 if (call_used_regs[i] > 1)
4199 call_used_regs[i] = !!(call_used_regs[i] & c_mask);
4200
4201 /* Calculate registers of CLOBBERED_REGS register set
4202 as call used registers from GENERAL_REGS register set. */
4203 if (TEST_HARD_REG_BIT (reg_class_contents[(int)GENERAL_REGS], i)
4204 && call_used_regs[i])
4205 SET_HARD_REG_BIT (reg_class_contents[(int)CLOBBERED_REGS], i);
4206 }
4207
4208 /* If MMX is disabled, squash the registers. */
4209 if (! TARGET_MMX)
4210 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4211 if (TEST_HARD_REG_BIT (reg_class_contents[(int)MMX_REGS], i))
4212 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4213
4214 /* If SSE is disabled, squash the registers. */
4215 if (! TARGET_SSE)
4216 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4217 if (TEST_HARD_REG_BIT (reg_class_contents[(int)SSE_REGS], i))
4218 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4219
4220 /* If the FPU is disabled, squash the registers. */
4221 if (! (TARGET_80387 || TARGET_FLOAT_RETURNS_IN_80387))
4222 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4223 if (TEST_HARD_REG_BIT (reg_class_contents[(int)FLOAT_REGS], i))
4224 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4225 }
4226
4227 \f
4228 /* Save the current options */
4229
4230 static void
4231 ix86_function_specific_save (struct cl_target_option *ptr)
4232 {
4233 ptr->arch = ix86_arch;
4234 ptr->schedule = ix86_schedule;
4235 ptr->tune = ix86_tune;
4236 ptr->branch_cost = ix86_branch_cost;
4237 ptr->tune_defaulted = ix86_tune_defaulted;
4238 ptr->arch_specified = ix86_arch_specified;
4239 ptr->x_ix86_isa_flags_explicit = ix86_isa_flags_explicit;
4240 ptr->ix86_target_flags_explicit = target_flags_explicit;
4241 ptr->x_recip_mask_explicit = recip_mask_explicit;
4242
4243 /* The fields are char but the variables are not; make sure the
4244 values fit in the fields. */
4245 gcc_assert (ptr->arch == ix86_arch);
4246 gcc_assert (ptr->schedule == ix86_schedule);
4247 gcc_assert (ptr->tune == ix86_tune);
4248 gcc_assert (ptr->branch_cost == ix86_branch_cost);
4249 }
4250
4251 /* Restore the current options */
4252
4253 static void
4254 ix86_function_specific_restore (struct cl_target_option *ptr)
4255 {
4256 enum processor_type old_tune = ix86_tune;
4257 enum processor_type old_arch = ix86_arch;
4258 unsigned int ix86_arch_mask, ix86_tune_mask;
4259 int i;
4260
4261 ix86_arch = (enum processor_type) ptr->arch;
4262 ix86_schedule = (enum attr_cpu) ptr->schedule;
4263 ix86_tune = (enum processor_type) ptr->tune;
4264 ix86_branch_cost = ptr->branch_cost;
4265 ix86_tune_defaulted = ptr->tune_defaulted;
4266 ix86_arch_specified = ptr->arch_specified;
4267 ix86_isa_flags_explicit = ptr->x_ix86_isa_flags_explicit;
4268 target_flags_explicit = ptr->ix86_target_flags_explicit;
4269 recip_mask_explicit = ptr->x_recip_mask_explicit;
4270
4271 /* Recreate the arch feature tests if the arch changed */
4272 if (old_arch != ix86_arch)
4273 {
4274 ix86_arch_mask = 1u << ix86_arch;
4275 for (i = 0; i < X86_ARCH_LAST; ++i)
4276 ix86_arch_features[i]
4277 = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
4278 }
4279
4280 /* Recreate the tune optimization tests */
4281 if (old_tune != ix86_tune)
4282 {
4283 ix86_tune_mask = 1u << ix86_tune;
4284 for (i = 0; i < X86_TUNE_LAST; ++i)
4285 ix86_tune_features[i]
4286 = !!(initial_ix86_tune_features[i] & ix86_tune_mask);
4287 }
4288 }
4289
4290 /* Print the current options */
4291
4292 static void
4293 ix86_function_specific_print (FILE *file, int indent,
4294 struct cl_target_option *ptr)
4295 {
4296 char *target_string
4297 = ix86_target_string (ptr->x_ix86_isa_flags, ptr->x_target_flags,
4298 NULL, NULL, ptr->x_ix86_fpmath, false);
4299
4300 fprintf (file, "%*sarch = %d (%s)\n",
4301 indent, "",
4302 ptr->arch,
4303 ((ptr->arch < TARGET_CPU_DEFAULT_max)
4304 ? cpu_names[ptr->arch]
4305 : "<unknown>"));
4306
4307 fprintf (file, "%*stune = %d (%s)\n",
4308 indent, "",
4309 ptr->tune,
4310 ((ptr->tune < TARGET_CPU_DEFAULT_max)
4311 ? cpu_names[ptr->tune]
4312 : "<unknown>"));
4313
4314 fprintf (file, "%*sbranch_cost = %d\n", indent, "", ptr->branch_cost);
4315
4316 if (target_string)
4317 {
4318 fprintf (file, "%*s%s\n", indent, "", target_string);
4319 free (target_string);
4320 }
4321 }
4322
4323 \f
4324 /* Inner function to process the attribute((target(...))), take an argument and
4325 set the current options from the argument. If we have a list, recursively go
4326 over the list. */
4327
4328 static bool
4329 ix86_valid_target_attribute_inner_p (tree args, char *p_strings[],
4330 struct gcc_options *enum_opts_set)
4331 {
4332 char *next_optstr;
4333 bool ret = true;
4334
4335 #define IX86_ATTR_ISA(S,O) { S, sizeof (S)-1, ix86_opt_isa, O, 0 }
4336 #define IX86_ATTR_STR(S,O) { S, sizeof (S)-1, ix86_opt_str, O, 0 }
4337 #define IX86_ATTR_ENUM(S,O) { S, sizeof (S)-1, ix86_opt_enum, O, 0 }
4338 #define IX86_ATTR_YES(S,O,M) { S, sizeof (S)-1, ix86_opt_yes, O, M }
4339 #define IX86_ATTR_NO(S,O,M) { S, sizeof (S)-1, ix86_opt_no, O, M }
4340
4341 enum ix86_opt_type
4342 {
4343 ix86_opt_unknown,
4344 ix86_opt_yes,
4345 ix86_opt_no,
4346 ix86_opt_str,
4347 ix86_opt_enum,
4348 ix86_opt_isa
4349 };
4350
4351 static const struct
4352 {
4353 const char *string;
4354 size_t len;
4355 enum ix86_opt_type type;
4356 int opt;
4357 int mask;
4358 } attrs[] = {
4359 /* isa options */
4360 IX86_ATTR_ISA ("3dnow", OPT_m3dnow),
4361 IX86_ATTR_ISA ("abm", OPT_mabm),
4362 IX86_ATTR_ISA ("bmi", OPT_mbmi),
4363 IX86_ATTR_ISA ("bmi2", OPT_mbmi2),
4364 IX86_ATTR_ISA ("lzcnt", OPT_mlzcnt),
4365 IX86_ATTR_ISA ("tbm", OPT_mtbm),
4366 IX86_ATTR_ISA ("aes", OPT_maes),
4367 IX86_ATTR_ISA ("avx", OPT_mavx),
4368 IX86_ATTR_ISA ("avx2", OPT_mavx2),
4369 IX86_ATTR_ISA ("mmx", OPT_mmmx),
4370 IX86_ATTR_ISA ("pclmul", OPT_mpclmul),
4371 IX86_ATTR_ISA ("popcnt", OPT_mpopcnt),
4372 IX86_ATTR_ISA ("sse", OPT_msse),
4373 IX86_ATTR_ISA ("sse2", OPT_msse2),
4374 IX86_ATTR_ISA ("sse3", OPT_msse3),
4375 IX86_ATTR_ISA ("sse4", OPT_msse4),
4376 IX86_ATTR_ISA ("sse4.1", OPT_msse4_1),
4377 IX86_ATTR_ISA ("sse4.2", OPT_msse4_2),
4378 IX86_ATTR_ISA ("sse4a", OPT_msse4a),
4379 IX86_ATTR_ISA ("ssse3", OPT_mssse3),
4380 IX86_ATTR_ISA ("fma4", OPT_mfma4),
4381 IX86_ATTR_ISA ("fma", OPT_mfma),
4382 IX86_ATTR_ISA ("xop", OPT_mxop),
4383 IX86_ATTR_ISA ("lwp", OPT_mlwp),
4384 IX86_ATTR_ISA ("fsgsbase", OPT_mfsgsbase),
4385 IX86_ATTR_ISA ("rdrnd", OPT_mrdrnd),
4386 IX86_ATTR_ISA ("f16c", OPT_mf16c),
4387 IX86_ATTR_ISA ("rtm", OPT_mrtm),
4388 IX86_ATTR_ISA ("hle", OPT_mhle),
4389 IX86_ATTR_ISA ("prfchw", OPT_mprfchw),
4390 IX86_ATTR_ISA ("rdseed", OPT_mrdseed),
4391 IX86_ATTR_ISA ("adx", OPT_madx),
4392 IX86_ATTR_ISA ("fxsr", OPT_mfxsr),
4393 IX86_ATTR_ISA ("xsave", OPT_mxsave),
4394 IX86_ATTR_ISA ("xsaveopt", OPT_mxsaveopt),
4395
4396 /* enum options */
4397 IX86_ATTR_ENUM ("fpmath=", OPT_mfpmath_),
4398
4399 /* string options */
4400 IX86_ATTR_STR ("arch=", IX86_FUNCTION_SPECIFIC_ARCH),
4401 IX86_ATTR_STR ("tune=", IX86_FUNCTION_SPECIFIC_TUNE),
4402
4403 /* flag options */
4404 IX86_ATTR_YES ("cld",
4405 OPT_mcld,
4406 MASK_CLD),
4407
4408 IX86_ATTR_NO ("fancy-math-387",
4409 OPT_mfancy_math_387,
4410 MASK_NO_FANCY_MATH_387),
4411
4412 IX86_ATTR_YES ("ieee-fp",
4413 OPT_mieee_fp,
4414 MASK_IEEE_FP),
4415
4416 IX86_ATTR_YES ("inline-all-stringops",
4417 OPT_minline_all_stringops,
4418 MASK_INLINE_ALL_STRINGOPS),
4419
4420 IX86_ATTR_YES ("inline-stringops-dynamically",
4421 OPT_minline_stringops_dynamically,
4422 MASK_INLINE_STRINGOPS_DYNAMICALLY),
4423
4424 IX86_ATTR_NO ("align-stringops",
4425 OPT_mno_align_stringops,
4426 MASK_NO_ALIGN_STRINGOPS),
4427
4428 IX86_ATTR_YES ("recip",
4429 OPT_mrecip,
4430 MASK_RECIP),
4431
4432 };
4433
4434 /* If this is a list, recurse to get the options. */
4435 if (TREE_CODE (args) == TREE_LIST)
4436 {
4437 bool ret = true;
4438
4439 for (; args; args = TREE_CHAIN (args))
4440 if (TREE_VALUE (args)
4441 && !ix86_valid_target_attribute_inner_p (TREE_VALUE (args),
4442 p_strings, enum_opts_set))
4443 ret = false;
4444
4445 return ret;
4446 }
4447
4448 else if (TREE_CODE (args) != STRING_CST)
4449 gcc_unreachable ();
4450
4451 /* Handle multiple arguments separated by commas. */
4452 next_optstr = ASTRDUP (TREE_STRING_POINTER (args));
4453
4454 while (next_optstr && *next_optstr != '\0')
4455 {
4456 char *p = next_optstr;
4457 char *orig_p = p;
4458 char *comma = strchr (next_optstr, ',');
4459 const char *opt_string;
4460 size_t len, opt_len;
4461 int opt;
4462 bool opt_set_p;
4463 char ch;
4464 unsigned i;
4465 enum ix86_opt_type type = ix86_opt_unknown;
4466 int mask = 0;
4467
4468 if (comma)
4469 {
4470 *comma = '\0';
4471 len = comma - next_optstr;
4472 next_optstr = comma + 1;
4473 }
4474 else
4475 {
4476 len = strlen (p);
4477 next_optstr = NULL;
4478 }
4479
4480 /* Recognize no-xxx. */
4481 if (len > 3 && p[0] == 'n' && p[1] == 'o' && p[2] == '-')
4482 {
4483 opt_set_p = false;
4484 p += 3;
4485 len -= 3;
4486 }
4487 else
4488 opt_set_p = true;
4489
4490 /* Find the option. */
4491 ch = *p;
4492 opt = N_OPTS;
4493 for (i = 0; i < ARRAY_SIZE (attrs); i++)
4494 {
4495 type = attrs[i].type;
4496 opt_len = attrs[i].len;
4497 if (ch == attrs[i].string[0]
4498 && ((type != ix86_opt_str && type != ix86_opt_enum)
4499 ? len == opt_len
4500 : len > opt_len)
4501 && memcmp (p, attrs[i].string, opt_len) == 0)
4502 {
4503 opt = attrs[i].opt;
4504 mask = attrs[i].mask;
4505 opt_string = attrs[i].string;
4506 break;
4507 }
4508 }
4509
4510 /* Process the option. */
4511 if (opt == N_OPTS)
4512 {
4513 error ("attribute(target(\"%s\")) is unknown", orig_p);
4514 ret = false;
4515 }
4516
4517 else if (type == ix86_opt_isa)
4518 {
4519 struct cl_decoded_option decoded;
4520
4521 generate_option (opt, NULL, opt_set_p, CL_TARGET, &decoded);
4522 ix86_handle_option (&global_options, &global_options_set,
4523 &decoded, input_location);
4524 }
4525
4526 else if (type == ix86_opt_yes || type == ix86_opt_no)
4527 {
4528 if (type == ix86_opt_no)
4529 opt_set_p = !opt_set_p;
4530
4531 if (opt_set_p)
4532 target_flags |= mask;
4533 else
4534 target_flags &= ~mask;
4535 }
4536
4537 else if (type == ix86_opt_str)
4538 {
4539 if (p_strings[opt])
4540 {
4541 error ("option(\"%s\") was already specified", opt_string);
4542 ret = false;
4543 }
4544 else
4545 p_strings[opt] = xstrdup (p + opt_len);
4546 }
4547
4548 else if (type == ix86_opt_enum)
4549 {
4550 bool arg_ok;
4551 int value;
4552
4553 arg_ok = opt_enum_arg_to_value (opt, p + opt_len, &value, CL_TARGET);
4554 if (arg_ok)
4555 set_option (&global_options, enum_opts_set, opt, value,
4556 p + opt_len, DK_UNSPECIFIED, input_location,
4557 global_dc);
4558 else
4559 {
4560 error ("attribute(target(\"%s\")) is unknown", orig_p);
4561 ret = false;
4562 }
4563 }
4564
4565 else
4566 gcc_unreachable ();
4567 }
4568
4569 return ret;
4570 }
4571
4572 /* Return a TARGET_OPTION_NODE tree of the target options listed or NULL. */
4573
4574 tree
4575 ix86_valid_target_attribute_tree (tree args)
4576 {
4577 const char *orig_arch_string = ix86_arch_string;
4578 const char *orig_tune_string = ix86_tune_string;
4579 enum fpmath_unit orig_fpmath_set = global_options_set.x_ix86_fpmath;
4580 int orig_tune_defaulted = ix86_tune_defaulted;
4581 int orig_arch_specified = ix86_arch_specified;
4582 char *option_strings[IX86_FUNCTION_SPECIFIC_MAX] = { NULL, NULL };
4583 tree t = NULL_TREE;
4584 int i;
4585 struct cl_target_option *def
4586 = TREE_TARGET_OPTION (target_option_default_node);
4587 struct gcc_options enum_opts_set;
4588
4589 memset (&enum_opts_set, 0, sizeof (enum_opts_set));
4590
4591 /* Process each of the options on the chain. */
4592 if (! ix86_valid_target_attribute_inner_p (args, option_strings,
4593 &enum_opts_set))
4594 return NULL_TREE;
4595
4596 /* If the changed options are different from the default, rerun
4597 ix86_option_override_internal, and then save the options away.
4598 The string options are are attribute options, and will be undone
4599 when we copy the save structure. */
4600 if (ix86_isa_flags != def->x_ix86_isa_flags
4601 || target_flags != def->x_target_flags
4602 || option_strings[IX86_FUNCTION_SPECIFIC_ARCH]
4603 || option_strings[IX86_FUNCTION_SPECIFIC_TUNE]
4604 || enum_opts_set.x_ix86_fpmath)
4605 {
4606 /* If we are using the default tune= or arch=, undo the string assigned,
4607 and use the default. */
4608 if (option_strings[IX86_FUNCTION_SPECIFIC_ARCH])
4609 ix86_arch_string = option_strings[IX86_FUNCTION_SPECIFIC_ARCH];
4610 else if (!orig_arch_specified)
4611 ix86_arch_string = NULL;
4612
4613 if (option_strings[IX86_FUNCTION_SPECIFIC_TUNE])
4614 ix86_tune_string = option_strings[IX86_FUNCTION_SPECIFIC_TUNE];
4615 else if (orig_tune_defaulted)
4616 ix86_tune_string = NULL;
4617
4618 /* If fpmath= is not set, and we now have sse2 on 32-bit, use it. */
4619 if (enum_opts_set.x_ix86_fpmath)
4620 global_options_set.x_ix86_fpmath = (enum fpmath_unit) 1;
4621 else if (!TARGET_64BIT && TARGET_SSE)
4622 {
4623 ix86_fpmath = (enum fpmath_unit) (FPMATH_SSE | FPMATH_387);
4624 global_options_set.x_ix86_fpmath = (enum fpmath_unit) 1;
4625 }
4626
4627 /* Do any overrides, such as arch=xxx, or tune=xxx support. */
4628 ix86_option_override_internal (false);
4629
4630 /* Add any builtin functions with the new isa if any. */
4631 ix86_add_new_builtins (ix86_isa_flags);
4632
4633 /* Save the current options unless we are validating options for
4634 #pragma. */
4635 t = build_target_option_node ();
4636
4637 ix86_arch_string = orig_arch_string;
4638 ix86_tune_string = orig_tune_string;
4639 global_options_set.x_ix86_fpmath = orig_fpmath_set;
4640
4641 /* Free up memory allocated to hold the strings */
4642 for (i = 0; i < IX86_FUNCTION_SPECIFIC_MAX; i++)
4643 free (option_strings[i]);
4644 }
4645
4646 return t;
4647 }
4648
4649 /* Hook to validate attribute((target("string"))). */
4650
4651 static bool
4652 ix86_valid_target_attribute_p (tree fndecl,
4653 tree ARG_UNUSED (name),
4654 tree args,
4655 int ARG_UNUSED (flags))
4656 {
4657 struct cl_target_option cur_target;
4658 bool ret = true;
4659 tree old_optimize = build_optimization_node ();
4660 tree new_target, new_optimize;
4661 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
4662
4663 /* If the function changed the optimization levels as well as setting target
4664 options, start with the optimizations specified. */
4665 if (func_optimize && func_optimize != old_optimize)
4666 cl_optimization_restore (&global_options,
4667 TREE_OPTIMIZATION (func_optimize));
4668
4669 /* The target attributes may also change some optimization flags, so update
4670 the optimization options if necessary. */
4671 cl_target_option_save (&cur_target, &global_options);
4672 new_target = ix86_valid_target_attribute_tree (args);
4673 new_optimize = build_optimization_node ();
4674
4675 if (!new_target)
4676 ret = false;
4677
4678 else if (fndecl)
4679 {
4680 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
4681
4682 if (old_optimize != new_optimize)
4683 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
4684 }
4685
4686 cl_target_option_restore (&global_options, &cur_target);
4687
4688 if (old_optimize != new_optimize)
4689 cl_optimization_restore (&global_options,
4690 TREE_OPTIMIZATION (old_optimize));
4691
4692 return ret;
4693 }
4694
4695 \f
4696 /* Hook to determine if one function can safely inline another. */
4697
4698 static bool
4699 ix86_can_inline_p (tree caller, tree callee)
4700 {
4701 bool ret = false;
4702 tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
4703 tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
4704
4705 /* If callee has no option attributes, then it is ok to inline. */
4706 if (!callee_tree)
4707 ret = true;
4708
4709 /* If caller has no option attributes, but callee does then it is not ok to
4710 inline. */
4711 else if (!caller_tree)
4712 ret = false;
4713
4714 else
4715 {
4716 struct cl_target_option *caller_opts = TREE_TARGET_OPTION (caller_tree);
4717 struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree);
4718
4719 /* Callee's isa options should a subset of the caller's, i.e. a SSE4 function
4720 can inline a SSE2 function but a SSE2 function can't inline a SSE4
4721 function. */
4722 if ((caller_opts->x_ix86_isa_flags & callee_opts->x_ix86_isa_flags)
4723 != callee_opts->x_ix86_isa_flags)
4724 ret = false;
4725
4726 /* See if we have the same non-isa options. */
4727 else if (caller_opts->x_target_flags != callee_opts->x_target_flags)
4728 ret = false;
4729
4730 /* See if arch, tune, etc. are the same. */
4731 else if (caller_opts->arch != callee_opts->arch)
4732 ret = false;
4733
4734 else if (caller_opts->tune != callee_opts->tune)
4735 ret = false;
4736
4737 else if (caller_opts->x_ix86_fpmath != callee_opts->x_ix86_fpmath)
4738 ret = false;
4739
4740 else if (caller_opts->branch_cost != callee_opts->branch_cost)
4741 ret = false;
4742
4743 else
4744 ret = true;
4745 }
4746
4747 return ret;
4748 }
4749
4750 \f
4751 /* Remember the last target of ix86_set_current_function. */
4752 static GTY(()) tree ix86_previous_fndecl;
4753
4754 /* Establish appropriate back-end context for processing the function
4755 FNDECL. The argument might be NULL to indicate processing at top
4756 level, outside of any function scope. */
4757 static void
4758 ix86_set_current_function (tree fndecl)
4759 {
4760 /* Only change the context if the function changes. This hook is called
4761 several times in the course of compiling a function, and we don't want to
4762 slow things down too much or call target_reinit when it isn't safe. */
4763 if (fndecl && fndecl != ix86_previous_fndecl)
4764 {
4765 tree old_tree = (ix86_previous_fndecl
4766 ? DECL_FUNCTION_SPECIFIC_TARGET (ix86_previous_fndecl)
4767 : NULL_TREE);
4768
4769 tree new_tree = (fndecl
4770 ? DECL_FUNCTION_SPECIFIC_TARGET (fndecl)
4771 : NULL_TREE);
4772
4773 ix86_previous_fndecl = fndecl;
4774 if (old_tree == new_tree)
4775 ;
4776
4777 else if (new_tree)
4778 {
4779 cl_target_option_restore (&global_options,
4780 TREE_TARGET_OPTION (new_tree));
4781 target_reinit ();
4782 }
4783
4784 else if (old_tree)
4785 {
4786 struct cl_target_option *def
4787 = TREE_TARGET_OPTION (target_option_current_node);
4788
4789 cl_target_option_restore (&global_options, def);
4790 target_reinit ();
4791 }
4792 }
4793 }
4794
4795 \f
4796 /* Return true if this goes in large data/bss. */
4797
4798 static bool
4799 ix86_in_large_data_p (tree exp)
4800 {
4801 if (ix86_cmodel != CM_MEDIUM && ix86_cmodel != CM_MEDIUM_PIC)
4802 return false;
4803
4804 /* Functions are never large data. */
4805 if (TREE_CODE (exp) == FUNCTION_DECL)
4806 return false;
4807
4808 if (TREE_CODE (exp) == VAR_DECL && DECL_SECTION_NAME (exp))
4809 {
4810 const char *section = TREE_STRING_POINTER (DECL_SECTION_NAME (exp));
4811 if (strcmp (section, ".ldata") == 0
4812 || strcmp (section, ".lbss") == 0)
4813 return true;
4814 return false;
4815 }
4816 else
4817 {
4818 HOST_WIDE_INT size = int_size_in_bytes (TREE_TYPE (exp));
4819
4820 /* If this is an incomplete type with size 0, then we can't put it
4821 in data because it might be too big when completed. */
4822 if (!size || size > ix86_section_threshold)
4823 return true;
4824 }
4825
4826 return false;
4827 }
4828
4829 /* Switch to the appropriate section for output of DECL.
4830 DECL is either a `VAR_DECL' node or a constant of some sort.
4831 RELOC indicates whether forming the initial value of DECL requires
4832 link-time relocations. */
4833
4834 static section * x86_64_elf_select_section (tree, int, unsigned HOST_WIDE_INT)
4835 ATTRIBUTE_UNUSED;
4836
4837 static section *
4838 x86_64_elf_select_section (tree decl, int reloc,
4839 unsigned HOST_WIDE_INT align)
4840 {
4841 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4842 && ix86_in_large_data_p (decl))
4843 {
4844 const char *sname = NULL;
4845 unsigned int flags = SECTION_WRITE;
4846 switch (categorize_decl_for_section (decl, reloc))
4847 {
4848 case SECCAT_DATA:
4849 sname = ".ldata";
4850 break;
4851 case SECCAT_DATA_REL:
4852 sname = ".ldata.rel";
4853 break;
4854 case SECCAT_DATA_REL_LOCAL:
4855 sname = ".ldata.rel.local";
4856 break;
4857 case SECCAT_DATA_REL_RO:
4858 sname = ".ldata.rel.ro";
4859 break;
4860 case SECCAT_DATA_REL_RO_LOCAL:
4861 sname = ".ldata.rel.ro.local";
4862 break;
4863 case SECCAT_BSS:
4864 sname = ".lbss";
4865 flags |= SECTION_BSS;
4866 break;
4867 case SECCAT_RODATA:
4868 case SECCAT_RODATA_MERGE_STR:
4869 case SECCAT_RODATA_MERGE_STR_INIT:
4870 case SECCAT_RODATA_MERGE_CONST:
4871 sname = ".lrodata";
4872 flags = 0;
4873 break;
4874 case SECCAT_SRODATA:
4875 case SECCAT_SDATA:
4876 case SECCAT_SBSS:
4877 gcc_unreachable ();
4878 case SECCAT_TEXT:
4879 case SECCAT_TDATA:
4880 case SECCAT_TBSS:
4881 /* We don't split these for medium model. Place them into
4882 default sections and hope for best. */
4883 break;
4884 }
4885 if (sname)
4886 {
4887 /* We might get called with string constants, but get_named_section
4888 doesn't like them as they are not DECLs. Also, we need to set
4889 flags in that case. */
4890 if (!DECL_P (decl))
4891 return get_section (sname, flags, NULL);
4892 return get_named_section (decl, sname, reloc);
4893 }
4894 }
4895 return default_elf_select_section (decl, reloc, align);
4896 }
4897
4898 /* Build up a unique section name, expressed as a
4899 STRING_CST node, and assign it to DECL_SECTION_NAME (decl).
4900 RELOC indicates whether the initial value of EXP requires
4901 link-time relocations. */
4902
4903 static void ATTRIBUTE_UNUSED
4904 x86_64_elf_unique_section (tree decl, int reloc)
4905 {
4906 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4907 && ix86_in_large_data_p (decl))
4908 {
4909 const char *prefix = NULL;
4910 /* We only need to use .gnu.linkonce if we don't have COMDAT groups. */
4911 bool one_only = DECL_ONE_ONLY (decl) && !HAVE_COMDAT_GROUP;
4912
4913 switch (categorize_decl_for_section (decl, reloc))
4914 {
4915 case SECCAT_DATA:
4916 case SECCAT_DATA_REL:
4917 case SECCAT_DATA_REL_LOCAL:
4918 case SECCAT_DATA_REL_RO:
4919 case SECCAT_DATA_REL_RO_LOCAL:
4920 prefix = one_only ? ".ld" : ".ldata";
4921 break;
4922 case SECCAT_BSS:
4923 prefix = one_only ? ".lb" : ".lbss";
4924 break;
4925 case SECCAT_RODATA:
4926 case SECCAT_RODATA_MERGE_STR:
4927 case SECCAT_RODATA_MERGE_STR_INIT:
4928 case SECCAT_RODATA_MERGE_CONST:
4929 prefix = one_only ? ".lr" : ".lrodata";
4930 break;
4931 case SECCAT_SRODATA:
4932 case SECCAT_SDATA:
4933 case SECCAT_SBSS:
4934 gcc_unreachable ();
4935 case SECCAT_TEXT:
4936 case SECCAT_TDATA:
4937 case SECCAT_TBSS:
4938 /* We don't split these for medium model. Place them into
4939 default sections and hope for best. */
4940 break;
4941 }
4942 if (prefix)
4943 {
4944 const char *name, *linkonce;
4945 char *string;
4946
4947 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
4948 name = targetm.strip_name_encoding (name);
4949
4950 /* If we're using one_only, then there needs to be a .gnu.linkonce
4951 prefix to the section name. */
4952 linkonce = one_only ? ".gnu.linkonce" : "";
4953
4954 string = ACONCAT ((linkonce, prefix, ".", name, NULL));
4955
4956 DECL_SECTION_NAME (decl) = build_string (strlen (string), string);
4957 return;
4958 }
4959 }
4960 default_unique_section (decl, reloc);
4961 }
4962
4963 #ifdef COMMON_ASM_OP
4964 /* This says how to output assembler code to declare an
4965 uninitialized external linkage data object.
4966
4967 For medium model x86-64 we need to use .largecomm opcode for
4968 large objects. */
4969 void
4970 x86_elf_aligned_common (FILE *file,
4971 const char *name, unsigned HOST_WIDE_INT size,
4972 int align)
4973 {
4974 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4975 && size > (unsigned int)ix86_section_threshold)
4976 fputs (".largecomm\t", file);
4977 else
4978 fputs (COMMON_ASM_OP, file);
4979 assemble_name (file, name);
4980 fprintf (file, "," HOST_WIDE_INT_PRINT_UNSIGNED ",%u\n",
4981 size, align / BITS_PER_UNIT);
4982 }
4983 #endif
4984
4985 /* Utility function for targets to use in implementing
4986 ASM_OUTPUT_ALIGNED_BSS. */
4987
4988 void
4989 x86_output_aligned_bss (FILE *file, tree decl ATTRIBUTE_UNUSED,
4990 const char *name, unsigned HOST_WIDE_INT size,
4991 int align)
4992 {
4993 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4994 && size > (unsigned int)ix86_section_threshold)
4995 switch_to_section (get_named_section (decl, ".lbss", 0));
4996 else
4997 switch_to_section (bss_section);
4998 ASM_OUTPUT_ALIGN (file, floor_log2 (align / BITS_PER_UNIT));
4999 #ifdef ASM_DECLARE_OBJECT_NAME
5000 last_assemble_variable_decl = decl;
5001 ASM_DECLARE_OBJECT_NAME (file, name, decl);
5002 #else
5003 /* Standard thing is just output label for the object. */
5004 ASM_OUTPUT_LABEL (file, name);
5005 #endif /* ASM_DECLARE_OBJECT_NAME */
5006 ASM_OUTPUT_SKIP (file, size ? size : 1);
5007 }
5008 \f
5009 /* Decide whether we must probe the stack before any space allocation
5010 on this target. It's essentially TARGET_STACK_PROBE except when
5011 -fstack-check causes the stack to be already probed differently. */
5012
5013 bool
5014 ix86_target_stack_probe (void)
5015 {
5016 /* Do not probe the stack twice if static stack checking is enabled. */
5017 if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
5018 return false;
5019
5020 return TARGET_STACK_PROBE;
5021 }
5022 \f
5023 /* Decide whether we can make a sibling call to a function. DECL is the
5024 declaration of the function being targeted by the call and EXP is the
5025 CALL_EXPR representing the call. */
5026
5027 static bool
5028 ix86_function_ok_for_sibcall (tree decl, tree exp)
5029 {
5030 tree type, decl_or_type;
5031 rtx a, b;
5032
5033 /* If we are generating position-independent code, we cannot sibcall
5034 optimize any indirect call, or a direct call to a global function,
5035 as the PLT requires %ebx be live. (Darwin does not have a PLT.) */
5036 if (!TARGET_MACHO
5037 && !TARGET_64BIT
5038 && flag_pic
5039 && (!decl || !targetm.binds_local_p (decl)))
5040 return false;
5041
5042 /* If we need to align the outgoing stack, then sibcalling would
5043 unalign the stack, which may break the called function. */
5044 if (ix86_minimum_incoming_stack_boundary (true)
5045 < PREFERRED_STACK_BOUNDARY)
5046 return false;
5047
5048 if (decl)
5049 {
5050 decl_or_type = decl;
5051 type = TREE_TYPE (decl);
5052 }
5053 else
5054 {
5055 /* We're looking at the CALL_EXPR, we need the type of the function. */
5056 type = CALL_EXPR_FN (exp); /* pointer expression */
5057 type = TREE_TYPE (type); /* pointer type */
5058 type = TREE_TYPE (type); /* function type */
5059 decl_or_type = type;
5060 }
5061
5062 /* Check that the return value locations are the same. Like
5063 if we are returning floats on the 80387 register stack, we cannot
5064 make a sibcall from a function that doesn't return a float to a
5065 function that does or, conversely, from a function that does return
5066 a float to a function that doesn't; the necessary stack adjustment
5067 would not be executed. This is also the place we notice
5068 differences in the return value ABI. Note that it is ok for one
5069 of the functions to have void return type as long as the return
5070 value of the other is passed in a register. */
5071 a = ix86_function_value (TREE_TYPE (exp), decl_or_type, false);
5072 b = ix86_function_value (TREE_TYPE (DECL_RESULT (cfun->decl)),
5073 cfun->decl, false);
5074 if (STACK_REG_P (a) || STACK_REG_P (b))
5075 {
5076 if (!rtx_equal_p (a, b))
5077 return false;
5078 }
5079 else if (VOID_TYPE_P (TREE_TYPE (DECL_RESULT (cfun->decl))))
5080 {
5081 /* Disable sibcall if we need to generate vzeroupper after
5082 callee returns. */
5083 if (TARGET_VZEROUPPER
5084 && cfun->machine->callee_return_avx256_p
5085 && !cfun->machine->caller_return_avx256_p)
5086 return false;
5087 }
5088 else if (!rtx_equal_p (a, b))
5089 return false;
5090
5091 if (TARGET_64BIT)
5092 {
5093 /* The SYSV ABI has more call-clobbered registers;
5094 disallow sibcalls from MS to SYSV. */
5095 if (cfun->machine->call_abi == MS_ABI
5096 && ix86_function_type_abi (type) == SYSV_ABI)
5097 return false;
5098 }
5099 else
5100 {
5101 /* If this call is indirect, we'll need to be able to use a
5102 call-clobbered register for the address of the target function.
5103 Make sure that all such registers are not used for passing
5104 parameters. Note that DLLIMPORT functions are indirect. */
5105 if (!decl
5106 || (TARGET_DLLIMPORT_DECL_ATTRIBUTES && DECL_DLLIMPORT_P (decl)))
5107 {
5108 if (ix86_function_regparm (type, NULL) >= 3)
5109 {
5110 /* ??? Need to count the actual number of registers to be used,
5111 not the possible number of registers. Fix later. */
5112 return false;
5113 }
5114 }
5115 }
5116
5117 /* Otherwise okay. That also includes certain types of indirect calls. */
5118 return true;
5119 }
5120
5121 /* Handle "cdecl", "stdcall", "fastcall", "regparm", "thiscall",
5122 and "sseregparm" calling convention attributes;
5123 arguments as in struct attribute_spec.handler. */
5124
5125 static tree
5126 ix86_handle_cconv_attribute (tree *node, tree name,
5127 tree args,
5128 int flags ATTRIBUTE_UNUSED,
5129 bool *no_add_attrs)
5130 {
5131 if (TREE_CODE (*node) != FUNCTION_TYPE
5132 && TREE_CODE (*node) != METHOD_TYPE
5133 && TREE_CODE (*node) != FIELD_DECL
5134 && TREE_CODE (*node) != TYPE_DECL)
5135 {
5136 warning (OPT_Wattributes, "%qE attribute only applies to functions",
5137 name);
5138 *no_add_attrs = true;
5139 return NULL_TREE;
5140 }
5141
5142 /* Can combine regparm with all attributes but fastcall, and thiscall. */
5143 if (is_attribute_p ("regparm", name))
5144 {
5145 tree cst;
5146
5147 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
5148 {
5149 error ("fastcall and regparm attributes are not compatible");
5150 }
5151
5152 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
5153 {
5154 error ("regparam and thiscall attributes are not compatible");
5155 }
5156
5157 cst = TREE_VALUE (args);
5158 if (TREE_CODE (cst) != INTEGER_CST)
5159 {
5160 warning (OPT_Wattributes,
5161 "%qE attribute requires an integer constant argument",
5162 name);
5163 *no_add_attrs = true;
5164 }
5165 else if (compare_tree_int (cst, REGPARM_MAX) > 0)
5166 {
5167 warning (OPT_Wattributes, "argument to %qE attribute larger than %d",
5168 name, REGPARM_MAX);
5169 *no_add_attrs = true;
5170 }
5171
5172 return NULL_TREE;
5173 }
5174
5175 if (TARGET_64BIT)
5176 {
5177 /* Do not warn when emulating the MS ABI. */
5178 if ((TREE_CODE (*node) != FUNCTION_TYPE
5179 && TREE_CODE (*node) != METHOD_TYPE)
5180 || ix86_function_type_abi (*node) != MS_ABI)
5181 warning (OPT_Wattributes, "%qE attribute ignored",
5182 name);
5183 *no_add_attrs = true;
5184 return NULL_TREE;
5185 }
5186
5187 /* Can combine fastcall with stdcall (redundant) and sseregparm. */
5188 if (is_attribute_p ("fastcall", name))
5189 {
5190 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
5191 {
5192 error ("fastcall and cdecl attributes are not compatible");
5193 }
5194 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
5195 {
5196 error ("fastcall and stdcall attributes are not compatible");
5197 }
5198 if (lookup_attribute ("regparm", TYPE_ATTRIBUTES (*node)))
5199 {
5200 error ("fastcall and regparm attributes are not compatible");
5201 }
5202 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
5203 {
5204 error ("fastcall and thiscall attributes are not compatible");
5205 }
5206 }
5207
5208 /* Can combine stdcall with fastcall (redundant), regparm and
5209 sseregparm. */
5210 else if (is_attribute_p ("stdcall", name))
5211 {
5212 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
5213 {
5214 error ("stdcall and cdecl attributes are not compatible");
5215 }
5216 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
5217 {
5218 error ("stdcall and fastcall attributes are not compatible");
5219 }
5220 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
5221 {
5222 error ("stdcall and thiscall attributes are not compatible");
5223 }
5224 }
5225
5226 /* Can combine cdecl with regparm and sseregparm. */
5227 else if (is_attribute_p ("cdecl", name))
5228 {
5229 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
5230 {
5231 error ("stdcall and cdecl attributes are not compatible");
5232 }
5233 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
5234 {
5235 error ("fastcall and cdecl attributes are not compatible");
5236 }
5237 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
5238 {
5239 error ("cdecl and thiscall attributes are not compatible");
5240 }
5241 }
5242 else if (is_attribute_p ("thiscall", name))
5243 {
5244 if (TREE_CODE (*node) != METHOD_TYPE && pedantic)
5245 warning (OPT_Wattributes, "%qE attribute is used for none class-method",
5246 name);
5247 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
5248 {
5249 error ("stdcall and thiscall attributes are not compatible");
5250 }
5251 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
5252 {
5253 error ("fastcall and thiscall attributes are not compatible");
5254 }
5255 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
5256 {
5257 error ("cdecl and thiscall attributes are not compatible");
5258 }
5259 }
5260
5261 /* Can combine sseregparm with all attributes. */
5262
5263 return NULL_TREE;
5264 }
5265
5266 /* The transactional memory builtins are implicitly regparm or fastcall
5267 depending on the ABI. Override the generic do-nothing attribute that
5268 these builtins were declared with, and replace it with one of the two
5269 attributes that we expect elsewhere. */
5270
5271 static tree
5272 ix86_handle_tm_regparm_attribute (tree *node, tree name ATTRIBUTE_UNUSED,
5273 tree args ATTRIBUTE_UNUSED,
5274 int flags ATTRIBUTE_UNUSED,
5275 bool *no_add_attrs)
5276 {
5277 tree alt;
5278
5279 /* In no case do we want to add the placeholder attribute. */
5280 *no_add_attrs = true;
5281
5282 /* The 64-bit ABI is unchanged for transactional memory. */
5283 if (TARGET_64BIT)
5284 return NULL_TREE;
5285
5286 /* ??? Is there a better way to validate 32-bit windows? We have
5287 cfun->machine->call_abi, but that seems to be set only for 64-bit. */
5288 if (CHECK_STACK_LIMIT > 0)
5289 alt = tree_cons (get_identifier ("fastcall"), NULL, NULL);
5290 else
5291 {
5292 alt = tree_cons (NULL, build_int_cst (NULL, 2), NULL);
5293 alt = tree_cons (get_identifier ("regparm"), alt, NULL);
5294 }
5295 decl_attributes (node, alt, flags);
5296
5297 return NULL_TREE;
5298 }
5299
5300 /* This function determines from TYPE the calling-convention. */
5301
5302 unsigned int
5303 ix86_get_callcvt (const_tree type)
5304 {
5305 unsigned int ret = 0;
5306 bool is_stdarg;
5307 tree attrs;
5308
5309 if (TARGET_64BIT)
5310 return IX86_CALLCVT_CDECL;
5311
5312 attrs = TYPE_ATTRIBUTES (type);
5313 if (attrs != NULL_TREE)
5314 {
5315 if (lookup_attribute ("cdecl", attrs))
5316 ret |= IX86_CALLCVT_CDECL;
5317 else if (lookup_attribute ("stdcall", attrs))
5318 ret |= IX86_CALLCVT_STDCALL;
5319 else if (lookup_attribute ("fastcall", attrs))
5320 ret |= IX86_CALLCVT_FASTCALL;
5321 else if (lookup_attribute ("thiscall", attrs))
5322 ret |= IX86_CALLCVT_THISCALL;
5323
5324 /* Regparam isn't allowed for thiscall and fastcall. */
5325 if ((ret & (IX86_CALLCVT_THISCALL | IX86_CALLCVT_FASTCALL)) == 0)
5326 {
5327 if (lookup_attribute ("regparm", attrs))
5328 ret |= IX86_CALLCVT_REGPARM;
5329 if (lookup_attribute ("sseregparm", attrs))
5330 ret |= IX86_CALLCVT_SSEREGPARM;
5331 }
5332
5333 if (IX86_BASE_CALLCVT(ret) != 0)
5334 return ret;
5335 }
5336
5337 is_stdarg = stdarg_p (type);
5338 if (TARGET_RTD && !is_stdarg)
5339 return IX86_CALLCVT_STDCALL | ret;
5340
5341 if (ret != 0
5342 || is_stdarg
5343 || TREE_CODE (type) != METHOD_TYPE
5344 || ix86_function_type_abi (type) != MS_ABI)
5345 return IX86_CALLCVT_CDECL | ret;
5346
5347 return IX86_CALLCVT_THISCALL;
5348 }
5349
5350 /* Return 0 if the attributes for two types are incompatible, 1 if they
5351 are compatible, and 2 if they are nearly compatible (which causes a
5352 warning to be generated). */
5353
5354 static int
5355 ix86_comp_type_attributes (const_tree type1, const_tree type2)
5356 {
5357 unsigned int ccvt1, ccvt2;
5358
5359 if (TREE_CODE (type1) != FUNCTION_TYPE
5360 && TREE_CODE (type1) != METHOD_TYPE)
5361 return 1;
5362
5363 ccvt1 = ix86_get_callcvt (type1);
5364 ccvt2 = ix86_get_callcvt (type2);
5365 if (ccvt1 != ccvt2)
5366 return 0;
5367 if (ix86_function_regparm (type1, NULL)
5368 != ix86_function_regparm (type2, NULL))
5369 return 0;
5370
5371 return 1;
5372 }
5373 \f
5374 /* Return the regparm value for a function with the indicated TYPE and DECL.
5375 DECL may be NULL when calling function indirectly
5376 or considering a libcall. */
5377
5378 static int
5379 ix86_function_regparm (const_tree type, const_tree decl)
5380 {
5381 tree attr;
5382 int regparm;
5383 unsigned int ccvt;
5384
5385 if (TARGET_64BIT)
5386 return (ix86_function_type_abi (type) == SYSV_ABI
5387 ? X86_64_REGPARM_MAX : X86_64_MS_REGPARM_MAX);
5388 ccvt = ix86_get_callcvt (type);
5389 regparm = ix86_regparm;
5390
5391 if ((ccvt & IX86_CALLCVT_REGPARM) != 0)
5392 {
5393 attr = lookup_attribute ("regparm", TYPE_ATTRIBUTES (type));
5394 if (attr)
5395 {
5396 regparm = TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr)));
5397 return regparm;
5398 }
5399 }
5400 else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
5401 return 2;
5402 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
5403 return 1;
5404
5405 /* Use register calling convention for local functions when possible. */
5406 if (decl
5407 && TREE_CODE (decl) == FUNCTION_DECL
5408 && optimize
5409 && !(profile_flag && !flag_fentry))
5410 {
5411 /* FIXME: remove this CONST_CAST when cgraph.[ch] is constified. */
5412 struct cgraph_local_info *i = cgraph_local_info (CONST_CAST_TREE (decl));
5413 if (i && i->local && i->can_change_signature)
5414 {
5415 int local_regparm, globals = 0, regno;
5416
5417 /* Make sure no regparm register is taken by a
5418 fixed register variable. */
5419 for (local_regparm = 0; local_regparm < REGPARM_MAX; local_regparm++)
5420 if (fixed_regs[local_regparm])
5421 break;
5422
5423 /* We don't want to use regparm(3) for nested functions as
5424 these use a static chain pointer in the third argument. */
5425 if (local_regparm == 3 && DECL_STATIC_CHAIN (decl))
5426 local_regparm = 2;
5427
5428 /* In 32-bit mode save a register for the split stack. */
5429 if (!TARGET_64BIT && local_regparm == 3 && flag_split_stack)
5430 local_regparm = 2;
5431
5432 /* Each fixed register usage increases register pressure,
5433 so less registers should be used for argument passing.
5434 This functionality can be overriden by an explicit
5435 regparm value. */
5436 for (regno = AX_REG; regno <= DI_REG; regno++)
5437 if (fixed_regs[regno])
5438 globals++;
5439
5440 local_regparm
5441 = globals < local_regparm ? local_regparm - globals : 0;
5442
5443 if (local_regparm > regparm)
5444 regparm = local_regparm;
5445 }
5446 }
5447
5448 return regparm;
5449 }
5450
5451 /* Return 1 or 2, if we can pass up to SSE_REGPARM_MAX SFmode (1) and
5452 DFmode (2) arguments in SSE registers for a function with the
5453 indicated TYPE and DECL. DECL may be NULL when calling function
5454 indirectly or considering a libcall. Otherwise return 0. */
5455
5456 static int
5457 ix86_function_sseregparm (const_tree type, const_tree decl, bool warn)
5458 {
5459 gcc_assert (!TARGET_64BIT);
5460
5461 /* Use SSE registers to pass SFmode and DFmode arguments if requested
5462 by the sseregparm attribute. */
5463 if (TARGET_SSEREGPARM
5464 || (type && lookup_attribute ("sseregparm", TYPE_ATTRIBUTES (type))))
5465 {
5466 if (!TARGET_SSE)
5467 {
5468 if (warn)
5469 {
5470 if (decl)
5471 error ("calling %qD with attribute sseregparm without "
5472 "SSE/SSE2 enabled", decl);
5473 else
5474 error ("calling %qT with attribute sseregparm without "
5475 "SSE/SSE2 enabled", type);
5476 }
5477 return 0;
5478 }
5479
5480 return 2;
5481 }
5482
5483 /* For local functions, pass up to SSE_REGPARM_MAX SFmode
5484 (and DFmode for SSE2) arguments in SSE registers. */
5485 if (decl && TARGET_SSE_MATH && optimize
5486 && !(profile_flag && !flag_fentry))
5487 {
5488 /* FIXME: remove this CONST_CAST when cgraph.[ch] is constified. */
5489 struct cgraph_local_info *i = cgraph_local_info (CONST_CAST_TREE(decl));
5490 if (i && i->local && i->can_change_signature)
5491 return TARGET_SSE2 ? 2 : 1;
5492 }
5493
5494 return 0;
5495 }
5496
5497 /* Return true if EAX is live at the start of the function. Used by
5498 ix86_expand_prologue to determine if we need special help before
5499 calling allocate_stack_worker. */
5500
5501 static bool
5502 ix86_eax_live_at_start_p (void)
5503 {
5504 /* Cheat. Don't bother working forward from ix86_function_regparm
5505 to the function type to whether an actual argument is located in
5506 eax. Instead just look at cfg info, which is still close enough
5507 to correct at this point. This gives false positives for broken
5508 functions that might use uninitialized data that happens to be
5509 allocated in eax, but who cares? */
5510 return REGNO_REG_SET_P (df_get_live_out (ENTRY_BLOCK_PTR), 0);
5511 }
5512
5513 static bool
5514 ix86_keep_aggregate_return_pointer (tree fntype)
5515 {
5516 tree attr;
5517
5518 if (!TARGET_64BIT)
5519 {
5520 attr = lookup_attribute ("callee_pop_aggregate_return",
5521 TYPE_ATTRIBUTES (fntype));
5522 if (attr)
5523 return (TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr))) == 0);
5524
5525 /* For 32-bit MS-ABI the default is to keep aggregate
5526 return pointer. */
5527 if (ix86_function_type_abi (fntype) == MS_ABI)
5528 return true;
5529 }
5530 return KEEP_AGGREGATE_RETURN_POINTER != 0;
5531 }
5532
5533 /* Value is the number of bytes of arguments automatically
5534 popped when returning from a subroutine call.
5535 FUNDECL is the declaration node of the function (as a tree),
5536 FUNTYPE is the data type of the function (as a tree),
5537 or for a library call it is an identifier node for the subroutine name.
5538 SIZE is the number of bytes of arguments passed on the stack.
5539
5540 On the 80386, the RTD insn may be used to pop them if the number
5541 of args is fixed, but if the number is variable then the caller
5542 must pop them all. RTD can't be used for library calls now
5543 because the library is compiled with the Unix compiler.
5544 Use of RTD is a selectable option, since it is incompatible with
5545 standard Unix calling sequences. If the option is not selected,
5546 the caller must always pop the args.
5547
5548 The attribute stdcall is equivalent to RTD on a per module basis. */
5549
5550 static int
5551 ix86_return_pops_args (tree fundecl, tree funtype, int size)
5552 {
5553 unsigned int ccvt;
5554
5555 /* None of the 64-bit ABIs pop arguments. */
5556 if (TARGET_64BIT)
5557 return 0;
5558
5559 ccvt = ix86_get_callcvt (funtype);
5560
5561 if ((ccvt & (IX86_CALLCVT_STDCALL | IX86_CALLCVT_FASTCALL
5562 | IX86_CALLCVT_THISCALL)) != 0
5563 && ! stdarg_p (funtype))
5564 return size;
5565
5566 /* Lose any fake structure return argument if it is passed on the stack. */
5567 if (aggregate_value_p (TREE_TYPE (funtype), fundecl)
5568 && !ix86_keep_aggregate_return_pointer (funtype))
5569 {
5570 int nregs = ix86_function_regparm (funtype, fundecl);
5571 if (nregs == 0)
5572 return GET_MODE_SIZE (Pmode);
5573 }
5574
5575 return 0;
5576 }
5577
5578 /* Implement the TARGET_LEGITIMATE_COMBINED_INSN hook. */
5579
5580 static bool
5581 ix86_legitimate_combined_insn (rtx insn)
5582 {
5583 /* Check operand constraints in case hard registers were propagated
5584 into insn pattern. This check prevents combine pass from
5585 generating insn patterns with invalid hard register operands.
5586 These invalid insns can eventually confuse reload to error out
5587 with a spill failure. See also PRs 46829 and 46843. */
5588 if ((INSN_CODE (insn) = recog (PATTERN (insn), insn, 0)) >= 0)
5589 {
5590 int i;
5591
5592 extract_insn (insn);
5593 preprocess_constraints ();
5594
5595 for (i = 0; i < recog_data.n_operands; i++)
5596 {
5597 rtx op = recog_data.operand[i];
5598 enum machine_mode mode = GET_MODE (op);
5599 struct operand_alternative *op_alt;
5600 int offset = 0;
5601 bool win;
5602 int j;
5603
5604 /* A unary operator may be accepted by the predicate, but it
5605 is irrelevant for matching constraints. */
5606 if (UNARY_P (op))
5607 op = XEXP (op, 0);
5608
5609 if (GET_CODE (op) == SUBREG)
5610 {
5611 if (REG_P (SUBREG_REG (op))
5612 && REGNO (SUBREG_REG (op)) < FIRST_PSEUDO_REGISTER)
5613 offset = subreg_regno_offset (REGNO (SUBREG_REG (op)),
5614 GET_MODE (SUBREG_REG (op)),
5615 SUBREG_BYTE (op),
5616 GET_MODE (op));
5617 op = SUBREG_REG (op);
5618 }
5619
5620 if (!(REG_P (op) && HARD_REGISTER_P (op)))
5621 continue;
5622
5623 op_alt = recog_op_alt[i];
5624
5625 /* Operand has no constraints, anything is OK. */
5626 win = !recog_data.n_alternatives;
5627
5628 for (j = 0; j < recog_data.n_alternatives; j++)
5629 {
5630 if (op_alt[j].anything_ok
5631 || (op_alt[j].matches != -1
5632 && operands_match_p
5633 (recog_data.operand[i],
5634 recog_data.operand[op_alt[j].matches]))
5635 || reg_fits_class_p (op, op_alt[j].cl, offset, mode))
5636 {
5637 win = true;
5638 break;
5639 }
5640 }
5641
5642 if (!win)
5643 return false;
5644 }
5645 }
5646
5647 return true;
5648 }
5649 \f
5650 /* Argument support functions. */
5651
5652 /* Return true when register may be used to pass function parameters. */
5653 bool
5654 ix86_function_arg_regno_p (int regno)
5655 {
5656 int i;
5657 const int *parm_regs;
5658
5659 if (!TARGET_64BIT)
5660 {
5661 if (TARGET_MACHO)
5662 return (regno < REGPARM_MAX
5663 || (TARGET_SSE && SSE_REGNO_P (regno) && !fixed_regs[regno]));
5664 else
5665 return (regno < REGPARM_MAX
5666 || (TARGET_MMX && MMX_REGNO_P (regno)
5667 && (regno < FIRST_MMX_REG + MMX_REGPARM_MAX))
5668 || (TARGET_SSE && SSE_REGNO_P (regno)
5669 && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX)));
5670 }
5671
5672 if (TARGET_MACHO)
5673 {
5674 if (SSE_REGNO_P (regno) && TARGET_SSE)
5675 return true;
5676 }
5677 else
5678 {
5679 if (TARGET_SSE && SSE_REGNO_P (regno)
5680 && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX))
5681 return true;
5682 }
5683
5684 /* TODO: The function should depend on current function ABI but
5685 builtins.c would need updating then. Therefore we use the
5686 default ABI. */
5687
5688 /* RAX is used as hidden argument to va_arg functions. */
5689 if (ix86_abi == SYSV_ABI && regno == AX_REG)
5690 return true;
5691
5692 if (ix86_abi == MS_ABI)
5693 parm_regs = x86_64_ms_abi_int_parameter_registers;
5694 else
5695 parm_regs = x86_64_int_parameter_registers;
5696 for (i = 0; i < (ix86_abi == MS_ABI
5697 ? X86_64_MS_REGPARM_MAX : X86_64_REGPARM_MAX); i++)
5698 if (regno == parm_regs[i])
5699 return true;
5700 return false;
5701 }
5702
5703 /* Return if we do not know how to pass TYPE solely in registers. */
5704
5705 static bool
5706 ix86_must_pass_in_stack (enum machine_mode mode, const_tree type)
5707 {
5708 if (must_pass_in_stack_var_size_or_pad (mode, type))
5709 return true;
5710
5711 /* For 32-bit, we want TImode aggregates to go on the stack. But watch out!
5712 The layout_type routine is crafty and tries to trick us into passing
5713 currently unsupported vector types on the stack by using TImode. */
5714 return (!TARGET_64BIT && mode == TImode
5715 && type && TREE_CODE (type) != VECTOR_TYPE);
5716 }
5717
5718 /* It returns the size, in bytes, of the area reserved for arguments passed
5719 in registers for the function represented by fndecl dependent to the used
5720 abi format. */
5721 int
5722 ix86_reg_parm_stack_space (const_tree fndecl)
5723 {
5724 enum calling_abi call_abi = SYSV_ABI;
5725 if (fndecl != NULL_TREE && TREE_CODE (fndecl) == FUNCTION_DECL)
5726 call_abi = ix86_function_abi (fndecl);
5727 else
5728 call_abi = ix86_function_type_abi (fndecl);
5729 if (TARGET_64BIT && call_abi == MS_ABI)
5730 return 32;
5731 return 0;
5732 }
5733
5734 /* Returns value SYSV_ABI, MS_ABI dependent on fntype, specifying the
5735 call abi used. */
5736 enum calling_abi
5737 ix86_function_type_abi (const_tree fntype)
5738 {
5739 if (fntype != NULL_TREE && TYPE_ATTRIBUTES (fntype) != NULL_TREE)
5740 {
5741 enum calling_abi abi = ix86_abi;
5742 if (abi == SYSV_ABI)
5743 {
5744 if (lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (fntype)))
5745 abi = MS_ABI;
5746 }
5747 else if (lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (fntype)))
5748 abi = SYSV_ABI;
5749 return abi;
5750 }
5751 return ix86_abi;
5752 }
5753
5754 static bool
5755 ix86_function_ms_hook_prologue (const_tree fn)
5756 {
5757 if (fn && lookup_attribute ("ms_hook_prologue", DECL_ATTRIBUTES (fn)))
5758 {
5759 if (decl_function_context (fn) != NULL_TREE)
5760 error_at (DECL_SOURCE_LOCATION (fn),
5761 "ms_hook_prologue is not compatible with nested function");
5762 else
5763 return true;
5764 }
5765 return false;
5766 }
5767
5768 static enum calling_abi
5769 ix86_function_abi (const_tree fndecl)
5770 {
5771 if (! fndecl)
5772 return ix86_abi;
5773 return ix86_function_type_abi (TREE_TYPE (fndecl));
5774 }
5775
5776 /* Returns value SYSV_ABI, MS_ABI dependent on cfun, specifying the
5777 call abi used. */
5778 enum calling_abi
5779 ix86_cfun_abi (void)
5780 {
5781 if (! cfun)
5782 return ix86_abi;
5783 return cfun->machine->call_abi;
5784 }
5785
5786 /* Write the extra assembler code needed to declare a function properly. */
5787
5788 void
5789 ix86_asm_output_function_label (FILE *asm_out_file, const char *fname,
5790 tree decl)
5791 {
5792 bool is_ms_hook = ix86_function_ms_hook_prologue (decl);
5793
5794 if (is_ms_hook)
5795 {
5796 int i, filler_count = (TARGET_64BIT ? 32 : 16);
5797 unsigned int filler_cc = 0xcccccccc;
5798
5799 for (i = 0; i < filler_count; i += 4)
5800 fprintf (asm_out_file, ASM_LONG " %#x\n", filler_cc);
5801 }
5802
5803 #ifdef SUBTARGET_ASM_UNWIND_INIT
5804 SUBTARGET_ASM_UNWIND_INIT (asm_out_file);
5805 #endif
5806
5807 ASM_OUTPUT_LABEL (asm_out_file, fname);
5808
5809 /* Output magic byte marker, if hot-patch attribute is set. */
5810 if (is_ms_hook)
5811 {
5812 if (TARGET_64BIT)
5813 {
5814 /* leaq [%rsp + 0], %rsp */
5815 asm_fprintf (asm_out_file, ASM_BYTE
5816 "0x48, 0x8d, 0xa4, 0x24, 0x00, 0x00, 0x00, 0x00\n");
5817 }
5818 else
5819 {
5820 /* movl.s %edi, %edi
5821 push %ebp
5822 movl.s %esp, %ebp */
5823 asm_fprintf (asm_out_file, ASM_BYTE
5824 "0x8b, 0xff, 0x55, 0x8b, 0xec\n");
5825 }
5826 }
5827 }
5828
5829 /* regclass.c */
5830 extern void init_regs (void);
5831
5832 /* Implementation of call abi switching target hook. Specific to FNDECL
5833 the specific call register sets are set. See also
5834 ix86_conditional_register_usage for more details. */
5835 void
5836 ix86_call_abi_override (const_tree fndecl)
5837 {
5838 if (fndecl == NULL_TREE)
5839 cfun->machine->call_abi = ix86_abi;
5840 else
5841 cfun->machine->call_abi = ix86_function_type_abi (TREE_TYPE (fndecl));
5842 }
5843
5844 /* 64-bit MS and SYSV ABI have different set of call used registers. Avoid
5845 expensive re-initialization of init_regs each time we switch function context
5846 since this is needed only during RTL expansion. */
5847 static void
5848 ix86_maybe_switch_abi (void)
5849 {
5850 if (TARGET_64BIT &&
5851 call_used_regs[SI_REG] == (cfun->machine->call_abi == MS_ABI))
5852 reinit_regs ();
5853 }
5854
5855 /* Initialize a variable CUM of type CUMULATIVE_ARGS
5856 for a call to a function whose data type is FNTYPE.
5857 For a library call, FNTYPE is 0. */
5858
5859 void
5860 init_cumulative_args (CUMULATIVE_ARGS *cum, /* Argument info to initialize */
5861 tree fntype, /* tree ptr for function decl */
5862 rtx libname, /* SYMBOL_REF of library name or 0 */
5863 tree fndecl,
5864 int caller)
5865 {
5866 struct cgraph_local_info *i;
5867 tree fnret_type;
5868
5869 memset (cum, 0, sizeof (*cum));
5870
5871 /* Initialize for the current callee. */
5872 if (caller)
5873 {
5874 cfun->machine->callee_pass_avx256_p = false;
5875 cfun->machine->callee_return_avx256_p = false;
5876 }
5877
5878 if (fndecl)
5879 {
5880 i = cgraph_local_info (fndecl);
5881 cum->call_abi = ix86_function_abi (fndecl);
5882 fnret_type = TREE_TYPE (TREE_TYPE (fndecl));
5883 }
5884 else
5885 {
5886 i = NULL;
5887 cum->call_abi = ix86_function_type_abi (fntype);
5888 if (fntype)
5889 fnret_type = TREE_TYPE (fntype);
5890 else
5891 fnret_type = NULL;
5892 }
5893
5894 if (TARGET_VZEROUPPER && fnret_type)
5895 {
5896 rtx fnret_value = ix86_function_value (fnret_type, fntype,
5897 false);
5898 if (function_pass_avx256_p (fnret_value))
5899 {
5900 /* The return value of this function uses 256bit AVX modes. */
5901 if (caller)
5902 cfun->machine->callee_return_avx256_p = true;
5903 else
5904 cfun->machine->caller_return_avx256_p = true;
5905 }
5906 }
5907
5908 cum->caller = caller;
5909
5910 /* Set up the number of registers to use for passing arguments. */
5911
5912 if (TARGET_64BIT && cum->call_abi == MS_ABI && !ACCUMULATE_OUTGOING_ARGS)
5913 sorry ("ms_abi attribute requires -maccumulate-outgoing-args "
5914 "or subtarget optimization implying it");
5915 cum->nregs = ix86_regparm;
5916 if (TARGET_64BIT)
5917 {
5918 cum->nregs = (cum->call_abi == SYSV_ABI
5919 ? X86_64_REGPARM_MAX
5920 : X86_64_MS_REGPARM_MAX);
5921 }
5922 if (TARGET_SSE)
5923 {
5924 cum->sse_nregs = SSE_REGPARM_MAX;
5925 if (TARGET_64BIT)
5926 {
5927 cum->sse_nregs = (cum->call_abi == SYSV_ABI
5928 ? X86_64_SSE_REGPARM_MAX
5929 : X86_64_MS_SSE_REGPARM_MAX);
5930 }
5931 }
5932 if (TARGET_MMX)
5933 cum->mmx_nregs = MMX_REGPARM_MAX;
5934 cum->warn_avx = true;
5935 cum->warn_sse = true;
5936 cum->warn_mmx = true;
5937
5938 /* Because type might mismatch in between caller and callee, we need to
5939 use actual type of function for local calls.
5940 FIXME: cgraph_analyze can be told to actually record if function uses
5941 va_start so for local functions maybe_vaarg can be made aggressive
5942 helping K&R code.
5943 FIXME: once typesytem is fixed, we won't need this code anymore. */
5944 if (i && i->local && i->can_change_signature)
5945 fntype = TREE_TYPE (fndecl);
5946 cum->maybe_vaarg = (fntype
5947 ? (!prototype_p (fntype) || stdarg_p (fntype))
5948 : !libname);
5949
5950 if (!TARGET_64BIT)
5951 {
5952 /* If there are variable arguments, then we won't pass anything
5953 in registers in 32-bit mode. */
5954 if (stdarg_p (fntype))
5955 {
5956 cum->nregs = 0;
5957 cum->sse_nregs = 0;
5958 cum->mmx_nregs = 0;
5959 cum->warn_avx = 0;
5960 cum->warn_sse = 0;
5961 cum->warn_mmx = 0;
5962 return;
5963 }
5964
5965 /* Use ecx and edx registers if function has fastcall attribute,
5966 else look for regparm information. */
5967 if (fntype)
5968 {
5969 unsigned int ccvt = ix86_get_callcvt (fntype);
5970 if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
5971 {
5972 cum->nregs = 1;
5973 cum->fastcall = 1; /* Same first register as in fastcall. */
5974 }
5975 else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
5976 {
5977 cum->nregs = 2;
5978 cum->fastcall = 1;
5979 }
5980 else
5981 cum->nregs = ix86_function_regparm (fntype, fndecl);
5982 }
5983
5984 /* Set up the number of SSE registers used for passing SFmode
5985 and DFmode arguments. Warn for mismatching ABI. */
5986 cum->float_in_sse = ix86_function_sseregparm (fntype, fndecl, true);
5987 }
5988 }
5989
5990 /* Return the "natural" mode for TYPE. In most cases, this is just TYPE_MODE.
5991 But in the case of vector types, it is some vector mode.
5992
5993 When we have only some of our vector isa extensions enabled, then there
5994 are some modes for which vector_mode_supported_p is false. For these
5995 modes, the generic vector support in gcc will choose some non-vector mode
5996 in order to implement the type. By computing the natural mode, we'll
5997 select the proper ABI location for the operand and not depend on whatever
5998 the middle-end decides to do with these vector types.
5999
6000 The midde-end can't deal with the vector types > 16 bytes. In this
6001 case, we return the original mode and warn ABI change if CUM isn't
6002 NULL. */
6003
6004 static enum machine_mode
6005 type_natural_mode (const_tree type, const CUMULATIVE_ARGS *cum)
6006 {
6007 enum machine_mode mode = TYPE_MODE (type);
6008
6009 if (TREE_CODE (type) == VECTOR_TYPE && !VECTOR_MODE_P (mode))
6010 {
6011 HOST_WIDE_INT size = int_size_in_bytes (type);
6012 if ((size == 8 || size == 16 || size == 32)
6013 /* ??? Generic code allows us to create width 1 vectors. Ignore. */
6014 && TYPE_VECTOR_SUBPARTS (type) > 1)
6015 {
6016 enum machine_mode innermode = TYPE_MODE (TREE_TYPE (type));
6017
6018 if (TREE_CODE (TREE_TYPE (type)) == REAL_TYPE)
6019 mode = MIN_MODE_VECTOR_FLOAT;
6020 else
6021 mode = MIN_MODE_VECTOR_INT;
6022
6023 /* Get the mode which has this inner mode and number of units. */
6024 for (; mode != VOIDmode; mode = GET_MODE_WIDER_MODE (mode))
6025 if (GET_MODE_NUNITS (mode) == TYPE_VECTOR_SUBPARTS (type)
6026 && GET_MODE_INNER (mode) == innermode)
6027 {
6028 if (size == 32 && !TARGET_AVX)
6029 {
6030 static bool warnedavx;
6031
6032 if (cum
6033 && !warnedavx
6034 && cum->warn_avx)
6035 {
6036 warnedavx = true;
6037 warning (0, "AVX vector argument without AVX "
6038 "enabled changes the ABI");
6039 }
6040 return TYPE_MODE (type);
6041 }
6042 else if ((size == 8 || size == 16) && !TARGET_SSE)
6043 {
6044 static bool warnedsse;
6045
6046 if (cum
6047 && !warnedsse
6048 && cum->warn_sse)
6049 {
6050 warnedsse = true;
6051 warning (0, "SSE vector argument without SSE "
6052 "enabled changes the ABI");
6053 }
6054 return mode;
6055 }
6056 else
6057 return mode;
6058 }
6059
6060 gcc_unreachable ();
6061 }
6062 }
6063
6064 return mode;
6065 }
6066
6067 /* We want to pass a value in REGNO whose "natural" mode is MODE. However,
6068 this may not agree with the mode that the type system has chosen for the
6069 register, which is ORIG_MODE. If ORIG_MODE is not BLKmode, then we can
6070 go ahead and use it. Otherwise we have to build a PARALLEL instead. */
6071
6072 static rtx
6073 gen_reg_or_parallel (enum machine_mode mode, enum machine_mode orig_mode,
6074 unsigned int regno)
6075 {
6076 rtx tmp;
6077
6078 if (orig_mode != BLKmode)
6079 tmp = gen_rtx_REG (orig_mode, regno);
6080 else
6081 {
6082 tmp = gen_rtx_REG (mode, regno);
6083 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, const0_rtx);
6084 tmp = gen_rtx_PARALLEL (orig_mode, gen_rtvec (1, tmp));
6085 }
6086
6087 return tmp;
6088 }
6089
6090 /* x86-64 register passing implementation. See x86-64 ABI for details. Goal
6091 of this code is to classify each 8bytes of incoming argument by the register
6092 class and assign registers accordingly. */
6093
6094 /* Return the union class of CLASS1 and CLASS2.
6095 See the x86-64 PS ABI for details. */
6096
6097 static enum x86_64_reg_class
6098 merge_classes (enum x86_64_reg_class class1, enum x86_64_reg_class class2)
6099 {
6100 /* Rule #1: If both classes are equal, this is the resulting class. */
6101 if (class1 == class2)
6102 return class1;
6103
6104 /* Rule #2: If one of the classes is NO_CLASS, the resulting class is
6105 the other class. */
6106 if (class1 == X86_64_NO_CLASS)
6107 return class2;
6108 if (class2 == X86_64_NO_CLASS)
6109 return class1;
6110
6111 /* Rule #3: If one of the classes is MEMORY, the result is MEMORY. */
6112 if (class1 == X86_64_MEMORY_CLASS || class2 == X86_64_MEMORY_CLASS)
6113 return X86_64_MEMORY_CLASS;
6114
6115 /* Rule #4: If one of the classes is INTEGER, the result is INTEGER. */
6116 if ((class1 == X86_64_INTEGERSI_CLASS && class2 == X86_64_SSESF_CLASS)
6117 || (class2 == X86_64_INTEGERSI_CLASS && class1 == X86_64_SSESF_CLASS))
6118 return X86_64_INTEGERSI_CLASS;
6119 if (class1 == X86_64_INTEGER_CLASS || class1 == X86_64_INTEGERSI_CLASS
6120 || class2 == X86_64_INTEGER_CLASS || class2 == X86_64_INTEGERSI_CLASS)
6121 return X86_64_INTEGER_CLASS;
6122
6123 /* Rule #5: If one of the classes is X87, X87UP, or COMPLEX_X87 class,
6124 MEMORY is used. */
6125 if (class1 == X86_64_X87_CLASS
6126 || class1 == X86_64_X87UP_CLASS
6127 || class1 == X86_64_COMPLEX_X87_CLASS
6128 || class2 == X86_64_X87_CLASS
6129 || class2 == X86_64_X87UP_CLASS
6130 || class2 == X86_64_COMPLEX_X87_CLASS)
6131 return X86_64_MEMORY_CLASS;
6132
6133 /* Rule #6: Otherwise class SSE is used. */
6134 return X86_64_SSE_CLASS;
6135 }
6136
6137 /* Classify the argument of type TYPE and mode MODE.
6138 CLASSES will be filled by the register class used to pass each word
6139 of the operand. The number of words is returned. In case the parameter
6140 should be passed in memory, 0 is returned. As a special case for zero
6141 sized containers, classes[0] will be NO_CLASS and 1 is returned.
6142
6143 BIT_OFFSET is used internally for handling records and specifies offset
6144 of the offset in bits modulo 256 to avoid overflow cases.
6145
6146 See the x86-64 PS ABI for details.
6147 */
6148
6149 static int
6150 classify_argument (enum machine_mode mode, const_tree type,
6151 enum x86_64_reg_class classes[MAX_CLASSES], int bit_offset)
6152 {
6153 HOST_WIDE_INT bytes =
6154 (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
6155 int words
6156 = (bytes + (bit_offset % 64) / 8 + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
6157
6158 /* Variable sized entities are always passed/returned in memory. */
6159 if (bytes < 0)
6160 return 0;
6161
6162 if (mode != VOIDmode
6163 && targetm.calls.must_pass_in_stack (mode, type))
6164 return 0;
6165
6166 if (type && AGGREGATE_TYPE_P (type))
6167 {
6168 int i;
6169 tree field;
6170 enum x86_64_reg_class subclasses[MAX_CLASSES];
6171
6172 /* On x86-64 we pass structures larger than 32 bytes on the stack. */
6173 if (bytes > 32)
6174 return 0;
6175
6176 for (i = 0; i < words; i++)
6177 classes[i] = X86_64_NO_CLASS;
6178
6179 /* Zero sized arrays or structures are NO_CLASS. We return 0 to
6180 signalize memory class, so handle it as special case. */
6181 if (!words)
6182 {
6183 classes[0] = X86_64_NO_CLASS;
6184 return 1;
6185 }
6186
6187 /* Classify each field of record and merge classes. */
6188 switch (TREE_CODE (type))
6189 {
6190 case RECORD_TYPE:
6191 /* And now merge the fields of structure. */
6192 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
6193 {
6194 if (TREE_CODE (field) == FIELD_DECL)
6195 {
6196 int num;
6197
6198 if (TREE_TYPE (field) == error_mark_node)
6199 continue;
6200
6201 /* Bitfields are always classified as integer. Handle them
6202 early, since later code would consider them to be
6203 misaligned integers. */
6204 if (DECL_BIT_FIELD (field))
6205 {
6206 for (i = (int_bit_position (field)
6207 + (bit_offset % 64)) / 8 / 8;
6208 i < ((int_bit_position (field) + (bit_offset % 64))
6209 + tree_low_cst (DECL_SIZE (field), 0)
6210 + 63) / 8 / 8; i++)
6211 classes[i] =
6212 merge_classes (X86_64_INTEGER_CLASS,
6213 classes[i]);
6214 }
6215 else
6216 {
6217 int pos;
6218
6219 type = TREE_TYPE (field);
6220
6221 /* Flexible array member is ignored. */
6222 if (TYPE_MODE (type) == BLKmode
6223 && TREE_CODE (type) == ARRAY_TYPE
6224 && TYPE_SIZE (type) == NULL_TREE
6225 && TYPE_DOMAIN (type) != NULL_TREE
6226 && (TYPE_MAX_VALUE (TYPE_DOMAIN (type))
6227 == NULL_TREE))
6228 {
6229 static bool warned;
6230
6231 if (!warned && warn_psabi)
6232 {
6233 warned = true;
6234 inform (input_location,
6235 "the ABI of passing struct with"
6236 " a flexible array member has"
6237 " changed in GCC 4.4");
6238 }
6239 continue;
6240 }
6241 num = classify_argument (TYPE_MODE (type), type,
6242 subclasses,
6243 (int_bit_position (field)
6244 + bit_offset) % 256);
6245 if (!num)
6246 return 0;
6247 pos = (int_bit_position (field)
6248 + (bit_offset % 64)) / 8 / 8;
6249 for (i = 0; i < num && (i + pos) < words; i++)
6250 classes[i + pos] =
6251 merge_classes (subclasses[i], classes[i + pos]);
6252 }
6253 }
6254 }
6255 break;
6256
6257 case ARRAY_TYPE:
6258 /* Arrays are handled as small records. */
6259 {
6260 int num;
6261 num = classify_argument (TYPE_MODE (TREE_TYPE (type)),
6262 TREE_TYPE (type), subclasses, bit_offset);
6263 if (!num)
6264 return 0;
6265
6266 /* The partial classes are now full classes. */
6267 if (subclasses[0] == X86_64_SSESF_CLASS && bytes != 4)
6268 subclasses[0] = X86_64_SSE_CLASS;
6269 if (subclasses[0] == X86_64_INTEGERSI_CLASS
6270 && !((bit_offset % 64) == 0 && bytes == 4))
6271 subclasses[0] = X86_64_INTEGER_CLASS;
6272
6273 for (i = 0; i < words; i++)
6274 classes[i] = subclasses[i % num];
6275
6276 break;
6277 }
6278 case UNION_TYPE:
6279 case QUAL_UNION_TYPE:
6280 /* Unions are similar to RECORD_TYPE but offset is always 0.
6281 */
6282 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
6283 {
6284 if (TREE_CODE (field) == FIELD_DECL)
6285 {
6286 int num;
6287
6288 if (TREE_TYPE (field) == error_mark_node)
6289 continue;
6290
6291 num = classify_argument (TYPE_MODE (TREE_TYPE (field)),
6292 TREE_TYPE (field), subclasses,
6293 bit_offset);
6294 if (!num)
6295 return 0;
6296 for (i = 0; i < num; i++)
6297 classes[i] = merge_classes (subclasses[i], classes[i]);
6298 }
6299 }
6300 break;
6301
6302 default:
6303 gcc_unreachable ();
6304 }
6305
6306 if (words > 2)
6307 {
6308 /* When size > 16 bytes, if the first one isn't
6309 X86_64_SSE_CLASS or any other ones aren't
6310 X86_64_SSEUP_CLASS, everything should be passed in
6311 memory. */
6312 if (classes[0] != X86_64_SSE_CLASS)
6313 return 0;
6314
6315 for (i = 1; i < words; i++)
6316 if (classes[i] != X86_64_SSEUP_CLASS)
6317 return 0;
6318 }
6319
6320 /* Final merger cleanup. */
6321 for (i = 0; i < words; i++)
6322 {
6323 /* If one class is MEMORY, everything should be passed in
6324 memory. */
6325 if (classes[i] == X86_64_MEMORY_CLASS)
6326 return 0;
6327
6328 /* The X86_64_SSEUP_CLASS should be always preceded by
6329 X86_64_SSE_CLASS or X86_64_SSEUP_CLASS. */
6330 if (classes[i] == X86_64_SSEUP_CLASS
6331 && classes[i - 1] != X86_64_SSE_CLASS
6332 && classes[i - 1] != X86_64_SSEUP_CLASS)
6333 {
6334 /* The first one should never be X86_64_SSEUP_CLASS. */
6335 gcc_assert (i != 0);
6336 classes[i] = X86_64_SSE_CLASS;
6337 }
6338
6339 /* If X86_64_X87UP_CLASS isn't preceded by X86_64_X87_CLASS,
6340 everything should be passed in memory. */
6341 if (classes[i] == X86_64_X87UP_CLASS
6342 && (classes[i - 1] != X86_64_X87_CLASS))
6343 {
6344 static bool warned;
6345
6346 /* The first one should never be X86_64_X87UP_CLASS. */
6347 gcc_assert (i != 0);
6348 if (!warned && warn_psabi)
6349 {
6350 warned = true;
6351 inform (input_location,
6352 "the ABI of passing union with long double"
6353 " has changed in GCC 4.4");
6354 }
6355 return 0;
6356 }
6357 }
6358 return words;
6359 }
6360
6361 /* Compute alignment needed. We align all types to natural boundaries with
6362 exception of XFmode that is aligned to 64bits. */
6363 if (mode != VOIDmode && mode != BLKmode)
6364 {
6365 int mode_alignment = GET_MODE_BITSIZE (mode);
6366
6367 if (mode == XFmode)
6368 mode_alignment = 128;
6369 else if (mode == XCmode)
6370 mode_alignment = 256;
6371 if (COMPLEX_MODE_P (mode))
6372 mode_alignment /= 2;
6373 /* Misaligned fields are always returned in memory. */
6374 if (bit_offset % mode_alignment)
6375 return 0;
6376 }
6377
6378 /* for V1xx modes, just use the base mode */
6379 if (VECTOR_MODE_P (mode) && mode != V1DImode && mode != V1TImode
6380 && GET_MODE_SIZE (GET_MODE_INNER (mode)) == bytes)
6381 mode = GET_MODE_INNER (mode);
6382
6383 /* Classification of atomic types. */
6384 switch (mode)
6385 {
6386 case SDmode:
6387 case DDmode:
6388 classes[0] = X86_64_SSE_CLASS;
6389 return 1;
6390 case TDmode:
6391 classes[0] = X86_64_SSE_CLASS;
6392 classes[1] = X86_64_SSEUP_CLASS;
6393 return 2;
6394 case DImode:
6395 case SImode:
6396 case HImode:
6397 case QImode:
6398 case CSImode:
6399 case CHImode:
6400 case CQImode:
6401 {
6402 int size = (bit_offset % 64)+ (int) GET_MODE_BITSIZE (mode);
6403
6404 if (size <= 32)
6405 {
6406 classes[0] = X86_64_INTEGERSI_CLASS;
6407 return 1;
6408 }
6409 else if (size <= 64)
6410 {
6411 classes[0] = X86_64_INTEGER_CLASS;
6412 return 1;
6413 }
6414 else if (size <= 64+32)
6415 {
6416 classes[0] = X86_64_INTEGER_CLASS;
6417 classes[1] = X86_64_INTEGERSI_CLASS;
6418 return 2;
6419 }
6420 else if (size <= 64+64)
6421 {
6422 classes[0] = classes[1] = X86_64_INTEGER_CLASS;
6423 return 2;
6424 }
6425 else
6426 gcc_unreachable ();
6427 }
6428 case CDImode:
6429 case TImode:
6430 classes[0] = classes[1] = X86_64_INTEGER_CLASS;
6431 return 2;
6432 case COImode:
6433 case OImode:
6434 /* OImode shouldn't be used directly. */
6435 gcc_unreachable ();
6436 case CTImode:
6437 return 0;
6438 case SFmode:
6439 if (!(bit_offset % 64))
6440 classes[0] = X86_64_SSESF_CLASS;
6441 else
6442 classes[0] = X86_64_SSE_CLASS;
6443 return 1;
6444 case DFmode:
6445 classes[0] = X86_64_SSEDF_CLASS;
6446 return 1;
6447 case XFmode:
6448 classes[0] = X86_64_X87_CLASS;
6449 classes[1] = X86_64_X87UP_CLASS;
6450 return 2;
6451 case TFmode:
6452 classes[0] = X86_64_SSE_CLASS;
6453 classes[1] = X86_64_SSEUP_CLASS;
6454 return 2;
6455 case SCmode:
6456 classes[0] = X86_64_SSE_CLASS;
6457 if (!(bit_offset % 64))
6458 return 1;
6459 else
6460 {
6461 static bool warned;
6462
6463 if (!warned && warn_psabi)
6464 {
6465 warned = true;
6466 inform (input_location,
6467 "the ABI of passing structure with complex float"
6468 " member has changed in GCC 4.4");
6469 }
6470 classes[1] = X86_64_SSESF_CLASS;
6471 return 2;
6472 }
6473 case DCmode:
6474 classes[0] = X86_64_SSEDF_CLASS;
6475 classes[1] = X86_64_SSEDF_CLASS;
6476 return 2;
6477 case XCmode:
6478 classes[0] = X86_64_COMPLEX_X87_CLASS;
6479 return 1;
6480 case TCmode:
6481 /* This modes is larger than 16 bytes. */
6482 return 0;
6483 case V8SFmode:
6484 case V8SImode:
6485 case V32QImode:
6486 case V16HImode:
6487 case V4DFmode:
6488 case V4DImode:
6489 classes[0] = X86_64_SSE_CLASS;
6490 classes[1] = X86_64_SSEUP_CLASS;
6491 classes[2] = X86_64_SSEUP_CLASS;
6492 classes[3] = X86_64_SSEUP_CLASS;
6493 return 4;
6494 case V4SFmode:
6495 case V4SImode:
6496 case V16QImode:
6497 case V8HImode:
6498 case V2DFmode:
6499 case V2DImode:
6500 classes[0] = X86_64_SSE_CLASS;
6501 classes[1] = X86_64_SSEUP_CLASS;
6502 return 2;
6503 case V1TImode:
6504 case V1DImode:
6505 case V2SFmode:
6506 case V2SImode:
6507 case V4HImode:
6508 case V8QImode:
6509 classes[0] = X86_64_SSE_CLASS;
6510 return 1;
6511 case BLKmode:
6512 case VOIDmode:
6513 return 0;
6514 default:
6515 gcc_assert (VECTOR_MODE_P (mode));
6516
6517 if (bytes > 16)
6518 return 0;
6519
6520 gcc_assert (GET_MODE_CLASS (GET_MODE_INNER (mode)) == MODE_INT);
6521
6522 if (bit_offset + GET_MODE_BITSIZE (mode) <= 32)
6523 classes[0] = X86_64_INTEGERSI_CLASS;
6524 else
6525 classes[0] = X86_64_INTEGER_CLASS;
6526 classes[1] = X86_64_INTEGER_CLASS;
6527 return 1 + (bytes > 8);
6528 }
6529 }
6530
6531 /* Examine the argument and return set number of register required in each
6532 class. Return 0 iff parameter should be passed in memory. */
6533 static int
6534 examine_argument (enum machine_mode mode, const_tree type, int in_return,
6535 int *int_nregs, int *sse_nregs)
6536 {
6537 enum x86_64_reg_class regclass[MAX_CLASSES];
6538 int n = classify_argument (mode, type, regclass, 0);
6539
6540 *int_nregs = 0;
6541 *sse_nregs = 0;
6542 if (!n)
6543 return 0;
6544 for (n--; n >= 0; n--)
6545 switch (regclass[n])
6546 {
6547 case X86_64_INTEGER_CLASS:
6548 case X86_64_INTEGERSI_CLASS:
6549 (*int_nregs)++;
6550 break;
6551 case X86_64_SSE_CLASS:
6552 case X86_64_SSESF_CLASS:
6553 case X86_64_SSEDF_CLASS:
6554 (*sse_nregs)++;
6555 break;
6556 case X86_64_NO_CLASS:
6557 case X86_64_SSEUP_CLASS:
6558 break;
6559 case X86_64_X87_CLASS:
6560 case X86_64_X87UP_CLASS:
6561 if (!in_return)
6562 return 0;
6563 break;
6564 case X86_64_COMPLEX_X87_CLASS:
6565 return in_return ? 2 : 0;
6566 case X86_64_MEMORY_CLASS:
6567 gcc_unreachable ();
6568 }
6569 return 1;
6570 }
6571
6572 /* Construct container for the argument used by GCC interface. See
6573 FUNCTION_ARG for the detailed description. */
6574
6575 static rtx
6576 construct_container (enum machine_mode mode, enum machine_mode orig_mode,
6577 const_tree type, int in_return, int nintregs, int nsseregs,
6578 const int *intreg, int sse_regno)
6579 {
6580 /* The following variables hold the static issued_error state. */
6581 static bool issued_sse_arg_error;
6582 static bool issued_sse_ret_error;
6583 static bool issued_x87_ret_error;
6584
6585 enum machine_mode tmpmode;
6586 int bytes =
6587 (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
6588 enum x86_64_reg_class regclass[MAX_CLASSES];
6589 int n;
6590 int i;
6591 int nexps = 0;
6592 int needed_sseregs, needed_intregs;
6593 rtx exp[MAX_CLASSES];
6594 rtx ret;
6595
6596 n = classify_argument (mode, type, regclass, 0);
6597 if (!n)
6598 return NULL;
6599 if (!examine_argument (mode, type, in_return, &needed_intregs,
6600 &needed_sseregs))
6601 return NULL;
6602 if (needed_intregs > nintregs || needed_sseregs > nsseregs)
6603 return NULL;
6604
6605 /* We allowed the user to turn off SSE for kernel mode. Don't crash if
6606 some less clueful developer tries to use floating-point anyway. */
6607 if (needed_sseregs && !TARGET_SSE)
6608 {
6609 if (in_return)
6610 {
6611 if (!issued_sse_ret_error)
6612 {
6613 error ("SSE register return with SSE disabled");
6614 issued_sse_ret_error = true;
6615 }
6616 }
6617 else if (!issued_sse_arg_error)
6618 {
6619 error ("SSE register argument with SSE disabled");
6620 issued_sse_arg_error = true;
6621 }
6622 return NULL;
6623 }
6624
6625 /* Likewise, error if the ABI requires us to return values in the
6626 x87 registers and the user specified -mno-80387. */
6627 if (!TARGET_80387 && in_return)
6628 for (i = 0; i < n; i++)
6629 if (regclass[i] == X86_64_X87_CLASS
6630 || regclass[i] == X86_64_X87UP_CLASS
6631 || regclass[i] == X86_64_COMPLEX_X87_CLASS)
6632 {
6633 if (!issued_x87_ret_error)
6634 {
6635 error ("x87 register return with x87 disabled");
6636 issued_x87_ret_error = true;
6637 }
6638 return NULL;
6639 }
6640
6641 /* First construct simple cases. Avoid SCmode, since we want to use
6642 single register to pass this type. */
6643 if (n == 1 && mode != SCmode)
6644 switch (regclass[0])
6645 {
6646 case X86_64_INTEGER_CLASS:
6647 case X86_64_INTEGERSI_CLASS:
6648 return gen_rtx_REG (mode, intreg[0]);
6649 case X86_64_SSE_CLASS:
6650 case X86_64_SSESF_CLASS:
6651 case X86_64_SSEDF_CLASS:
6652 if (mode != BLKmode)
6653 return gen_reg_or_parallel (mode, orig_mode,
6654 SSE_REGNO (sse_regno));
6655 break;
6656 case X86_64_X87_CLASS:
6657 case X86_64_COMPLEX_X87_CLASS:
6658 return gen_rtx_REG (mode, FIRST_STACK_REG);
6659 case X86_64_NO_CLASS:
6660 /* Zero sized array, struct or class. */
6661 return NULL;
6662 default:
6663 gcc_unreachable ();
6664 }
6665 if (n == 2
6666 && regclass[0] == X86_64_SSE_CLASS
6667 && regclass[1] == X86_64_SSEUP_CLASS
6668 && mode != BLKmode)
6669 return gen_reg_or_parallel (mode, orig_mode,
6670 SSE_REGNO (sse_regno));
6671 if (n == 4
6672 && regclass[0] == X86_64_SSE_CLASS
6673 && regclass[1] == X86_64_SSEUP_CLASS
6674 && regclass[2] == X86_64_SSEUP_CLASS
6675 && regclass[3] == X86_64_SSEUP_CLASS
6676 && mode != BLKmode)
6677 return gen_reg_or_parallel (mode, orig_mode,
6678 SSE_REGNO (sse_regno));
6679 if (n == 2
6680 && regclass[0] == X86_64_X87_CLASS
6681 && regclass[1] == X86_64_X87UP_CLASS)
6682 return gen_rtx_REG (XFmode, FIRST_STACK_REG);
6683
6684 if (n == 2
6685 && regclass[0] == X86_64_INTEGER_CLASS
6686 && regclass[1] == X86_64_INTEGER_CLASS
6687 && (mode == CDImode || mode == TImode || mode == TFmode)
6688 && intreg[0] + 1 == intreg[1])
6689 return gen_rtx_REG (mode, intreg[0]);
6690
6691 /* Otherwise figure out the entries of the PARALLEL. */
6692 for (i = 0; i < n; i++)
6693 {
6694 int pos;
6695
6696 switch (regclass[i])
6697 {
6698 case X86_64_NO_CLASS:
6699 break;
6700 case X86_64_INTEGER_CLASS:
6701 case X86_64_INTEGERSI_CLASS:
6702 /* Merge TImodes on aligned occasions here too. */
6703 if (i * 8 + 8 > bytes)
6704 tmpmode
6705 = mode_for_size ((bytes - i * 8) * BITS_PER_UNIT, MODE_INT, 0);
6706 else if (regclass[i] == X86_64_INTEGERSI_CLASS)
6707 tmpmode = SImode;
6708 else
6709 tmpmode = DImode;
6710 /* We've requested 24 bytes we
6711 don't have mode for. Use DImode. */
6712 if (tmpmode == BLKmode)
6713 tmpmode = DImode;
6714 exp [nexps++]
6715 = gen_rtx_EXPR_LIST (VOIDmode,
6716 gen_rtx_REG (tmpmode, *intreg),
6717 GEN_INT (i*8));
6718 intreg++;
6719 break;
6720 case X86_64_SSESF_CLASS:
6721 exp [nexps++]
6722 = gen_rtx_EXPR_LIST (VOIDmode,
6723 gen_rtx_REG (SFmode,
6724 SSE_REGNO (sse_regno)),
6725 GEN_INT (i*8));
6726 sse_regno++;
6727 break;
6728 case X86_64_SSEDF_CLASS:
6729 exp [nexps++]
6730 = gen_rtx_EXPR_LIST (VOIDmode,
6731 gen_rtx_REG (DFmode,
6732 SSE_REGNO (sse_regno)),
6733 GEN_INT (i*8));
6734 sse_regno++;
6735 break;
6736 case X86_64_SSE_CLASS:
6737 pos = i;
6738 switch (n)
6739 {
6740 case 1:
6741 tmpmode = DImode;
6742 break;
6743 case 2:
6744 if (i == 0 && regclass[1] == X86_64_SSEUP_CLASS)
6745 {
6746 tmpmode = TImode;
6747 i++;
6748 }
6749 else
6750 tmpmode = DImode;
6751 break;
6752 case 4:
6753 gcc_assert (i == 0
6754 && regclass[1] == X86_64_SSEUP_CLASS
6755 && regclass[2] == X86_64_SSEUP_CLASS
6756 && regclass[3] == X86_64_SSEUP_CLASS);
6757 tmpmode = OImode;
6758 i += 3;
6759 break;
6760 default:
6761 gcc_unreachable ();
6762 }
6763 exp [nexps++]
6764 = gen_rtx_EXPR_LIST (VOIDmode,
6765 gen_rtx_REG (tmpmode,
6766 SSE_REGNO (sse_regno)),
6767 GEN_INT (pos*8));
6768 sse_regno++;
6769 break;
6770 default:
6771 gcc_unreachable ();
6772 }
6773 }
6774
6775 /* Empty aligned struct, union or class. */
6776 if (nexps == 0)
6777 return NULL;
6778
6779 ret = gen_rtx_PARALLEL (mode, rtvec_alloc (nexps));
6780 for (i = 0; i < nexps; i++)
6781 XVECEXP (ret, 0, i) = exp [i];
6782 return ret;
6783 }
6784
6785 /* Update the data in CUM to advance over an argument of mode MODE
6786 and data type TYPE. (TYPE is null for libcalls where that information
6787 may not be available.) */
6788
6789 static void
6790 function_arg_advance_32 (CUMULATIVE_ARGS *cum, enum machine_mode mode,
6791 const_tree type, HOST_WIDE_INT bytes,
6792 HOST_WIDE_INT words)
6793 {
6794 switch (mode)
6795 {
6796 default:
6797 break;
6798
6799 case BLKmode:
6800 if (bytes < 0)
6801 break;
6802 /* FALLTHRU */
6803
6804 case DImode:
6805 case SImode:
6806 case HImode:
6807 case QImode:
6808 cum->words += words;
6809 cum->nregs -= words;
6810 cum->regno += words;
6811
6812 if (cum->nregs <= 0)
6813 {
6814 cum->nregs = 0;
6815 cum->regno = 0;
6816 }
6817 break;
6818
6819 case OImode:
6820 /* OImode shouldn't be used directly. */
6821 gcc_unreachable ();
6822
6823 case DFmode:
6824 if (cum->float_in_sse < 2)
6825 break;
6826 case SFmode:
6827 if (cum->float_in_sse < 1)
6828 break;
6829 /* FALLTHRU */
6830
6831 case V8SFmode:
6832 case V8SImode:
6833 case V32QImode:
6834 case V16HImode:
6835 case V4DFmode:
6836 case V4DImode:
6837 case TImode:
6838 case V16QImode:
6839 case V8HImode:
6840 case V4SImode:
6841 case V2DImode:
6842 case V4SFmode:
6843 case V2DFmode:
6844 if (!type || !AGGREGATE_TYPE_P (type))
6845 {
6846 cum->sse_words += words;
6847 cum->sse_nregs -= 1;
6848 cum->sse_regno += 1;
6849 if (cum->sse_nregs <= 0)
6850 {
6851 cum->sse_nregs = 0;
6852 cum->sse_regno = 0;
6853 }
6854 }
6855 break;
6856
6857 case V8QImode:
6858 case V4HImode:
6859 case V2SImode:
6860 case V2SFmode:
6861 case V1TImode:
6862 case V1DImode:
6863 if (!type || !AGGREGATE_TYPE_P (type))
6864 {
6865 cum->mmx_words += words;
6866 cum->mmx_nregs -= 1;
6867 cum->mmx_regno += 1;
6868 if (cum->mmx_nregs <= 0)
6869 {
6870 cum->mmx_nregs = 0;
6871 cum->mmx_regno = 0;
6872 }
6873 }
6874 break;
6875 }
6876 }
6877
6878 static void
6879 function_arg_advance_64 (CUMULATIVE_ARGS *cum, enum machine_mode mode,
6880 const_tree type, HOST_WIDE_INT words, bool named)
6881 {
6882 int int_nregs, sse_nregs;
6883
6884 /* Unnamed 256bit vector mode parameters are passed on stack. */
6885 if (!named && VALID_AVX256_REG_MODE (mode))
6886 return;
6887
6888 if (examine_argument (mode, type, 0, &int_nregs, &sse_nregs)
6889 && sse_nregs <= cum->sse_nregs && int_nregs <= cum->nregs)
6890 {
6891 cum->nregs -= int_nregs;
6892 cum->sse_nregs -= sse_nregs;
6893 cum->regno += int_nregs;
6894 cum->sse_regno += sse_nregs;
6895 }
6896 else
6897 {
6898 int align = ix86_function_arg_boundary (mode, type) / BITS_PER_WORD;
6899 cum->words = (cum->words + align - 1) & ~(align - 1);
6900 cum->words += words;
6901 }
6902 }
6903
6904 static void
6905 function_arg_advance_ms_64 (CUMULATIVE_ARGS *cum, HOST_WIDE_INT bytes,
6906 HOST_WIDE_INT words)
6907 {
6908 /* Otherwise, this should be passed indirect. */
6909 gcc_assert (bytes == 1 || bytes == 2 || bytes == 4 || bytes == 8);
6910
6911 cum->words += words;
6912 if (cum->nregs > 0)
6913 {
6914 cum->nregs -= 1;
6915 cum->regno += 1;
6916 }
6917 }
6918
6919 /* Update the data in CUM to advance over an argument of mode MODE and
6920 data type TYPE. (TYPE is null for libcalls where that information
6921 may not be available.) */
6922
6923 static void
6924 ix86_function_arg_advance (cumulative_args_t cum_v, enum machine_mode mode,
6925 const_tree type, bool named)
6926 {
6927 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
6928 HOST_WIDE_INT bytes, words;
6929
6930 if (mode == BLKmode)
6931 bytes = int_size_in_bytes (type);
6932 else
6933 bytes = GET_MODE_SIZE (mode);
6934 words = (bytes + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
6935
6936 if (type)
6937 mode = type_natural_mode (type, NULL);
6938
6939 if (TARGET_64BIT && (cum ? cum->call_abi : ix86_abi) == MS_ABI)
6940 function_arg_advance_ms_64 (cum, bytes, words);
6941 else if (TARGET_64BIT)
6942 function_arg_advance_64 (cum, mode, type, words, named);
6943 else
6944 function_arg_advance_32 (cum, mode, type, bytes, words);
6945 }
6946
6947 /* Define where to put the arguments to a function.
6948 Value is zero to push the argument on the stack,
6949 or a hard register in which to store the argument.
6950
6951 MODE is the argument's machine mode.
6952 TYPE is the data type of the argument (as a tree).
6953 This is null for libcalls where that information may
6954 not be available.
6955 CUM is a variable of type CUMULATIVE_ARGS which gives info about
6956 the preceding args and about the function being called.
6957 NAMED is nonzero if this argument is a named parameter
6958 (otherwise it is an extra parameter matching an ellipsis). */
6959
6960 static rtx
6961 function_arg_32 (const CUMULATIVE_ARGS *cum, enum machine_mode mode,
6962 enum machine_mode orig_mode, const_tree type,
6963 HOST_WIDE_INT bytes, HOST_WIDE_INT words)
6964 {
6965 static bool warnedsse, warnedmmx;
6966
6967 /* Avoid the AL settings for the Unix64 ABI. */
6968 if (mode == VOIDmode)
6969 return constm1_rtx;
6970
6971 switch (mode)
6972 {
6973 default:
6974 break;
6975
6976 case BLKmode:
6977 if (bytes < 0)
6978 break;
6979 /* FALLTHRU */
6980 case DImode:
6981 case SImode:
6982 case HImode:
6983 case QImode:
6984 if (words <= cum->nregs)
6985 {
6986 int regno = cum->regno;
6987
6988 /* Fastcall allocates the first two DWORD (SImode) or
6989 smaller arguments to ECX and EDX if it isn't an
6990 aggregate type . */
6991 if (cum->fastcall)
6992 {
6993 if (mode == BLKmode
6994 || mode == DImode
6995 || (type && AGGREGATE_TYPE_P (type)))
6996 break;
6997
6998 /* ECX not EAX is the first allocated register. */
6999 if (regno == AX_REG)
7000 regno = CX_REG;
7001 }
7002 return gen_rtx_REG (mode, regno);
7003 }
7004 break;
7005
7006 case DFmode:
7007 if (cum->float_in_sse < 2)
7008 break;
7009 case SFmode:
7010 if (cum->float_in_sse < 1)
7011 break;
7012 /* FALLTHRU */
7013 case TImode:
7014 /* In 32bit, we pass TImode in xmm registers. */
7015 case V16QImode:
7016 case V8HImode:
7017 case V4SImode:
7018 case V2DImode:
7019 case V4SFmode:
7020 case V2DFmode:
7021 if (!type || !AGGREGATE_TYPE_P (type))
7022 {
7023 if (!TARGET_SSE && !warnedsse && cum->warn_sse)
7024 {
7025 warnedsse = true;
7026 warning (0, "SSE vector argument without SSE enabled "
7027 "changes the ABI");
7028 }
7029 if (cum->sse_nregs)
7030 return gen_reg_or_parallel (mode, orig_mode,
7031 cum->sse_regno + FIRST_SSE_REG);
7032 }
7033 break;
7034
7035 case OImode:
7036 /* OImode shouldn't be used directly. */
7037 gcc_unreachable ();
7038
7039 case V8SFmode:
7040 case V8SImode:
7041 case V32QImode:
7042 case V16HImode:
7043 case V4DFmode:
7044 case V4DImode:
7045 if (!type || !AGGREGATE_TYPE_P (type))
7046 {
7047 if (cum->sse_nregs)
7048 return gen_reg_or_parallel (mode, orig_mode,
7049 cum->sse_regno + FIRST_SSE_REG);
7050 }
7051 break;
7052
7053 case V8QImode:
7054 case V4HImode:
7055 case V2SImode:
7056 case V2SFmode:
7057 case V1TImode:
7058 case V1DImode:
7059 if (!type || !AGGREGATE_TYPE_P (type))
7060 {
7061 if (!TARGET_MMX && !warnedmmx && cum->warn_mmx)
7062 {
7063 warnedmmx = true;
7064 warning (0, "MMX vector argument without MMX enabled "
7065 "changes the ABI");
7066 }
7067 if (cum->mmx_nregs)
7068 return gen_reg_or_parallel (mode, orig_mode,
7069 cum->mmx_regno + FIRST_MMX_REG);
7070 }
7071 break;
7072 }
7073
7074 return NULL_RTX;
7075 }
7076
7077 static rtx
7078 function_arg_64 (const CUMULATIVE_ARGS *cum, enum machine_mode mode,
7079 enum machine_mode orig_mode, const_tree type, bool named)
7080 {
7081 /* Handle a hidden AL argument containing number of registers
7082 for varargs x86-64 functions. */
7083 if (mode == VOIDmode)
7084 return GEN_INT (cum->maybe_vaarg
7085 ? (cum->sse_nregs < 0
7086 ? X86_64_SSE_REGPARM_MAX
7087 : cum->sse_regno)
7088 : -1);
7089
7090 switch (mode)
7091 {
7092 default:
7093 break;
7094
7095 case V8SFmode:
7096 case V8SImode:
7097 case V32QImode:
7098 case V16HImode:
7099 case V4DFmode:
7100 case V4DImode:
7101 /* Unnamed 256bit vector mode parameters are passed on stack. */
7102 if (!named)
7103 return NULL;
7104 break;
7105 }
7106
7107 return construct_container (mode, orig_mode, type, 0, cum->nregs,
7108 cum->sse_nregs,
7109 &x86_64_int_parameter_registers [cum->regno],
7110 cum->sse_regno);
7111 }
7112
7113 static rtx
7114 function_arg_ms_64 (const CUMULATIVE_ARGS *cum, enum machine_mode mode,
7115 enum machine_mode orig_mode, bool named,
7116 HOST_WIDE_INT bytes)
7117 {
7118 unsigned int regno;
7119
7120 /* We need to add clobber for MS_ABI->SYSV ABI calls in expand_call.
7121 We use value of -2 to specify that current function call is MSABI. */
7122 if (mode == VOIDmode)
7123 return GEN_INT (-2);
7124
7125 /* If we've run out of registers, it goes on the stack. */
7126 if (cum->nregs == 0)
7127 return NULL_RTX;
7128
7129 regno = x86_64_ms_abi_int_parameter_registers[cum->regno];
7130
7131 /* Only floating point modes are passed in anything but integer regs. */
7132 if (TARGET_SSE && (mode == SFmode || mode == DFmode))
7133 {
7134 if (named)
7135 regno = cum->regno + FIRST_SSE_REG;
7136 else
7137 {
7138 rtx t1, t2;
7139
7140 /* Unnamed floating parameters are passed in both the
7141 SSE and integer registers. */
7142 t1 = gen_rtx_REG (mode, cum->regno + FIRST_SSE_REG);
7143 t2 = gen_rtx_REG (mode, regno);
7144 t1 = gen_rtx_EXPR_LIST (VOIDmode, t1, const0_rtx);
7145 t2 = gen_rtx_EXPR_LIST (VOIDmode, t2, const0_rtx);
7146 return gen_rtx_PARALLEL (mode, gen_rtvec (2, t1, t2));
7147 }
7148 }
7149 /* Handle aggregated types passed in register. */
7150 if (orig_mode == BLKmode)
7151 {
7152 if (bytes > 0 && bytes <= 8)
7153 mode = (bytes > 4 ? DImode : SImode);
7154 if (mode == BLKmode)
7155 mode = DImode;
7156 }
7157
7158 return gen_reg_or_parallel (mode, orig_mode, regno);
7159 }
7160
7161 /* Return where to put the arguments to a function.
7162 Return zero to push the argument on the stack, or a hard register in which to store the argument.
7163
7164 MODE is the argument's machine mode. TYPE is the data type of the
7165 argument. It is null for libcalls where that information may not be
7166 available. CUM gives information about the preceding args and about
7167 the function being called. NAMED is nonzero if this argument is a
7168 named parameter (otherwise it is an extra parameter matching an
7169 ellipsis). */
7170
7171 static rtx
7172 ix86_function_arg (cumulative_args_t cum_v, enum machine_mode omode,
7173 const_tree type, bool named)
7174 {
7175 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
7176 enum machine_mode mode = omode;
7177 HOST_WIDE_INT bytes, words;
7178 rtx arg;
7179
7180 if (mode == BLKmode)
7181 bytes = int_size_in_bytes (type);
7182 else
7183 bytes = GET_MODE_SIZE (mode);
7184 words = (bytes + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
7185
7186 /* To simplify the code below, represent vector types with a vector mode
7187 even if MMX/SSE are not active. */
7188 if (type && TREE_CODE (type) == VECTOR_TYPE)
7189 mode = type_natural_mode (type, cum);
7190
7191 if (TARGET_64BIT && (cum ? cum->call_abi : ix86_abi) == MS_ABI)
7192 arg = function_arg_ms_64 (cum, mode, omode, named, bytes);
7193 else if (TARGET_64BIT)
7194 arg = function_arg_64 (cum, mode, omode, type, named);
7195 else
7196 arg = function_arg_32 (cum, mode, omode, type, bytes, words);
7197
7198 if (TARGET_VZEROUPPER && function_pass_avx256_p (arg))
7199 {
7200 /* This argument uses 256bit AVX modes. */
7201 if (cum->caller)
7202 cfun->machine->callee_pass_avx256_p = true;
7203 else
7204 cfun->machine->caller_pass_avx256_p = true;
7205 }
7206
7207 return arg;
7208 }
7209
7210 /* A C expression that indicates when an argument must be passed by
7211 reference. If nonzero for an argument, a copy of that argument is
7212 made in memory and a pointer to the argument is passed instead of
7213 the argument itself. The pointer is passed in whatever way is
7214 appropriate for passing a pointer to that type. */
7215
7216 static bool
7217 ix86_pass_by_reference (cumulative_args_t cum_v ATTRIBUTE_UNUSED,
7218 enum machine_mode mode ATTRIBUTE_UNUSED,
7219 const_tree type, bool named ATTRIBUTE_UNUSED)
7220 {
7221 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
7222
7223 /* See Windows x64 Software Convention. */
7224 if (TARGET_64BIT && (cum ? cum->call_abi : ix86_abi) == MS_ABI)
7225 {
7226 int msize = (int) GET_MODE_SIZE (mode);
7227 if (type)
7228 {
7229 /* Arrays are passed by reference. */
7230 if (TREE_CODE (type) == ARRAY_TYPE)
7231 return true;
7232
7233 if (AGGREGATE_TYPE_P (type))
7234 {
7235 /* Structs/unions of sizes other than 8, 16, 32, or 64 bits
7236 are passed by reference. */
7237 msize = int_size_in_bytes (type);
7238 }
7239 }
7240
7241 /* __m128 is passed by reference. */
7242 switch (msize) {
7243 case 1: case 2: case 4: case 8:
7244 break;
7245 default:
7246 return true;
7247 }
7248 }
7249 else if (TARGET_64BIT && type && int_size_in_bytes (type) == -1)
7250 return 1;
7251
7252 return 0;
7253 }
7254
7255 /* Return true when TYPE should be 128bit aligned for 32bit argument
7256 passing ABI. XXX: This function is obsolete and is only used for
7257 checking psABI compatibility with previous versions of GCC. */
7258
7259 static bool
7260 ix86_compat_aligned_value_p (const_tree type)
7261 {
7262 enum machine_mode mode = TYPE_MODE (type);
7263 if (((TARGET_SSE && SSE_REG_MODE_P (mode))
7264 || mode == TDmode
7265 || mode == TFmode
7266 || mode == TCmode)
7267 && (!TYPE_USER_ALIGN (type) || TYPE_ALIGN (type) > 128))
7268 return true;
7269 if (TYPE_ALIGN (type) < 128)
7270 return false;
7271
7272 if (AGGREGATE_TYPE_P (type))
7273 {
7274 /* Walk the aggregates recursively. */
7275 switch (TREE_CODE (type))
7276 {
7277 case RECORD_TYPE:
7278 case UNION_TYPE:
7279 case QUAL_UNION_TYPE:
7280 {
7281 tree field;
7282
7283 /* Walk all the structure fields. */
7284 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
7285 {
7286 if (TREE_CODE (field) == FIELD_DECL
7287 && ix86_compat_aligned_value_p (TREE_TYPE (field)))
7288 return true;
7289 }
7290 break;
7291 }
7292
7293 case ARRAY_TYPE:
7294 /* Just for use if some languages passes arrays by value. */
7295 if (ix86_compat_aligned_value_p (TREE_TYPE (type)))
7296 return true;
7297 break;
7298
7299 default:
7300 gcc_unreachable ();
7301 }
7302 }
7303 return false;
7304 }
7305
7306 /* Return the alignment boundary for MODE and TYPE with alignment ALIGN.
7307 XXX: This function is obsolete and is only used for checking psABI
7308 compatibility with previous versions of GCC. */
7309
7310 static unsigned int
7311 ix86_compat_function_arg_boundary (enum machine_mode mode,
7312 const_tree type, unsigned int align)
7313 {
7314 /* In 32bit, only _Decimal128 and __float128 are aligned to their
7315 natural boundaries. */
7316 if (!TARGET_64BIT && mode != TDmode && mode != TFmode)
7317 {
7318 /* i386 ABI defines all arguments to be 4 byte aligned. We have to
7319 make an exception for SSE modes since these require 128bit
7320 alignment.
7321
7322 The handling here differs from field_alignment. ICC aligns MMX
7323 arguments to 4 byte boundaries, while structure fields are aligned
7324 to 8 byte boundaries. */
7325 if (!type)
7326 {
7327 if (!(TARGET_SSE && SSE_REG_MODE_P (mode)))
7328 align = PARM_BOUNDARY;
7329 }
7330 else
7331 {
7332 if (!ix86_compat_aligned_value_p (type))
7333 align = PARM_BOUNDARY;
7334 }
7335 }
7336 if (align > BIGGEST_ALIGNMENT)
7337 align = BIGGEST_ALIGNMENT;
7338 return align;
7339 }
7340
7341 /* Return true when TYPE should be 128bit aligned for 32bit argument
7342 passing ABI. */
7343
7344 static bool
7345 ix86_contains_aligned_value_p (const_tree type)
7346 {
7347 enum machine_mode mode = TYPE_MODE (type);
7348
7349 if (mode == XFmode || mode == XCmode)
7350 return false;
7351
7352 if (TYPE_ALIGN (type) < 128)
7353 return false;
7354
7355 if (AGGREGATE_TYPE_P (type))
7356 {
7357 /* Walk the aggregates recursively. */
7358 switch (TREE_CODE (type))
7359 {
7360 case RECORD_TYPE:
7361 case UNION_TYPE:
7362 case QUAL_UNION_TYPE:
7363 {
7364 tree field;
7365
7366 /* Walk all the structure fields. */
7367 for (field = TYPE_FIELDS (type);
7368 field;
7369 field = DECL_CHAIN (field))
7370 {
7371 if (TREE_CODE (field) == FIELD_DECL
7372 && ix86_contains_aligned_value_p (TREE_TYPE (field)))
7373 return true;
7374 }
7375 break;
7376 }
7377
7378 case ARRAY_TYPE:
7379 /* Just for use if some languages passes arrays by value. */
7380 if (ix86_contains_aligned_value_p (TREE_TYPE (type)))
7381 return true;
7382 break;
7383
7384 default:
7385 gcc_unreachable ();
7386 }
7387 }
7388 else
7389 return TYPE_ALIGN (type) >= 128;
7390
7391 return false;
7392 }
7393
7394 /* Gives the alignment boundary, in bits, of an argument with the
7395 specified mode and type. */
7396
7397 static unsigned int
7398 ix86_function_arg_boundary (enum machine_mode mode, const_tree type)
7399 {
7400 unsigned int align;
7401 if (type)
7402 {
7403 /* Since the main variant type is used for call, we convert it to
7404 the main variant type. */
7405 type = TYPE_MAIN_VARIANT (type);
7406 align = TYPE_ALIGN (type);
7407 }
7408 else
7409 align = GET_MODE_ALIGNMENT (mode);
7410 if (align < PARM_BOUNDARY)
7411 align = PARM_BOUNDARY;
7412 else
7413 {
7414 static bool warned;
7415 unsigned int saved_align = align;
7416
7417 if (!TARGET_64BIT)
7418 {
7419 /* i386 ABI defines XFmode arguments to be 4 byte aligned. */
7420 if (!type)
7421 {
7422 if (mode == XFmode || mode == XCmode)
7423 align = PARM_BOUNDARY;
7424 }
7425 else if (!ix86_contains_aligned_value_p (type))
7426 align = PARM_BOUNDARY;
7427
7428 if (align < 128)
7429 align = PARM_BOUNDARY;
7430 }
7431
7432 if (warn_psabi
7433 && !warned
7434 && align != ix86_compat_function_arg_boundary (mode, type,
7435 saved_align))
7436 {
7437 warned = true;
7438 inform (input_location,
7439 "The ABI for passing parameters with %d-byte"
7440 " alignment has changed in GCC 4.6",
7441 align / BITS_PER_UNIT);
7442 }
7443 }
7444
7445 return align;
7446 }
7447
7448 /* Return true if N is a possible register number of function value. */
7449
7450 static bool
7451 ix86_function_value_regno_p (const unsigned int regno)
7452 {
7453 switch (regno)
7454 {
7455 case AX_REG:
7456 return true;
7457
7458 case FIRST_FLOAT_REG:
7459 /* TODO: The function should depend on current function ABI but
7460 builtins.c would need updating then. Therefore we use the
7461 default ABI. */
7462 if (TARGET_64BIT && ix86_abi == MS_ABI)
7463 return false;
7464 return TARGET_FLOAT_RETURNS_IN_80387;
7465
7466 case FIRST_SSE_REG:
7467 return TARGET_SSE;
7468
7469 case FIRST_MMX_REG:
7470 if (TARGET_MACHO || TARGET_64BIT)
7471 return false;
7472 return TARGET_MMX;
7473 }
7474
7475 return false;
7476 }
7477
7478 /* Define how to find the value returned by a function.
7479 VALTYPE is the data type of the value (as a tree).
7480 If the precise function being called is known, FUNC is its FUNCTION_DECL;
7481 otherwise, FUNC is 0. */
7482
7483 static rtx
7484 function_value_32 (enum machine_mode orig_mode, enum machine_mode mode,
7485 const_tree fntype, const_tree fn)
7486 {
7487 unsigned int regno;
7488
7489 /* 8-byte vector modes in %mm0. See ix86_return_in_memory for where
7490 we normally prevent this case when mmx is not available. However
7491 some ABIs may require the result to be returned like DImode. */
7492 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 8)
7493 regno = FIRST_MMX_REG;
7494
7495 /* 16-byte vector modes in %xmm0. See ix86_return_in_memory for where
7496 we prevent this case when sse is not available. However some ABIs
7497 may require the result to be returned like integer TImode. */
7498 else if (mode == TImode
7499 || (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 16))
7500 regno = FIRST_SSE_REG;
7501
7502 /* 32-byte vector modes in %ymm0. */
7503 else if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 32)
7504 regno = FIRST_SSE_REG;
7505
7506 /* Floating point return values in %st(0) (unless -mno-fp-ret-in-387). */
7507 else if (X87_FLOAT_MODE_P (mode) && TARGET_FLOAT_RETURNS_IN_80387)
7508 regno = FIRST_FLOAT_REG;
7509 else
7510 /* Most things go in %eax. */
7511 regno = AX_REG;
7512
7513 /* Override FP return register with %xmm0 for local functions when
7514 SSE math is enabled or for functions with sseregparm attribute. */
7515 if ((fn || fntype) && (mode == SFmode || mode == DFmode))
7516 {
7517 int sse_level = ix86_function_sseregparm (fntype, fn, false);
7518 if ((sse_level >= 1 && mode == SFmode)
7519 || (sse_level == 2 && mode == DFmode))
7520 regno = FIRST_SSE_REG;
7521 }
7522
7523 /* OImode shouldn't be used directly. */
7524 gcc_assert (mode != OImode);
7525
7526 return gen_rtx_REG (orig_mode, regno);
7527 }
7528
7529 static rtx
7530 function_value_64 (enum machine_mode orig_mode, enum machine_mode mode,
7531 const_tree valtype)
7532 {
7533 rtx ret;
7534
7535 /* Handle libcalls, which don't provide a type node. */
7536 if (valtype == NULL)
7537 {
7538 unsigned int regno;
7539
7540 switch (mode)
7541 {
7542 case SFmode:
7543 case SCmode:
7544 case DFmode:
7545 case DCmode:
7546 case TFmode:
7547 case SDmode:
7548 case DDmode:
7549 case TDmode:
7550 regno = FIRST_SSE_REG;
7551 break;
7552 case XFmode:
7553 case XCmode:
7554 regno = FIRST_FLOAT_REG;
7555 break;
7556 case TCmode:
7557 return NULL;
7558 default:
7559 regno = AX_REG;
7560 }
7561
7562 return gen_rtx_REG (mode, regno);
7563 }
7564 else if (POINTER_TYPE_P (valtype))
7565 {
7566 /* Pointers are always returned in word_mode. */
7567 mode = word_mode;
7568 }
7569
7570 ret = construct_container (mode, orig_mode, valtype, 1,
7571 X86_64_REGPARM_MAX, X86_64_SSE_REGPARM_MAX,
7572 x86_64_int_return_registers, 0);
7573
7574 /* For zero sized structures, construct_container returns NULL, but we
7575 need to keep rest of compiler happy by returning meaningful value. */
7576 if (!ret)
7577 ret = gen_rtx_REG (orig_mode, AX_REG);
7578
7579 return ret;
7580 }
7581
7582 static rtx
7583 function_value_ms_64 (enum machine_mode orig_mode, enum machine_mode mode)
7584 {
7585 unsigned int regno = AX_REG;
7586
7587 if (TARGET_SSE)
7588 {
7589 switch (GET_MODE_SIZE (mode))
7590 {
7591 case 16:
7592 if((SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
7593 && !COMPLEX_MODE_P (mode))
7594 regno = FIRST_SSE_REG;
7595 break;
7596 case 8:
7597 case 4:
7598 if (mode == SFmode || mode == DFmode)
7599 regno = FIRST_SSE_REG;
7600 break;
7601 default:
7602 break;
7603 }
7604 }
7605 return gen_rtx_REG (orig_mode, regno);
7606 }
7607
7608 static rtx
7609 ix86_function_value_1 (const_tree valtype, const_tree fntype_or_decl,
7610 enum machine_mode orig_mode, enum machine_mode mode)
7611 {
7612 const_tree fn, fntype;
7613
7614 fn = NULL_TREE;
7615 if (fntype_or_decl && DECL_P (fntype_or_decl))
7616 fn = fntype_or_decl;
7617 fntype = fn ? TREE_TYPE (fn) : fntype_or_decl;
7618
7619 if (TARGET_64BIT && ix86_function_type_abi (fntype) == MS_ABI)
7620 return function_value_ms_64 (orig_mode, mode);
7621 else if (TARGET_64BIT)
7622 return function_value_64 (orig_mode, mode, valtype);
7623 else
7624 return function_value_32 (orig_mode, mode, fntype, fn);
7625 }
7626
7627 static rtx
7628 ix86_function_value (const_tree valtype, const_tree fntype_or_decl,
7629 bool outgoing ATTRIBUTE_UNUSED)
7630 {
7631 enum machine_mode mode, orig_mode;
7632
7633 orig_mode = TYPE_MODE (valtype);
7634 mode = type_natural_mode (valtype, NULL);
7635 return ix86_function_value_1 (valtype, fntype_or_decl, orig_mode, mode);
7636 }
7637
7638 /* Pointer function arguments and return values are promoted to
7639 word_mode. */
7640
7641 static enum machine_mode
7642 ix86_promote_function_mode (const_tree type, enum machine_mode mode,
7643 int *punsignedp, const_tree fntype,
7644 int for_return)
7645 {
7646 if (type != NULL_TREE && POINTER_TYPE_P (type))
7647 {
7648 *punsignedp = POINTERS_EXTEND_UNSIGNED;
7649 return word_mode;
7650 }
7651 return default_promote_function_mode (type, mode, punsignedp, fntype,
7652 for_return);
7653 }
7654
7655 /* Return true if a structure, union or array with MODE containing FIELD
7656 should be accessed using BLKmode. */
7657
7658 static bool
7659 ix86_member_type_forces_blk (const_tree field, enum machine_mode mode)
7660 {
7661 /* Union with XFmode must be in BLKmode. */
7662 return (mode == XFmode
7663 && (TREE_CODE (DECL_FIELD_CONTEXT (field)) == UNION_TYPE
7664 || TREE_CODE (DECL_FIELD_CONTEXT (field)) == QUAL_UNION_TYPE));
7665 }
7666
7667 rtx
7668 ix86_libcall_value (enum machine_mode mode)
7669 {
7670 return ix86_function_value_1 (NULL, NULL, mode, mode);
7671 }
7672
7673 /* Return true iff type is returned in memory. */
7674
7675 static bool ATTRIBUTE_UNUSED
7676 return_in_memory_32 (const_tree type, enum machine_mode mode)
7677 {
7678 HOST_WIDE_INT size;
7679
7680 if (mode == BLKmode)
7681 return true;
7682
7683 size = int_size_in_bytes (type);
7684
7685 if (MS_AGGREGATE_RETURN && AGGREGATE_TYPE_P (type) && size <= 8)
7686 return false;
7687
7688 if (VECTOR_MODE_P (mode) || mode == TImode)
7689 {
7690 /* User-created vectors small enough to fit in EAX. */
7691 if (size < 8)
7692 return false;
7693
7694 /* MMX/3dNow values are returned in MM0,
7695 except when it doesn't exits or the ABI prescribes otherwise. */
7696 if (size == 8)
7697 return !TARGET_MMX || TARGET_VECT8_RETURNS;
7698
7699 /* SSE values are returned in XMM0, except when it doesn't exist. */
7700 if (size == 16)
7701 return !TARGET_SSE;
7702
7703 /* AVX values are returned in YMM0, except when it doesn't exist. */
7704 if (size == 32)
7705 return !TARGET_AVX;
7706 }
7707
7708 if (mode == XFmode)
7709 return false;
7710
7711 if (size > 12)
7712 return true;
7713
7714 /* OImode shouldn't be used directly. */
7715 gcc_assert (mode != OImode);
7716
7717 return false;
7718 }
7719
7720 static bool ATTRIBUTE_UNUSED
7721 return_in_memory_64 (const_tree type, enum machine_mode mode)
7722 {
7723 int needed_intregs, needed_sseregs;
7724 return !examine_argument (mode, type, 1, &needed_intregs, &needed_sseregs);
7725 }
7726
7727 static bool ATTRIBUTE_UNUSED
7728 return_in_memory_ms_64 (const_tree type, enum machine_mode mode)
7729 {
7730 HOST_WIDE_INT size = int_size_in_bytes (type);
7731
7732 /* __m128 is returned in xmm0. */
7733 if ((SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
7734 && !COMPLEX_MODE_P (mode) && (GET_MODE_SIZE (mode) == 16 || size == 16))
7735 return false;
7736
7737 /* Otherwise, the size must be exactly in [1248]. */
7738 return size != 1 && size != 2 && size != 4 && size != 8;
7739 }
7740
7741 static bool
7742 ix86_return_in_memory (const_tree type, const_tree fntype ATTRIBUTE_UNUSED)
7743 {
7744 #ifdef SUBTARGET_RETURN_IN_MEMORY
7745 return SUBTARGET_RETURN_IN_MEMORY (type, fntype);
7746 #else
7747 const enum machine_mode mode = type_natural_mode (type, NULL);
7748
7749 if (TARGET_64BIT)
7750 {
7751 if (ix86_function_type_abi (fntype) == MS_ABI)
7752 return return_in_memory_ms_64 (type, mode);
7753 else
7754 return return_in_memory_64 (type, mode);
7755 }
7756 else
7757 return return_in_memory_32 (type, mode);
7758 #endif
7759 }
7760
7761 /* When returning SSE vector types, we have a choice of either
7762 (1) being abi incompatible with a -march switch, or
7763 (2) generating an error.
7764 Given no good solution, I think the safest thing is one warning.
7765 The user won't be able to use -Werror, but....
7766
7767 Choose the STRUCT_VALUE_RTX hook because that's (at present) only
7768 called in response to actually generating a caller or callee that
7769 uses such a type. As opposed to TARGET_RETURN_IN_MEMORY, which is called
7770 via aggregate_value_p for general type probing from tree-ssa. */
7771
7772 static rtx
7773 ix86_struct_value_rtx (tree type, int incoming ATTRIBUTE_UNUSED)
7774 {
7775 static bool warnedsse, warnedmmx;
7776
7777 if (!TARGET_64BIT && type)
7778 {
7779 /* Look at the return type of the function, not the function type. */
7780 enum machine_mode mode = TYPE_MODE (TREE_TYPE (type));
7781
7782 if (!TARGET_SSE && !warnedsse)
7783 {
7784 if (mode == TImode
7785 || (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 16))
7786 {
7787 warnedsse = true;
7788 warning (0, "SSE vector return without SSE enabled "
7789 "changes the ABI");
7790 }
7791 }
7792
7793 if (!TARGET_MMX && !warnedmmx)
7794 {
7795 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 8)
7796 {
7797 warnedmmx = true;
7798 warning (0, "MMX vector return without MMX enabled "
7799 "changes the ABI");
7800 }
7801 }
7802 }
7803
7804 return NULL;
7805 }
7806
7807 \f
7808 /* Create the va_list data type. */
7809
7810 /* Returns the calling convention specific va_list date type.
7811 The argument ABI can be DEFAULT_ABI, MS_ABI, or SYSV_ABI. */
7812
7813 static tree
7814 ix86_build_builtin_va_list_abi (enum calling_abi abi)
7815 {
7816 tree f_gpr, f_fpr, f_ovf, f_sav, record, type_decl;
7817
7818 /* For i386 we use plain pointer to argument area. */
7819 if (!TARGET_64BIT || abi == MS_ABI)
7820 return build_pointer_type (char_type_node);
7821
7822 record = lang_hooks.types.make_type (RECORD_TYPE);
7823 type_decl = build_decl (BUILTINS_LOCATION,
7824 TYPE_DECL, get_identifier ("__va_list_tag"), record);
7825
7826 f_gpr = build_decl (BUILTINS_LOCATION,
7827 FIELD_DECL, get_identifier ("gp_offset"),
7828 unsigned_type_node);
7829 f_fpr = build_decl (BUILTINS_LOCATION,
7830 FIELD_DECL, get_identifier ("fp_offset"),
7831 unsigned_type_node);
7832 f_ovf = build_decl (BUILTINS_LOCATION,
7833 FIELD_DECL, get_identifier ("overflow_arg_area"),
7834 ptr_type_node);
7835 f_sav = build_decl (BUILTINS_LOCATION,
7836 FIELD_DECL, get_identifier ("reg_save_area"),
7837 ptr_type_node);
7838
7839 va_list_gpr_counter_field = f_gpr;
7840 va_list_fpr_counter_field = f_fpr;
7841
7842 DECL_FIELD_CONTEXT (f_gpr) = record;
7843 DECL_FIELD_CONTEXT (f_fpr) = record;
7844 DECL_FIELD_CONTEXT (f_ovf) = record;
7845 DECL_FIELD_CONTEXT (f_sav) = record;
7846
7847 TYPE_STUB_DECL (record) = type_decl;
7848 TYPE_NAME (record) = type_decl;
7849 TYPE_FIELDS (record) = f_gpr;
7850 DECL_CHAIN (f_gpr) = f_fpr;
7851 DECL_CHAIN (f_fpr) = f_ovf;
7852 DECL_CHAIN (f_ovf) = f_sav;
7853
7854 layout_type (record);
7855
7856 /* The correct type is an array type of one element. */
7857 return build_array_type (record, build_index_type (size_zero_node));
7858 }
7859
7860 /* Setup the builtin va_list data type and for 64-bit the additional
7861 calling convention specific va_list data types. */
7862
7863 static tree
7864 ix86_build_builtin_va_list (void)
7865 {
7866 tree ret = ix86_build_builtin_va_list_abi (ix86_abi);
7867
7868 /* Initialize abi specific va_list builtin types. */
7869 if (TARGET_64BIT)
7870 {
7871 tree t;
7872 if (ix86_abi == MS_ABI)
7873 {
7874 t = ix86_build_builtin_va_list_abi (SYSV_ABI);
7875 if (TREE_CODE (t) != RECORD_TYPE)
7876 t = build_variant_type_copy (t);
7877 sysv_va_list_type_node = t;
7878 }
7879 else
7880 {
7881 t = ret;
7882 if (TREE_CODE (t) != RECORD_TYPE)
7883 t = build_variant_type_copy (t);
7884 sysv_va_list_type_node = t;
7885 }
7886 if (ix86_abi != MS_ABI)
7887 {
7888 t = ix86_build_builtin_va_list_abi (MS_ABI);
7889 if (TREE_CODE (t) != RECORD_TYPE)
7890 t = build_variant_type_copy (t);
7891 ms_va_list_type_node = t;
7892 }
7893 else
7894 {
7895 t = ret;
7896 if (TREE_CODE (t) != RECORD_TYPE)
7897 t = build_variant_type_copy (t);
7898 ms_va_list_type_node = t;
7899 }
7900 }
7901
7902 return ret;
7903 }
7904
7905 /* Worker function for TARGET_SETUP_INCOMING_VARARGS. */
7906
7907 static void
7908 setup_incoming_varargs_64 (CUMULATIVE_ARGS *cum)
7909 {
7910 rtx save_area, mem;
7911 alias_set_type set;
7912 int i, max;
7913
7914 /* GPR size of varargs save area. */
7915 if (cfun->va_list_gpr_size)
7916 ix86_varargs_gpr_size = X86_64_REGPARM_MAX * UNITS_PER_WORD;
7917 else
7918 ix86_varargs_gpr_size = 0;
7919
7920 /* FPR size of varargs save area. We don't need it if we don't pass
7921 anything in SSE registers. */
7922 if (TARGET_SSE && cfun->va_list_fpr_size)
7923 ix86_varargs_fpr_size = X86_64_SSE_REGPARM_MAX * 16;
7924 else
7925 ix86_varargs_fpr_size = 0;
7926
7927 if (! ix86_varargs_gpr_size && ! ix86_varargs_fpr_size)
7928 return;
7929
7930 save_area = frame_pointer_rtx;
7931 set = get_varargs_alias_set ();
7932
7933 max = cum->regno + cfun->va_list_gpr_size / UNITS_PER_WORD;
7934 if (max > X86_64_REGPARM_MAX)
7935 max = X86_64_REGPARM_MAX;
7936
7937 for (i = cum->regno; i < max; i++)
7938 {
7939 mem = gen_rtx_MEM (word_mode,
7940 plus_constant (Pmode, save_area, i * UNITS_PER_WORD));
7941 MEM_NOTRAP_P (mem) = 1;
7942 set_mem_alias_set (mem, set);
7943 emit_move_insn (mem,
7944 gen_rtx_REG (word_mode,
7945 x86_64_int_parameter_registers[i]));
7946 }
7947
7948 if (ix86_varargs_fpr_size)
7949 {
7950 enum machine_mode smode;
7951 rtx label, test;
7952
7953 /* Now emit code to save SSE registers. The AX parameter contains number
7954 of SSE parameter registers used to call this function, though all we
7955 actually check here is the zero/non-zero status. */
7956
7957 label = gen_label_rtx ();
7958 test = gen_rtx_EQ (VOIDmode, gen_rtx_REG (QImode, AX_REG), const0_rtx);
7959 emit_jump_insn (gen_cbranchqi4 (test, XEXP (test, 0), XEXP (test, 1),
7960 label));
7961
7962 /* ??? If !TARGET_SSE_TYPELESS_STORES, would we perform better if
7963 we used movdqa (i.e. TImode) instead? Perhaps even better would
7964 be if we could determine the real mode of the data, via a hook
7965 into pass_stdarg. Ignore all that for now. */
7966 smode = V4SFmode;
7967 if (crtl->stack_alignment_needed < GET_MODE_ALIGNMENT (smode))
7968 crtl->stack_alignment_needed = GET_MODE_ALIGNMENT (smode);
7969
7970 max = cum->sse_regno + cfun->va_list_fpr_size / 16;
7971 if (max > X86_64_SSE_REGPARM_MAX)
7972 max = X86_64_SSE_REGPARM_MAX;
7973
7974 for (i = cum->sse_regno; i < max; ++i)
7975 {
7976 mem = plus_constant (Pmode, save_area,
7977 i * 16 + ix86_varargs_gpr_size);
7978 mem = gen_rtx_MEM (smode, mem);
7979 MEM_NOTRAP_P (mem) = 1;
7980 set_mem_alias_set (mem, set);
7981 set_mem_align (mem, GET_MODE_ALIGNMENT (smode));
7982
7983 emit_move_insn (mem, gen_rtx_REG (smode, SSE_REGNO (i)));
7984 }
7985
7986 emit_label (label);
7987 }
7988 }
7989
7990 static void
7991 setup_incoming_varargs_ms_64 (CUMULATIVE_ARGS *cum)
7992 {
7993 alias_set_type set = get_varargs_alias_set ();
7994 int i;
7995
7996 /* Reset to zero, as there might be a sysv vaarg used
7997 before. */
7998 ix86_varargs_gpr_size = 0;
7999 ix86_varargs_fpr_size = 0;
8000
8001 for (i = cum->regno; i < X86_64_MS_REGPARM_MAX; i++)
8002 {
8003 rtx reg, mem;
8004
8005 mem = gen_rtx_MEM (Pmode,
8006 plus_constant (Pmode, virtual_incoming_args_rtx,
8007 i * UNITS_PER_WORD));
8008 MEM_NOTRAP_P (mem) = 1;
8009 set_mem_alias_set (mem, set);
8010
8011 reg = gen_rtx_REG (Pmode, x86_64_ms_abi_int_parameter_registers[i]);
8012 emit_move_insn (mem, reg);
8013 }
8014 }
8015
8016 static void
8017 ix86_setup_incoming_varargs (cumulative_args_t cum_v, enum machine_mode mode,
8018 tree type, int *pretend_size ATTRIBUTE_UNUSED,
8019 int no_rtl)
8020 {
8021 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
8022 CUMULATIVE_ARGS next_cum;
8023 tree fntype;
8024
8025 /* This argument doesn't appear to be used anymore. Which is good,
8026 because the old code here didn't suppress rtl generation. */
8027 gcc_assert (!no_rtl);
8028
8029 if (!TARGET_64BIT)
8030 return;
8031
8032 fntype = TREE_TYPE (current_function_decl);
8033
8034 /* For varargs, we do not want to skip the dummy va_dcl argument.
8035 For stdargs, we do want to skip the last named argument. */
8036 next_cum = *cum;
8037 if (stdarg_p (fntype))
8038 ix86_function_arg_advance (pack_cumulative_args (&next_cum), mode, type,
8039 true);
8040
8041 if (cum->call_abi == MS_ABI)
8042 setup_incoming_varargs_ms_64 (&next_cum);
8043 else
8044 setup_incoming_varargs_64 (&next_cum);
8045 }
8046
8047 /* Checks if TYPE is of kind va_list char *. */
8048
8049 static bool
8050 is_va_list_char_pointer (tree type)
8051 {
8052 tree canonic;
8053
8054 /* For 32-bit it is always true. */
8055 if (!TARGET_64BIT)
8056 return true;
8057 canonic = ix86_canonical_va_list_type (type);
8058 return (canonic == ms_va_list_type_node
8059 || (ix86_abi == MS_ABI && canonic == va_list_type_node));
8060 }
8061
8062 /* Implement va_start. */
8063
8064 static void
8065 ix86_va_start (tree valist, rtx nextarg)
8066 {
8067 HOST_WIDE_INT words, n_gpr, n_fpr;
8068 tree f_gpr, f_fpr, f_ovf, f_sav;
8069 tree gpr, fpr, ovf, sav, t;
8070 tree type;
8071 rtx ovf_rtx;
8072
8073 if (flag_split_stack
8074 && cfun->machine->split_stack_varargs_pointer == NULL_RTX)
8075 {
8076 unsigned int scratch_regno;
8077
8078 /* When we are splitting the stack, we can't refer to the stack
8079 arguments using internal_arg_pointer, because they may be on
8080 the old stack. The split stack prologue will arrange to
8081 leave a pointer to the old stack arguments in a scratch
8082 register, which we here copy to a pseudo-register. The split
8083 stack prologue can't set the pseudo-register directly because
8084 it (the prologue) runs before any registers have been saved. */
8085
8086 scratch_regno = split_stack_prologue_scratch_regno ();
8087 if (scratch_regno != INVALID_REGNUM)
8088 {
8089 rtx reg, seq;
8090
8091 reg = gen_reg_rtx (Pmode);
8092 cfun->machine->split_stack_varargs_pointer = reg;
8093
8094 start_sequence ();
8095 emit_move_insn (reg, gen_rtx_REG (Pmode, scratch_regno));
8096 seq = get_insns ();
8097 end_sequence ();
8098
8099 push_topmost_sequence ();
8100 emit_insn_after (seq, entry_of_function ());
8101 pop_topmost_sequence ();
8102 }
8103 }
8104
8105 /* Only 64bit target needs something special. */
8106 if (!TARGET_64BIT || is_va_list_char_pointer (TREE_TYPE (valist)))
8107 {
8108 if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
8109 std_expand_builtin_va_start (valist, nextarg);
8110 else
8111 {
8112 rtx va_r, next;
8113
8114 va_r = expand_expr (valist, NULL_RTX, VOIDmode, EXPAND_WRITE);
8115 next = expand_binop (ptr_mode, add_optab,
8116 cfun->machine->split_stack_varargs_pointer,
8117 crtl->args.arg_offset_rtx,
8118 NULL_RTX, 0, OPTAB_LIB_WIDEN);
8119 convert_move (va_r, next, 0);
8120 }
8121 return;
8122 }
8123
8124 f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
8125 f_fpr = DECL_CHAIN (f_gpr);
8126 f_ovf = DECL_CHAIN (f_fpr);
8127 f_sav = DECL_CHAIN (f_ovf);
8128
8129 valist = build_simple_mem_ref (valist);
8130 TREE_TYPE (valist) = TREE_TYPE (sysv_va_list_type_node);
8131 /* The following should be folded into the MEM_REF offset. */
8132 gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr), unshare_expr (valist),
8133 f_gpr, NULL_TREE);
8134 fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), unshare_expr (valist),
8135 f_fpr, NULL_TREE);
8136 ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), unshare_expr (valist),
8137 f_ovf, NULL_TREE);
8138 sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), unshare_expr (valist),
8139 f_sav, NULL_TREE);
8140
8141 /* Count number of gp and fp argument registers used. */
8142 words = crtl->args.info.words;
8143 n_gpr = crtl->args.info.regno;
8144 n_fpr = crtl->args.info.sse_regno;
8145
8146 if (cfun->va_list_gpr_size)
8147 {
8148 type = TREE_TYPE (gpr);
8149 t = build2 (MODIFY_EXPR, type,
8150 gpr, build_int_cst (type, n_gpr * 8));
8151 TREE_SIDE_EFFECTS (t) = 1;
8152 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
8153 }
8154
8155 if (TARGET_SSE && cfun->va_list_fpr_size)
8156 {
8157 type = TREE_TYPE (fpr);
8158 t = build2 (MODIFY_EXPR, type, fpr,
8159 build_int_cst (type, n_fpr * 16 + 8*X86_64_REGPARM_MAX));
8160 TREE_SIDE_EFFECTS (t) = 1;
8161 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
8162 }
8163
8164 /* Find the overflow area. */
8165 type = TREE_TYPE (ovf);
8166 if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
8167 ovf_rtx = crtl->args.internal_arg_pointer;
8168 else
8169 ovf_rtx = cfun->machine->split_stack_varargs_pointer;
8170 t = make_tree (type, ovf_rtx);
8171 if (words != 0)
8172 t = fold_build_pointer_plus_hwi (t, words * UNITS_PER_WORD);
8173 t = build2 (MODIFY_EXPR, type, ovf, t);
8174 TREE_SIDE_EFFECTS (t) = 1;
8175 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
8176
8177 if (ix86_varargs_gpr_size || ix86_varargs_fpr_size)
8178 {
8179 /* Find the register save area.
8180 Prologue of the function save it right above stack frame. */
8181 type = TREE_TYPE (sav);
8182 t = make_tree (type, frame_pointer_rtx);
8183 if (!ix86_varargs_gpr_size)
8184 t = fold_build_pointer_plus_hwi (t, -8 * X86_64_REGPARM_MAX);
8185 t = build2 (MODIFY_EXPR, type, sav, t);
8186 TREE_SIDE_EFFECTS (t) = 1;
8187 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
8188 }
8189 }
8190
8191 /* Implement va_arg. */
8192
8193 static tree
8194 ix86_gimplify_va_arg (tree valist, tree type, gimple_seq *pre_p,
8195 gimple_seq *post_p)
8196 {
8197 static const int intreg[6] = { 0, 1, 2, 3, 4, 5 };
8198 tree f_gpr, f_fpr, f_ovf, f_sav;
8199 tree gpr, fpr, ovf, sav, t;
8200 int size, rsize;
8201 tree lab_false, lab_over = NULL_TREE;
8202 tree addr, t2;
8203 rtx container;
8204 int indirect_p = 0;
8205 tree ptrtype;
8206 enum machine_mode nat_mode;
8207 unsigned int arg_boundary;
8208
8209 /* Only 64bit target needs something special. */
8210 if (!TARGET_64BIT || is_va_list_char_pointer (TREE_TYPE (valist)))
8211 return std_gimplify_va_arg_expr (valist, type, pre_p, post_p);
8212
8213 f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
8214 f_fpr = DECL_CHAIN (f_gpr);
8215 f_ovf = DECL_CHAIN (f_fpr);
8216 f_sav = DECL_CHAIN (f_ovf);
8217
8218 gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr),
8219 build_va_arg_indirect_ref (valist), f_gpr, NULL_TREE);
8220 valist = build_va_arg_indirect_ref (valist);
8221 fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), valist, f_fpr, NULL_TREE);
8222 ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), valist, f_ovf, NULL_TREE);
8223 sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), valist, f_sav, NULL_TREE);
8224
8225 indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
8226 if (indirect_p)
8227 type = build_pointer_type (type);
8228 size = int_size_in_bytes (type);
8229 rsize = (size + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
8230
8231 nat_mode = type_natural_mode (type, NULL);
8232 switch (nat_mode)
8233 {
8234 case V8SFmode:
8235 case V8SImode:
8236 case V32QImode:
8237 case V16HImode:
8238 case V4DFmode:
8239 case V4DImode:
8240 /* Unnamed 256bit vector mode parameters are passed on stack. */
8241 if (!TARGET_64BIT_MS_ABI)
8242 {
8243 container = NULL;
8244 break;
8245 }
8246
8247 default:
8248 container = construct_container (nat_mode, TYPE_MODE (type),
8249 type, 0, X86_64_REGPARM_MAX,
8250 X86_64_SSE_REGPARM_MAX, intreg,
8251 0);
8252 break;
8253 }
8254
8255 /* Pull the value out of the saved registers. */
8256
8257 addr = create_tmp_var (ptr_type_node, "addr");
8258
8259 if (container)
8260 {
8261 int needed_intregs, needed_sseregs;
8262 bool need_temp;
8263 tree int_addr, sse_addr;
8264
8265 lab_false = create_artificial_label (UNKNOWN_LOCATION);
8266 lab_over = create_artificial_label (UNKNOWN_LOCATION);
8267
8268 examine_argument (nat_mode, type, 0, &needed_intregs, &needed_sseregs);
8269
8270 need_temp = (!REG_P (container)
8271 && ((needed_intregs && TYPE_ALIGN (type) > 64)
8272 || TYPE_ALIGN (type) > 128));
8273
8274 /* In case we are passing structure, verify that it is consecutive block
8275 on the register save area. If not we need to do moves. */
8276 if (!need_temp && !REG_P (container))
8277 {
8278 /* Verify that all registers are strictly consecutive */
8279 if (SSE_REGNO_P (REGNO (XEXP (XVECEXP (container, 0, 0), 0))))
8280 {
8281 int i;
8282
8283 for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
8284 {
8285 rtx slot = XVECEXP (container, 0, i);
8286 if (REGNO (XEXP (slot, 0)) != FIRST_SSE_REG + (unsigned int) i
8287 || INTVAL (XEXP (slot, 1)) != i * 16)
8288 need_temp = 1;
8289 }
8290 }
8291 else
8292 {
8293 int i;
8294
8295 for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
8296 {
8297 rtx slot = XVECEXP (container, 0, i);
8298 if (REGNO (XEXP (slot, 0)) != (unsigned int) i
8299 || INTVAL (XEXP (slot, 1)) != i * 8)
8300 need_temp = 1;
8301 }
8302 }
8303 }
8304 if (!need_temp)
8305 {
8306 int_addr = addr;
8307 sse_addr = addr;
8308 }
8309 else
8310 {
8311 int_addr = create_tmp_var (ptr_type_node, "int_addr");
8312 sse_addr = create_tmp_var (ptr_type_node, "sse_addr");
8313 }
8314
8315 /* First ensure that we fit completely in registers. */
8316 if (needed_intregs)
8317 {
8318 t = build_int_cst (TREE_TYPE (gpr),
8319 (X86_64_REGPARM_MAX - needed_intregs + 1) * 8);
8320 t = build2 (GE_EXPR, boolean_type_node, gpr, t);
8321 t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
8322 t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
8323 gimplify_and_add (t, pre_p);
8324 }
8325 if (needed_sseregs)
8326 {
8327 t = build_int_cst (TREE_TYPE (fpr),
8328 (X86_64_SSE_REGPARM_MAX - needed_sseregs + 1) * 16
8329 + X86_64_REGPARM_MAX * 8);
8330 t = build2 (GE_EXPR, boolean_type_node, fpr, t);
8331 t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
8332 t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
8333 gimplify_and_add (t, pre_p);
8334 }
8335
8336 /* Compute index to start of area used for integer regs. */
8337 if (needed_intregs)
8338 {
8339 /* int_addr = gpr + sav; */
8340 t = fold_build_pointer_plus (sav, gpr);
8341 gimplify_assign (int_addr, t, pre_p);
8342 }
8343 if (needed_sseregs)
8344 {
8345 /* sse_addr = fpr + sav; */
8346 t = fold_build_pointer_plus (sav, fpr);
8347 gimplify_assign (sse_addr, t, pre_p);
8348 }
8349 if (need_temp)
8350 {
8351 int i, prev_size = 0;
8352 tree temp = create_tmp_var (type, "va_arg_tmp");
8353
8354 /* addr = &temp; */
8355 t = build1 (ADDR_EXPR, build_pointer_type (type), temp);
8356 gimplify_assign (addr, t, pre_p);
8357
8358 for (i = 0; i < XVECLEN (container, 0); i++)
8359 {
8360 rtx slot = XVECEXP (container, 0, i);
8361 rtx reg = XEXP (slot, 0);
8362 enum machine_mode mode = GET_MODE (reg);
8363 tree piece_type;
8364 tree addr_type;
8365 tree daddr_type;
8366 tree src_addr, src;
8367 int src_offset;
8368 tree dest_addr, dest;
8369 int cur_size = GET_MODE_SIZE (mode);
8370
8371 gcc_assert (prev_size <= INTVAL (XEXP (slot, 1)));
8372 prev_size = INTVAL (XEXP (slot, 1));
8373 if (prev_size + cur_size > size)
8374 {
8375 cur_size = size - prev_size;
8376 mode = mode_for_size (cur_size * BITS_PER_UNIT, MODE_INT, 1);
8377 if (mode == BLKmode)
8378 mode = QImode;
8379 }
8380 piece_type = lang_hooks.types.type_for_mode (mode, 1);
8381 if (mode == GET_MODE (reg))
8382 addr_type = build_pointer_type (piece_type);
8383 else
8384 addr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
8385 true);
8386 daddr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
8387 true);
8388
8389 if (SSE_REGNO_P (REGNO (reg)))
8390 {
8391 src_addr = sse_addr;
8392 src_offset = (REGNO (reg) - FIRST_SSE_REG) * 16;
8393 }
8394 else
8395 {
8396 src_addr = int_addr;
8397 src_offset = REGNO (reg) * 8;
8398 }
8399 src_addr = fold_convert (addr_type, src_addr);
8400 src_addr = fold_build_pointer_plus_hwi (src_addr, src_offset);
8401
8402 dest_addr = fold_convert (daddr_type, addr);
8403 dest_addr = fold_build_pointer_plus_hwi (dest_addr, prev_size);
8404 if (cur_size == GET_MODE_SIZE (mode))
8405 {
8406 src = build_va_arg_indirect_ref (src_addr);
8407 dest = build_va_arg_indirect_ref (dest_addr);
8408
8409 gimplify_assign (dest, src, pre_p);
8410 }
8411 else
8412 {
8413 tree copy
8414 = build_call_expr (builtin_decl_implicit (BUILT_IN_MEMCPY),
8415 3, dest_addr, src_addr,
8416 size_int (cur_size));
8417 gimplify_and_add (copy, pre_p);
8418 }
8419 prev_size += cur_size;
8420 }
8421 }
8422
8423 if (needed_intregs)
8424 {
8425 t = build2 (PLUS_EXPR, TREE_TYPE (gpr), gpr,
8426 build_int_cst (TREE_TYPE (gpr), needed_intregs * 8));
8427 gimplify_assign (gpr, t, pre_p);
8428 }
8429
8430 if (needed_sseregs)
8431 {
8432 t = build2 (PLUS_EXPR, TREE_TYPE (fpr), fpr,
8433 build_int_cst (TREE_TYPE (fpr), needed_sseregs * 16));
8434 gimplify_assign (fpr, t, pre_p);
8435 }
8436
8437 gimple_seq_add_stmt (pre_p, gimple_build_goto (lab_over));
8438
8439 gimple_seq_add_stmt (pre_p, gimple_build_label (lab_false));
8440 }
8441
8442 /* ... otherwise out of the overflow area. */
8443
8444 /* When we align parameter on stack for caller, if the parameter
8445 alignment is beyond MAX_SUPPORTED_STACK_ALIGNMENT, it will be
8446 aligned at MAX_SUPPORTED_STACK_ALIGNMENT. We will match callee
8447 here with caller. */
8448 arg_boundary = ix86_function_arg_boundary (VOIDmode, type);
8449 if ((unsigned int) arg_boundary > MAX_SUPPORTED_STACK_ALIGNMENT)
8450 arg_boundary = MAX_SUPPORTED_STACK_ALIGNMENT;
8451
8452 /* Care for on-stack alignment if needed. */
8453 if (arg_boundary <= 64 || size == 0)
8454 t = ovf;
8455 else
8456 {
8457 HOST_WIDE_INT align = arg_boundary / 8;
8458 t = fold_build_pointer_plus_hwi (ovf, align - 1);
8459 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
8460 build_int_cst (TREE_TYPE (t), -align));
8461 }
8462
8463 gimplify_expr (&t, pre_p, NULL, is_gimple_val, fb_rvalue);
8464 gimplify_assign (addr, t, pre_p);
8465
8466 t = fold_build_pointer_plus_hwi (t, rsize * UNITS_PER_WORD);
8467 gimplify_assign (unshare_expr (ovf), t, pre_p);
8468
8469 if (container)
8470 gimple_seq_add_stmt (pre_p, gimple_build_label (lab_over));
8471
8472 ptrtype = build_pointer_type_for_mode (type, ptr_mode, true);
8473 addr = fold_convert (ptrtype, addr);
8474
8475 if (indirect_p)
8476 addr = build_va_arg_indirect_ref (addr);
8477 return build_va_arg_indirect_ref (addr);
8478 }
8479 \f
8480 /* Return true if OPNUM's MEM should be matched
8481 in movabs* patterns. */
8482
8483 bool
8484 ix86_check_movabs (rtx insn, int opnum)
8485 {
8486 rtx set, mem;
8487
8488 set = PATTERN (insn);
8489 if (GET_CODE (set) == PARALLEL)
8490 set = XVECEXP (set, 0, 0);
8491 gcc_assert (GET_CODE (set) == SET);
8492 mem = XEXP (set, opnum);
8493 while (GET_CODE (mem) == SUBREG)
8494 mem = SUBREG_REG (mem);
8495 gcc_assert (MEM_P (mem));
8496 return volatile_ok || !MEM_VOLATILE_P (mem);
8497 }
8498 \f
8499 /* Initialize the table of extra 80387 mathematical constants. */
8500
8501 static void
8502 init_ext_80387_constants (void)
8503 {
8504 static const char * cst[5] =
8505 {
8506 "0.3010299956639811952256464283594894482", /* 0: fldlg2 */
8507 "0.6931471805599453094286904741849753009", /* 1: fldln2 */
8508 "1.4426950408889634073876517827983434472", /* 2: fldl2e */
8509 "3.3219280948873623478083405569094566090", /* 3: fldl2t */
8510 "3.1415926535897932385128089594061862044", /* 4: fldpi */
8511 };
8512 int i;
8513
8514 for (i = 0; i < 5; i++)
8515 {
8516 real_from_string (&ext_80387_constants_table[i], cst[i]);
8517 /* Ensure each constant is rounded to XFmode precision. */
8518 real_convert (&ext_80387_constants_table[i],
8519 XFmode, &ext_80387_constants_table[i]);
8520 }
8521
8522 ext_80387_constants_init = 1;
8523 }
8524
8525 /* Return non-zero if the constant is something that
8526 can be loaded with a special instruction. */
8527
8528 int
8529 standard_80387_constant_p (rtx x)
8530 {
8531 enum machine_mode mode = GET_MODE (x);
8532
8533 REAL_VALUE_TYPE r;
8534
8535 if (!(X87_FLOAT_MODE_P (mode) && (GET_CODE (x) == CONST_DOUBLE)))
8536 return -1;
8537
8538 if (x == CONST0_RTX (mode))
8539 return 1;
8540 if (x == CONST1_RTX (mode))
8541 return 2;
8542
8543 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
8544
8545 /* For XFmode constants, try to find a special 80387 instruction when
8546 optimizing for size or on those CPUs that benefit from them. */
8547 if (mode == XFmode
8548 && (optimize_function_for_size_p (cfun) || TARGET_EXT_80387_CONSTANTS))
8549 {
8550 int i;
8551
8552 if (! ext_80387_constants_init)
8553 init_ext_80387_constants ();
8554
8555 for (i = 0; i < 5; i++)
8556 if (real_identical (&r, &ext_80387_constants_table[i]))
8557 return i + 3;
8558 }
8559
8560 /* Load of the constant -0.0 or -1.0 will be split as
8561 fldz;fchs or fld1;fchs sequence. */
8562 if (real_isnegzero (&r))
8563 return 8;
8564 if (real_identical (&r, &dconstm1))
8565 return 9;
8566
8567 return 0;
8568 }
8569
8570 /* Return the opcode of the special instruction to be used to load
8571 the constant X. */
8572
8573 const char *
8574 standard_80387_constant_opcode (rtx x)
8575 {
8576 switch (standard_80387_constant_p (x))
8577 {
8578 case 1:
8579 return "fldz";
8580 case 2:
8581 return "fld1";
8582 case 3:
8583 return "fldlg2";
8584 case 4:
8585 return "fldln2";
8586 case 5:
8587 return "fldl2e";
8588 case 6:
8589 return "fldl2t";
8590 case 7:
8591 return "fldpi";
8592 case 8:
8593 case 9:
8594 return "#";
8595 default:
8596 gcc_unreachable ();
8597 }
8598 }
8599
8600 /* Return the CONST_DOUBLE representing the 80387 constant that is
8601 loaded by the specified special instruction. The argument IDX
8602 matches the return value from standard_80387_constant_p. */
8603
8604 rtx
8605 standard_80387_constant_rtx (int idx)
8606 {
8607 int i;
8608
8609 if (! ext_80387_constants_init)
8610 init_ext_80387_constants ();
8611
8612 switch (idx)
8613 {
8614 case 3:
8615 case 4:
8616 case 5:
8617 case 6:
8618 case 7:
8619 i = idx - 3;
8620 break;
8621
8622 default:
8623 gcc_unreachable ();
8624 }
8625
8626 return CONST_DOUBLE_FROM_REAL_VALUE (ext_80387_constants_table[i],
8627 XFmode);
8628 }
8629
8630 /* Return 1 if X is all 0s and 2 if x is all 1s
8631 in supported SSE/AVX vector mode. */
8632
8633 int
8634 standard_sse_constant_p (rtx x)
8635 {
8636 enum machine_mode mode = GET_MODE (x);
8637
8638 if (x == const0_rtx || x == CONST0_RTX (GET_MODE (x)))
8639 return 1;
8640 if (vector_all_ones_operand (x, mode))
8641 switch (mode)
8642 {
8643 case V16QImode:
8644 case V8HImode:
8645 case V4SImode:
8646 case V2DImode:
8647 if (TARGET_SSE2)
8648 return 2;
8649 case V32QImode:
8650 case V16HImode:
8651 case V8SImode:
8652 case V4DImode:
8653 if (TARGET_AVX2)
8654 return 2;
8655 default:
8656 break;
8657 }
8658
8659 return 0;
8660 }
8661
8662 /* Return the opcode of the special instruction to be used to load
8663 the constant X. */
8664
8665 const char *
8666 standard_sse_constant_opcode (rtx insn, rtx x)
8667 {
8668 switch (standard_sse_constant_p (x))
8669 {
8670 case 1:
8671 switch (get_attr_mode (insn))
8672 {
8673 case MODE_TI:
8674 return "%vpxor\t%0, %d0";
8675 case MODE_V2DF:
8676 return "%vxorpd\t%0, %d0";
8677 case MODE_V4SF:
8678 return "%vxorps\t%0, %d0";
8679
8680 case MODE_OI:
8681 return "vpxor\t%x0, %x0, %x0";
8682 case MODE_V4DF:
8683 return "vxorpd\t%x0, %x0, %x0";
8684 case MODE_V8SF:
8685 return "vxorps\t%x0, %x0, %x0";
8686
8687 default:
8688 break;
8689 }
8690
8691 case 2:
8692 if (TARGET_AVX)
8693 return "vpcmpeqd\t%0, %0, %0";
8694 else
8695 return "pcmpeqd\t%0, %0";
8696
8697 default:
8698 break;
8699 }
8700 gcc_unreachable ();
8701 }
8702
8703 /* Returns true if OP contains a symbol reference */
8704
8705 bool
8706 symbolic_reference_mentioned_p (rtx op)
8707 {
8708 const char *fmt;
8709 int i;
8710
8711 if (GET_CODE (op) == SYMBOL_REF || GET_CODE (op) == LABEL_REF)
8712 return true;
8713
8714 fmt = GET_RTX_FORMAT (GET_CODE (op));
8715 for (i = GET_RTX_LENGTH (GET_CODE (op)) - 1; i >= 0; i--)
8716 {
8717 if (fmt[i] == 'E')
8718 {
8719 int j;
8720
8721 for (j = XVECLEN (op, i) - 1; j >= 0; j--)
8722 if (symbolic_reference_mentioned_p (XVECEXP (op, i, j)))
8723 return true;
8724 }
8725
8726 else if (fmt[i] == 'e' && symbolic_reference_mentioned_p (XEXP (op, i)))
8727 return true;
8728 }
8729
8730 return false;
8731 }
8732
8733 /* Return true if it is appropriate to emit `ret' instructions in the
8734 body of a function. Do this only if the epilogue is simple, needing a
8735 couple of insns. Prior to reloading, we can't tell how many registers
8736 must be saved, so return false then. Return false if there is no frame
8737 marker to de-allocate. */
8738
8739 bool
8740 ix86_can_use_return_insn_p (void)
8741 {
8742 struct ix86_frame frame;
8743
8744 if (! reload_completed || frame_pointer_needed)
8745 return 0;
8746
8747 /* Don't allow more than 32k pop, since that's all we can do
8748 with one instruction. */
8749 if (crtl->args.pops_args && crtl->args.size >= 32768)
8750 return 0;
8751
8752 ix86_compute_frame_layout (&frame);
8753 return (frame.stack_pointer_offset == UNITS_PER_WORD
8754 && (frame.nregs + frame.nsseregs) == 0);
8755 }
8756 \f
8757 /* Value should be nonzero if functions must have frame pointers.
8758 Zero means the frame pointer need not be set up (and parms may
8759 be accessed via the stack pointer) in functions that seem suitable. */
8760
8761 static bool
8762 ix86_frame_pointer_required (void)
8763 {
8764 /* If we accessed previous frames, then the generated code expects
8765 to be able to access the saved ebp value in our frame. */
8766 if (cfun->machine->accesses_prev_frame)
8767 return true;
8768
8769 /* Several x86 os'es need a frame pointer for other reasons,
8770 usually pertaining to setjmp. */
8771 if (SUBTARGET_FRAME_POINTER_REQUIRED)
8772 return true;
8773
8774 /* For older 32-bit runtimes setjmp requires valid frame-pointer. */
8775 if (TARGET_32BIT_MS_ABI && cfun->calls_setjmp)
8776 return true;
8777
8778 /* Win64 SEH, very large frames need a frame-pointer as maximum stack
8779 allocation is 4GB. */
8780 if (TARGET_64BIT_MS_ABI && get_frame_size () > SEH_MAX_FRAME_SIZE)
8781 return true;
8782
8783 /* In ix86_option_override_internal, TARGET_OMIT_LEAF_FRAME_POINTER
8784 turns off the frame pointer by default. Turn it back on now if
8785 we've not got a leaf function. */
8786 if (TARGET_OMIT_LEAF_FRAME_POINTER
8787 && (!crtl->is_leaf
8788 || ix86_current_function_calls_tls_descriptor))
8789 return true;
8790
8791 if (crtl->profile && !flag_fentry)
8792 return true;
8793
8794 return false;
8795 }
8796
8797 /* Record that the current function accesses previous call frames. */
8798
8799 void
8800 ix86_setup_frame_addresses (void)
8801 {
8802 cfun->machine->accesses_prev_frame = 1;
8803 }
8804 \f
8805 #ifndef USE_HIDDEN_LINKONCE
8806 # if defined(HAVE_GAS_HIDDEN) && (SUPPORTS_ONE_ONLY - 0)
8807 # define USE_HIDDEN_LINKONCE 1
8808 # else
8809 # define USE_HIDDEN_LINKONCE 0
8810 # endif
8811 #endif
8812
8813 static int pic_labels_used;
8814
8815 /* Fills in the label name that should be used for a pc thunk for
8816 the given register. */
8817
8818 static void
8819 get_pc_thunk_name (char name[32], unsigned int regno)
8820 {
8821 gcc_assert (!TARGET_64BIT);
8822
8823 if (USE_HIDDEN_LINKONCE)
8824 sprintf (name, "__x86.get_pc_thunk.%s", reg_names[regno]);
8825 else
8826 ASM_GENERATE_INTERNAL_LABEL (name, "LPR", regno);
8827 }
8828
8829
8830 /* This function generates code for -fpic that loads %ebx with
8831 the return address of the caller and then returns. */
8832
8833 static void
8834 ix86_code_end (void)
8835 {
8836 rtx xops[2];
8837 int regno;
8838
8839 for (regno = AX_REG; regno <= SP_REG; regno++)
8840 {
8841 char name[32];
8842 tree decl;
8843
8844 if (!(pic_labels_used & (1 << regno)))
8845 continue;
8846
8847 get_pc_thunk_name (name, regno);
8848
8849 decl = build_decl (BUILTINS_LOCATION, FUNCTION_DECL,
8850 get_identifier (name),
8851 build_function_type_list (void_type_node, NULL_TREE));
8852 DECL_RESULT (decl) = build_decl (BUILTINS_LOCATION, RESULT_DECL,
8853 NULL_TREE, void_type_node);
8854 TREE_PUBLIC (decl) = 1;
8855 TREE_STATIC (decl) = 1;
8856 DECL_IGNORED_P (decl) = 1;
8857
8858 #if TARGET_MACHO
8859 if (TARGET_MACHO)
8860 {
8861 switch_to_section (darwin_sections[text_coal_section]);
8862 fputs ("\t.weak_definition\t", asm_out_file);
8863 assemble_name (asm_out_file, name);
8864 fputs ("\n\t.private_extern\t", asm_out_file);
8865 assemble_name (asm_out_file, name);
8866 putc ('\n', asm_out_file);
8867 ASM_OUTPUT_LABEL (asm_out_file, name);
8868 DECL_WEAK (decl) = 1;
8869 }
8870 else
8871 #endif
8872 if (USE_HIDDEN_LINKONCE)
8873 {
8874 DECL_COMDAT_GROUP (decl) = DECL_ASSEMBLER_NAME (decl);
8875
8876 targetm.asm_out.unique_section (decl, 0);
8877 switch_to_section (get_named_section (decl, NULL, 0));
8878
8879 targetm.asm_out.globalize_label (asm_out_file, name);
8880 fputs ("\t.hidden\t", asm_out_file);
8881 assemble_name (asm_out_file, name);
8882 putc ('\n', asm_out_file);
8883 ASM_DECLARE_FUNCTION_NAME (asm_out_file, name, decl);
8884 }
8885 else
8886 {
8887 switch_to_section (text_section);
8888 ASM_OUTPUT_LABEL (asm_out_file, name);
8889 }
8890
8891 DECL_INITIAL (decl) = make_node (BLOCK);
8892 current_function_decl = decl;
8893 init_function_start (decl);
8894 first_function_block_is_cold = false;
8895 /* Make sure unwind info is emitted for the thunk if needed. */
8896 final_start_function (emit_barrier (), asm_out_file, 1);
8897
8898 /* Pad stack IP move with 4 instructions (two NOPs count
8899 as one instruction). */
8900 if (TARGET_PAD_SHORT_FUNCTION)
8901 {
8902 int i = 8;
8903
8904 while (i--)
8905 fputs ("\tnop\n", asm_out_file);
8906 }
8907
8908 xops[0] = gen_rtx_REG (Pmode, regno);
8909 xops[1] = gen_rtx_MEM (Pmode, stack_pointer_rtx);
8910 output_asm_insn ("mov%z0\t{%1, %0|%0, %1}", xops);
8911 fputs ("\tret\n", asm_out_file);
8912 final_end_function ();
8913 init_insn_lengths ();
8914 free_after_compilation (cfun);
8915 set_cfun (NULL);
8916 current_function_decl = NULL;
8917 }
8918
8919 if (flag_split_stack)
8920 file_end_indicate_split_stack ();
8921 }
8922
8923 /* Emit code for the SET_GOT patterns. */
8924
8925 const char *
8926 output_set_got (rtx dest, rtx label ATTRIBUTE_UNUSED)
8927 {
8928 rtx xops[3];
8929
8930 xops[0] = dest;
8931
8932 if (TARGET_VXWORKS_RTP && flag_pic)
8933 {
8934 /* Load (*VXWORKS_GOTT_BASE) into the PIC register. */
8935 xops[2] = gen_rtx_MEM (Pmode,
8936 gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_BASE));
8937 output_asm_insn ("mov{l}\t{%2, %0|%0, %2}", xops);
8938
8939 /* Load (*VXWORKS_GOTT_BASE)[VXWORKS_GOTT_INDEX] into the PIC register.
8940 Use %P and a local symbol in order to print VXWORKS_GOTT_INDEX as
8941 an unadorned address. */
8942 xops[2] = gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_INDEX);
8943 SYMBOL_REF_FLAGS (xops[2]) |= SYMBOL_FLAG_LOCAL;
8944 output_asm_insn ("mov{l}\t{%P2(%0), %0|%0, DWORD PTR %P2[%0]}", xops);
8945 return "";
8946 }
8947
8948 xops[1] = gen_rtx_SYMBOL_REF (Pmode, GOT_SYMBOL_NAME);
8949
8950 if (!flag_pic)
8951 {
8952 xops[2] = gen_rtx_LABEL_REF (Pmode, label ? label : gen_label_rtx ());
8953
8954 output_asm_insn ("mov%z0\t{%2, %0|%0, %2}", xops);
8955
8956 #if TARGET_MACHO
8957 /* Output the Mach-O "canonical" label name ("Lxx$pb") here too. This
8958 is what will be referenced by the Mach-O PIC subsystem. */
8959 if (!label)
8960 ASM_OUTPUT_LABEL (asm_out_file, MACHOPIC_FUNCTION_BASE_NAME);
8961 #endif
8962
8963 targetm.asm_out.internal_label (asm_out_file, "L",
8964 CODE_LABEL_NUMBER (XEXP (xops[2], 0)));
8965 }
8966 else
8967 {
8968 char name[32];
8969 get_pc_thunk_name (name, REGNO (dest));
8970 pic_labels_used |= 1 << REGNO (dest);
8971
8972 xops[2] = gen_rtx_SYMBOL_REF (Pmode, ggc_strdup (name));
8973 xops[2] = gen_rtx_MEM (QImode, xops[2]);
8974 output_asm_insn ("call\t%X2", xops);
8975 /* Output the Mach-O "canonical" label name ("Lxx$pb") here too. This
8976 is what will be referenced by the Mach-O PIC subsystem. */
8977 #if TARGET_MACHO
8978 if (!label)
8979 ASM_OUTPUT_LABEL (asm_out_file, MACHOPIC_FUNCTION_BASE_NAME);
8980 else
8981 targetm.asm_out.internal_label (asm_out_file, "L",
8982 CODE_LABEL_NUMBER (label));
8983 #endif
8984 }
8985
8986 if (!TARGET_MACHO)
8987 output_asm_insn ("add%z0\t{%1, %0|%0, %1}", xops);
8988
8989 return "";
8990 }
8991
8992 /* Generate an "push" pattern for input ARG. */
8993
8994 static rtx
8995 gen_push (rtx arg)
8996 {
8997 struct machine_function *m = cfun->machine;
8998
8999 if (m->fs.cfa_reg == stack_pointer_rtx)
9000 m->fs.cfa_offset += UNITS_PER_WORD;
9001 m->fs.sp_offset += UNITS_PER_WORD;
9002
9003 if (REG_P (arg) && GET_MODE (arg) != word_mode)
9004 arg = gen_rtx_REG (word_mode, REGNO (arg));
9005
9006 return gen_rtx_SET (VOIDmode,
9007 gen_rtx_MEM (word_mode,
9008 gen_rtx_PRE_DEC (Pmode,
9009 stack_pointer_rtx)),
9010 arg);
9011 }
9012
9013 /* Generate an "pop" pattern for input ARG. */
9014
9015 static rtx
9016 gen_pop (rtx arg)
9017 {
9018 if (REG_P (arg) && GET_MODE (arg) != word_mode)
9019 arg = gen_rtx_REG (word_mode, REGNO (arg));
9020
9021 return gen_rtx_SET (VOIDmode,
9022 arg,
9023 gen_rtx_MEM (word_mode,
9024 gen_rtx_POST_INC (Pmode,
9025 stack_pointer_rtx)));
9026 }
9027
9028 /* Return >= 0 if there is an unused call-clobbered register available
9029 for the entire function. */
9030
9031 static unsigned int
9032 ix86_select_alt_pic_regnum (void)
9033 {
9034 if (crtl->is_leaf
9035 && !crtl->profile
9036 && !ix86_current_function_calls_tls_descriptor)
9037 {
9038 int i, drap;
9039 /* Can't use the same register for both PIC and DRAP. */
9040 if (crtl->drap_reg)
9041 drap = REGNO (crtl->drap_reg);
9042 else
9043 drap = -1;
9044 for (i = 2; i >= 0; --i)
9045 if (i != drap && !df_regs_ever_live_p (i))
9046 return i;
9047 }
9048
9049 return INVALID_REGNUM;
9050 }
9051
9052 /* Return TRUE if we need to save REGNO. */
9053
9054 static bool
9055 ix86_save_reg (unsigned int regno, bool maybe_eh_return)
9056 {
9057 if (pic_offset_table_rtx
9058 && regno == REAL_PIC_OFFSET_TABLE_REGNUM
9059 && (df_regs_ever_live_p (REAL_PIC_OFFSET_TABLE_REGNUM)
9060 || crtl->profile
9061 || crtl->calls_eh_return
9062 || crtl->uses_const_pool))
9063 return ix86_select_alt_pic_regnum () == INVALID_REGNUM;
9064
9065 if (crtl->calls_eh_return && maybe_eh_return)
9066 {
9067 unsigned i;
9068 for (i = 0; ; i++)
9069 {
9070 unsigned test = EH_RETURN_DATA_REGNO (i);
9071 if (test == INVALID_REGNUM)
9072 break;
9073 if (test == regno)
9074 return true;
9075 }
9076 }
9077
9078 if (crtl->drap_reg && regno == REGNO (crtl->drap_reg))
9079 return true;
9080
9081 return (df_regs_ever_live_p (regno)
9082 && !call_used_regs[regno]
9083 && !fixed_regs[regno]
9084 && (regno != HARD_FRAME_POINTER_REGNUM || !frame_pointer_needed));
9085 }
9086
9087 /* Return number of saved general prupose registers. */
9088
9089 static int
9090 ix86_nsaved_regs (void)
9091 {
9092 int nregs = 0;
9093 int regno;
9094
9095 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
9096 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9097 nregs ++;
9098 return nregs;
9099 }
9100
9101 /* Return number of saved SSE registrers. */
9102
9103 static int
9104 ix86_nsaved_sseregs (void)
9105 {
9106 int nregs = 0;
9107 int regno;
9108
9109 if (!TARGET_64BIT_MS_ABI)
9110 return 0;
9111 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
9112 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9113 nregs ++;
9114 return nregs;
9115 }
9116
9117 /* Given FROM and TO register numbers, say whether this elimination is
9118 allowed. If stack alignment is needed, we can only replace argument
9119 pointer with hard frame pointer, or replace frame pointer with stack
9120 pointer. Otherwise, frame pointer elimination is automatically
9121 handled and all other eliminations are valid. */
9122
9123 static bool
9124 ix86_can_eliminate (const int from, const int to)
9125 {
9126 if (stack_realign_fp)
9127 return ((from == ARG_POINTER_REGNUM
9128 && to == HARD_FRAME_POINTER_REGNUM)
9129 || (from == FRAME_POINTER_REGNUM
9130 && to == STACK_POINTER_REGNUM));
9131 else
9132 return to == STACK_POINTER_REGNUM ? !frame_pointer_needed : true;
9133 }
9134
9135 /* Return the offset between two registers, one to be eliminated, and the other
9136 its replacement, at the start of a routine. */
9137
9138 HOST_WIDE_INT
9139 ix86_initial_elimination_offset (int from, int to)
9140 {
9141 struct ix86_frame frame;
9142 ix86_compute_frame_layout (&frame);
9143
9144 if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
9145 return frame.hard_frame_pointer_offset;
9146 else if (from == FRAME_POINTER_REGNUM
9147 && to == HARD_FRAME_POINTER_REGNUM)
9148 return frame.hard_frame_pointer_offset - frame.frame_pointer_offset;
9149 else
9150 {
9151 gcc_assert (to == STACK_POINTER_REGNUM);
9152
9153 if (from == ARG_POINTER_REGNUM)
9154 return frame.stack_pointer_offset;
9155
9156 gcc_assert (from == FRAME_POINTER_REGNUM);
9157 return frame.stack_pointer_offset - frame.frame_pointer_offset;
9158 }
9159 }
9160
9161 /* In a dynamically-aligned function, we can't know the offset from
9162 stack pointer to frame pointer, so we must ensure that setjmp
9163 eliminates fp against the hard fp (%ebp) rather than trying to
9164 index from %esp up to the top of the frame across a gap that is
9165 of unknown (at compile-time) size. */
9166 static rtx
9167 ix86_builtin_setjmp_frame_value (void)
9168 {
9169 return stack_realign_fp ? hard_frame_pointer_rtx : virtual_stack_vars_rtx;
9170 }
9171
9172 /* When using -fsplit-stack, the allocation routines set a field in
9173 the TCB to the bottom of the stack plus this much space, measured
9174 in bytes. */
9175
9176 #define SPLIT_STACK_AVAILABLE 256
9177
9178 /* Fill structure ix86_frame about frame of currently computed function. */
9179
9180 static void
9181 ix86_compute_frame_layout (struct ix86_frame *frame)
9182 {
9183 unsigned HOST_WIDE_INT stack_alignment_needed;
9184 HOST_WIDE_INT offset;
9185 unsigned HOST_WIDE_INT preferred_alignment;
9186 HOST_WIDE_INT size = get_frame_size ();
9187 HOST_WIDE_INT to_allocate;
9188
9189 frame->nregs = ix86_nsaved_regs ();
9190 frame->nsseregs = ix86_nsaved_sseregs ();
9191
9192 stack_alignment_needed = crtl->stack_alignment_needed / BITS_PER_UNIT;
9193 preferred_alignment = crtl->preferred_stack_boundary / BITS_PER_UNIT;
9194
9195 /* 64-bit MS ABI seem to require stack alignment to be always 16 except for
9196 function prologues and leaf. */
9197 if ((TARGET_64BIT_MS_ABI && preferred_alignment < 16)
9198 && (!crtl->is_leaf || cfun->calls_alloca != 0
9199 || ix86_current_function_calls_tls_descriptor))
9200 {
9201 preferred_alignment = 16;
9202 stack_alignment_needed = 16;
9203 crtl->preferred_stack_boundary = 128;
9204 crtl->stack_alignment_needed = 128;
9205 }
9206
9207 gcc_assert (!size || stack_alignment_needed);
9208 gcc_assert (preferred_alignment >= STACK_BOUNDARY / BITS_PER_UNIT);
9209 gcc_assert (preferred_alignment <= stack_alignment_needed);
9210
9211 /* For SEH we have to limit the amount of code movement into the prologue.
9212 At present we do this via a BLOCKAGE, at which point there's very little
9213 scheduling that can be done, which means that there's very little point
9214 in doing anything except PUSHs. */
9215 if (TARGET_SEH)
9216 cfun->machine->use_fast_prologue_epilogue = false;
9217
9218 /* During reload iteration the amount of registers saved can change.
9219 Recompute the value as needed. Do not recompute when amount of registers
9220 didn't change as reload does multiple calls to the function and does not
9221 expect the decision to change within single iteration. */
9222 else if (!optimize_function_for_size_p (cfun)
9223 && cfun->machine->use_fast_prologue_epilogue_nregs != frame->nregs)
9224 {
9225 int count = frame->nregs;
9226 struct cgraph_node *node = cgraph_get_node (current_function_decl);
9227
9228 cfun->machine->use_fast_prologue_epilogue_nregs = count;
9229
9230 /* The fast prologue uses move instead of push to save registers. This
9231 is significantly longer, but also executes faster as modern hardware
9232 can execute the moves in parallel, but can't do that for push/pop.
9233
9234 Be careful about choosing what prologue to emit: When function takes
9235 many instructions to execute we may use slow version as well as in
9236 case function is known to be outside hot spot (this is known with
9237 feedback only). Weight the size of function by number of registers
9238 to save as it is cheap to use one or two push instructions but very
9239 slow to use many of them. */
9240 if (count)
9241 count = (count - 1) * FAST_PROLOGUE_INSN_COUNT;
9242 if (node->frequency < NODE_FREQUENCY_NORMAL
9243 || (flag_branch_probabilities
9244 && node->frequency < NODE_FREQUENCY_HOT))
9245 cfun->machine->use_fast_prologue_epilogue = false;
9246 else
9247 cfun->machine->use_fast_prologue_epilogue
9248 = !expensive_function_p (count);
9249 }
9250
9251 frame->save_regs_using_mov
9252 = (TARGET_PROLOGUE_USING_MOVE && cfun->machine->use_fast_prologue_epilogue
9253 /* If static stack checking is enabled and done with probes,
9254 the registers need to be saved before allocating the frame. */
9255 && flag_stack_check != STATIC_BUILTIN_STACK_CHECK);
9256
9257 /* Skip return address. */
9258 offset = UNITS_PER_WORD;
9259
9260 /* Skip pushed static chain. */
9261 if (ix86_static_chain_on_stack)
9262 offset += UNITS_PER_WORD;
9263
9264 /* Skip saved base pointer. */
9265 if (frame_pointer_needed)
9266 offset += UNITS_PER_WORD;
9267 frame->hfp_save_offset = offset;
9268
9269 /* The traditional frame pointer location is at the top of the frame. */
9270 frame->hard_frame_pointer_offset = offset;
9271
9272 /* Register save area */
9273 offset += frame->nregs * UNITS_PER_WORD;
9274 frame->reg_save_offset = offset;
9275
9276 /* On SEH target, registers are pushed just before the frame pointer
9277 location. */
9278 if (TARGET_SEH)
9279 frame->hard_frame_pointer_offset = offset;
9280
9281 /* Align and set SSE register save area. */
9282 if (frame->nsseregs)
9283 {
9284 /* The only ABI that has saved SSE registers (Win64) also has a
9285 16-byte aligned default stack, and thus we don't need to be
9286 within the re-aligned local stack frame to save them. */
9287 gcc_assert (INCOMING_STACK_BOUNDARY >= 128);
9288 offset = (offset + 16 - 1) & -16;
9289 offset += frame->nsseregs * 16;
9290 }
9291 frame->sse_reg_save_offset = offset;
9292
9293 /* The re-aligned stack starts here. Values before this point are not
9294 directly comparable with values below this point. In order to make
9295 sure that no value happens to be the same before and after, force
9296 the alignment computation below to add a non-zero value. */
9297 if (stack_realign_fp)
9298 offset = (offset + stack_alignment_needed) & -stack_alignment_needed;
9299
9300 /* Va-arg area */
9301 frame->va_arg_size = ix86_varargs_gpr_size + ix86_varargs_fpr_size;
9302 offset += frame->va_arg_size;
9303
9304 /* Align start of frame for local function. */
9305 if (stack_realign_fp
9306 || offset != frame->sse_reg_save_offset
9307 || size != 0
9308 || !crtl->is_leaf
9309 || cfun->calls_alloca
9310 || ix86_current_function_calls_tls_descriptor)
9311 offset = (offset + stack_alignment_needed - 1) & -stack_alignment_needed;
9312
9313 /* Frame pointer points here. */
9314 frame->frame_pointer_offset = offset;
9315
9316 offset += size;
9317
9318 /* Add outgoing arguments area. Can be skipped if we eliminated
9319 all the function calls as dead code.
9320 Skipping is however impossible when function calls alloca. Alloca
9321 expander assumes that last crtl->outgoing_args_size
9322 of stack frame are unused. */
9323 if (ACCUMULATE_OUTGOING_ARGS
9324 && (!crtl->is_leaf || cfun->calls_alloca
9325 || ix86_current_function_calls_tls_descriptor))
9326 {
9327 offset += crtl->outgoing_args_size;
9328 frame->outgoing_arguments_size = crtl->outgoing_args_size;
9329 }
9330 else
9331 frame->outgoing_arguments_size = 0;
9332
9333 /* Align stack boundary. Only needed if we're calling another function
9334 or using alloca. */
9335 if (!crtl->is_leaf || cfun->calls_alloca
9336 || ix86_current_function_calls_tls_descriptor)
9337 offset = (offset + preferred_alignment - 1) & -preferred_alignment;
9338
9339 /* We've reached end of stack frame. */
9340 frame->stack_pointer_offset = offset;
9341
9342 /* Size prologue needs to allocate. */
9343 to_allocate = offset - frame->sse_reg_save_offset;
9344
9345 if ((!to_allocate && frame->nregs <= 1)
9346 || (TARGET_64BIT && to_allocate >= (HOST_WIDE_INT) 0x80000000))
9347 frame->save_regs_using_mov = false;
9348
9349 if (ix86_using_red_zone ()
9350 && crtl->sp_is_unchanging
9351 && crtl->is_leaf
9352 && !ix86_current_function_calls_tls_descriptor)
9353 {
9354 frame->red_zone_size = to_allocate;
9355 if (frame->save_regs_using_mov)
9356 frame->red_zone_size += frame->nregs * UNITS_PER_WORD;
9357 if (frame->red_zone_size > RED_ZONE_SIZE - RED_ZONE_RESERVE)
9358 frame->red_zone_size = RED_ZONE_SIZE - RED_ZONE_RESERVE;
9359 }
9360 else
9361 frame->red_zone_size = 0;
9362 frame->stack_pointer_offset -= frame->red_zone_size;
9363
9364 /* The SEH frame pointer location is near the bottom of the frame.
9365 This is enforced by the fact that the difference between the
9366 stack pointer and the frame pointer is limited to 240 bytes in
9367 the unwind data structure. */
9368 if (TARGET_SEH)
9369 {
9370 HOST_WIDE_INT diff;
9371
9372 /* If we can leave the frame pointer where it is, do so. Also, returns
9373 the establisher frame for __builtin_frame_address (0). */
9374 diff = frame->stack_pointer_offset - frame->hard_frame_pointer_offset;
9375 if (diff <= SEH_MAX_FRAME_SIZE
9376 && (diff > 240 || (diff & 15) != 0)
9377 && !crtl->accesses_prior_frames)
9378 {
9379 /* Ideally we'd determine what portion of the local stack frame
9380 (within the constraint of the lowest 240) is most heavily used.
9381 But without that complication, simply bias the frame pointer
9382 by 128 bytes so as to maximize the amount of the local stack
9383 frame that is addressable with 8-bit offsets. */
9384 frame->hard_frame_pointer_offset = frame->stack_pointer_offset - 128;
9385 }
9386 }
9387 }
9388
9389 /* This is semi-inlined memory_address_length, but simplified
9390 since we know that we're always dealing with reg+offset, and
9391 to avoid having to create and discard all that rtl. */
9392
9393 static inline int
9394 choose_baseaddr_len (unsigned int regno, HOST_WIDE_INT offset)
9395 {
9396 int len = 4;
9397
9398 if (offset == 0)
9399 {
9400 /* EBP and R13 cannot be encoded without an offset. */
9401 len = (regno == BP_REG || regno == R13_REG);
9402 }
9403 else if (IN_RANGE (offset, -128, 127))
9404 len = 1;
9405
9406 /* ESP and R12 must be encoded with a SIB byte. */
9407 if (regno == SP_REG || regno == R12_REG)
9408 len++;
9409
9410 return len;
9411 }
9412
9413 /* Return an RTX that points to CFA_OFFSET within the stack frame.
9414 The valid base registers are taken from CFUN->MACHINE->FS. */
9415
9416 static rtx
9417 choose_baseaddr (HOST_WIDE_INT cfa_offset)
9418 {
9419 const struct machine_function *m = cfun->machine;
9420 rtx base_reg = NULL;
9421 HOST_WIDE_INT base_offset = 0;
9422
9423 if (m->use_fast_prologue_epilogue)
9424 {
9425 /* Choose the base register most likely to allow the most scheduling
9426 opportunities. Generally FP is valid throughout the function,
9427 while DRAP must be reloaded within the epilogue. But choose either
9428 over the SP due to increased encoding size. */
9429
9430 if (m->fs.fp_valid)
9431 {
9432 base_reg = hard_frame_pointer_rtx;
9433 base_offset = m->fs.fp_offset - cfa_offset;
9434 }
9435 else if (m->fs.drap_valid)
9436 {
9437 base_reg = crtl->drap_reg;
9438 base_offset = 0 - cfa_offset;
9439 }
9440 else if (m->fs.sp_valid)
9441 {
9442 base_reg = stack_pointer_rtx;
9443 base_offset = m->fs.sp_offset - cfa_offset;
9444 }
9445 }
9446 else
9447 {
9448 HOST_WIDE_INT toffset;
9449 int len = 16, tlen;
9450
9451 /* Choose the base register with the smallest address encoding.
9452 With a tie, choose FP > DRAP > SP. */
9453 if (m->fs.sp_valid)
9454 {
9455 base_reg = stack_pointer_rtx;
9456 base_offset = m->fs.sp_offset - cfa_offset;
9457 len = choose_baseaddr_len (STACK_POINTER_REGNUM, base_offset);
9458 }
9459 if (m->fs.drap_valid)
9460 {
9461 toffset = 0 - cfa_offset;
9462 tlen = choose_baseaddr_len (REGNO (crtl->drap_reg), toffset);
9463 if (tlen <= len)
9464 {
9465 base_reg = crtl->drap_reg;
9466 base_offset = toffset;
9467 len = tlen;
9468 }
9469 }
9470 if (m->fs.fp_valid)
9471 {
9472 toffset = m->fs.fp_offset - cfa_offset;
9473 tlen = choose_baseaddr_len (HARD_FRAME_POINTER_REGNUM, toffset);
9474 if (tlen <= len)
9475 {
9476 base_reg = hard_frame_pointer_rtx;
9477 base_offset = toffset;
9478 len = tlen;
9479 }
9480 }
9481 }
9482 gcc_assert (base_reg != NULL);
9483
9484 return plus_constant (Pmode, base_reg, base_offset);
9485 }
9486
9487 /* Emit code to save registers in the prologue. */
9488
9489 static void
9490 ix86_emit_save_regs (void)
9491 {
9492 unsigned int regno;
9493 rtx insn;
9494
9495 for (regno = FIRST_PSEUDO_REGISTER - 1; regno-- > 0; )
9496 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9497 {
9498 insn = emit_insn (gen_push (gen_rtx_REG (word_mode, regno)));
9499 RTX_FRAME_RELATED_P (insn) = 1;
9500 }
9501 }
9502
9503 /* Emit a single register save at CFA - CFA_OFFSET. */
9504
9505 static void
9506 ix86_emit_save_reg_using_mov (enum machine_mode mode, unsigned int regno,
9507 HOST_WIDE_INT cfa_offset)
9508 {
9509 struct machine_function *m = cfun->machine;
9510 rtx reg = gen_rtx_REG (mode, regno);
9511 rtx mem, addr, base, insn;
9512
9513 addr = choose_baseaddr (cfa_offset);
9514 mem = gen_frame_mem (mode, addr);
9515
9516 /* For SSE saves, we need to indicate the 128-bit alignment. */
9517 set_mem_align (mem, GET_MODE_ALIGNMENT (mode));
9518
9519 insn = emit_move_insn (mem, reg);
9520 RTX_FRAME_RELATED_P (insn) = 1;
9521
9522 base = addr;
9523 if (GET_CODE (base) == PLUS)
9524 base = XEXP (base, 0);
9525 gcc_checking_assert (REG_P (base));
9526
9527 /* When saving registers into a re-aligned local stack frame, avoid
9528 any tricky guessing by dwarf2out. */
9529 if (m->fs.realigned)
9530 {
9531 gcc_checking_assert (stack_realign_drap);
9532
9533 if (regno == REGNO (crtl->drap_reg))
9534 {
9535 /* A bit of a hack. We force the DRAP register to be saved in
9536 the re-aligned stack frame, which provides us with a copy
9537 of the CFA that will last past the prologue. Install it. */
9538 gcc_checking_assert (cfun->machine->fs.fp_valid);
9539 addr = plus_constant (Pmode, hard_frame_pointer_rtx,
9540 cfun->machine->fs.fp_offset - cfa_offset);
9541 mem = gen_rtx_MEM (mode, addr);
9542 add_reg_note (insn, REG_CFA_DEF_CFA, mem);
9543 }
9544 else
9545 {
9546 /* The frame pointer is a stable reference within the
9547 aligned frame. Use it. */
9548 gcc_checking_assert (cfun->machine->fs.fp_valid);
9549 addr = plus_constant (Pmode, hard_frame_pointer_rtx,
9550 cfun->machine->fs.fp_offset - cfa_offset);
9551 mem = gen_rtx_MEM (mode, addr);
9552 add_reg_note (insn, REG_CFA_EXPRESSION,
9553 gen_rtx_SET (VOIDmode, mem, reg));
9554 }
9555 }
9556
9557 /* The memory may not be relative to the current CFA register,
9558 which means that we may need to generate a new pattern for
9559 use by the unwind info. */
9560 else if (base != m->fs.cfa_reg)
9561 {
9562 addr = plus_constant (Pmode, m->fs.cfa_reg,
9563 m->fs.cfa_offset - cfa_offset);
9564 mem = gen_rtx_MEM (mode, addr);
9565 add_reg_note (insn, REG_CFA_OFFSET, gen_rtx_SET (VOIDmode, mem, reg));
9566 }
9567 }
9568
9569 /* Emit code to save registers using MOV insns.
9570 First register is stored at CFA - CFA_OFFSET. */
9571 static void
9572 ix86_emit_save_regs_using_mov (HOST_WIDE_INT cfa_offset)
9573 {
9574 unsigned int regno;
9575
9576 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
9577 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9578 {
9579 ix86_emit_save_reg_using_mov (word_mode, regno, cfa_offset);
9580 cfa_offset -= UNITS_PER_WORD;
9581 }
9582 }
9583
9584 /* Emit code to save SSE registers using MOV insns.
9585 First register is stored at CFA - CFA_OFFSET. */
9586 static void
9587 ix86_emit_save_sse_regs_using_mov (HOST_WIDE_INT cfa_offset)
9588 {
9589 unsigned int regno;
9590
9591 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
9592 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9593 {
9594 ix86_emit_save_reg_using_mov (V4SFmode, regno, cfa_offset);
9595 cfa_offset -= 16;
9596 }
9597 }
9598
9599 static GTY(()) rtx queued_cfa_restores;
9600
9601 /* Add a REG_CFA_RESTORE REG note to INSN or queue them until next stack
9602 manipulation insn. The value is on the stack at CFA - CFA_OFFSET.
9603 Don't add the note if the previously saved value will be left untouched
9604 within stack red-zone till return, as unwinders can find the same value
9605 in the register and on the stack. */
9606
9607 static void
9608 ix86_add_cfa_restore_note (rtx insn, rtx reg, HOST_WIDE_INT cfa_offset)
9609 {
9610 if (!crtl->shrink_wrapped
9611 && cfa_offset <= cfun->machine->fs.red_zone_offset)
9612 return;
9613
9614 if (insn)
9615 {
9616 add_reg_note (insn, REG_CFA_RESTORE, reg);
9617 RTX_FRAME_RELATED_P (insn) = 1;
9618 }
9619 else
9620 queued_cfa_restores
9621 = alloc_reg_note (REG_CFA_RESTORE, reg, queued_cfa_restores);
9622 }
9623
9624 /* Add queued REG_CFA_RESTORE notes if any to INSN. */
9625
9626 static void
9627 ix86_add_queued_cfa_restore_notes (rtx insn)
9628 {
9629 rtx last;
9630 if (!queued_cfa_restores)
9631 return;
9632 for (last = queued_cfa_restores; XEXP (last, 1); last = XEXP (last, 1))
9633 ;
9634 XEXP (last, 1) = REG_NOTES (insn);
9635 REG_NOTES (insn) = queued_cfa_restores;
9636 queued_cfa_restores = NULL_RTX;
9637 RTX_FRAME_RELATED_P (insn) = 1;
9638 }
9639
9640 /* Expand prologue or epilogue stack adjustment.
9641 The pattern exist to put a dependency on all ebp-based memory accesses.
9642 STYLE should be negative if instructions should be marked as frame related,
9643 zero if %r11 register is live and cannot be freely used and positive
9644 otherwise. */
9645
9646 static void
9647 pro_epilogue_adjust_stack (rtx dest, rtx src, rtx offset,
9648 int style, bool set_cfa)
9649 {
9650 struct machine_function *m = cfun->machine;
9651 rtx insn;
9652 bool add_frame_related_expr = false;
9653
9654 if (Pmode == SImode)
9655 insn = gen_pro_epilogue_adjust_stack_si_add (dest, src, offset);
9656 else if (x86_64_immediate_operand (offset, DImode))
9657 insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, offset);
9658 else
9659 {
9660 rtx tmp;
9661 /* r11 is used by indirect sibcall return as well, set before the
9662 epilogue and used after the epilogue. */
9663 if (style)
9664 tmp = gen_rtx_REG (DImode, R11_REG);
9665 else
9666 {
9667 gcc_assert (src != hard_frame_pointer_rtx
9668 && dest != hard_frame_pointer_rtx);
9669 tmp = hard_frame_pointer_rtx;
9670 }
9671 insn = emit_insn (gen_rtx_SET (DImode, tmp, offset));
9672 if (style < 0)
9673 add_frame_related_expr = true;
9674
9675 insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, tmp);
9676 }
9677
9678 insn = emit_insn (insn);
9679 if (style >= 0)
9680 ix86_add_queued_cfa_restore_notes (insn);
9681
9682 if (set_cfa)
9683 {
9684 rtx r;
9685
9686 gcc_assert (m->fs.cfa_reg == src);
9687 m->fs.cfa_offset += INTVAL (offset);
9688 m->fs.cfa_reg = dest;
9689
9690 r = gen_rtx_PLUS (Pmode, src, offset);
9691 r = gen_rtx_SET (VOIDmode, dest, r);
9692 add_reg_note (insn, REG_CFA_ADJUST_CFA, r);
9693 RTX_FRAME_RELATED_P (insn) = 1;
9694 }
9695 else if (style < 0)
9696 {
9697 RTX_FRAME_RELATED_P (insn) = 1;
9698 if (add_frame_related_expr)
9699 {
9700 rtx r = gen_rtx_PLUS (Pmode, src, offset);
9701 r = gen_rtx_SET (VOIDmode, dest, r);
9702 add_reg_note (insn, REG_FRAME_RELATED_EXPR, r);
9703 }
9704 }
9705
9706 if (dest == stack_pointer_rtx)
9707 {
9708 HOST_WIDE_INT ooffset = m->fs.sp_offset;
9709 bool valid = m->fs.sp_valid;
9710
9711 if (src == hard_frame_pointer_rtx)
9712 {
9713 valid = m->fs.fp_valid;
9714 ooffset = m->fs.fp_offset;
9715 }
9716 else if (src == crtl->drap_reg)
9717 {
9718 valid = m->fs.drap_valid;
9719 ooffset = 0;
9720 }
9721 else
9722 {
9723 /* Else there are two possibilities: SP itself, which we set
9724 up as the default above. Or EH_RETURN_STACKADJ_RTX, which is
9725 taken care of this by hand along the eh_return path. */
9726 gcc_checking_assert (src == stack_pointer_rtx
9727 || offset == const0_rtx);
9728 }
9729
9730 m->fs.sp_offset = ooffset - INTVAL (offset);
9731 m->fs.sp_valid = valid;
9732 }
9733 }
9734
9735 /* Find an available register to be used as dynamic realign argument
9736 pointer regsiter. Such a register will be written in prologue and
9737 used in begin of body, so it must not be
9738 1. parameter passing register.
9739 2. GOT pointer.
9740 We reuse static-chain register if it is available. Otherwise, we
9741 use DI for i386 and R13 for x86-64. We chose R13 since it has
9742 shorter encoding.
9743
9744 Return: the regno of chosen register. */
9745
9746 static unsigned int
9747 find_drap_reg (void)
9748 {
9749 tree decl = cfun->decl;
9750
9751 if (TARGET_64BIT)
9752 {
9753 /* Use R13 for nested function or function need static chain.
9754 Since function with tail call may use any caller-saved
9755 registers in epilogue, DRAP must not use caller-saved
9756 register in such case. */
9757 if (DECL_STATIC_CHAIN (decl) || crtl->tail_call_emit)
9758 return R13_REG;
9759
9760 return R10_REG;
9761 }
9762 else
9763 {
9764 /* Use DI for nested function or function need static chain.
9765 Since function with tail call may use any caller-saved
9766 registers in epilogue, DRAP must not use caller-saved
9767 register in such case. */
9768 if (DECL_STATIC_CHAIN (decl) || crtl->tail_call_emit)
9769 return DI_REG;
9770
9771 /* Reuse static chain register if it isn't used for parameter
9772 passing. */
9773 if (ix86_function_regparm (TREE_TYPE (decl), decl) <= 2)
9774 {
9775 unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (decl));
9776 if ((ccvt & (IX86_CALLCVT_FASTCALL | IX86_CALLCVT_THISCALL)) == 0)
9777 return CX_REG;
9778 }
9779 return DI_REG;
9780 }
9781 }
9782
9783 /* Return minimum incoming stack alignment. */
9784
9785 static unsigned int
9786 ix86_minimum_incoming_stack_boundary (bool sibcall)
9787 {
9788 unsigned int incoming_stack_boundary;
9789
9790 /* Prefer the one specified at command line. */
9791 if (ix86_user_incoming_stack_boundary)
9792 incoming_stack_boundary = ix86_user_incoming_stack_boundary;
9793 /* In 32bit, use MIN_STACK_BOUNDARY for incoming stack boundary
9794 if -mstackrealign is used, it isn't used for sibcall check and
9795 estimated stack alignment is 128bit. */
9796 else if (!sibcall
9797 && !TARGET_64BIT
9798 && ix86_force_align_arg_pointer
9799 && crtl->stack_alignment_estimated == 128)
9800 incoming_stack_boundary = MIN_STACK_BOUNDARY;
9801 else
9802 incoming_stack_boundary = ix86_default_incoming_stack_boundary;
9803
9804 /* Incoming stack alignment can be changed on individual functions
9805 via force_align_arg_pointer attribute. We use the smallest
9806 incoming stack boundary. */
9807 if (incoming_stack_boundary > MIN_STACK_BOUNDARY
9808 && lookup_attribute (ix86_force_align_arg_pointer_string,
9809 TYPE_ATTRIBUTES (TREE_TYPE (current_function_decl))))
9810 incoming_stack_boundary = MIN_STACK_BOUNDARY;
9811
9812 /* The incoming stack frame has to be aligned at least at
9813 parm_stack_boundary. */
9814 if (incoming_stack_boundary < crtl->parm_stack_boundary)
9815 incoming_stack_boundary = crtl->parm_stack_boundary;
9816
9817 /* Stack at entrance of main is aligned by runtime. We use the
9818 smallest incoming stack boundary. */
9819 if (incoming_stack_boundary > MAIN_STACK_BOUNDARY
9820 && DECL_NAME (current_function_decl)
9821 && MAIN_NAME_P (DECL_NAME (current_function_decl))
9822 && DECL_FILE_SCOPE_P (current_function_decl))
9823 incoming_stack_boundary = MAIN_STACK_BOUNDARY;
9824
9825 return incoming_stack_boundary;
9826 }
9827
9828 /* Update incoming stack boundary and estimated stack alignment. */
9829
9830 static void
9831 ix86_update_stack_boundary (void)
9832 {
9833 ix86_incoming_stack_boundary
9834 = ix86_minimum_incoming_stack_boundary (false);
9835
9836 /* x86_64 vararg needs 16byte stack alignment for register save
9837 area. */
9838 if (TARGET_64BIT
9839 && cfun->stdarg
9840 && crtl->stack_alignment_estimated < 128)
9841 crtl->stack_alignment_estimated = 128;
9842 }
9843
9844 /* Handle the TARGET_GET_DRAP_RTX hook. Return NULL if no DRAP is
9845 needed or an rtx for DRAP otherwise. */
9846
9847 static rtx
9848 ix86_get_drap_rtx (void)
9849 {
9850 if (ix86_force_drap || !ACCUMULATE_OUTGOING_ARGS)
9851 crtl->need_drap = true;
9852
9853 if (stack_realign_drap)
9854 {
9855 /* Assign DRAP to vDRAP and returns vDRAP */
9856 unsigned int regno = find_drap_reg ();
9857 rtx drap_vreg;
9858 rtx arg_ptr;
9859 rtx seq, insn;
9860
9861 arg_ptr = gen_rtx_REG (Pmode, regno);
9862 crtl->drap_reg = arg_ptr;
9863
9864 start_sequence ();
9865 drap_vreg = copy_to_reg (arg_ptr);
9866 seq = get_insns ();
9867 end_sequence ();
9868
9869 insn = emit_insn_before (seq, NEXT_INSN (entry_of_function ()));
9870 if (!optimize)
9871 {
9872 add_reg_note (insn, REG_CFA_SET_VDRAP, drap_vreg);
9873 RTX_FRAME_RELATED_P (insn) = 1;
9874 }
9875 return drap_vreg;
9876 }
9877 else
9878 return NULL;
9879 }
9880
9881 /* Handle the TARGET_INTERNAL_ARG_POINTER hook. */
9882
9883 static rtx
9884 ix86_internal_arg_pointer (void)
9885 {
9886 return virtual_incoming_args_rtx;
9887 }
9888
9889 struct scratch_reg {
9890 rtx reg;
9891 bool saved;
9892 };
9893
9894 /* Return a short-lived scratch register for use on function entry.
9895 In 32-bit mode, it is valid only after the registers are saved
9896 in the prologue. This register must be released by means of
9897 release_scratch_register_on_entry once it is dead. */
9898
9899 static void
9900 get_scratch_register_on_entry (struct scratch_reg *sr)
9901 {
9902 int regno;
9903
9904 sr->saved = false;
9905
9906 if (TARGET_64BIT)
9907 {
9908 /* We always use R11 in 64-bit mode. */
9909 regno = R11_REG;
9910 }
9911 else
9912 {
9913 tree decl = current_function_decl, fntype = TREE_TYPE (decl);
9914 bool fastcall_p
9915 = lookup_attribute ("fastcall", TYPE_ATTRIBUTES (fntype)) != NULL_TREE;
9916 bool static_chain_p = DECL_STATIC_CHAIN (decl);
9917 int regparm = ix86_function_regparm (fntype, decl);
9918 int drap_regno
9919 = crtl->drap_reg ? REGNO (crtl->drap_reg) : INVALID_REGNUM;
9920
9921 /* 'fastcall' sets regparm to 2, uses ecx/edx for arguments and eax
9922 for the static chain register. */
9923 if ((regparm < 1 || (fastcall_p && !static_chain_p))
9924 && drap_regno != AX_REG)
9925 regno = AX_REG;
9926 else if (regparm < 2 && drap_regno != DX_REG)
9927 regno = DX_REG;
9928 /* ecx is the static chain register. */
9929 else if (regparm < 3 && !fastcall_p && !static_chain_p
9930 && drap_regno != CX_REG)
9931 regno = CX_REG;
9932 else if (ix86_save_reg (BX_REG, true))
9933 regno = BX_REG;
9934 /* esi is the static chain register. */
9935 else if (!(regparm == 3 && static_chain_p)
9936 && ix86_save_reg (SI_REG, true))
9937 regno = SI_REG;
9938 else if (ix86_save_reg (DI_REG, true))
9939 regno = DI_REG;
9940 else
9941 {
9942 regno = (drap_regno == AX_REG ? DX_REG : AX_REG);
9943 sr->saved = true;
9944 }
9945 }
9946
9947 sr->reg = gen_rtx_REG (Pmode, regno);
9948 if (sr->saved)
9949 {
9950 rtx insn = emit_insn (gen_push (sr->reg));
9951 RTX_FRAME_RELATED_P (insn) = 1;
9952 }
9953 }
9954
9955 /* Release a scratch register obtained from the preceding function. */
9956
9957 static void
9958 release_scratch_register_on_entry (struct scratch_reg *sr)
9959 {
9960 if (sr->saved)
9961 {
9962 rtx x, insn = emit_insn (gen_pop (sr->reg));
9963
9964 /* The RTX_FRAME_RELATED_P mechanism doesn't know about pop. */
9965 RTX_FRAME_RELATED_P (insn) = 1;
9966 x = gen_rtx_PLUS (Pmode, stack_pointer_rtx, GEN_INT (UNITS_PER_WORD));
9967 x = gen_rtx_SET (VOIDmode, stack_pointer_rtx, x);
9968 add_reg_note (insn, REG_FRAME_RELATED_EXPR, x);
9969 }
9970 }
9971
9972 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
9973
9974 /* Emit code to adjust the stack pointer by SIZE bytes while probing it. */
9975
9976 static void
9977 ix86_adjust_stack_and_probe (const HOST_WIDE_INT size)
9978 {
9979 /* We skip the probe for the first interval + a small dope of 4 words and
9980 probe that many bytes past the specified size to maintain a protection
9981 area at the botton of the stack. */
9982 const int dope = 4 * UNITS_PER_WORD;
9983 rtx size_rtx = GEN_INT (size), last;
9984
9985 /* See if we have a constant small number of probes to generate. If so,
9986 that's the easy case. The run-time loop is made up of 11 insns in the
9987 generic case while the compile-time loop is made up of 3+2*(n-1) insns
9988 for n # of intervals. */
9989 if (size <= 5 * PROBE_INTERVAL)
9990 {
9991 HOST_WIDE_INT i, adjust;
9992 bool first_probe = true;
9993
9994 /* Adjust SP and probe at PROBE_INTERVAL + N * PROBE_INTERVAL for
9995 values of N from 1 until it exceeds SIZE. If only one probe is
9996 needed, this will not generate any code. Then adjust and probe
9997 to PROBE_INTERVAL + SIZE. */
9998 for (i = PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
9999 {
10000 if (first_probe)
10001 {
10002 adjust = 2 * PROBE_INTERVAL + dope;
10003 first_probe = false;
10004 }
10005 else
10006 adjust = PROBE_INTERVAL;
10007
10008 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
10009 plus_constant (Pmode, stack_pointer_rtx,
10010 -adjust)));
10011 emit_stack_probe (stack_pointer_rtx);
10012 }
10013
10014 if (first_probe)
10015 adjust = size + PROBE_INTERVAL + dope;
10016 else
10017 adjust = size + PROBE_INTERVAL - i;
10018
10019 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
10020 plus_constant (Pmode, stack_pointer_rtx,
10021 -adjust)));
10022 emit_stack_probe (stack_pointer_rtx);
10023
10024 /* Adjust back to account for the additional first interval. */
10025 last = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
10026 plus_constant (Pmode, stack_pointer_rtx,
10027 PROBE_INTERVAL + dope)));
10028 }
10029
10030 /* Otherwise, do the same as above, but in a loop. Note that we must be
10031 extra careful with variables wrapping around because we might be at
10032 the very top (or the very bottom) of the address space and we have
10033 to be able to handle this case properly; in particular, we use an
10034 equality test for the loop condition. */
10035 else
10036 {
10037 HOST_WIDE_INT rounded_size;
10038 struct scratch_reg sr;
10039
10040 get_scratch_register_on_entry (&sr);
10041
10042
10043 /* Step 1: round SIZE to the previous multiple of the interval. */
10044
10045 rounded_size = size & -PROBE_INTERVAL;
10046
10047
10048 /* Step 2: compute initial and final value of the loop counter. */
10049
10050 /* SP = SP_0 + PROBE_INTERVAL. */
10051 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
10052 plus_constant (Pmode, stack_pointer_rtx,
10053 - (PROBE_INTERVAL + dope))));
10054
10055 /* LAST_ADDR = SP_0 + PROBE_INTERVAL + ROUNDED_SIZE. */
10056 emit_move_insn (sr.reg, GEN_INT (-rounded_size));
10057 emit_insn (gen_rtx_SET (VOIDmode, sr.reg,
10058 gen_rtx_PLUS (Pmode, sr.reg,
10059 stack_pointer_rtx)));
10060
10061
10062 /* Step 3: the loop
10063
10064 while (SP != LAST_ADDR)
10065 {
10066 SP = SP + PROBE_INTERVAL
10067 probe at SP
10068 }
10069
10070 adjusts SP and probes to PROBE_INTERVAL + N * PROBE_INTERVAL for
10071 values of N from 1 until it is equal to ROUNDED_SIZE. */
10072
10073 emit_insn (ix86_gen_adjust_stack_and_probe (sr.reg, sr.reg, size_rtx));
10074
10075
10076 /* Step 4: adjust SP and probe at PROBE_INTERVAL + SIZE if we cannot
10077 assert at compile-time that SIZE is equal to ROUNDED_SIZE. */
10078
10079 if (size != rounded_size)
10080 {
10081 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
10082 plus_constant (Pmode, stack_pointer_rtx,
10083 rounded_size - size)));
10084 emit_stack_probe (stack_pointer_rtx);
10085 }
10086
10087 /* Adjust back to account for the additional first interval. */
10088 last = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
10089 plus_constant (Pmode, stack_pointer_rtx,
10090 PROBE_INTERVAL + dope)));
10091
10092 release_scratch_register_on_entry (&sr);
10093 }
10094
10095 gcc_assert (cfun->machine->fs.cfa_reg != stack_pointer_rtx);
10096
10097 /* Even if the stack pointer isn't the CFA register, we need to correctly
10098 describe the adjustments made to it, in particular differentiate the
10099 frame-related ones from the frame-unrelated ones. */
10100 if (size > 0)
10101 {
10102 rtx expr = gen_rtx_SEQUENCE (VOIDmode, rtvec_alloc (2));
10103 XVECEXP (expr, 0, 0)
10104 = gen_rtx_SET (VOIDmode, stack_pointer_rtx,
10105 plus_constant (Pmode, stack_pointer_rtx, -size));
10106 XVECEXP (expr, 0, 1)
10107 = gen_rtx_SET (VOIDmode, stack_pointer_rtx,
10108 plus_constant (Pmode, stack_pointer_rtx,
10109 PROBE_INTERVAL + dope + size));
10110 add_reg_note (last, REG_FRAME_RELATED_EXPR, expr);
10111 RTX_FRAME_RELATED_P (last) = 1;
10112
10113 cfun->machine->fs.sp_offset += size;
10114 }
10115
10116 /* Make sure nothing is scheduled before we are done. */
10117 emit_insn (gen_blockage ());
10118 }
10119
10120 /* Adjust the stack pointer up to REG while probing it. */
10121
10122 const char *
10123 output_adjust_stack_and_probe (rtx reg)
10124 {
10125 static int labelno = 0;
10126 char loop_lab[32], end_lab[32];
10127 rtx xops[2];
10128
10129 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno);
10130 ASM_GENERATE_INTERNAL_LABEL (end_lab, "LPSRE", labelno++);
10131
10132 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
10133
10134 /* Jump to END_LAB if SP == LAST_ADDR. */
10135 xops[0] = stack_pointer_rtx;
10136 xops[1] = reg;
10137 output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
10138 fputs ("\tje\t", asm_out_file);
10139 assemble_name_raw (asm_out_file, end_lab);
10140 fputc ('\n', asm_out_file);
10141
10142 /* SP = SP + PROBE_INTERVAL. */
10143 xops[1] = GEN_INT (PROBE_INTERVAL);
10144 output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
10145
10146 /* Probe at SP. */
10147 xops[1] = const0_rtx;
10148 output_asm_insn ("or%z0\t{%1, (%0)|DWORD PTR [%0], %1}", xops);
10149
10150 fprintf (asm_out_file, "\tjmp\t");
10151 assemble_name_raw (asm_out_file, loop_lab);
10152 fputc ('\n', asm_out_file);
10153
10154 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, end_lab);
10155
10156 return "";
10157 }
10158
10159 /* Emit code to probe a range of stack addresses from FIRST to FIRST+SIZE,
10160 inclusive. These are offsets from the current stack pointer. */
10161
10162 static void
10163 ix86_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size)
10164 {
10165 /* See if we have a constant small number of probes to generate. If so,
10166 that's the easy case. The run-time loop is made up of 7 insns in the
10167 generic case while the compile-time loop is made up of n insns for n #
10168 of intervals. */
10169 if (size <= 7 * PROBE_INTERVAL)
10170 {
10171 HOST_WIDE_INT i;
10172
10173 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 1 until
10174 it exceeds SIZE. If only one probe is needed, this will not
10175 generate any code. Then probe at FIRST + SIZE. */
10176 for (i = PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
10177 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
10178 -(first + i)));
10179
10180 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
10181 -(first + size)));
10182 }
10183
10184 /* Otherwise, do the same as above, but in a loop. Note that we must be
10185 extra careful with variables wrapping around because we might be at
10186 the very top (or the very bottom) of the address space and we have
10187 to be able to handle this case properly; in particular, we use an
10188 equality test for the loop condition. */
10189 else
10190 {
10191 HOST_WIDE_INT rounded_size, last;
10192 struct scratch_reg sr;
10193
10194 get_scratch_register_on_entry (&sr);
10195
10196
10197 /* Step 1: round SIZE to the previous multiple of the interval. */
10198
10199 rounded_size = size & -PROBE_INTERVAL;
10200
10201
10202 /* Step 2: compute initial and final value of the loop counter. */
10203
10204 /* TEST_OFFSET = FIRST. */
10205 emit_move_insn (sr.reg, GEN_INT (-first));
10206
10207 /* LAST_OFFSET = FIRST + ROUNDED_SIZE. */
10208 last = first + rounded_size;
10209
10210
10211 /* Step 3: the loop
10212
10213 while (TEST_ADDR != LAST_ADDR)
10214 {
10215 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
10216 probe at TEST_ADDR
10217 }
10218
10219 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
10220 until it is equal to ROUNDED_SIZE. */
10221
10222 emit_insn (ix86_gen_probe_stack_range (sr.reg, sr.reg, GEN_INT (-last)));
10223
10224
10225 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
10226 that SIZE is equal to ROUNDED_SIZE. */
10227
10228 if (size != rounded_size)
10229 emit_stack_probe (plus_constant (Pmode,
10230 gen_rtx_PLUS (Pmode,
10231 stack_pointer_rtx,
10232 sr.reg),
10233 rounded_size - size));
10234
10235 release_scratch_register_on_entry (&sr);
10236 }
10237
10238 /* Make sure nothing is scheduled before we are done. */
10239 emit_insn (gen_blockage ());
10240 }
10241
10242 /* Probe a range of stack addresses from REG to END, inclusive. These are
10243 offsets from the current stack pointer. */
10244
10245 const char *
10246 output_probe_stack_range (rtx reg, rtx end)
10247 {
10248 static int labelno = 0;
10249 char loop_lab[32], end_lab[32];
10250 rtx xops[3];
10251
10252 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno);
10253 ASM_GENERATE_INTERNAL_LABEL (end_lab, "LPSRE", labelno++);
10254
10255 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
10256
10257 /* Jump to END_LAB if TEST_ADDR == LAST_ADDR. */
10258 xops[0] = reg;
10259 xops[1] = end;
10260 output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
10261 fputs ("\tje\t", asm_out_file);
10262 assemble_name_raw (asm_out_file, end_lab);
10263 fputc ('\n', asm_out_file);
10264
10265 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
10266 xops[1] = GEN_INT (PROBE_INTERVAL);
10267 output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
10268
10269 /* Probe at TEST_ADDR. */
10270 xops[0] = stack_pointer_rtx;
10271 xops[1] = reg;
10272 xops[2] = const0_rtx;
10273 output_asm_insn ("or%z0\t{%2, (%0,%1)|DWORD PTR [%0+%1], %2}", xops);
10274
10275 fprintf (asm_out_file, "\tjmp\t");
10276 assemble_name_raw (asm_out_file, loop_lab);
10277 fputc ('\n', asm_out_file);
10278
10279 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, end_lab);
10280
10281 return "";
10282 }
10283
10284 /* Finalize stack_realign_needed flag, which will guide prologue/epilogue
10285 to be generated in correct form. */
10286 static void
10287 ix86_finalize_stack_realign_flags (void)
10288 {
10289 /* Check if stack realign is really needed after reload, and
10290 stores result in cfun */
10291 unsigned int incoming_stack_boundary
10292 = (crtl->parm_stack_boundary > ix86_incoming_stack_boundary
10293 ? crtl->parm_stack_boundary : ix86_incoming_stack_boundary);
10294 unsigned int stack_realign = (incoming_stack_boundary
10295 < (crtl->is_leaf
10296 ? crtl->max_used_stack_slot_alignment
10297 : crtl->stack_alignment_needed));
10298
10299 if (crtl->stack_realign_finalized)
10300 {
10301 /* After stack_realign_needed is finalized, we can't no longer
10302 change it. */
10303 gcc_assert (crtl->stack_realign_needed == stack_realign);
10304 return;
10305 }
10306
10307 /* If the only reason for frame_pointer_needed is that we conservatively
10308 assumed stack realignment might be needed, but in the end nothing that
10309 needed the stack alignment had been spilled, clear frame_pointer_needed
10310 and say we don't need stack realignment. */
10311 if (stack_realign
10312 && !crtl->need_drap
10313 && frame_pointer_needed
10314 && crtl->is_leaf
10315 && flag_omit_frame_pointer
10316 && crtl->sp_is_unchanging
10317 && !ix86_current_function_calls_tls_descriptor
10318 && !crtl->accesses_prior_frames
10319 && !cfun->calls_alloca
10320 && !crtl->calls_eh_return
10321 && !(flag_stack_check && STACK_CHECK_MOVING_SP)
10322 && !ix86_frame_pointer_required ()
10323 && get_frame_size () == 0
10324 && ix86_nsaved_sseregs () == 0
10325 && ix86_varargs_gpr_size + ix86_varargs_fpr_size == 0)
10326 {
10327 HARD_REG_SET set_up_by_prologue, prologue_used;
10328 basic_block bb;
10329
10330 CLEAR_HARD_REG_SET (prologue_used);
10331 CLEAR_HARD_REG_SET (set_up_by_prologue);
10332 add_to_hard_reg_set (&set_up_by_prologue, Pmode, STACK_POINTER_REGNUM);
10333 add_to_hard_reg_set (&set_up_by_prologue, Pmode, ARG_POINTER_REGNUM);
10334 add_to_hard_reg_set (&set_up_by_prologue, Pmode,
10335 HARD_FRAME_POINTER_REGNUM);
10336 FOR_EACH_BB (bb)
10337 {
10338 rtx insn;
10339 FOR_BB_INSNS (bb, insn)
10340 if (NONDEBUG_INSN_P (insn)
10341 && requires_stack_frame_p (insn, prologue_used,
10342 set_up_by_prologue))
10343 {
10344 crtl->stack_realign_needed = stack_realign;
10345 crtl->stack_realign_finalized = true;
10346 return;
10347 }
10348 }
10349
10350 frame_pointer_needed = false;
10351 stack_realign = false;
10352 crtl->max_used_stack_slot_alignment = incoming_stack_boundary;
10353 crtl->stack_alignment_needed = incoming_stack_boundary;
10354 crtl->stack_alignment_estimated = incoming_stack_boundary;
10355 if (crtl->preferred_stack_boundary > incoming_stack_boundary)
10356 crtl->preferred_stack_boundary = incoming_stack_boundary;
10357 df_finish_pass (true);
10358 df_scan_alloc (NULL);
10359 df_scan_blocks ();
10360 df_compute_regs_ever_live (true);
10361 df_analyze ();
10362 }
10363
10364 crtl->stack_realign_needed = stack_realign;
10365 crtl->stack_realign_finalized = true;
10366 }
10367
10368 /* Expand the prologue into a bunch of separate insns. */
10369
10370 void
10371 ix86_expand_prologue (void)
10372 {
10373 struct machine_function *m = cfun->machine;
10374 rtx insn, t;
10375 bool pic_reg_used;
10376 struct ix86_frame frame;
10377 HOST_WIDE_INT allocate;
10378 bool int_registers_saved;
10379 bool sse_registers_saved;
10380
10381 ix86_finalize_stack_realign_flags ();
10382
10383 /* DRAP should not coexist with stack_realign_fp */
10384 gcc_assert (!(crtl->drap_reg && stack_realign_fp));
10385
10386 memset (&m->fs, 0, sizeof (m->fs));
10387
10388 /* Initialize CFA state for before the prologue. */
10389 m->fs.cfa_reg = stack_pointer_rtx;
10390 m->fs.cfa_offset = INCOMING_FRAME_SP_OFFSET;
10391
10392 /* Track SP offset to the CFA. We continue tracking this after we've
10393 swapped the CFA register away from SP. In the case of re-alignment
10394 this is fudged; we're interested to offsets within the local frame. */
10395 m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
10396 m->fs.sp_valid = true;
10397
10398 ix86_compute_frame_layout (&frame);
10399
10400 if (!TARGET_64BIT && ix86_function_ms_hook_prologue (current_function_decl))
10401 {
10402 /* We should have already generated an error for any use of
10403 ms_hook on a nested function. */
10404 gcc_checking_assert (!ix86_static_chain_on_stack);
10405
10406 /* Check if profiling is active and we shall use profiling before
10407 prologue variant. If so sorry. */
10408 if (crtl->profile && flag_fentry != 0)
10409 sorry ("ms_hook_prologue attribute isn%'t compatible "
10410 "with -mfentry for 32-bit");
10411
10412 /* In ix86_asm_output_function_label we emitted:
10413 8b ff movl.s %edi,%edi
10414 55 push %ebp
10415 8b ec movl.s %esp,%ebp
10416
10417 This matches the hookable function prologue in Win32 API
10418 functions in Microsoft Windows XP Service Pack 2 and newer.
10419 Wine uses this to enable Windows apps to hook the Win32 API
10420 functions provided by Wine.
10421
10422 What that means is that we've already set up the frame pointer. */
10423
10424 if (frame_pointer_needed
10425 && !(crtl->drap_reg && crtl->stack_realign_needed))
10426 {
10427 rtx push, mov;
10428
10429 /* We've decided to use the frame pointer already set up.
10430 Describe this to the unwinder by pretending that both
10431 push and mov insns happen right here.
10432
10433 Putting the unwind info here at the end of the ms_hook
10434 is done so that we can make absolutely certain we get
10435 the required byte sequence at the start of the function,
10436 rather than relying on an assembler that can produce
10437 the exact encoding required.
10438
10439 However it does mean (in the unpatched case) that we have
10440 a 1 insn window where the asynchronous unwind info is
10441 incorrect. However, if we placed the unwind info at
10442 its correct location we would have incorrect unwind info
10443 in the patched case. Which is probably all moot since
10444 I don't expect Wine generates dwarf2 unwind info for the
10445 system libraries that use this feature. */
10446
10447 insn = emit_insn (gen_blockage ());
10448
10449 push = gen_push (hard_frame_pointer_rtx);
10450 mov = gen_rtx_SET (VOIDmode, hard_frame_pointer_rtx,
10451 stack_pointer_rtx);
10452 RTX_FRAME_RELATED_P (push) = 1;
10453 RTX_FRAME_RELATED_P (mov) = 1;
10454
10455 RTX_FRAME_RELATED_P (insn) = 1;
10456 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
10457 gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, push, mov)));
10458
10459 /* Note that gen_push incremented m->fs.cfa_offset, even
10460 though we didn't emit the push insn here. */
10461 m->fs.cfa_reg = hard_frame_pointer_rtx;
10462 m->fs.fp_offset = m->fs.cfa_offset;
10463 m->fs.fp_valid = true;
10464 }
10465 else
10466 {
10467 /* The frame pointer is not needed so pop %ebp again.
10468 This leaves us with a pristine state. */
10469 emit_insn (gen_pop (hard_frame_pointer_rtx));
10470 }
10471 }
10472
10473 /* The first insn of a function that accepts its static chain on the
10474 stack is to push the register that would be filled in by a direct
10475 call. This insn will be skipped by the trampoline. */
10476 else if (ix86_static_chain_on_stack)
10477 {
10478 insn = emit_insn (gen_push (ix86_static_chain (cfun->decl, false)));
10479 emit_insn (gen_blockage ());
10480
10481 /* We don't want to interpret this push insn as a register save,
10482 only as a stack adjustment. The real copy of the register as
10483 a save will be done later, if needed. */
10484 t = plus_constant (Pmode, stack_pointer_rtx, -UNITS_PER_WORD);
10485 t = gen_rtx_SET (VOIDmode, stack_pointer_rtx, t);
10486 add_reg_note (insn, REG_CFA_ADJUST_CFA, t);
10487 RTX_FRAME_RELATED_P (insn) = 1;
10488 }
10489
10490 /* Emit prologue code to adjust stack alignment and setup DRAP, in case
10491 of DRAP is needed and stack realignment is really needed after reload */
10492 if (stack_realign_drap)
10493 {
10494 int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
10495
10496 /* Only need to push parameter pointer reg if it is caller saved. */
10497 if (!call_used_regs[REGNO (crtl->drap_reg)])
10498 {
10499 /* Push arg pointer reg */
10500 insn = emit_insn (gen_push (crtl->drap_reg));
10501 RTX_FRAME_RELATED_P (insn) = 1;
10502 }
10503
10504 /* Grab the argument pointer. */
10505 t = plus_constant (Pmode, stack_pointer_rtx, m->fs.sp_offset);
10506 insn = emit_insn (gen_rtx_SET (VOIDmode, crtl->drap_reg, t));
10507 RTX_FRAME_RELATED_P (insn) = 1;
10508 m->fs.cfa_reg = crtl->drap_reg;
10509 m->fs.cfa_offset = 0;
10510
10511 /* Align the stack. */
10512 insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
10513 stack_pointer_rtx,
10514 GEN_INT (-align_bytes)));
10515 RTX_FRAME_RELATED_P (insn) = 1;
10516
10517 /* Replicate the return address on the stack so that return
10518 address can be reached via (argp - 1) slot. This is needed
10519 to implement macro RETURN_ADDR_RTX and intrinsic function
10520 expand_builtin_return_addr etc. */
10521 t = plus_constant (Pmode, crtl->drap_reg, -UNITS_PER_WORD);
10522 t = gen_frame_mem (word_mode, t);
10523 insn = emit_insn (gen_push (t));
10524 RTX_FRAME_RELATED_P (insn) = 1;
10525
10526 /* For the purposes of frame and register save area addressing,
10527 we've started over with a new frame. */
10528 m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
10529 m->fs.realigned = true;
10530 }
10531
10532 int_registers_saved = (frame.nregs == 0);
10533 sse_registers_saved = (frame.nsseregs == 0);
10534
10535 if (frame_pointer_needed && !m->fs.fp_valid)
10536 {
10537 /* Note: AT&T enter does NOT have reversed args. Enter is probably
10538 slower on all targets. Also sdb doesn't like it. */
10539 insn = emit_insn (gen_push (hard_frame_pointer_rtx));
10540 RTX_FRAME_RELATED_P (insn) = 1;
10541
10542 /* Push registers now, before setting the frame pointer
10543 on SEH target. */
10544 if (!int_registers_saved
10545 && TARGET_SEH
10546 && !frame.save_regs_using_mov)
10547 {
10548 ix86_emit_save_regs ();
10549 int_registers_saved = true;
10550 gcc_assert (m->fs.sp_offset == frame.reg_save_offset);
10551 }
10552
10553 if (m->fs.sp_offset == frame.hard_frame_pointer_offset)
10554 {
10555 insn = emit_move_insn (hard_frame_pointer_rtx, stack_pointer_rtx);
10556 RTX_FRAME_RELATED_P (insn) = 1;
10557
10558 if (m->fs.cfa_reg == stack_pointer_rtx)
10559 m->fs.cfa_reg = hard_frame_pointer_rtx;
10560 m->fs.fp_offset = m->fs.sp_offset;
10561 m->fs.fp_valid = true;
10562 }
10563 }
10564
10565 if (!int_registers_saved)
10566 {
10567 /* If saving registers via PUSH, do so now. */
10568 if (!frame.save_regs_using_mov)
10569 {
10570 ix86_emit_save_regs ();
10571 int_registers_saved = true;
10572 gcc_assert (m->fs.sp_offset == frame.reg_save_offset);
10573 }
10574
10575 /* When using red zone we may start register saving before allocating
10576 the stack frame saving one cycle of the prologue. However, avoid
10577 doing this if we have to probe the stack; at least on x86_64 the
10578 stack probe can turn into a call that clobbers a red zone location. */
10579 else if (ix86_using_red_zone ()
10580 && (! TARGET_STACK_PROBE
10581 || frame.stack_pointer_offset < CHECK_STACK_LIMIT))
10582 {
10583 ix86_emit_save_regs_using_mov (frame.reg_save_offset);
10584 int_registers_saved = true;
10585 }
10586 }
10587
10588 if (stack_realign_fp)
10589 {
10590 int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
10591 gcc_assert (align_bytes > MIN_STACK_BOUNDARY / BITS_PER_UNIT);
10592
10593 /* The computation of the size of the re-aligned stack frame means
10594 that we must allocate the size of the register save area before
10595 performing the actual alignment. Otherwise we cannot guarantee
10596 that there's enough storage above the realignment point. */
10597 if (m->fs.sp_offset != frame.sse_reg_save_offset)
10598 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10599 GEN_INT (m->fs.sp_offset
10600 - frame.sse_reg_save_offset),
10601 -1, false);
10602
10603 /* Align the stack. */
10604 insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
10605 stack_pointer_rtx,
10606 GEN_INT (-align_bytes)));
10607
10608 /* For the purposes of register save area addressing, the stack
10609 pointer is no longer valid. As for the value of sp_offset,
10610 see ix86_compute_frame_layout, which we need to match in order
10611 to pass verification of stack_pointer_offset at the end. */
10612 m->fs.sp_offset = (m->fs.sp_offset + align_bytes) & -align_bytes;
10613 m->fs.sp_valid = false;
10614 }
10615
10616 allocate = frame.stack_pointer_offset - m->fs.sp_offset;
10617
10618 if (flag_stack_usage_info)
10619 {
10620 /* We start to count from ARG_POINTER. */
10621 HOST_WIDE_INT stack_size = frame.stack_pointer_offset;
10622
10623 /* If it was realigned, take into account the fake frame. */
10624 if (stack_realign_drap)
10625 {
10626 if (ix86_static_chain_on_stack)
10627 stack_size += UNITS_PER_WORD;
10628
10629 if (!call_used_regs[REGNO (crtl->drap_reg)])
10630 stack_size += UNITS_PER_WORD;
10631
10632 /* This over-estimates by 1 minimal-stack-alignment-unit but
10633 mitigates that by counting in the new return address slot. */
10634 current_function_dynamic_stack_size
10635 += crtl->stack_alignment_needed / BITS_PER_UNIT;
10636 }
10637
10638 current_function_static_stack_size = stack_size;
10639 }
10640
10641 /* On SEH target with very large frame size, allocate an area to save
10642 SSE registers (as the very large allocation won't be described). */
10643 if (TARGET_SEH
10644 && frame.stack_pointer_offset > SEH_MAX_FRAME_SIZE
10645 && !sse_registers_saved)
10646 {
10647 HOST_WIDE_INT sse_size =
10648 frame.sse_reg_save_offset - frame.reg_save_offset;
10649
10650 gcc_assert (int_registers_saved);
10651
10652 /* No need to do stack checking as the area will be immediately
10653 written. */
10654 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10655 GEN_INT (-sse_size), -1,
10656 m->fs.cfa_reg == stack_pointer_rtx);
10657 allocate -= sse_size;
10658 ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
10659 sse_registers_saved = true;
10660 }
10661
10662 /* The stack has already been decremented by the instruction calling us
10663 so probe if the size is non-negative to preserve the protection area. */
10664 if (allocate >= 0 && flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
10665 {
10666 /* We expect the registers to be saved when probes are used. */
10667 gcc_assert (int_registers_saved);
10668
10669 if (STACK_CHECK_MOVING_SP)
10670 {
10671 ix86_adjust_stack_and_probe (allocate);
10672 allocate = 0;
10673 }
10674 else
10675 {
10676 HOST_WIDE_INT size = allocate;
10677
10678 if (TARGET_64BIT && size >= (HOST_WIDE_INT) 0x80000000)
10679 size = 0x80000000 - STACK_CHECK_PROTECT - 1;
10680
10681 if (TARGET_STACK_PROBE)
10682 ix86_emit_probe_stack_range (0, size + STACK_CHECK_PROTECT);
10683 else
10684 ix86_emit_probe_stack_range (STACK_CHECK_PROTECT, size);
10685 }
10686 }
10687
10688 if (allocate == 0)
10689 ;
10690 else if (!ix86_target_stack_probe ()
10691 || frame.stack_pointer_offset < CHECK_STACK_LIMIT)
10692 {
10693 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10694 GEN_INT (-allocate), -1,
10695 m->fs.cfa_reg == stack_pointer_rtx);
10696 }
10697 else
10698 {
10699 rtx eax = gen_rtx_REG (Pmode, AX_REG);
10700 rtx r10 = NULL;
10701 rtx (*adjust_stack_insn)(rtx, rtx, rtx);
10702
10703 bool eax_live = false;
10704 bool r10_live = false;
10705
10706 if (TARGET_64BIT)
10707 r10_live = (DECL_STATIC_CHAIN (current_function_decl) != 0);
10708 if (!TARGET_64BIT_MS_ABI)
10709 eax_live = ix86_eax_live_at_start_p ();
10710
10711 if (eax_live)
10712 {
10713 emit_insn (gen_push (eax));
10714 allocate -= UNITS_PER_WORD;
10715 }
10716 if (r10_live)
10717 {
10718 r10 = gen_rtx_REG (Pmode, R10_REG);
10719 emit_insn (gen_push (r10));
10720 allocate -= UNITS_PER_WORD;
10721 }
10722
10723 emit_move_insn (eax, GEN_INT (allocate));
10724 emit_insn (ix86_gen_allocate_stack_worker (eax, eax));
10725
10726 /* Use the fact that AX still contains ALLOCATE. */
10727 adjust_stack_insn = (Pmode == DImode
10728 ? gen_pro_epilogue_adjust_stack_di_sub
10729 : gen_pro_epilogue_adjust_stack_si_sub);
10730
10731 insn = emit_insn (adjust_stack_insn (stack_pointer_rtx,
10732 stack_pointer_rtx, eax));
10733
10734 /* Note that SEH directives need to continue tracking the stack
10735 pointer even after the frame pointer has been set up. */
10736 if (m->fs.cfa_reg == stack_pointer_rtx || TARGET_SEH)
10737 {
10738 if (m->fs.cfa_reg == stack_pointer_rtx)
10739 m->fs.cfa_offset += allocate;
10740
10741 RTX_FRAME_RELATED_P (insn) = 1;
10742 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
10743 gen_rtx_SET (VOIDmode, stack_pointer_rtx,
10744 plus_constant (Pmode, stack_pointer_rtx,
10745 -allocate)));
10746 }
10747 m->fs.sp_offset += allocate;
10748
10749 if (r10_live && eax_live)
10750 {
10751 t = choose_baseaddr (m->fs.sp_offset - allocate);
10752 emit_move_insn (gen_rtx_REG (word_mode, R10_REG),
10753 gen_frame_mem (word_mode, t));
10754 t = choose_baseaddr (m->fs.sp_offset - allocate - UNITS_PER_WORD);
10755 emit_move_insn (gen_rtx_REG (word_mode, AX_REG),
10756 gen_frame_mem (word_mode, t));
10757 }
10758 else if (eax_live || r10_live)
10759 {
10760 t = choose_baseaddr (m->fs.sp_offset - allocate);
10761 emit_move_insn (gen_rtx_REG (word_mode,
10762 (eax_live ? AX_REG : R10_REG)),
10763 gen_frame_mem (word_mode, t));
10764 }
10765 }
10766 gcc_assert (m->fs.sp_offset == frame.stack_pointer_offset);
10767
10768 /* If we havn't already set up the frame pointer, do so now. */
10769 if (frame_pointer_needed && !m->fs.fp_valid)
10770 {
10771 insn = ix86_gen_add3 (hard_frame_pointer_rtx, stack_pointer_rtx,
10772 GEN_INT (frame.stack_pointer_offset
10773 - frame.hard_frame_pointer_offset));
10774 insn = emit_insn (insn);
10775 RTX_FRAME_RELATED_P (insn) = 1;
10776 add_reg_note (insn, REG_CFA_ADJUST_CFA, NULL);
10777
10778 if (m->fs.cfa_reg == stack_pointer_rtx)
10779 m->fs.cfa_reg = hard_frame_pointer_rtx;
10780 m->fs.fp_offset = frame.hard_frame_pointer_offset;
10781 m->fs.fp_valid = true;
10782 }
10783
10784 if (!int_registers_saved)
10785 ix86_emit_save_regs_using_mov (frame.reg_save_offset);
10786 if (!sse_registers_saved)
10787 ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
10788
10789 pic_reg_used = false;
10790 if (pic_offset_table_rtx
10791 && (df_regs_ever_live_p (REAL_PIC_OFFSET_TABLE_REGNUM)
10792 || crtl->profile))
10793 {
10794 unsigned int alt_pic_reg_used = ix86_select_alt_pic_regnum ();
10795
10796 if (alt_pic_reg_used != INVALID_REGNUM)
10797 SET_REGNO (pic_offset_table_rtx, alt_pic_reg_used);
10798
10799 pic_reg_used = true;
10800 }
10801
10802 if (pic_reg_used)
10803 {
10804 if (TARGET_64BIT)
10805 {
10806 if (ix86_cmodel == CM_LARGE_PIC)
10807 {
10808 rtx label, tmp_reg;
10809
10810 gcc_assert (Pmode == DImode);
10811 label = gen_label_rtx ();
10812 emit_label (label);
10813 LABEL_PRESERVE_P (label) = 1;
10814 tmp_reg = gen_rtx_REG (Pmode, R11_REG);
10815 gcc_assert (REGNO (pic_offset_table_rtx) != REGNO (tmp_reg));
10816 insn = emit_insn (gen_set_rip_rex64 (pic_offset_table_rtx,
10817 label));
10818 insn = emit_insn (gen_set_got_offset_rex64 (tmp_reg, label));
10819 insn = emit_insn (ix86_gen_add3 (pic_offset_table_rtx,
10820 pic_offset_table_rtx, tmp_reg));
10821 }
10822 else
10823 insn = emit_insn (gen_set_got_rex64 (pic_offset_table_rtx));
10824 }
10825 else
10826 {
10827 insn = emit_insn (gen_set_got (pic_offset_table_rtx));
10828 RTX_FRAME_RELATED_P (insn) = 1;
10829 add_reg_note (insn, REG_CFA_FLUSH_QUEUE, NULL_RTX);
10830 }
10831 }
10832
10833 /* In the pic_reg_used case, make sure that the got load isn't deleted
10834 when mcount needs it. Blockage to avoid call movement across mcount
10835 call is emitted in generic code after the NOTE_INSN_PROLOGUE_END
10836 note. */
10837 if (crtl->profile && !flag_fentry && pic_reg_used)
10838 emit_insn (gen_prologue_use (pic_offset_table_rtx));
10839
10840 if (crtl->drap_reg && !crtl->stack_realign_needed)
10841 {
10842 /* vDRAP is setup but after reload it turns out stack realign
10843 isn't necessary, here we will emit prologue to setup DRAP
10844 without stack realign adjustment */
10845 t = choose_baseaddr (0);
10846 emit_insn (gen_rtx_SET (VOIDmode, crtl->drap_reg, t));
10847 }
10848
10849 /* Prevent instructions from being scheduled into register save push
10850 sequence when access to the redzone area is done through frame pointer.
10851 The offset between the frame pointer and the stack pointer is calculated
10852 relative to the value of the stack pointer at the end of the function
10853 prologue, and moving instructions that access redzone area via frame
10854 pointer inside push sequence violates this assumption. */
10855 if (frame_pointer_needed && frame.red_zone_size)
10856 emit_insn (gen_memory_blockage ());
10857
10858 /* Emit cld instruction if stringops are used in the function. */
10859 if (TARGET_CLD && ix86_current_function_needs_cld)
10860 emit_insn (gen_cld ());
10861
10862 /* SEH requires that the prologue end within 256 bytes of the start of
10863 the function. Prevent instruction schedules that would extend that.
10864 Further, prevent alloca modifications to the stack pointer from being
10865 combined with prologue modifications. */
10866 if (TARGET_SEH)
10867 emit_insn (gen_prologue_use (stack_pointer_rtx));
10868 }
10869
10870 /* Emit code to restore REG using a POP insn. */
10871
10872 static void
10873 ix86_emit_restore_reg_using_pop (rtx reg)
10874 {
10875 struct machine_function *m = cfun->machine;
10876 rtx insn = emit_insn (gen_pop (reg));
10877
10878 ix86_add_cfa_restore_note (insn, reg, m->fs.sp_offset);
10879 m->fs.sp_offset -= UNITS_PER_WORD;
10880
10881 if (m->fs.cfa_reg == crtl->drap_reg
10882 && REGNO (reg) == REGNO (crtl->drap_reg))
10883 {
10884 /* Previously we'd represented the CFA as an expression
10885 like *(%ebp - 8). We've just popped that value from
10886 the stack, which means we need to reset the CFA to
10887 the drap register. This will remain until we restore
10888 the stack pointer. */
10889 add_reg_note (insn, REG_CFA_DEF_CFA, reg);
10890 RTX_FRAME_RELATED_P (insn) = 1;
10891
10892 /* This means that the DRAP register is valid for addressing too. */
10893 m->fs.drap_valid = true;
10894 return;
10895 }
10896
10897 if (m->fs.cfa_reg == stack_pointer_rtx)
10898 {
10899 rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
10900 x = gen_rtx_SET (VOIDmode, stack_pointer_rtx, x);
10901 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
10902 RTX_FRAME_RELATED_P (insn) = 1;
10903
10904 m->fs.cfa_offset -= UNITS_PER_WORD;
10905 }
10906
10907 /* When the frame pointer is the CFA, and we pop it, we are
10908 swapping back to the stack pointer as the CFA. This happens
10909 for stack frames that don't allocate other data, so we assume
10910 the stack pointer is now pointing at the return address, i.e.
10911 the function entry state, which makes the offset be 1 word. */
10912 if (reg == hard_frame_pointer_rtx)
10913 {
10914 m->fs.fp_valid = false;
10915 if (m->fs.cfa_reg == hard_frame_pointer_rtx)
10916 {
10917 m->fs.cfa_reg = stack_pointer_rtx;
10918 m->fs.cfa_offset -= UNITS_PER_WORD;
10919
10920 add_reg_note (insn, REG_CFA_DEF_CFA,
10921 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
10922 GEN_INT (m->fs.cfa_offset)));
10923 RTX_FRAME_RELATED_P (insn) = 1;
10924 }
10925 }
10926 }
10927
10928 /* Emit code to restore saved registers using POP insns. */
10929
10930 static void
10931 ix86_emit_restore_regs_using_pop (void)
10932 {
10933 unsigned int regno;
10934
10935 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
10936 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, false))
10937 ix86_emit_restore_reg_using_pop (gen_rtx_REG (word_mode, regno));
10938 }
10939
10940 /* Emit code and notes for the LEAVE instruction. */
10941
10942 static void
10943 ix86_emit_leave (void)
10944 {
10945 struct machine_function *m = cfun->machine;
10946 rtx insn = emit_insn (ix86_gen_leave ());
10947
10948 ix86_add_queued_cfa_restore_notes (insn);
10949
10950 gcc_assert (m->fs.fp_valid);
10951 m->fs.sp_valid = true;
10952 m->fs.sp_offset = m->fs.fp_offset - UNITS_PER_WORD;
10953 m->fs.fp_valid = false;
10954
10955 if (m->fs.cfa_reg == hard_frame_pointer_rtx)
10956 {
10957 m->fs.cfa_reg = stack_pointer_rtx;
10958 m->fs.cfa_offset = m->fs.sp_offset;
10959
10960 add_reg_note (insn, REG_CFA_DEF_CFA,
10961 plus_constant (Pmode, stack_pointer_rtx,
10962 m->fs.sp_offset));
10963 RTX_FRAME_RELATED_P (insn) = 1;
10964 }
10965 ix86_add_cfa_restore_note (insn, hard_frame_pointer_rtx,
10966 m->fs.fp_offset);
10967 }
10968
10969 /* Emit code to restore saved registers using MOV insns.
10970 First register is restored from CFA - CFA_OFFSET. */
10971 static void
10972 ix86_emit_restore_regs_using_mov (HOST_WIDE_INT cfa_offset,
10973 bool maybe_eh_return)
10974 {
10975 struct machine_function *m = cfun->machine;
10976 unsigned int regno;
10977
10978 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
10979 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return))
10980 {
10981 rtx reg = gen_rtx_REG (word_mode, regno);
10982 rtx insn, mem;
10983
10984 mem = choose_baseaddr (cfa_offset);
10985 mem = gen_frame_mem (word_mode, mem);
10986 insn = emit_move_insn (reg, mem);
10987
10988 if (m->fs.cfa_reg == crtl->drap_reg && regno == REGNO (crtl->drap_reg))
10989 {
10990 /* Previously we'd represented the CFA as an expression
10991 like *(%ebp - 8). We've just popped that value from
10992 the stack, which means we need to reset the CFA to
10993 the drap register. This will remain until we restore
10994 the stack pointer. */
10995 add_reg_note (insn, REG_CFA_DEF_CFA, reg);
10996 RTX_FRAME_RELATED_P (insn) = 1;
10997
10998 /* This means that the DRAP register is valid for addressing. */
10999 m->fs.drap_valid = true;
11000 }
11001 else
11002 ix86_add_cfa_restore_note (NULL_RTX, reg, cfa_offset);
11003
11004 cfa_offset -= UNITS_PER_WORD;
11005 }
11006 }
11007
11008 /* Emit code to restore saved registers using MOV insns.
11009 First register is restored from CFA - CFA_OFFSET. */
11010 static void
11011 ix86_emit_restore_sse_regs_using_mov (HOST_WIDE_INT cfa_offset,
11012 bool maybe_eh_return)
11013 {
11014 unsigned int regno;
11015
11016 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
11017 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return))
11018 {
11019 rtx reg = gen_rtx_REG (V4SFmode, regno);
11020 rtx mem;
11021
11022 mem = choose_baseaddr (cfa_offset);
11023 mem = gen_rtx_MEM (V4SFmode, mem);
11024 set_mem_align (mem, 128);
11025 emit_move_insn (reg, mem);
11026
11027 ix86_add_cfa_restore_note (NULL_RTX, reg, cfa_offset);
11028
11029 cfa_offset -= 16;
11030 }
11031 }
11032
11033 /* Emit vzeroupper if needed. */
11034
11035 void
11036 ix86_maybe_emit_epilogue_vzeroupper (void)
11037 {
11038 if (TARGET_VZEROUPPER
11039 && !TREE_THIS_VOLATILE (cfun->decl)
11040 && !cfun->machine->caller_return_avx256_p)
11041 emit_insn (gen_avx_vzeroupper (GEN_INT (call_no_avx256)));
11042 }
11043
11044 /* Restore function stack, frame, and registers. */
11045
11046 void
11047 ix86_expand_epilogue (int style)
11048 {
11049 struct machine_function *m = cfun->machine;
11050 struct machine_frame_state frame_state_save = m->fs;
11051 struct ix86_frame frame;
11052 bool restore_regs_via_mov;
11053 bool using_drap;
11054
11055 ix86_finalize_stack_realign_flags ();
11056 ix86_compute_frame_layout (&frame);
11057
11058 m->fs.sp_valid = (!frame_pointer_needed
11059 || (crtl->sp_is_unchanging
11060 && !stack_realign_fp));
11061 gcc_assert (!m->fs.sp_valid
11062 || m->fs.sp_offset == frame.stack_pointer_offset);
11063
11064 /* The FP must be valid if the frame pointer is present. */
11065 gcc_assert (frame_pointer_needed == m->fs.fp_valid);
11066 gcc_assert (!m->fs.fp_valid
11067 || m->fs.fp_offset == frame.hard_frame_pointer_offset);
11068
11069 /* We must have *some* valid pointer to the stack frame. */
11070 gcc_assert (m->fs.sp_valid || m->fs.fp_valid);
11071
11072 /* The DRAP is never valid at this point. */
11073 gcc_assert (!m->fs.drap_valid);
11074
11075 /* See the comment about red zone and frame
11076 pointer usage in ix86_expand_prologue. */
11077 if (frame_pointer_needed && frame.red_zone_size)
11078 emit_insn (gen_memory_blockage ());
11079
11080 using_drap = crtl->drap_reg && crtl->stack_realign_needed;
11081 gcc_assert (!using_drap || m->fs.cfa_reg == crtl->drap_reg);
11082
11083 /* Determine the CFA offset of the end of the red-zone. */
11084 m->fs.red_zone_offset = 0;
11085 if (ix86_using_red_zone () && crtl->args.pops_args < 65536)
11086 {
11087 /* The red-zone begins below the return address. */
11088 m->fs.red_zone_offset = RED_ZONE_SIZE + UNITS_PER_WORD;
11089
11090 /* When the register save area is in the aligned portion of
11091 the stack, determine the maximum runtime displacement that
11092 matches up with the aligned frame. */
11093 if (stack_realign_drap)
11094 m->fs.red_zone_offset -= (crtl->stack_alignment_needed / BITS_PER_UNIT
11095 + UNITS_PER_WORD);
11096 }
11097
11098 /* Special care must be taken for the normal return case of a function
11099 using eh_return: the eax and edx registers are marked as saved, but
11100 not restored along this path. Adjust the save location to match. */
11101 if (crtl->calls_eh_return && style != 2)
11102 frame.reg_save_offset -= 2 * UNITS_PER_WORD;
11103
11104 /* EH_RETURN requires the use of moves to function properly. */
11105 if (crtl->calls_eh_return)
11106 restore_regs_via_mov = true;
11107 /* SEH requires the use of pops to identify the epilogue. */
11108 else if (TARGET_SEH)
11109 restore_regs_via_mov = false;
11110 /* If we're only restoring one register and sp is not valid then
11111 using a move instruction to restore the register since it's
11112 less work than reloading sp and popping the register. */
11113 else if (!m->fs.sp_valid && frame.nregs <= 1)
11114 restore_regs_via_mov = true;
11115 else if (TARGET_EPILOGUE_USING_MOVE
11116 && cfun->machine->use_fast_prologue_epilogue
11117 && (frame.nregs > 1
11118 || m->fs.sp_offset != frame.reg_save_offset))
11119 restore_regs_via_mov = true;
11120 else if (frame_pointer_needed
11121 && !frame.nregs
11122 && m->fs.sp_offset != frame.reg_save_offset)
11123 restore_regs_via_mov = true;
11124 else if (frame_pointer_needed
11125 && TARGET_USE_LEAVE
11126 && cfun->machine->use_fast_prologue_epilogue
11127 && frame.nregs == 1)
11128 restore_regs_via_mov = true;
11129 else
11130 restore_regs_via_mov = false;
11131
11132 if (restore_regs_via_mov || frame.nsseregs)
11133 {
11134 /* Ensure that the entire register save area is addressable via
11135 the stack pointer, if we will restore via sp. */
11136 if (TARGET_64BIT
11137 && m->fs.sp_offset > 0x7fffffff
11138 && !(m->fs.fp_valid || m->fs.drap_valid)
11139 && (frame.nsseregs + frame.nregs) != 0)
11140 {
11141 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
11142 GEN_INT (m->fs.sp_offset
11143 - frame.sse_reg_save_offset),
11144 style,
11145 m->fs.cfa_reg == stack_pointer_rtx);
11146 }
11147 }
11148
11149 /* If there are any SSE registers to restore, then we have to do it
11150 via moves, since there's obviously no pop for SSE regs. */
11151 if (frame.nsseregs)
11152 ix86_emit_restore_sse_regs_using_mov (frame.sse_reg_save_offset,
11153 style == 2);
11154
11155 if (restore_regs_via_mov)
11156 {
11157 rtx t;
11158
11159 if (frame.nregs)
11160 ix86_emit_restore_regs_using_mov (frame.reg_save_offset, style == 2);
11161
11162 /* eh_return epilogues need %ecx added to the stack pointer. */
11163 if (style == 2)
11164 {
11165 rtx insn, sa = EH_RETURN_STACKADJ_RTX;
11166
11167 /* Stack align doesn't work with eh_return. */
11168 gcc_assert (!stack_realign_drap);
11169 /* Neither does regparm nested functions. */
11170 gcc_assert (!ix86_static_chain_on_stack);
11171
11172 if (frame_pointer_needed)
11173 {
11174 t = gen_rtx_PLUS (Pmode, hard_frame_pointer_rtx, sa);
11175 t = plus_constant (Pmode, t, m->fs.fp_offset - UNITS_PER_WORD);
11176 emit_insn (gen_rtx_SET (VOIDmode, sa, t));
11177
11178 t = gen_frame_mem (Pmode, hard_frame_pointer_rtx);
11179 insn = emit_move_insn (hard_frame_pointer_rtx, t);
11180
11181 /* Note that we use SA as a temporary CFA, as the return
11182 address is at the proper place relative to it. We
11183 pretend this happens at the FP restore insn because
11184 prior to this insn the FP would be stored at the wrong
11185 offset relative to SA, and after this insn we have no
11186 other reasonable register to use for the CFA. We don't
11187 bother resetting the CFA to the SP for the duration of
11188 the return insn. */
11189 add_reg_note (insn, REG_CFA_DEF_CFA,
11190 plus_constant (Pmode, sa, UNITS_PER_WORD));
11191 ix86_add_queued_cfa_restore_notes (insn);
11192 add_reg_note (insn, REG_CFA_RESTORE, hard_frame_pointer_rtx);
11193 RTX_FRAME_RELATED_P (insn) = 1;
11194
11195 m->fs.cfa_reg = sa;
11196 m->fs.cfa_offset = UNITS_PER_WORD;
11197 m->fs.fp_valid = false;
11198
11199 pro_epilogue_adjust_stack (stack_pointer_rtx, sa,
11200 const0_rtx, style, false);
11201 }
11202 else
11203 {
11204 t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, sa);
11205 t = plus_constant (Pmode, t, m->fs.sp_offset - UNITS_PER_WORD);
11206 insn = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx, t));
11207 ix86_add_queued_cfa_restore_notes (insn);
11208
11209 gcc_assert (m->fs.cfa_reg == stack_pointer_rtx);
11210 if (m->fs.cfa_offset != UNITS_PER_WORD)
11211 {
11212 m->fs.cfa_offset = UNITS_PER_WORD;
11213 add_reg_note (insn, REG_CFA_DEF_CFA,
11214 plus_constant (Pmode, stack_pointer_rtx,
11215 UNITS_PER_WORD));
11216 RTX_FRAME_RELATED_P (insn) = 1;
11217 }
11218 }
11219 m->fs.sp_offset = UNITS_PER_WORD;
11220 m->fs.sp_valid = true;
11221 }
11222 }
11223 else
11224 {
11225 /* SEH requires that the function end with (1) a stack adjustment
11226 if necessary, (2) a sequence of pops, and (3) a return or
11227 jump instruction. Prevent insns from the function body from
11228 being scheduled into this sequence. */
11229 if (TARGET_SEH)
11230 {
11231 /* Prevent a catch region from being adjacent to the standard
11232 epilogue sequence. Unfortuantely crtl->uses_eh_lsda nor
11233 several other flags that would be interesting to test are
11234 not yet set up. */
11235 if (flag_non_call_exceptions)
11236 emit_insn (gen_nops (const1_rtx));
11237 else
11238 emit_insn (gen_blockage ());
11239 }
11240
11241 /* First step is to deallocate the stack frame so that we can
11242 pop the registers. Also do it on SEH target for very large
11243 frame as the emitted instructions aren't allowed by the ABI in
11244 epilogues. */
11245 if (!m->fs.sp_valid
11246 || (TARGET_SEH
11247 && (m->fs.sp_offset - frame.reg_save_offset
11248 >= SEH_MAX_FRAME_SIZE)))
11249 {
11250 pro_epilogue_adjust_stack (stack_pointer_rtx, hard_frame_pointer_rtx,
11251 GEN_INT (m->fs.fp_offset
11252 - frame.reg_save_offset),
11253 style, false);
11254 }
11255 else if (m->fs.sp_offset != frame.reg_save_offset)
11256 {
11257 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
11258 GEN_INT (m->fs.sp_offset
11259 - frame.reg_save_offset),
11260 style,
11261 m->fs.cfa_reg == stack_pointer_rtx);
11262 }
11263
11264 ix86_emit_restore_regs_using_pop ();
11265 }
11266
11267 /* If we used a stack pointer and haven't already got rid of it,
11268 then do so now. */
11269 if (m->fs.fp_valid)
11270 {
11271 /* If the stack pointer is valid and pointing at the frame
11272 pointer store address, then we only need a pop. */
11273 if (m->fs.sp_valid && m->fs.sp_offset == frame.hfp_save_offset)
11274 ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
11275 /* Leave results in shorter dependency chains on CPUs that are
11276 able to grok it fast. */
11277 else if (TARGET_USE_LEAVE
11278 || optimize_function_for_size_p (cfun)
11279 || !cfun->machine->use_fast_prologue_epilogue)
11280 ix86_emit_leave ();
11281 else
11282 {
11283 pro_epilogue_adjust_stack (stack_pointer_rtx,
11284 hard_frame_pointer_rtx,
11285 const0_rtx, style, !using_drap);
11286 ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
11287 }
11288 }
11289
11290 if (using_drap)
11291 {
11292 int param_ptr_offset = UNITS_PER_WORD;
11293 rtx insn;
11294
11295 gcc_assert (stack_realign_drap);
11296
11297 if (ix86_static_chain_on_stack)
11298 param_ptr_offset += UNITS_PER_WORD;
11299 if (!call_used_regs[REGNO (crtl->drap_reg)])
11300 param_ptr_offset += UNITS_PER_WORD;
11301
11302 insn = emit_insn (gen_rtx_SET
11303 (VOIDmode, stack_pointer_rtx,
11304 gen_rtx_PLUS (Pmode,
11305 crtl->drap_reg,
11306 GEN_INT (-param_ptr_offset))));
11307 m->fs.cfa_reg = stack_pointer_rtx;
11308 m->fs.cfa_offset = param_ptr_offset;
11309 m->fs.sp_offset = param_ptr_offset;
11310 m->fs.realigned = false;
11311
11312 add_reg_note (insn, REG_CFA_DEF_CFA,
11313 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
11314 GEN_INT (param_ptr_offset)));
11315 RTX_FRAME_RELATED_P (insn) = 1;
11316
11317 if (!call_used_regs[REGNO (crtl->drap_reg)])
11318 ix86_emit_restore_reg_using_pop (crtl->drap_reg);
11319 }
11320
11321 /* At this point the stack pointer must be valid, and we must have
11322 restored all of the registers. We may not have deallocated the
11323 entire stack frame. We've delayed this until now because it may
11324 be possible to merge the local stack deallocation with the
11325 deallocation forced by ix86_static_chain_on_stack. */
11326 gcc_assert (m->fs.sp_valid);
11327 gcc_assert (!m->fs.fp_valid);
11328 gcc_assert (!m->fs.realigned);
11329 if (m->fs.sp_offset != UNITS_PER_WORD)
11330 {
11331 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
11332 GEN_INT (m->fs.sp_offset - UNITS_PER_WORD),
11333 style, true);
11334 }
11335 else
11336 ix86_add_queued_cfa_restore_notes (get_last_insn ());
11337
11338 /* Sibcall epilogues don't want a return instruction. */
11339 if (style == 0)
11340 {
11341 m->fs = frame_state_save;
11342 return;
11343 }
11344
11345 /* Emit vzeroupper if needed. */
11346 ix86_maybe_emit_epilogue_vzeroupper ();
11347
11348 if (crtl->args.pops_args && crtl->args.size)
11349 {
11350 rtx popc = GEN_INT (crtl->args.pops_args);
11351
11352 /* i386 can only pop 64K bytes. If asked to pop more, pop return
11353 address, do explicit add, and jump indirectly to the caller. */
11354
11355 if (crtl->args.pops_args >= 65536)
11356 {
11357 rtx ecx = gen_rtx_REG (SImode, CX_REG);
11358 rtx insn;
11359
11360 /* There is no "pascal" calling convention in any 64bit ABI. */
11361 gcc_assert (!TARGET_64BIT);
11362
11363 insn = emit_insn (gen_pop (ecx));
11364 m->fs.cfa_offset -= UNITS_PER_WORD;
11365 m->fs.sp_offset -= UNITS_PER_WORD;
11366
11367 add_reg_note (insn, REG_CFA_ADJUST_CFA,
11368 copy_rtx (XVECEXP (PATTERN (insn), 0, 1)));
11369 add_reg_note (insn, REG_CFA_REGISTER,
11370 gen_rtx_SET (VOIDmode, ecx, pc_rtx));
11371 RTX_FRAME_RELATED_P (insn) = 1;
11372
11373 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
11374 popc, -1, true);
11375 emit_jump_insn (gen_simple_return_indirect_internal (ecx));
11376 }
11377 else
11378 emit_jump_insn (gen_simple_return_pop_internal (popc));
11379 }
11380 else
11381 emit_jump_insn (gen_simple_return_internal ());
11382
11383 /* Restore the state back to the state from the prologue,
11384 so that it's correct for the next epilogue. */
11385 m->fs = frame_state_save;
11386 }
11387
11388 /* Reset from the function's potential modifications. */
11389
11390 static void
11391 ix86_output_function_epilogue (FILE *file ATTRIBUTE_UNUSED,
11392 HOST_WIDE_INT size ATTRIBUTE_UNUSED)
11393 {
11394 if (pic_offset_table_rtx)
11395 SET_REGNO (pic_offset_table_rtx, REAL_PIC_OFFSET_TABLE_REGNUM);
11396 #if TARGET_MACHO
11397 /* Mach-O doesn't support labels at the end of objects, so if
11398 it looks like we might want one, insert a NOP. */
11399 {
11400 rtx insn = get_last_insn ();
11401 rtx deleted_debug_label = NULL_RTX;
11402 while (insn
11403 && NOTE_P (insn)
11404 && NOTE_KIND (insn) != NOTE_INSN_DELETED_LABEL)
11405 {
11406 /* Don't insert a nop for NOTE_INSN_DELETED_DEBUG_LABEL
11407 notes only, instead set their CODE_LABEL_NUMBER to -1,
11408 otherwise there would be code generation differences
11409 in between -g and -g0. */
11410 if (NOTE_P (insn) && NOTE_KIND (insn) == NOTE_INSN_DELETED_DEBUG_LABEL)
11411 deleted_debug_label = insn;
11412 insn = PREV_INSN (insn);
11413 }
11414 if (insn
11415 && (LABEL_P (insn)
11416 || (NOTE_P (insn)
11417 && NOTE_KIND (insn) == NOTE_INSN_DELETED_LABEL)))
11418 fputs ("\tnop\n", file);
11419 else if (deleted_debug_label)
11420 for (insn = deleted_debug_label; insn; insn = NEXT_INSN (insn))
11421 if (NOTE_KIND (insn) == NOTE_INSN_DELETED_DEBUG_LABEL)
11422 CODE_LABEL_NUMBER (insn) = -1;
11423 }
11424 #endif
11425
11426 }
11427
11428 /* Return a scratch register to use in the split stack prologue. The
11429 split stack prologue is used for -fsplit-stack. It is the first
11430 instructions in the function, even before the regular prologue.
11431 The scratch register can be any caller-saved register which is not
11432 used for parameters or for the static chain. */
11433
11434 static unsigned int
11435 split_stack_prologue_scratch_regno (void)
11436 {
11437 if (TARGET_64BIT)
11438 return R11_REG;
11439 else
11440 {
11441 bool is_fastcall;
11442 int regparm;
11443
11444 is_fastcall = (lookup_attribute ("fastcall",
11445 TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
11446 != NULL);
11447 regparm = ix86_function_regparm (TREE_TYPE (cfun->decl), cfun->decl);
11448
11449 if (is_fastcall)
11450 {
11451 if (DECL_STATIC_CHAIN (cfun->decl))
11452 {
11453 sorry ("-fsplit-stack does not support fastcall with "
11454 "nested function");
11455 return INVALID_REGNUM;
11456 }
11457 return AX_REG;
11458 }
11459 else if (regparm < 3)
11460 {
11461 if (!DECL_STATIC_CHAIN (cfun->decl))
11462 return CX_REG;
11463 else
11464 {
11465 if (regparm >= 2)
11466 {
11467 sorry ("-fsplit-stack does not support 2 register "
11468 " parameters for a nested function");
11469 return INVALID_REGNUM;
11470 }
11471 return DX_REG;
11472 }
11473 }
11474 else
11475 {
11476 /* FIXME: We could make this work by pushing a register
11477 around the addition and comparison. */
11478 sorry ("-fsplit-stack does not support 3 register parameters");
11479 return INVALID_REGNUM;
11480 }
11481 }
11482 }
11483
11484 /* A SYMBOL_REF for the function which allocates new stackspace for
11485 -fsplit-stack. */
11486
11487 static GTY(()) rtx split_stack_fn;
11488
11489 /* A SYMBOL_REF for the more stack function when using the large
11490 model. */
11491
11492 static GTY(()) rtx split_stack_fn_large;
11493
11494 /* Handle -fsplit-stack. These are the first instructions in the
11495 function, even before the regular prologue. */
11496
11497 void
11498 ix86_expand_split_stack_prologue (void)
11499 {
11500 struct ix86_frame frame;
11501 HOST_WIDE_INT allocate;
11502 unsigned HOST_WIDE_INT args_size;
11503 rtx label, limit, current, jump_insn, allocate_rtx, call_insn, call_fusage;
11504 rtx scratch_reg = NULL_RTX;
11505 rtx varargs_label = NULL_RTX;
11506 rtx fn;
11507
11508 gcc_assert (flag_split_stack && reload_completed);
11509
11510 ix86_finalize_stack_realign_flags ();
11511 ix86_compute_frame_layout (&frame);
11512 allocate = frame.stack_pointer_offset - INCOMING_FRAME_SP_OFFSET;
11513
11514 /* This is the label we will branch to if we have enough stack
11515 space. We expect the basic block reordering pass to reverse this
11516 branch if optimizing, so that we branch in the unlikely case. */
11517 label = gen_label_rtx ();
11518
11519 /* We need to compare the stack pointer minus the frame size with
11520 the stack boundary in the TCB. The stack boundary always gives
11521 us SPLIT_STACK_AVAILABLE bytes, so if we need less than that we
11522 can compare directly. Otherwise we need to do an addition. */
11523
11524 limit = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
11525 UNSPEC_STACK_CHECK);
11526 limit = gen_rtx_CONST (Pmode, limit);
11527 limit = gen_rtx_MEM (Pmode, limit);
11528 if (allocate < SPLIT_STACK_AVAILABLE)
11529 current = stack_pointer_rtx;
11530 else
11531 {
11532 unsigned int scratch_regno;
11533 rtx offset;
11534
11535 /* We need a scratch register to hold the stack pointer minus
11536 the required frame size. Since this is the very start of the
11537 function, the scratch register can be any caller-saved
11538 register which is not used for parameters. */
11539 offset = GEN_INT (- allocate);
11540 scratch_regno = split_stack_prologue_scratch_regno ();
11541 if (scratch_regno == INVALID_REGNUM)
11542 return;
11543 scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
11544 if (!TARGET_64BIT || x86_64_immediate_operand (offset, Pmode))
11545 {
11546 /* We don't use ix86_gen_add3 in this case because it will
11547 want to split to lea, but when not optimizing the insn
11548 will not be split after this point. */
11549 emit_insn (gen_rtx_SET (VOIDmode, scratch_reg,
11550 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
11551 offset)));
11552 }
11553 else
11554 {
11555 emit_move_insn (scratch_reg, offset);
11556 emit_insn (ix86_gen_add3 (scratch_reg, scratch_reg,
11557 stack_pointer_rtx));
11558 }
11559 current = scratch_reg;
11560 }
11561
11562 ix86_expand_branch (GEU, current, limit, label);
11563 jump_insn = get_last_insn ();
11564 JUMP_LABEL (jump_insn) = label;
11565
11566 /* Mark the jump as very likely to be taken. */
11567 add_reg_note (jump_insn, REG_BR_PROB,
11568 GEN_INT (REG_BR_PROB_BASE - REG_BR_PROB_BASE / 100));
11569
11570 if (split_stack_fn == NULL_RTX)
11571 split_stack_fn = gen_rtx_SYMBOL_REF (Pmode, "__morestack");
11572 fn = split_stack_fn;
11573
11574 /* Get more stack space. We pass in the desired stack space and the
11575 size of the arguments to copy to the new stack. In 32-bit mode
11576 we push the parameters; __morestack will return on a new stack
11577 anyhow. In 64-bit mode we pass the parameters in r10 and
11578 r11. */
11579 allocate_rtx = GEN_INT (allocate);
11580 args_size = crtl->args.size >= 0 ? crtl->args.size : 0;
11581 call_fusage = NULL_RTX;
11582 if (TARGET_64BIT)
11583 {
11584 rtx reg10, reg11;
11585
11586 reg10 = gen_rtx_REG (Pmode, R10_REG);
11587 reg11 = gen_rtx_REG (Pmode, R11_REG);
11588
11589 /* If this function uses a static chain, it will be in %r10.
11590 Preserve it across the call to __morestack. */
11591 if (DECL_STATIC_CHAIN (cfun->decl))
11592 {
11593 rtx rax;
11594
11595 rax = gen_rtx_REG (word_mode, AX_REG);
11596 emit_move_insn (rax, gen_rtx_REG (word_mode, R10_REG));
11597 use_reg (&call_fusage, rax);
11598 }
11599
11600 if (ix86_cmodel == CM_LARGE || ix86_cmodel == CM_LARGE_PIC)
11601 {
11602 HOST_WIDE_INT argval;
11603
11604 gcc_assert (Pmode == DImode);
11605 /* When using the large model we need to load the address
11606 into a register, and we've run out of registers. So we
11607 switch to a different calling convention, and we call a
11608 different function: __morestack_large. We pass the
11609 argument size in the upper 32 bits of r10 and pass the
11610 frame size in the lower 32 bits. */
11611 gcc_assert ((allocate & (HOST_WIDE_INT) 0xffffffff) == allocate);
11612 gcc_assert ((args_size & 0xffffffff) == args_size);
11613
11614 if (split_stack_fn_large == NULL_RTX)
11615 split_stack_fn_large =
11616 gen_rtx_SYMBOL_REF (Pmode, "__morestack_large_model");
11617
11618 if (ix86_cmodel == CM_LARGE_PIC)
11619 {
11620 rtx label, x;
11621
11622 label = gen_label_rtx ();
11623 emit_label (label);
11624 LABEL_PRESERVE_P (label) = 1;
11625 emit_insn (gen_set_rip_rex64 (reg10, label));
11626 emit_insn (gen_set_got_offset_rex64 (reg11, label));
11627 emit_insn (ix86_gen_add3 (reg10, reg10, reg11));
11628 x = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, split_stack_fn_large),
11629 UNSPEC_GOT);
11630 x = gen_rtx_CONST (Pmode, x);
11631 emit_move_insn (reg11, x);
11632 x = gen_rtx_PLUS (Pmode, reg10, reg11);
11633 x = gen_const_mem (Pmode, x);
11634 emit_move_insn (reg11, x);
11635 }
11636 else
11637 emit_move_insn (reg11, split_stack_fn_large);
11638
11639 fn = reg11;
11640
11641 argval = ((args_size << 16) << 16) + allocate;
11642 emit_move_insn (reg10, GEN_INT (argval));
11643 }
11644 else
11645 {
11646 emit_move_insn (reg10, allocate_rtx);
11647 emit_move_insn (reg11, GEN_INT (args_size));
11648 use_reg (&call_fusage, reg11);
11649 }
11650
11651 use_reg (&call_fusage, reg10);
11652 }
11653 else
11654 {
11655 emit_insn (gen_push (GEN_INT (args_size)));
11656 emit_insn (gen_push (allocate_rtx));
11657 }
11658 call_insn = ix86_expand_call (NULL_RTX, gen_rtx_MEM (QImode, fn),
11659 GEN_INT (UNITS_PER_WORD), constm1_rtx,
11660 NULL_RTX, false);
11661 add_function_usage_to (call_insn, call_fusage);
11662
11663 /* In order to make call/return prediction work right, we now need
11664 to execute a return instruction. See
11665 libgcc/config/i386/morestack.S for the details on how this works.
11666
11667 For flow purposes gcc must not see this as a return
11668 instruction--we need control flow to continue at the subsequent
11669 label. Therefore, we use an unspec. */
11670 gcc_assert (crtl->args.pops_args < 65536);
11671 emit_insn (gen_split_stack_return (GEN_INT (crtl->args.pops_args)));
11672
11673 /* If we are in 64-bit mode and this function uses a static chain,
11674 we saved %r10 in %rax before calling _morestack. */
11675 if (TARGET_64BIT && DECL_STATIC_CHAIN (cfun->decl))
11676 emit_move_insn (gen_rtx_REG (word_mode, R10_REG),
11677 gen_rtx_REG (word_mode, AX_REG));
11678
11679 /* If this function calls va_start, we need to store a pointer to
11680 the arguments on the old stack, because they may not have been
11681 all copied to the new stack. At this point the old stack can be
11682 found at the frame pointer value used by __morestack, because
11683 __morestack has set that up before calling back to us. Here we
11684 store that pointer in a scratch register, and in
11685 ix86_expand_prologue we store the scratch register in a stack
11686 slot. */
11687 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
11688 {
11689 unsigned int scratch_regno;
11690 rtx frame_reg;
11691 int words;
11692
11693 scratch_regno = split_stack_prologue_scratch_regno ();
11694 scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
11695 frame_reg = gen_rtx_REG (Pmode, BP_REG);
11696
11697 /* 64-bit:
11698 fp -> old fp value
11699 return address within this function
11700 return address of caller of this function
11701 stack arguments
11702 So we add three words to get to the stack arguments.
11703
11704 32-bit:
11705 fp -> old fp value
11706 return address within this function
11707 first argument to __morestack
11708 second argument to __morestack
11709 return address of caller of this function
11710 stack arguments
11711 So we add five words to get to the stack arguments.
11712 */
11713 words = TARGET_64BIT ? 3 : 5;
11714 emit_insn (gen_rtx_SET (VOIDmode, scratch_reg,
11715 gen_rtx_PLUS (Pmode, frame_reg,
11716 GEN_INT (words * UNITS_PER_WORD))));
11717
11718 varargs_label = gen_label_rtx ();
11719 emit_jump_insn (gen_jump (varargs_label));
11720 JUMP_LABEL (get_last_insn ()) = varargs_label;
11721
11722 emit_barrier ();
11723 }
11724
11725 emit_label (label);
11726 LABEL_NUSES (label) = 1;
11727
11728 /* If this function calls va_start, we now have to set the scratch
11729 register for the case where we do not call __morestack. In this
11730 case we need to set it based on the stack pointer. */
11731 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
11732 {
11733 emit_insn (gen_rtx_SET (VOIDmode, scratch_reg,
11734 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
11735 GEN_INT (UNITS_PER_WORD))));
11736
11737 emit_label (varargs_label);
11738 LABEL_NUSES (varargs_label) = 1;
11739 }
11740 }
11741
11742 /* We may have to tell the dataflow pass that the split stack prologue
11743 is initializing a scratch register. */
11744
11745 static void
11746 ix86_live_on_entry (bitmap regs)
11747 {
11748 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
11749 {
11750 gcc_assert (flag_split_stack);
11751 bitmap_set_bit (regs, split_stack_prologue_scratch_regno ());
11752 }
11753 }
11754 \f
11755 /* Determine if op is suitable SUBREG RTX for address. */
11756
11757 static bool
11758 ix86_address_subreg_operand (rtx op)
11759 {
11760 enum machine_mode mode;
11761
11762 if (!REG_P (op))
11763 return false;
11764
11765 mode = GET_MODE (op);
11766
11767 if (GET_MODE_CLASS (mode) != MODE_INT)
11768 return false;
11769
11770 /* Don't allow SUBREGs that span more than a word. It can lead to spill
11771 failures when the register is one word out of a two word structure. */
11772 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
11773 return false;
11774
11775 /* simplify_subreg does not handle stack pointer. */
11776 if (REGNO (op) == STACK_POINTER_REGNUM)
11777 return false;
11778
11779 /* Allow only SUBREGs of non-eliminable hard registers. */
11780 return register_no_elim_operand (op, mode);
11781 }
11782
11783 /* Extract the parts of an RTL expression that is a valid memory address
11784 for an instruction. Return 0 if the structure of the address is
11785 grossly off. Return -1 if the address contains ASHIFT, so it is not
11786 strictly valid, but still used for computing length of lea instruction. */
11787
11788 int
11789 ix86_decompose_address (rtx addr, struct ix86_address *out)
11790 {
11791 rtx base = NULL_RTX, index = NULL_RTX, disp = NULL_RTX;
11792 rtx base_reg, index_reg;
11793 HOST_WIDE_INT scale = 1;
11794 rtx scale_rtx = NULL_RTX;
11795 rtx tmp;
11796 int retval = 1;
11797 enum ix86_address_seg seg = SEG_DEFAULT;
11798
11799 /* Allow zero-extended SImode addresses,
11800 they will be emitted with addr32 prefix. */
11801 if (TARGET_64BIT && GET_MODE (addr) == DImode)
11802 {
11803 if (GET_CODE (addr) == ZERO_EXTEND
11804 && GET_MODE (XEXP (addr, 0)) == SImode)
11805 {
11806 addr = XEXP (addr, 0);
11807 if (CONST_INT_P (addr))
11808 return 0;
11809 }
11810 else if (GET_CODE (addr) == AND
11811 && const_32bit_mask (XEXP (addr, 1), DImode))
11812 {
11813 addr = XEXP (addr, 0);
11814
11815 /* Adjust SUBREGs. */
11816 if (GET_CODE (addr) == SUBREG
11817 && GET_MODE (SUBREG_REG (addr)) == SImode)
11818 {
11819 addr = SUBREG_REG (addr);
11820 if (CONST_INT_P (addr))
11821 return 0;
11822 }
11823 else if (GET_MODE (addr) == DImode)
11824 {
11825 addr = simplify_gen_subreg (SImode, addr, DImode, 0);
11826 if (addr == NULL_RTX)
11827 return 0;
11828 }
11829 else if (GET_MODE (addr) != VOIDmode)
11830 return 0;
11831 }
11832 }
11833
11834 /* Allow SImode subregs of DImode addresses,
11835 they will be emitted with addr32 prefix. */
11836 if (TARGET_64BIT && GET_MODE (addr) == SImode)
11837 {
11838 if (GET_CODE (addr) == SUBREG
11839 && GET_MODE (SUBREG_REG (addr)) == DImode)
11840 {
11841 addr = SUBREG_REG (addr);
11842 if (CONST_INT_P (addr))
11843 return 0;
11844 }
11845 }
11846
11847 if (REG_P (addr))
11848 base = addr;
11849 else if (GET_CODE (addr) == SUBREG)
11850 {
11851 if (ix86_address_subreg_operand (SUBREG_REG (addr)))
11852 base = addr;
11853 else
11854 return 0;
11855 }
11856 else if (GET_CODE (addr) == PLUS)
11857 {
11858 rtx addends[4], op;
11859 int n = 0, i;
11860
11861 op = addr;
11862 do
11863 {
11864 if (n >= 4)
11865 return 0;
11866 addends[n++] = XEXP (op, 1);
11867 op = XEXP (op, 0);
11868 }
11869 while (GET_CODE (op) == PLUS);
11870 if (n >= 4)
11871 return 0;
11872 addends[n] = op;
11873
11874 for (i = n; i >= 0; --i)
11875 {
11876 op = addends[i];
11877 switch (GET_CODE (op))
11878 {
11879 case MULT:
11880 if (index)
11881 return 0;
11882 index = XEXP (op, 0);
11883 scale_rtx = XEXP (op, 1);
11884 break;
11885
11886 case ASHIFT:
11887 if (index)
11888 return 0;
11889 index = XEXP (op, 0);
11890 tmp = XEXP (op, 1);
11891 if (!CONST_INT_P (tmp))
11892 return 0;
11893 scale = INTVAL (tmp);
11894 if ((unsigned HOST_WIDE_INT) scale > 3)
11895 return 0;
11896 scale = 1 << scale;
11897 break;
11898
11899 case ZERO_EXTEND:
11900 op = XEXP (op, 0);
11901 if (GET_CODE (op) != UNSPEC)
11902 return 0;
11903 /* FALLTHRU */
11904
11905 case UNSPEC:
11906 if (XINT (op, 1) == UNSPEC_TP
11907 && TARGET_TLS_DIRECT_SEG_REFS
11908 && seg == SEG_DEFAULT)
11909 seg = TARGET_64BIT ? SEG_FS : SEG_GS;
11910 else
11911 return 0;
11912 break;
11913
11914 case SUBREG:
11915 if (!ix86_address_subreg_operand (SUBREG_REG (op)))
11916 return 0;
11917 /* FALLTHRU */
11918
11919 case REG:
11920 if (!base)
11921 base = op;
11922 else if (!index)
11923 index = op;
11924 else
11925 return 0;
11926 break;
11927
11928 case CONST:
11929 case CONST_INT:
11930 case SYMBOL_REF:
11931 case LABEL_REF:
11932 if (disp)
11933 return 0;
11934 disp = op;
11935 break;
11936
11937 default:
11938 return 0;
11939 }
11940 }
11941 }
11942 else if (GET_CODE (addr) == MULT)
11943 {
11944 index = XEXP (addr, 0); /* index*scale */
11945 scale_rtx = XEXP (addr, 1);
11946 }
11947 else if (GET_CODE (addr) == ASHIFT)
11948 {
11949 /* We're called for lea too, which implements ashift on occasion. */
11950 index = XEXP (addr, 0);
11951 tmp = XEXP (addr, 1);
11952 if (!CONST_INT_P (tmp))
11953 return 0;
11954 scale = INTVAL (tmp);
11955 if ((unsigned HOST_WIDE_INT) scale > 3)
11956 return 0;
11957 scale = 1 << scale;
11958 retval = -1;
11959 }
11960 else if (CONST_INT_P (addr))
11961 {
11962 if (!x86_64_immediate_operand (addr, VOIDmode))
11963 return 0;
11964
11965 /* Constant addresses are sign extended to 64bit, we have to
11966 prevent addresses from 0x80000000 to 0xffffffff in x32 mode. */
11967 if (TARGET_X32
11968 && val_signbit_known_set_p (SImode, INTVAL (addr)))
11969 return 0;
11970
11971 disp = addr;
11972 }
11973 else
11974 disp = addr; /* displacement */
11975
11976 if (index)
11977 {
11978 if (REG_P (index))
11979 ;
11980 else if (GET_CODE (index) == SUBREG
11981 && ix86_address_subreg_operand (SUBREG_REG (index)))
11982 ;
11983 else
11984 return 0;
11985 }
11986
11987 /* Address override works only on the (%reg) part of %fs:(%reg). */
11988 if (seg != SEG_DEFAULT
11989 && ((base && GET_MODE (base) != word_mode)
11990 || (index && GET_MODE (index) != word_mode)))
11991 return 0;
11992
11993 /* Extract the integral value of scale. */
11994 if (scale_rtx)
11995 {
11996 if (!CONST_INT_P (scale_rtx))
11997 return 0;
11998 scale = INTVAL (scale_rtx);
11999 }
12000
12001 base_reg = base && GET_CODE (base) == SUBREG ? SUBREG_REG (base) : base;
12002 index_reg = index && GET_CODE (index) == SUBREG ? SUBREG_REG (index) : index;
12003
12004 /* Avoid useless 0 displacement. */
12005 if (disp == const0_rtx && (base || index))
12006 disp = NULL_RTX;
12007
12008 /* Allow arg pointer and stack pointer as index if there is not scaling. */
12009 if (base_reg && index_reg && scale == 1
12010 && (index_reg == arg_pointer_rtx
12011 || index_reg == frame_pointer_rtx
12012 || (REG_P (index_reg) && REGNO (index_reg) == STACK_POINTER_REGNUM)))
12013 {
12014 rtx tmp;
12015 tmp = base, base = index, index = tmp;
12016 tmp = base_reg, base_reg = index_reg, index_reg = tmp;
12017 }
12018
12019 /* Special case: %ebp cannot be encoded as a base without a displacement.
12020 Similarly %r13. */
12021 if (!disp
12022 && base_reg
12023 && (base_reg == hard_frame_pointer_rtx
12024 || base_reg == frame_pointer_rtx
12025 || base_reg == arg_pointer_rtx
12026 || (REG_P (base_reg)
12027 && (REGNO (base_reg) == HARD_FRAME_POINTER_REGNUM
12028 || REGNO (base_reg) == R13_REG))))
12029 disp = const0_rtx;
12030
12031 /* Special case: on K6, [%esi] makes the instruction vector decoded.
12032 Avoid this by transforming to [%esi+0].
12033 Reload calls address legitimization without cfun defined, so we need
12034 to test cfun for being non-NULL. */
12035 if (TARGET_K6 && cfun && optimize_function_for_speed_p (cfun)
12036 && base_reg && !index_reg && !disp
12037 && REG_P (base_reg) && REGNO (base_reg) == SI_REG)
12038 disp = const0_rtx;
12039
12040 /* Special case: encode reg+reg instead of reg*2. */
12041 if (!base && index && scale == 2)
12042 base = index, base_reg = index_reg, scale = 1;
12043
12044 /* Special case: scaling cannot be encoded without base or displacement. */
12045 if (!base && !disp && index && scale != 1)
12046 disp = const0_rtx;
12047
12048 out->base = base;
12049 out->index = index;
12050 out->disp = disp;
12051 out->scale = scale;
12052 out->seg = seg;
12053
12054 return retval;
12055 }
12056 \f
12057 /* Return cost of the memory address x.
12058 For i386, it is better to use a complex address than let gcc copy
12059 the address into a reg and make a new pseudo. But not if the address
12060 requires to two regs - that would mean more pseudos with longer
12061 lifetimes. */
12062 static int
12063 ix86_address_cost (rtx x, enum machine_mode mode ATTRIBUTE_UNUSED,
12064 addr_space_t as ATTRIBUTE_UNUSED,
12065 bool speed ATTRIBUTE_UNUSED)
12066 {
12067 struct ix86_address parts;
12068 int cost = 1;
12069 int ok = ix86_decompose_address (x, &parts);
12070
12071 gcc_assert (ok);
12072
12073 if (parts.base && GET_CODE (parts.base) == SUBREG)
12074 parts.base = SUBREG_REG (parts.base);
12075 if (parts.index && GET_CODE (parts.index) == SUBREG)
12076 parts.index = SUBREG_REG (parts.index);
12077
12078 /* Attempt to minimize number of registers in the address. */
12079 if ((parts.base
12080 && (!REG_P (parts.base) || REGNO (parts.base) >= FIRST_PSEUDO_REGISTER))
12081 || (parts.index
12082 && (!REG_P (parts.index)
12083 || REGNO (parts.index) >= FIRST_PSEUDO_REGISTER)))
12084 cost++;
12085
12086 if (parts.base
12087 && (!REG_P (parts.base) || REGNO (parts.base) >= FIRST_PSEUDO_REGISTER)
12088 && parts.index
12089 && (!REG_P (parts.index) || REGNO (parts.index) >= FIRST_PSEUDO_REGISTER)
12090 && parts.base != parts.index)
12091 cost++;
12092
12093 /* AMD-K6 don't like addresses with ModR/M set to 00_xxx_100b,
12094 since it's predecode logic can't detect the length of instructions
12095 and it degenerates to vector decoded. Increase cost of such
12096 addresses here. The penalty is minimally 2 cycles. It may be worthwhile
12097 to split such addresses or even refuse such addresses at all.
12098
12099 Following addressing modes are affected:
12100 [base+scale*index]
12101 [scale*index+disp]
12102 [base+index]
12103
12104 The first and last case may be avoidable by explicitly coding the zero in
12105 memory address, but I don't have AMD-K6 machine handy to check this
12106 theory. */
12107
12108 if (TARGET_K6
12109 && ((!parts.disp && parts.base && parts.index && parts.scale != 1)
12110 || (parts.disp && !parts.base && parts.index && parts.scale != 1)
12111 || (!parts.disp && parts.base && parts.index && parts.scale == 1)))
12112 cost += 10;
12113
12114 return cost;
12115 }
12116 \f
12117 /* Allow {LABEL | SYMBOL}_REF - SYMBOL_REF-FOR-PICBASE for Mach-O as
12118 this is used for to form addresses to local data when -fPIC is in
12119 use. */
12120
12121 static bool
12122 darwin_local_data_pic (rtx disp)
12123 {
12124 return (GET_CODE (disp) == UNSPEC
12125 && XINT (disp, 1) == UNSPEC_MACHOPIC_OFFSET);
12126 }
12127
12128 /* Determine if a given RTX is a valid constant. We already know this
12129 satisfies CONSTANT_P. */
12130
12131 static bool
12132 ix86_legitimate_constant_p (enum machine_mode mode ATTRIBUTE_UNUSED, rtx x)
12133 {
12134 switch (GET_CODE (x))
12135 {
12136 case CONST:
12137 x = XEXP (x, 0);
12138
12139 if (GET_CODE (x) == PLUS)
12140 {
12141 if (!CONST_INT_P (XEXP (x, 1)))
12142 return false;
12143 x = XEXP (x, 0);
12144 }
12145
12146 if (TARGET_MACHO && darwin_local_data_pic (x))
12147 return true;
12148
12149 /* Only some unspecs are valid as "constants". */
12150 if (GET_CODE (x) == UNSPEC)
12151 switch (XINT (x, 1))
12152 {
12153 case UNSPEC_GOT:
12154 case UNSPEC_GOTOFF:
12155 case UNSPEC_PLTOFF:
12156 return TARGET_64BIT;
12157 case UNSPEC_TPOFF:
12158 case UNSPEC_NTPOFF:
12159 x = XVECEXP (x, 0, 0);
12160 return (GET_CODE (x) == SYMBOL_REF
12161 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
12162 case UNSPEC_DTPOFF:
12163 x = XVECEXP (x, 0, 0);
12164 return (GET_CODE (x) == SYMBOL_REF
12165 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_DYNAMIC);
12166 default:
12167 return false;
12168 }
12169
12170 /* We must have drilled down to a symbol. */
12171 if (GET_CODE (x) == LABEL_REF)
12172 return true;
12173 if (GET_CODE (x) != SYMBOL_REF)
12174 return false;
12175 /* FALLTHRU */
12176
12177 case SYMBOL_REF:
12178 /* TLS symbols are never valid. */
12179 if (SYMBOL_REF_TLS_MODEL (x))
12180 return false;
12181
12182 /* DLLIMPORT symbols are never valid. */
12183 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
12184 && SYMBOL_REF_DLLIMPORT_P (x))
12185 return false;
12186
12187 #if TARGET_MACHO
12188 /* mdynamic-no-pic */
12189 if (MACHO_DYNAMIC_NO_PIC_P)
12190 return machopic_symbol_defined_p (x);
12191 #endif
12192 break;
12193
12194 case CONST_DOUBLE:
12195 if (GET_MODE (x) == TImode
12196 && x != CONST0_RTX (TImode)
12197 && !TARGET_64BIT)
12198 return false;
12199 break;
12200
12201 case CONST_VECTOR:
12202 if (!standard_sse_constant_p (x))
12203 return false;
12204
12205 default:
12206 break;
12207 }
12208
12209 /* Otherwise we handle everything else in the move patterns. */
12210 return true;
12211 }
12212
12213 /* Determine if it's legal to put X into the constant pool. This
12214 is not possible for the address of thread-local symbols, which
12215 is checked above. */
12216
12217 static bool
12218 ix86_cannot_force_const_mem (enum machine_mode mode, rtx x)
12219 {
12220 /* We can always put integral constants and vectors in memory. */
12221 switch (GET_CODE (x))
12222 {
12223 case CONST_INT:
12224 case CONST_DOUBLE:
12225 case CONST_VECTOR:
12226 return false;
12227
12228 default:
12229 break;
12230 }
12231 return !ix86_legitimate_constant_p (mode, x);
12232 }
12233
12234
12235 /* Nonzero if the constant value X is a legitimate general operand
12236 when generating PIC code. It is given that flag_pic is on and
12237 that X satisfies CONSTANT_P or is a CONST_DOUBLE. */
12238
12239 bool
12240 legitimate_pic_operand_p (rtx x)
12241 {
12242 rtx inner;
12243
12244 switch (GET_CODE (x))
12245 {
12246 case CONST:
12247 inner = XEXP (x, 0);
12248 if (GET_CODE (inner) == PLUS
12249 && CONST_INT_P (XEXP (inner, 1)))
12250 inner = XEXP (inner, 0);
12251
12252 /* Only some unspecs are valid as "constants". */
12253 if (GET_CODE (inner) == UNSPEC)
12254 switch (XINT (inner, 1))
12255 {
12256 case UNSPEC_GOT:
12257 case UNSPEC_GOTOFF:
12258 case UNSPEC_PLTOFF:
12259 return TARGET_64BIT;
12260 case UNSPEC_TPOFF:
12261 x = XVECEXP (inner, 0, 0);
12262 return (GET_CODE (x) == SYMBOL_REF
12263 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
12264 case UNSPEC_MACHOPIC_OFFSET:
12265 return legitimate_pic_address_disp_p (x);
12266 default:
12267 return false;
12268 }
12269 /* FALLTHRU */
12270
12271 case SYMBOL_REF:
12272 case LABEL_REF:
12273 return legitimate_pic_address_disp_p (x);
12274
12275 default:
12276 return true;
12277 }
12278 }
12279
12280 /* Determine if a given CONST RTX is a valid memory displacement
12281 in PIC mode. */
12282
12283 bool
12284 legitimate_pic_address_disp_p (rtx disp)
12285 {
12286 bool saw_plus;
12287
12288 /* In 64bit mode we can allow direct addresses of symbols and labels
12289 when they are not dynamic symbols. */
12290 if (TARGET_64BIT)
12291 {
12292 rtx op0 = disp, op1;
12293
12294 switch (GET_CODE (disp))
12295 {
12296 case LABEL_REF:
12297 return true;
12298
12299 case CONST:
12300 if (GET_CODE (XEXP (disp, 0)) != PLUS)
12301 break;
12302 op0 = XEXP (XEXP (disp, 0), 0);
12303 op1 = XEXP (XEXP (disp, 0), 1);
12304 if (!CONST_INT_P (op1)
12305 || INTVAL (op1) >= 16*1024*1024
12306 || INTVAL (op1) < -16*1024*1024)
12307 break;
12308 if (GET_CODE (op0) == LABEL_REF)
12309 return true;
12310 if (GET_CODE (op0) == CONST
12311 && GET_CODE (XEXP (op0, 0)) == UNSPEC
12312 && XINT (XEXP (op0, 0), 1) == UNSPEC_PCREL)
12313 return true;
12314 if (GET_CODE (op0) == UNSPEC
12315 && XINT (op0, 1) == UNSPEC_PCREL)
12316 return true;
12317 if (GET_CODE (op0) != SYMBOL_REF)
12318 break;
12319 /* FALLTHRU */
12320
12321 case SYMBOL_REF:
12322 /* TLS references should always be enclosed in UNSPEC. */
12323 if (SYMBOL_REF_TLS_MODEL (op0))
12324 return false;
12325 if (!SYMBOL_REF_FAR_ADDR_P (op0) && SYMBOL_REF_LOCAL_P (op0)
12326 && ix86_cmodel != CM_LARGE_PIC)
12327 return true;
12328 break;
12329
12330 default:
12331 break;
12332 }
12333 }
12334 if (GET_CODE (disp) != CONST)
12335 return false;
12336 disp = XEXP (disp, 0);
12337
12338 if (TARGET_64BIT)
12339 {
12340 /* We are unsafe to allow PLUS expressions. This limit allowed distance
12341 of GOT tables. We should not need these anyway. */
12342 if (GET_CODE (disp) != UNSPEC
12343 || (XINT (disp, 1) != UNSPEC_GOTPCREL
12344 && XINT (disp, 1) != UNSPEC_GOTOFF
12345 && XINT (disp, 1) != UNSPEC_PCREL
12346 && XINT (disp, 1) != UNSPEC_PLTOFF))
12347 return false;
12348
12349 if (GET_CODE (XVECEXP (disp, 0, 0)) != SYMBOL_REF
12350 && GET_CODE (XVECEXP (disp, 0, 0)) != LABEL_REF)
12351 return false;
12352 return true;
12353 }
12354
12355 saw_plus = false;
12356 if (GET_CODE (disp) == PLUS)
12357 {
12358 if (!CONST_INT_P (XEXP (disp, 1)))
12359 return false;
12360 disp = XEXP (disp, 0);
12361 saw_plus = true;
12362 }
12363
12364 if (TARGET_MACHO && darwin_local_data_pic (disp))
12365 return true;
12366
12367 if (GET_CODE (disp) != UNSPEC)
12368 return false;
12369
12370 switch (XINT (disp, 1))
12371 {
12372 case UNSPEC_GOT:
12373 if (saw_plus)
12374 return false;
12375 /* We need to check for both symbols and labels because VxWorks loads
12376 text labels with @GOT rather than @GOTOFF. See gotoff_operand for
12377 details. */
12378 return (GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
12379 || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF);
12380 case UNSPEC_GOTOFF:
12381 /* Refuse GOTOFF in 64bit mode since it is always 64bit when used.
12382 While ABI specify also 32bit relocation but we don't produce it in
12383 small PIC model at all. */
12384 if ((GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
12385 || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF)
12386 && !TARGET_64BIT)
12387 return gotoff_operand (XVECEXP (disp, 0, 0), Pmode);
12388 return false;
12389 case UNSPEC_GOTTPOFF:
12390 case UNSPEC_GOTNTPOFF:
12391 case UNSPEC_INDNTPOFF:
12392 if (saw_plus)
12393 return false;
12394 disp = XVECEXP (disp, 0, 0);
12395 return (GET_CODE (disp) == SYMBOL_REF
12396 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_INITIAL_EXEC);
12397 case UNSPEC_NTPOFF:
12398 disp = XVECEXP (disp, 0, 0);
12399 return (GET_CODE (disp) == SYMBOL_REF
12400 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_EXEC);
12401 case UNSPEC_DTPOFF:
12402 disp = XVECEXP (disp, 0, 0);
12403 return (GET_CODE (disp) == SYMBOL_REF
12404 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_DYNAMIC);
12405 }
12406
12407 return false;
12408 }
12409
12410 /* Our implementation of LEGITIMIZE_RELOAD_ADDRESS. Returns a value to
12411 replace the input X, or the original X if no replacement is called for.
12412 The output parameter *WIN is 1 if the calling macro should goto WIN,
12413 0 if it should not. */
12414
12415 bool
12416 ix86_legitimize_reload_address (rtx x,
12417 enum machine_mode mode ATTRIBUTE_UNUSED,
12418 int opnum, int type,
12419 int ind_levels ATTRIBUTE_UNUSED)
12420 {
12421 /* Reload can generate:
12422
12423 (plus:DI (plus:DI (unspec:DI [(const_int 0 [0])] UNSPEC_TP)
12424 (reg:DI 97))
12425 (reg:DI 2 cx))
12426
12427 This RTX is rejected from ix86_legitimate_address_p due to
12428 non-strictness of base register 97. Following this rejection,
12429 reload pushes all three components into separate registers,
12430 creating invalid memory address RTX.
12431
12432 Following code reloads only the invalid part of the
12433 memory address RTX. */
12434
12435 if (GET_CODE (x) == PLUS
12436 && REG_P (XEXP (x, 1))
12437 && GET_CODE (XEXP (x, 0)) == PLUS
12438 && REG_P (XEXP (XEXP (x, 0), 1)))
12439 {
12440 rtx base, index;
12441 bool something_reloaded = false;
12442
12443 base = XEXP (XEXP (x, 0), 1);
12444 if (!REG_OK_FOR_BASE_STRICT_P (base))
12445 {
12446 push_reload (base, NULL_RTX, &XEXP (XEXP (x, 0), 1), NULL,
12447 BASE_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
12448 opnum, (enum reload_type) type);
12449 something_reloaded = true;
12450 }
12451
12452 index = XEXP (x, 1);
12453 if (!REG_OK_FOR_INDEX_STRICT_P (index))
12454 {
12455 push_reload (index, NULL_RTX, &XEXP (x, 1), NULL,
12456 INDEX_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
12457 opnum, (enum reload_type) type);
12458 something_reloaded = true;
12459 }
12460
12461 gcc_assert (something_reloaded);
12462 return true;
12463 }
12464
12465 return false;
12466 }
12467
12468 /* Recognizes RTL expressions that are valid memory addresses for an
12469 instruction. The MODE argument is the machine mode for the MEM
12470 expression that wants to use this address.
12471
12472 It only recognizes address in canonical form. LEGITIMIZE_ADDRESS should
12473 convert common non-canonical forms to canonical form so that they will
12474 be recognized. */
12475
12476 static bool
12477 ix86_legitimate_address_p (enum machine_mode mode ATTRIBUTE_UNUSED,
12478 rtx addr, bool strict)
12479 {
12480 struct ix86_address parts;
12481 rtx base, index, disp;
12482 HOST_WIDE_INT scale;
12483
12484 if (ix86_decompose_address (addr, &parts) <= 0)
12485 /* Decomposition failed. */
12486 return false;
12487
12488 base = parts.base;
12489 index = parts.index;
12490 disp = parts.disp;
12491 scale = parts.scale;
12492
12493 /* Validate base register. */
12494 if (base)
12495 {
12496 rtx reg;
12497
12498 if (REG_P (base))
12499 reg = base;
12500 else if (GET_CODE (base) == SUBREG && REG_P (SUBREG_REG (base)))
12501 reg = SUBREG_REG (base);
12502 else
12503 /* Base is not a register. */
12504 return false;
12505
12506 if (GET_MODE (base) != SImode && GET_MODE (base) != DImode)
12507 return false;
12508
12509 if ((strict && ! REG_OK_FOR_BASE_STRICT_P (reg))
12510 || (! strict && ! REG_OK_FOR_BASE_NONSTRICT_P (reg)))
12511 /* Base is not valid. */
12512 return false;
12513 }
12514
12515 /* Validate index register. */
12516 if (index)
12517 {
12518 rtx reg;
12519
12520 if (REG_P (index))
12521 reg = index;
12522 else if (GET_CODE (index) == SUBREG && REG_P (SUBREG_REG (index)))
12523 reg = SUBREG_REG (index);
12524 else
12525 /* Index is not a register. */
12526 return false;
12527
12528 if (GET_MODE (index) != SImode && GET_MODE (index) != DImode)
12529 return false;
12530
12531 if ((strict && ! REG_OK_FOR_INDEX_STRICT_P (reg))
12532 || (! strict && ! REG_OK_FOR_INDEX_NONSTRICT_P (reg)))
12533 /* Index is not valid. */
12534 return false;
12535 }
12536
12537 /* Index and base should have the same mode. */
12538 if (base && index
12539 && GET_MODE (base) != GET_MODE (index))
12540 return false;
12541
12542 /* Validate scale factor. */
12543 if (scale != 1)
12544 {
12545 if (!index)
12546 /* Scale without index. */
12547 return false;
12548
12549 if (scale != 2 && scale != 4 && scale != 8)
12550 /* Scale is not a valid multiplier. */
12551 return false;
12552 }
12553
12554 /* Validate displacement. */
12555 if (disp)
12556 {
12557 if (GET_CODE (disp) == CONST
12558 && GET_CODE (XEXP (disp, 0)) == UNSPEC
12559 && XINT (XEXP (disp, 0), 1) != UNSPEC_MACHOPIC_OFFSET)
12560 switch (XINT (XEXP (disp, 0), 1))
12561 {
12562 /* Refuse GOTOFF and GOT in 64bit mode since it is always 64bit when
12563 used. While ABI specify also 32bit relocations, we don't produce
12564 them at all and use IP relative instead. */
12565 case UNSPEC_GOT:
12566 case UNSPEC_GOTOFF:
12567 gcc_assert (flag_pic);
12568 if (!TARGET_64BIT)
12569 goto is_legitimate_pic;
12570
12571 /* 64bit address unspec. */
12572 return false;
12573
12574 case UNSPEC_GOTPCREL:
12575 case UNSPEC_PCREL:
12576 gcc_assert (flag_pic);
12577 goto is_legitimate_pic;
12578
12579 case UNSPEC_GOTTPOFF:
12580 case UNSPEC_GOTNTPOFF:
12581 case UNSPEC_INDNTPOFF:
12582 case UNSPEC_NTPOFF:
12583 case UNSPEC_DTPOFF:
12584 break;
12585
12586 case UNSPEC_STACK_CHECK:
12587 gcc_assert (flag_split_stack);
12588 break;
12589
12590 default:
12591 /* Invalid address unspec. */
12592 return false;
12593 }
12594
12595 else if (SYMBOLIC_CONST (disp)
12596 && (flag_pic
12597 || (TARGET_MACHO
12598 #if TARGET_MACHO
12599 && MACHOPIC_INDIRECT
12600 && !machopic_operand_p (disp)
12601 #endif
12602 )))
12603 {
12604
12605 is_legitimate_pic:
12606 if (TARGET_64BIT && (index || base))
12607 {
12608 /* foo@dtpoff(%rX) is ok. */
12609 if (GET_CODE (disp) != CONST
12610 || GET_CODE (XEXP (disp, 0)) != PLUS
12611 || GET_CODE (XEXP (XEXP (disp, 0), 0)) != UNSPEC
12612 || !CONST_INT_P (XEXP (XEXP (disp, 0), 1))
12613 || (XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_DTPOFF
12614 && XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_NTPOFF))
12615 /* Non-constant pic memory reference. */
12616 return false;
12617 }
12618 else if ((!TARGET_MACHO || flag_pic)
12619 && ! legitimate_pic_address_disp_p (disp))
12620 /* Displacement is an invalid pic construct. */
12621 return false;
12622 #if TARGET_MACHO
12623 else if (MACHO_DYNAMIC_NO_PIC_P
12624 && !ix86_legitimate_constant_p (Pmode, disp))
12625 /* displacment must be referenced via non_lazy_pointer */
12626 return false;
12627 #endif
12628
12629 /* This code used to verify that a symbolic pic displacement
12630 includes the pic_offset_table_rtx register.
12631
12632 While this is good idea, unfortunately these constructs may
12633 be created by "adds using lea" optimization for incorrect
12634 code like:
12635
12636 int a;
12637 int foo(int i)
12638 {
12639 return *(&a+i);
12640 }
12641
12642 This code is nonsensical, but results in addressing
12643 GOT table with pic_offset_table_rtx base. We can't
12644 just refuse it easily, since it gets matched by
12645 "addsi3" pattern, that later gets split to lea in the
12646 case output register differs from input. While this
12647 can be handled by separate addsi pattern for this case
12648 that never results in lea, this seems to be easier and
12649 correct fix for crash to disable this test. */
12650 }
12651 else if (GET_CODE (disp) != LABEL_REF
12652 && !CONST_INT_P (disp)
12653 && (GET_CODE (disp) != CONST
12654 || !ix86_legitimate_constant_p (Pmode, disp))
12655 && (GET_CODE (disp) != SYMBOL_REF
12656 || !ix86_legitimate_constant_p (Pmode, disp)))
12657 /* Displacement is not constant. */
12658 return false;
12659 else if (TARGET_64BIT
12660 && !x86_64_immediate_operand (disp, VOIDmode))
12661 /* Displacement is out of range. */
12662 return false;
12663 }
12664
12665 /* Everything looks valid. */
12666 return true;
12667 }
12668
12669 /* Determine if a given RTX is a valid constant address. */
12670
12671 bool
12672 constant_address_p (rtx x)
12673 {
12674 return CONSTANT_P (x) && ix86_legitimate_address_p (Pmode, x, 1);
12675 }
12676 \f
12677 /* Return a unique alias set for the GOT. */
12678
12679 static alias_set_type
12680 ix86_GOT_alias_set (void)
12681 {
12682 static alias_set_type set = -1;
12683 if (set == -1)
12684 set = new_alias_set ();
12685 return set;
12686 }
12687
12688 /* Return a legitimate reference for ORIG (an address) using the
12689 register REG. If REG is 0, a new pseudo is generated.
12690
12691 There are two types of references that must be handled:
12692
12693 1. Global data references must load the address from the GOT, via
12694 the PIC reg. An insn is emitted to do this load, and the reg is
12695 returned.
12696
12697 2. Static data references, constant pool addresses, and code labels
12698 compute the address as an offset from the GOT, whose base is in
12699 the PIC reg. Static data objects have SYMBOL_FLAG_LOCAL set to
12700 differentiate them from global data objects. The returned
12701 address is the PIC reg + an unspec constant.
12702
12703 TARGET_LEGITIMATE_ADDRESS_P rejects symbolic references unless the PIC
12704 reg also appears in the address. */
12705
12706 static rtx
12707 legitimize_pic_address (rtx orig, rtx reg)
12708 {
12709 rtx addr = orig;
12710 rtx new_rtx = orig;
12711 rtx base;
12712
12713 #if TARGET_MACHO
12714 if (TARGET_MACHO && !TARGET_64BIT)
12715 {
12716 if (reg == 0)
12717 reg = gen_reg_rtx (Pmode);
12718 /* Use the generic Mach-O PIC machinery. */
12719 return machopic_legitimize_pic_address (orig, GET_MODE (orig), reg);
12720 }
12721 #endif
12722
12723 if (TARGET_64BIT && legitimate_pic_address_disp_p (addr))
12724 new_rtx = addr;
12725 else if (TARGET_64BIT
12726 && ix86_cmodel != CM_SMALL_PIC
12727 && gotoff_operand (addr, Pmode))
12728 {
12729 rtx tmpreg;
12730 /* This symbol may be referenced via a displacement from the PIC
12731 base address (@GOTOFF). */
12732
12733 if (reload_in_progress)
12734 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12735 if (GET_CODE (addr) == CONST)
12736 addr = XEXP (addr, 0);
12737 if (GET_CODE (addr) == PLUS)
12738 {
12739 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, XEXP (addr, 0)),
12740 UNSPEC_GOTOFF);
12741 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, XEXP (addr, 1));
12742 }
12743 else
12744 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTOFF);
12745 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12746 if (!reg)
12747 tmpreg = gen_reg_rtx (Pmode);
12748 else
12749 tmpreg = reg;
12750 emit_move_insn (tmpreg, new_rtx);
12751
12752 if (reg != 0)
12753 {
12754 new_rtx = expand_simple_binop (Pmode, PLUS, reg, pic_offset_table_rtx,
12755 tmpreg, 1, OPTAB_DIRECT);
12756 new_rtx = reg;
12757 }
12758 else new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, tmpreg);
12759 }
12760 else if (!TARGET_64BIT && gotoff_operand (addr, Pmode))
12761 {
12762 /* This symbol may be referenced via a displacement from the PIC
12763 base address (@GOTOFF). */
12764
12765 if (reload_in_progress)
12766 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12767 if (GET_CODE (addr) == CONST)
12768 addr = XEXP (addr, 0);
12769 if (GET_CODE (addr) == PLUS)
12770 {
12771 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, XEXP (addr, 0)),
12772 UNSPEC_GOTOFF);
12773 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, XEXP (addr, 1));
12774 }
12775 else
12776 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTOFF);
12777 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12778 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
12779
12780 if (reg != 0)
12781 {
12782 emit_move_insn (reg, new_rtx);
12783 new_rtx = reg;
12784 }
12785 }
12786 else if ((GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (addr) == 0)
12787 /* We can't use @GOTOFF for text labels on VxWorks;
12788 see gotoff_operand. */
12789 || (TARGET_VXWORKS_RTP && GET_CODE (addr) == LABEL_REF))
12790 {
12791 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
12792 {
12793 if (GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_DLLIMPORT_P (addr))
12794 return legitimize_dllimport_symbol (addr, true);
12795 if (GET_CODE (addr) == CONST && GET_CODE (XEXP (addr, 0)) == PLUS
12796 && GET_CODE (XEXP (XEXP (addr, 0), 0)) == SYMBOL_REF
12797 && SYMBOL_REF_DLLIMPORT_P (XEXP (XEXP (addr, 0), 0)))
12798 {
12799 rtx t = legitimize_dllimport_symbol (XEXP (XEXP (addr, 0), 0), true);
12800 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (addr, 0), 1));
12801 }
12802 }
12803
12804 /* For x64 PE-COFF there is no GOT table. So we use address
12805 directly. */
12806 if (TARGET_64BIT && DEFAULT_ABI == MS_ABI)
12807 {
12808 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_PCREL);
12809 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12810
12811 if (reg == 0)
12812 reg = gen_reg_rtx (Pmode);
12813 emit_move_insn (reg, new_rtx);
12814 new_rtx = reg;
12815 }
12816 else if (TARGET_64BIT && ix86_cmodel != CM_LARGE_PIC)
12817 {
12818 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTPCREL);
12819 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12820 new_rtx = gen_const_mem (Pmode, new_rtx);
12821 set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
12822
12823 if (reg == 0)
12824 reg = gen_reg_rtx (Pmode);
12825 /* Use directly gen_movsi, otherwise the address is loaded
12826 into register for CSE. We don't want to CSE this addresses,
12827 instead we CSE addresses from the GOT table, so skip this. */
12828 emit_insn (gen_movsi (reg, new_rtx));
12829 new_rtx = reg;
12830 }
12831 else
12832 {
12833 /* This symbol must be referenced via a load from the
12834 Global Offset Table (@GOT). */
12835
12836 if (reload_in_progress)
12837 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12838 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOT);
12839 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12840 if (TARGET_64BIT)
12841 new_rtx = force_reg (Pmode, new_rtx);
12842 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
12843 new_rtx = gen_const_mem (Pmode, new_rtx);
12844 set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
12845
12846 if (reg == 0)
12847 reg = gen_reg_rtx (Pmode);
12848 emit_move_insn (reg, new_rtx);
12849 new_rtx = reg;
12850 }
12851 }
12852 else
12853 {
12854 if (CONST_INT_P (addr)
12855 && !x86_64_immediate_operand (addr, VOIDmode))
12856 {
12857 if (reg)
12858 {
12859 emit_move_insn (reg, addr);
12860 new_rtx = reg;
12861 }
12862 else
12863 new_rtx = force_reg (Pmode, addr);
12864 }
12865 else if (GET_CODE (addr) == CONST)
12866 {
12867 addr = XEXP (addr, 0);
12868
12869 /* We must match stuff we generate before. Assume the only
12870 unspecs that can get here are ours. Not that we could do
12871 anything with them anyway.... */
12872 if (GET_CODE (addr) == UNSPEC
12873 || (GET_CODE (addr) == PLUS
12874 && GET_CODE (XEXP (addr, 0)) == UNSPEC))
12875 return orig;
12876 gcc_assert (GET_CODE (addr) == PLUS);
12877 }
12878 if (GET_CODE (addr) == PLUS)
12879 {
12880 rtx op0 = XEXP (addr, 0), op1 = XEXP (addr, 1);
12881
12882 /* Check first to see if this is a constant offset from a @GOTOFF
12883 symbol reference. */
12884 if (gotoff_operand (op0, Pmode)
12885 && CONST_INT_P (op1))
12886 {
12887 if (!TARGET_64BIT)
12888 {
12889 if (reload_in_progress)
12890 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12891 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op0),
12892 UNSPEC_GOTOFF);
12893 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, op1);
12894 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12895 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
12896
12897 if (reg != 0)
12898 {
12899 emit_move_insn (reg, new_rtx);
12900 new_rtx = reg;
12901 }
12902 }
12903 else
12904 {
12905 if (INTVAL (op1) < -16*1024*1024
12906 || INTVAL (op1) >= 16*1024*1024)
12907 {
12908 if (!x86_64_immediate_operand (op1, Pmode))
12909 op1 = force_reg (Pmode, op1);
12910 new_rtx = gen_rtx_PLUS (Pmode, force_reg (Pmode, op0), op1);
12911 }
12912 }
12913 }
12914 else
12915 {
12916 base = legitimize_pic_address (XEXP (addr, 0), reg);
12917 new_rtx = legitimize_pic_address (XEXP (addr, 1),
12918 base == reg ? NULL_RTX : reg);
12919
12920 if (CONST_INT_P (new_rtx))
12921 new_rtx = plus_constant (Pmode, base, INTVAL (new_rtx));
12922 else
12923 {
12924 if (GET_CODE (new_rtx) == PLUS && CONSTANT_P (XEXP (new_rtx, 1)))
12925 {
12926 base = gen_rtx_PLUS (Pmode, base, XEXP (new_rtx, 0));
12927 new_rtx = XEXP (new_rtx, 1);
12928 }
12929 new_rtx = gen_rtx_PLUS (Pmode, base, new_rtx);
12930 }
12931 }
12932 }
12933 }
12934 return new_rtx;
12935 }
12936 \f
12937 /* Load the thread pointer. If TO_REG is true, force it into a register. */
12938
12939 static rtx
12940 get_thread_pointer (enum machine_mode tp_mode, bool to_reg)
12941 {
12942 rtx tp = gen_rtx_UNSPEC (ptr_mode, gen_rtvec (1, const0_rtx), UNSPEC_TP);
12943
12944 if (GET_MODE (tp) != tp_mode)
12945 {
12946 gcc_assert (GET_MODE (tp) == SImode);
12947 gcc_assert (tp_mode == DImode);
12948
12949 tp = gen_rtx_ZERO_EXTEND (tp_mode, tp);
12950 }
12951
12952 if (to_reg)
12953 tp = copy_to_mode_reg (tp_mode, tp);
12954
12955 return tp;
12956 }
12957
12958 /* Construct the SYMBOL_REF for the tls_get_addr function. */
12959
12960 static GTY(()) rtx ix86_tls_symbol;
12961
12962 static rtx
12963 ix86_tls_get_addr (void)
12964 {
12965 if (!ix86_tls_symbol)
12966 {
12967 const char *sym
12968 = ((TARGET_ANY_GNU_TLS && !TARGET_64BIT)
12969 ? "___tls_get_addr" : "__tls_get_addr");
12970
12971 ix86_tls_symbol = gen_rtx_SYMBOL_REF (Pmode, sym);
12972 }
12973
12974 return ix86_tls_symbol;
12975 }
12976
12977 /* Construct the SYMBOL_REF for the _TLS_MODULE_BASE_ symbol. */
12978
12979 static GTY(()) rtx ix86_tls_module_base_symbol;
12980
12981 rtx
12982 ix86_tls_module_base (void)
12983 {
12984 if (!ix86_tls_module_base_symbol)
12985 {
12986 ix86_tls_module_base_symbol
12987 = gen_rtx_SYMBOL_REF (Pmode, "_TLS_MODULE_BASE_");
12988
12989 SYMBOL_REF_FLAGS (ix86_tls_module_base_symbol)
12990 |= TLS_MODEL_GLOBAL_DYNAMIC << SYMBOL_FLAG_TLS_SHIFT;
12991 }
12992
12993 return ix86_tls_module_base_symbol;
12994 }
12995
12996 /* A subroutine of ix86_legitimize_address and ix86_expand_move. FOR_MOV is
12997 false if we expect this to be used for a memory address and true if
12998 we expect to load the address into a register. */
12999
13000 static rtx
13001 legitimize_tls_address (rtx x, enum tls_model model, bool for_mov)
13002 {
13003 rtx dest, base, off;
13004 rtx pic = NULL_RTX, tp = NULL_RTX;
13005 enum machine_mode tp_mode = Pmode;
13006 int type;
13007
13008 switch (model)
13009 {
13010 case TLS_MODEL_GLOBAL_DYNAMIC:
13011 dest = gen_reg_rtx (Pmode);
13012
13013 if (!TARGET_64BIT)
13014 {
13015 if (flag_pic)
13016 pic = pic_offset_table_rtx;
13017 else
13018 {
13019 pic = gen_reg_rtx (Pmode);
13020 emit_insn (gen_set_got (pic));
13021 }
13022 }
13023
13024 if (TARGET_GNU2_TLS)
13025 {
13026 if (TARGET_64BIT)
13027 emit_insn (gen_tls_dynamic_gnu2_64 (dest, x));
13028 else
13029 emit_insn (gen_tls_dynamic_gnu2_32 (dest, x, pic));
13030
13031 tp = get_thread_pointer (Pmode, true);
13032 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, tp, dest));
13033
13034 set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
13035 }
13036 else
13037 {
13038 rtx caddr = ix86_tls_get_addr ();
13039
13040 if (TARGET_64BIT)
13041 {
13042 rtx rax = gen_rtx_REG (Pmode, AX_REG), insns;
13043
13044 start_sequence ();
13045 emit_call_insn (ix86_gen_tls_global_dynamic_64 (rax, x,
13046 caddr));
13047 insns = get_insns ();
13048 end_sequence ();
13049
13050 RTL_CONST_CALL_P (insns) = 1;
13051 emit_libcall_block (insns, dest, rax, x);
13052 }
13053 else
13054 emit_insn (gen_tls_global_dynamic_32 (dest, x, pic, caddr));
13055 }
13056 break;
13057
13058 case TLS_MODEL_LOCAL_DYNAMIC:
13059 base = gen_reg_rtx (Pmode);
13060
13061 if (!TARGET_64BIT)
13062 {
13063 if (flag_pic)
13064 pic = pic_offset_table_rtx;
13065 else
13066 {
13067 pic = gen_reg_rtx (Pmode);
13068 emit_insn (gen_set_got (pic));
13069 }
13070 }
13071
13072 if (TARGET_GNU2_TLS)
13073 {
13074 rtx tmp = ix86_tls_module_base ();
13075
13076 if (TARGET_64BIT)
13077 emit_insn (gen_tls_dynamic_gnu2_64 (base, tmp));
13078 else
13079 emit_insn (gen_tls_dynamic_gnu2_32 (base, tmp, pic));
13080
13081 tp = get_thread_pointer (Pmode, true);
13082 set_unique_reg_note (get_last_insn (), REG_EQUAL,
13083 gen_rtx_MINUS (Pmode, tmp, tp));
13084 }
13085 else
13086 {
13087 rtx caddr = ix86_tls_get_addr ();
13088
13089 if (TARGET_64BIT)
13090 {
13091 rtx rax = gen_rtx_REG (Pmode, AX_REG), insns, eqv;
13092
13093 start_sequence ();
13094 emit_call_insn (ix86_gen_tls_local_dynamic_base_64 (rax,
13095 caddr));
13096 insns = get_insns ();
13097 end_sequence ();
13098
13099 /* Attach a unique REG_EQUAL, to allow the RTL optimizers to
13100 share the LD_BASE result with other LD model accesses. */
13101 eqv = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
13102 UNSPEC_TLS_LD_BASE);
13103
13104 RTL_CONST_CALL_P (insns) = 1;
13105 emit_libcall_block (insns, base, rax, eqv);
13106 }
13107 else
13108 emit_insn (gen_tls_local_dynamic_base_32 (base, pic, caddr));
13109 }
13110
13111 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x), UNSPEC_DTPOFF);
13112 off = gen_rtx_CONST (Pmode, off);
13113
13114 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, base, off));
13115
13116 if (TARGET_GNU2_TLS)
13117 {
13118 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, dest, tp));
13119
13120 set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
13121 }
13122 break;
13123
13124 case TLS_MODEL_INITIAL_EXEC:
13125 if (TARGET_64BIT)
13126 {
13127 if (TARGET_SUN_TLS && !TARGET_X32)
13128 {
13129 /* The Sun linker took the AMD64 TLS spec literally
13130 and can only handle %rax as destination of the
13131 initial executable code sequence. */
13132
13133 dest = gen_reg_rtx (DImode);
13134 emit_insn (gen_tls_initial_exec_64_sun (dest, x));
13135 return dest;
13136 }
13137
13138 /* Generate DImode references to avoid %fs:(%reg32)
13139 problems and linker IE->LE relaxation bug. */
13140 tp_mode = DImode;
13141 pic = NULL;
13142 type = UNSPEC_GOTNTPOFF;
13143 }
13144 else if (flag_pic)
13145 {
13146 if (reload_in_progress)
13147 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
13148 pic = pic_offset_table_rtx;
13149 type = TARGET_ANY_GNU_TLS ? UNSPEC_GOTNTPOFF : UNSPEC_GOTTPOFF;
13150 }
13151 else if (!TARGET_ANY_GNU_TLS)
13152 {
13153 pic = gen_reg_rtx (Pmode);
13154 emit_insn (gen_set_got (pic));
13155 type = UNSPEC_GOTTPOFF;
13156 }
13157 else
13158 {
13159 pic = NULL;
13160 type = UNSPEC_INDNTPOFF;
13161 }
13162
13163 off = gen_rtx_UNSPEC (tp_mode, gen_rtvec (1, x), type);
13164 off = gen_rtx_CONST (tp_mode, off);
13165 if (pic)
13166 off = gen_rtx_PLUS (tp_mode, pic, off);
13167 off = gen_const_mem (tp_mode, off);
13168 set_mem_alias_set (off, ix86_GOT_alias_set ());
13169
13170 if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
13171 {
13172 base = get_thread_pointer (tp_mode,
13173 for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
13174 off = force_reg (tp_mode, off);
13175 return gen_rtx_PLUS (tp_mode, base, off);
13176 }
13177 else
13178 {
13179 base = get_thread_pointer (Pmode, true);
13180 dest = gen_reg_rtx (Pmode);
13181 emit_insn (ix86_gen_sub3 (dest, base, off));
13182 }
13183 break;
13184
13185 case TLS_MODEL_LOCAL_EXEC:
13186 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x),
13187 (TARGET_64BIT || TARGET_ANY_GNU_TLS)
13188 ? UNSPEC_NTPOFF : UNSPEC_TPOFF);
13189 off = gen_rtx_CONST (Pmode, off);
13190
13191 if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
13192 {
13193 base = get_thread_pointer (Pmode,
13194 for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
13195 return gen_rtx_PLUS (Pmode, base, off);
13196 }
13197 else
13198 {
13199 base = get_thread_pointer (Pmode, true);
13200 dest = gen_reg_rtx (Pmode);
13201 emit_insn (ix86_gen_sub3 (dest, base, off));
13202 }
13203 break;
13204
13205 default:
13206 gcc_unreachable ();
13207 }
13208
13209 return dest;
13210 }
13211
13212 /* Create or return the unique __imp_DECL dllimport symbol corresponding
13213 to symbol DECL. */
13214
13215 static GTY((if_marked ("tree_map_marked_p"), param_is (struct tree_map)))
13216 htab_t dllimport_map;
13217
13218 static tree
13219 get_dllimport_decl (tree decl)
13220 {
13221 struct tree_map *h, in;
13222 void **loc;
13223 const char *name;
13224 const char *prefix;
13225 size_t namelen, prefixlen;
13226 char *imp_name;
13227 tree to;
13228 rtx rtl;
13229
13230 if (!dllimport_map)
13231 dllimport_map = htab_create_ggc (512, tree_map_hash, tree_map_eq, 0);
13232
13233 in.hash = htab_hash_pointer (decl);
13234 in.base.from = decl;
13235 loc = htab_find_slot_with_hash (dllimport_map, &in, in.hash, INSERT);
13236 h = (struct tree_map *) *loc;
13237 if (h)
13238 return h->to;
13239
13240 *loc = h = ggc_alloc_tree_map ();
13241 h->hash = in.hash;
13242 h->base.from = decl;
13243 h->to = to = build_decl (DECL_SOURCE_LOCATION (decl),
13244 VAR_DECL, NULL, ptr_type_node);
13245 DECL_ARTIFICIAL (to) = 1;
13246 DECL_IGNORED_P (to) = 1;
13247 DECL_EXTERNAL (to) = 1;
13248 TREE_READONLY (to) = 1;
13249
13250 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
13251 name = targetm.strip_name_encoding (name);
13252 prefix = name[0] == FASTCALL_PREFIX || user_label_prefix[0] == 0
13253 ? "*__imp_" : "*__imp__";
13254 namelen = strlen (name);
13255 prefixlen = strlen (prefix);
13256 imp_name = (char *) alloca (namelen + prefixlen + 1);
13257 memcpy (imp_name, prefix, prefixlen);
13258 memcpy (imp_name + prefixlen, name, namelen + 1);
13259
13260 name = ggc_alloc_string (imp_name, namelen + prefixlen);
13261 rtl = gen_rtx_SYMBOL_REF (Pmode, name);
13262 SET_SYMBOL_REF_DECL (rtl, to);
13263 SYMBOL_REF_FLAGS (rtl) = SYMBOL_FLAG_LOCAL;
13264
13265 rtl = gen_const_mem (Pmode, rtl);
13266 set_mem_alias_set (rtl, ix86_GOT_alias_set ());
13267
13268 SET_DECL_RTL (to, rtl);
13269 SET_DECL_ASSEMBLER_NAME (to, get_identifier (name));
13270
13271 return to;
13272 }
13273
13274 /* Expand SYMBOL into its corresponding dllimport symbol. WANT_REG is
13275 true if we require the result be a register. */
13276
13277 static rtx
13278 legitimize_dllimport_symbol (rtx symbol, bool want_reg)
13279 {
13280 tree imp_decl;
13281 rtx x;
13282
13283 gcc_assert (SYMBOL_REF_DECL (symbol));
13284 imp_decl = get_dllimport_decl (SYMBOL_REF_DECL (symbol));
13285
13286 x = DECL_RTL (imp_decl);
13287 if (want_reg)
13288 x = force_reg (Pmode, x);
13289 return x;
13290 }
13291
13292 /* Try machine-dependent ways of modifying an illegitimate address
13293 to be legitimate. If we find one, return the new, valid address.
13294 This macro is used in only one place: `memory_address' in explow.c.
13295
13296 OLDX is the address as it was before break_out_memory_refs was called.
13297 In some cases it is useful to look at this to decide what needs to be done.
13298
13299 It is always safe for this macro to do nothing. It exists to recognize
13300 opportunities to optimize the output.
13301
13302 For the 80386, we handle X+REG by loading X into a register R and
13303 using R+REG. R will go in a general reg and indexing will be used.
13304 However, if REG is a broken-out memory address or multiplication,
13305 nothing needs to be done because REG can certainly go in a general reg.
13306
13307 When -fpic is used, special handling is needed for symbolic references.
13308 See comments by legitimize_pic_address in i386.c for details. */
13309
13310 static rtx
13311 ix86_legitimize_address (rtx x, rtx oldx ATTRIBUTE_UNUSED,
13312 enum machine_mode mode)
13313 {
13314 int changed = 0;
13315 unsigned log;
13316
13317 log = GET_CODE (x) == SYMBOL_REF ? SYMBOL_REF_TLS_MODEL (x) : 0;
13318 if (log)
13319 return legitimize_tls_address (x, (enum tls_model) log, false);
13320 if (GET_CODE (x) == CONST
13321 && GET_CODE (XEXP (x, 0)) == PLUS
13322 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF
13323 && (log = SYMBOL_REF_TLS_MODEL (XEXP (XEXP (x, 0), 0))))
13324 {
13325 rtx t = legitimize_tls_address (XEXP (XEXP (x, 0), 0),
13326 (enum tls_model) log, false);
13327 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (x, 0), 1));
13328 }
13329
13330 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
13331 {
13332 if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_DLLIMPORT_P (x))
13333 return legitimize_dllimport_symbol (x, true);
13334 if (GET_CODE (x) == CONST
13335 && GET_CODE (XEXP (x, 0)) == PLUS
13336 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF
13337 && SYMBOL_REF_DLLIMPORT_P (XEXP (XEXP (x, 0), 0)))
13338 {
13339 rtx t = legitimize_dllimport_symbol (XEXP (XEXP (x, 0), 0), true);
13340 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (x, 0), 1));
13341 }
13342 }
13343
13344 if (flag_pic && SYMBOLIC_CONST (x))
13345 return legitimize_pic_address (x, 0);
13346
13347 #if TARGET_MACHO
13348 if (MACHO_DYNAMIC_NO_PIC_P && SYMBOLIC_CONST (x))
13349 return machopic_indirect_data_reference (x, 0);
13350 #endif
13351
13352 /* Canonicalize shifts by 0, 1, 2, 3 into multiply */
13353 if (GET_CODE (x) == ASHIFT
13354 && CONST_INT_P (XEXP (x, 1))
13355 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (x, 1)) < 4)
13356 {
13357 changed = 1;
13358 log = INTVAL (XEXP (x, 1));
13359 x = gen_rtx_MULT (Pmode, force_reg (Pmode, XEXP (x, 0)),
13360 GEN_INT (1 << log));
13361 }
13362
13363 if (GET_CODE (x) == PLUS)
13364 {
13365 /* Canonicalize shifts by 0, 1, 2, 3 into multiply. */
13366
13367 if (GET_CODE (XEXP (x, 0)) == ASHIFT
13368 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
13369 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 0), 1)) < 4)
13370 {
13371 changed = 1;
13372 log = INTVAL (XEXP (XEXP (x, 0), 1));
13373 XEXP (x, 0) = gen_rtx_MULT (Pmode,
13374 force_reg (Pmode, XEXP (XEXP (x, 0), 0)),
13375 GEN_INT (1 << log));
13376 }
13377
13378 if (GET_CODE (XEXP (x, 1)) == ASHIFT
13379 && CONST_INT_P (XEXP (XEXP (x, 1), 1))
13380 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 1), 1)) < 4)
13381 {
13382 changed = 1;
13383 log = INTVAL (XEXP (XEXP (x, 1), 1));
13384 XEXP (x, 1) = gen_rtx_MULT (Pmode,
13385 force_reg (Pmode, XEXP (XEXP (x, 1), 0)),
13386 GEN_INT (1 << log));
13387 }
13388
13389 /* Put multiply first if it isn't already. */
13390 if (GET_CODE (XEXP (x, 1)) == MULT)
13391 {
13392 rtx tmp = XEXP (x, 0);
13393 XEXP (x, 0) = XEXP (x, 1);
13394 XEXP (x, 1) = tmp;
13395 changed = 1;
13396 }
13397
13398 /* Canonicalize (plus (mult (reg) (const)) (plus (reg) (const)))
13399 into (plus (plus (mult (reg) (const)) (reg)) (const)). This can be
13400 created by virtual register instantiation, register elimination, and
13401 similar optimizations. */
13402 if (GET_CODE (XEXP (x, 0)) == MULT && GET_CODE (XEXP (x, 1)) == PLUS)
13403 {
13404 changed = 1;
13405 x = gen_rtx_PLUS (Pmode,
13406 gen_rtx_PLUS (Pmode, XEXP (x, 0),
13407 XEXP (XEXP (x, 1), 0)),
13408 XEXP (XEXP (x, 1), 1));
13409 }
13410
13411 /* Canonicalize
13412 (plus (plus (mult (reg) (const)) (plus (reg) (const))) const)
13413 into (plus (plus (mult (reg) (const)) (reg)) (const)). */
13414 else if (GET_CODE (x) == PLUS && GET_CODE (XEXP (x, 0)) == PLUS
13415 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
13416 && GET_CODE (XEXP (XEXP (x, 0), 1)) == PLUS
13417 && CONSTANT_P (XEXP (x, 1)))
13418 {
13419 rtx constant;
13420 rtx other = NULL_RTX;
13421
13422 if (CONST_INT_P (XEXP (x, 1)))
13423 {
13424 constant = XEXP (x, 1);
13425 other = XEXP (XEXP (XEXP (x, 0), 1), 1);
13426 }
13427 else if (CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 1), 1)))
13428 {
13429 constant = XEXP (XEXP (XEXP (x, 0), 1), 1);
13430 other = XEXP (x, 1);
13431 }
13432 else
13433 constant = 0;
13434
13435 if (constant)
13436 {
13437 changed = 1;
13438 x = gen_rtx_PLUS (Pmode,
13439 gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 0),
13440 XEXP (XEXP (XEXP (x, 0), 1), 0)),
13441 plus_constant (Pmode, other,
13442 INTVAL (constant)));
13443 }
13444 }
13445
13446 if (changed && ix86_legitimate_address_p (mode, x, false))
13447 return x;
13448
13449 if (GET_CODE (XEXP (x, 0)) == MULT)
13450 {
13451 changed = 1;
13452 XEXP (x, 0) = force_operand (XEXP (x, 0), 0);
13453 }
13454
13455 if (GET_CODE (XEXP (x, 1)) == MULT)
13456 {
13457 changed = 1;
13458 XEXP (x, 1) = force_operand (XEXP (x, 1), 0);
13459 }
13460
13461 if (changed
13462 && REG_P (XEXP (x, 1))
13463 && REG_P (XEXP (x, 0)))
13464 return x;
13465
13466 if (flag_pic && SYMBOLIC_CONST (XEXP (x, 1)))
13467 {
13468 changed = 1;
13469 x = legitimize_pic_address (x, 0);
13470 }
13471
13472 if (changed && ix86_legitimate_address_p (mode, x, false))
13473 return x;
13474
13475 if (REG_P (XEXP (x, 0)))
13476 {
13477 rtx temp = gen_reg_rtx (Pmode);
13478 rtx val = force_operand (XEXP (x, 1), temp);
13479 if (val != temp)
13480 {
13481 if (GET_MODE (val) != Pmode)
13482 val = convert_to_mode (Pmode, val, 1);
13483 emit_move_insn (temp, val);
13484 }
13485
13486 XEXP (x, 1) = temp;
13487 return x;
13488 }
13489
13490 else if (REG_P (XEXP (x, 1)))
13491 {
13492 rtx temp = gen_reg_rtx (Pmode);
13493 rtx val = force_operand (XEXP (x, 0), temp);
13494 if (val != temp)
13495 {
13496 if (GET_MODE (val) != Pmode)
13497 val = convert_to_mode (Pmode, val, 1);
13498 emit_move_insn (temp, val);
13499 }
13500
13501 XEXP (x, 0) = temp;
13502 return x;
13503 }
13504 }
13505
13506 return x;
13507 }
13508 \f
13509 /* Print an integer constant expression in assembler syntax. Addition
13510 and subtraction are the only arithmetic that may appear in these
13511 expressions. FILE is the stdio stream to write to, X is the rtx, and
13512 CODE is the operand print code from the output string. */
13513
13514 static void
13515 output_pic_addr_const (FILE *file, rtx x, int code)
13516 {
13517 char buf[256];
13518
13519 switch (GET_CODE (x))
13520 {
13521 case PC:
13522 gcc_assert (flag_pic);
13523 putc ('.', file);
13524 break;
13525
13526 case SYMBOL_REF:
13527 if (TARGET_64BIT || ! TARGET_MACHO_BRANCH_ISLANDS)
13528 output_addr_const (file, x);
13529 else
13530 {
13531 const char *name = XSTR (x, 0);
13532
13533 /* Mark the decl as referenced so that cgraph will
13534 output the function. */
13535 if (SYMBOL_REF_DECL (x))
13536 mark_decl_referenced (SYMBOL_REF_DECL (x));
13537
13538 #if TARGET_MACHO
13539 if (MACHOPIC_INDIRECT
13540 && machopic_classify_symbol (x) == MACHOPIC_UNDEFINED_FUNCTION)
13541 name = machopic_indirection_name (x, /*stub_p=*/true);
13542 #endif
13543 assemble_name (file, name);
13544 }
13545 if (!TARGET_MACHO && !(TARGET_64BIT && DEFAULT_ABI == MS_ABI)
13546 && code == 'P' && ! SYMBOL_REF_LOCAL_P (x))
13547 fputs ("@PLT", file);
13548 break;
13549
13550 case LABEL_REF:
13551 x = XEXP (x, 0);
13552 /* FALLTHRU */
13553 case CODE_LABEL:
13554 ASM_GENERATE_INTERNAL_LABEL (buf, "L", CODE_LABEL_NUMBER (x));
13555 assemble_name (asm_out_file, buf);
13556 break;
13557
13558 case CONST_INT:
13559 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
13560 break;
13561
13562 case CONST:
13563 /* This used to output parentheses around the expression,
13564 but that does not work on the 386 (either ATT or BSD assembler). */
13565 output_pic_addr_const (file, XEXP (x, 0), code);
13566 break;
13567
13568 case CONST_DOUBLE:
13569 if (GET_MODE (x) == VOIDmode)
13570 {
13571 /* We can use %d if the number is <32 bits and positive. */
13572 if (CONST_DOUBLE_HIGH (x) || CONST_DOUBLE_LOW (x) < 0)
13573 fprintf (file, "0x%lx%08lx",
13574 (unsigned long) CONST_DOUBLE_HIGH (x),
13575 (unsigned long) CONST_DOUBLE_LOW (x));
13576 else
13577 fprintf (file, HOST_WIDE_INT_PRINT_DEC, CONST_DOUBLE_LOW (x));
13578 }
13579 else
13580 /* We can't handle floating point constants;
13581 TARGET_PRINT_OPERAND must handle them. */
13582 output_operand_lossage ("floating constant misused");
13583 break;
13584
13585 case PLUS:
13586 /* Some assemblers need integer constants to appear first. */
13587 if (CONST_INT_P (XEXP (x, 0)))
13588 {
13589 output_pic_addr_const (file, XEXP (x, 0), code);
13590 putc ('+', file);
13591 output_pic_addr_const (file, XEXP (x, 1), code);
13592 }
13593 else
13594 {
13595 gcc_assert (CONST_INT_P (XEXP (x, 1)));
13596 output_pic_addr_const (file, XEXP (x, 1), code);
13597 putc ('+', file);
13598 output_pic_addr_const (file, XEXP (x, 0), code);
13599 }
13600 break;
13601
13602 case MINUS:
13603 if (!TARGET_MACHO)
13604 putc (ASSEMBLER_DIALECT == ASM_INTEL ? '(' : '[', file);
13605 output_pic_addr_const (file, XEXP (x, 0), code);
13606 putc ('-', file);
13607 output_pic_addr_const (file, XEXP (x, 1), code);
13608 if (!TARGET_MACHO)
13609 putc (ASSEMBLER_DIALECT == ASM_INTEL ? ')' : ']', file);
13610 break;
13611
13612 case UNSPEC:
13613 if (XINT (x, 1) == UNSPEC_STACK_CHECK)
13614 {
13615 bool f = i386_asm_output_addr_const_extra (file, x);
13616 gcc_assert (f);
13617 break;
13618 }
13619
13620 gcc_assert (XVECLEN (x, 0) == 1);
13621 output_pic_addr_const (file, XVECEXP (x, 0, 0), code);
13622 switch (XINT (x, 1))
13623 {
13624 case UNSPEC_GOT:
13625 fputs ("@GOT", file);
13626 break;
13627 case UNSPEC_GOTOFF:
13628 fputs ("@GOTOFF", file);
13629 break;
13630 case UNSPEC_PLTOFF:
13631 fputs ("@PLTOFF", file);
13632 break;
13633 case UNSPEC_PCREL:
13634 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
13635 "(%rip)" : "[rip]", file);
13636 break;
13637 case UNSPEC_GOTPCREL:
13638 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
13639 "@GOTPCREL(%rip)" : "@GOTPCREL[rip]", file);
13640 break;
13641 case UNSPEC_GOTTPOFF:
13642 /* FIXME: This might be @TPOFF in Sun ld too. */
13643 fputs ("@gottpoff", file);
13644 break;
13645 case UNSPEC_TPOFF:
13646 fputs ("@tpoff", file);
13647 break;
13648 case UNSPEC_NTPOFF:
13649 if (TARGET_64BIT)
13650 fputs ("@tpoff", file);
13651 else
13652 fputs ("@ntpoff", file);
13653 break;
13654 case UNSPEC_DTPOFF:
13655 fputs ("@dtpoff", file);
13656 break;
13657 case UNSPEC_GOTNTPOFF:
13658 if (TARGET_64BIT)
13659 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
13660 "@gottpoff(%rip)": "@gottpoff[rip]", file);
13661 else
13662 fputs ("@gotntpoff", file);
13663 break;
13664 case UNSPEC_INDNTPOFF:
13665 fputs ("@indntpoff", file);
13666 break;
13667 #if TARGET_MACHO
13668 case UNSPEC_MACHOPIC_OFFSET:
13669 putc ('-', file);
13670 machopic_output_function_base_name (file);
13671 break;
13672 #endif
13673 default:
13674 output_operand_lossage ("invalid UNSPEC as operand");
13675 break;
13676 }
13677 break;
13678
13679 default:
13680 output_operand_lossage ("invalid expression as operand");
13681 }
13682 }
13683
13684 /* This is called from dwarf2out.c via TARGET_ASM_OUTPUT_DWARF_DTPREL.
13685 We need to emit DTP-relative relocations. */
13686
13687 static void ATTRIBUTE_UNUSED
13688 i386_output_dwarf_dtprel (FILE *file, int size, rtx x)
13689 {
13690 fputs (ASM_LONG, file);
13691 output_addr_const (file, x);
13692 fputs ("@dtpoff", file);
13693 switch (size)
13694 {
13695 case 4:
13696 break;
13697 case 8:
13698 fputs (", 0", file);
13699 break;
13700 default:
13701 gcc_unreachable ();
13702 }
13703 }
13704
13705 /* Return true if X is a representation of the PIC register. This copes
13706 with calls from ix86_find_base_term, where the register might have
13707 been replaced by a cselib value. */
13708
13709 static bool
13710 ix86_pic_register_p (rtx x)
13711 {
13712 if (GET_CODE (x) == VALUE && CSELIB_VAL_PTR (x))
13713 return (pic_offset_table_rtx
13714 && rtx_equal_for_cselib_p (x, pic_offset_table_rtx));
13715 else
13716 return REG_P (x) && REGNO (x) == PIC_OFFSET_TABLE_REGNUM;
13717 }
13718
13719 /* Helper function for ix86_delegitimize_address.
13720 Attempt to delegitimize TLS local-exec accesses. */
13721
13722 static rtx
13723 ix86_delegitimize_tls_address (rtx orig_x)
13724 {
13725 rtx x = orig_x, unspec;
13726 struct ix86_address addr;
13727
13728 if (!TARGET_TLS_DIRECT_SEG_REFS)
13729 return orig_x;
13730 if (MEM_P (x))
13731 x = XEXP (x, 0);
13732 if (GET_CODE (x) != PLUS || GET_MODE (x) != Pmode)
13733 return orig_x;
13734 if (ix86_decompose_address (x, &addr) == 0
13735 || addr.seg != (TARGET_64BIT ? SEG_FS : SEG_GS)
13736 || addr.disp == NULL_RTX
13737 || GET_CODE (addr.disp) != CONST)
13738 return orig_x;
13739 unspec = XEXP (addr.disp, 0);
13740 if (GET_CODE (unspec) == PLUS && CONST_INT_P (XEXP (unspec, 1)))
13741 unspec = XEXP (unspec, 0);
13742 if (GET_CODE (unspec) != UNSPEC || XINT (unspec, 1) != UNSPEC_NTPOFF)
13743 return orig_x;
13744 x = XVECEXP (unspec, 0, 0);
13745 gcc_assert (GET_CODE (x) == SYMBOL_REF);
13746 if (unspec != XEXP (addr.disp, 0))
13747 x = gen_rtx_PLUS (Pmode, x, XEXP (XEXP (addr.disp, 0), 1));
13748 if (addr.index)
13749 {
13750 rtx idx = addr.index;
13751 if (addr.scale != 1)
13752 idx = gen_rtx_MULT (Pmode, idx, GEN_INT (addr.scale));
13753 x = gen_rtx_PLUS (Pmode, idx, x);
13754 }
13755 if (addr.base)
13756 x = gen_rtx_PLUS (Pmode, addr.base, x);
13757 if (MEM_P (orig_x))
13758 x = replace_equiv_address_nv (orig_x, x);
13759 return x;
13760 }
13761
13762 /* In the name of slightly smaller debug output, and to cater to
13763 general assembler lossage, recognize PIC+GOTOFF and turn it back
13764 into a direct symbol reference.
13765
13766 On Darwin, this is necessary to avoid a crash, because Darwin
13767 has a different PIC label for each routine but the DWARF debugging
13768 information is not associated with any particular routine, so it's
13769 necessary to remove references to the PIC label from RTL stored by
13770 the DWARF output code. */
13771
13772 static rtx
13773 ix86_delegitimize_address (rtx x)
13774 {
13775 rtx orig_x = delegitimize_mem_from_attrs (x);
13776 /* addend is NULL or some rtx if x is something+GOTOFF where
13777 something doesn't include the PIC register. */
13778 rtx addend = NULL_RTX;
13779 /* reg_addend is NULL or a multiple of some register. */
13780 rtx reg_addend = NULL_RTX;
13781 /* const_addend is NULL or a const_int. */
13782 rtx const_addend = NULL_RTX;
13783 /* This is the result, or NULL. */
13784 rtx result = NULL_RTX;
13785
13786 x = orig_x;
13787
13788 if (MEM_P (x))
13789 x = XEXP (x, 0);
13790
13791 if (TARGET_64BIT)
13792 {
13793 if (GET_CODE (x) == CONST
13794 && GET_CODE (XEXP (x, 0)) == PLUS
13795 && GET_MODE (XEXP (x, 0)) == Pmode
13796 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
13797 && GET_CODE (XEXP (XEXP (x, 0), 0)) == UNSPEC
13798 && XINT (XEXP (XEXP (x, 0), 0), 1) == UNSPEC_PCREL)
13799 {
13800 rtx x2 = XVECEXP (XEXP (XEXP (x, 0), 0), 0, 0);
13801 x = gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 1), x2);
13802 if (MEM_P (orig_x))
13803 x = replace_equiv_address_nv (orig_x, x);
13804 return x;
13805 }
13806 if (GET_CODE (x) != CONST
13807 || GET_CODE (XEXP (x, 0)) != UNSPEC
13808 || (XINT (XEXP (x, 0), 1) != UNSPEC_GOTPCREL
13809 && XINT (XEXP (x, 0), 1) != UNSPEC_PCREL)
13810 || (!MEM_P (orig_x) && XINT (XEXP (x, 0), 1) != UNSPEC_PCREL))
13811 return ix86_delegitimize_tls_address (orig_x);
13812 x = XVECEXP (XEXP (x, 0), 0, 0);
13813 if (GET_MODE (orig_x) != GET_MODE (x) && MEM_P (orig_x))
13814 {
13815 x = simplify_gen_subreg (GET_MODE (orig_x), x,
13816 GET_MODE (x), 0);
13817 if (x == NULL_RTX)
13818 return orig_x;
13819 }
13820 return x;
13821 }
13822
13823 if (GET_CODE (x) != PLUS
13824 || GET_CODE (XEXP (x, 1)) != CONST)
13825 return ix86_delegitimize_tls_address (orig_x);
13826
13827 if (ix86_pic_register_p (XEXP (x, 0)))
13828 /* %ebx + GOT/GOTOFF */
13829 ;
13830 else if (GET_CODE (XEXP (x, 0)) == PLUS)
13831 {
13832 /* %ebx + %reg * scale + GOT/GOTOFF */
13833 reg_addend = XEXP (x, 0);
13834 if (ix86_pic_register_p (XEXP (reg_addend, 0)))
13835 reg_addend = XEXP (reg_addend, 1);
13836 else if (ix86_pic_register_p (XEXP (reg_addend, 1)))
13837 reg_addend = XEXP (reg_addend, 0);
13838 else
13839 {
13840 reg_addend = NULL_RTX;
13841 addend = XEXP (x, 0);
13842 }
13843 }
13844 else
13845 addend = XEXP (x, 0);
13846
13847 x = XEXP (XEXP (x, 1), 0);
13848 if (GET_CODE (x) == PLUS
13849 && CONST_INT_P (XEXP (x, 1)))
13850 {
13851 const_addend = XEXP (x, 1);
13852 x = XEXP (x, 0);
13853 }
13854
13855 if (GET_CODE (x) == UNSPEC
13856 && ((XINT (x, 1) == UNSPEC_GOT && MEM_P (orig_x) && !addend)
13857 || (XINT (x, 1) == UNSPEC_GOTOFF && !MEM_P (orig_x))))
13858 result = XVECEXP (x, 0, 0);
13859
13860 if (TARGET_MACHO && darwin_local_data_pic (x)
13861 && !MEM_P (orig_x))
13862 result = XVECEXP (x, 0, 0);
13863
13864 if (! result)
13865 return ix86_delegitimize_tls_address (orig_x);
13866
13867 if (const_addend)
13868 result = gen_rtx_CONST (Pmode, gen_rtx_PLUS (Pmode, result, const_addend));
13869 if (reg_addend)
13870 result = gen_rtx_PLUS (Pmode, reg_addend, result);
13871 if (addend)
13872 {
13873 /* If the rest of original X doesn't involve the PIC register, add
13874 addend and subtract pic_offset_table_rtx. This can happen e.g.
13875 for code like:
13876 leal (%ebx, %ecx, 4), %ecx
13877 ...
13878 movl foo@GOTOFF(%ecx), %edx
13879 in which case we return (%ecx - %ebx) + foo. */
13880 if (pic_offset_table_rtx)
13881 result = gen_rtx_PLUS (Pmode, gen_rtx_MINUS (Pmode, copy_rtx (addend),
13882 pic_offset_table_rtx),
13883 result);
13884 else
13885 return orig_x;
13886 }
13887 if (GET_MODE (orig_x) != Pmode && MEM_P (orig_x))
13888 {
13889 result = simplify_gen_subreg (GET_MODE (orig_x), result, Pmode, 0);
13890 if (result == NULL_RTX)
13891 return orig_x;
13892 }
13893 return result;
13894 }
13895
13896 /* If X is a machine specific address (i.e. a symbol or label being
13897 referenced as a displacement from the GOT implemented using an
13898 UNSPEC), then return the base term. Otherwise return X. */
13899
13900 rtx
13901 ix86_find_base_term (rtx x)
13902 {
13903 rtx term;
13904
13905 if (TARGET_64BIT)
13906 {
13907 if (GET_CODE (x) != CONST)
13908 return x;
13909 term = XEXP (x, 0);
13910 if (GET_CODE (term) == PLUS
13911 && (CONST_INT_P (XEXP (term, 1))
13912 || GET_CODE (XEXP (term, 1)) == CONST_DOUBLE))
13913 term = XEXP (term, 0);
13914 if (GET_CODE (term) != UNSPEC
13915 || (XINT (term, 1) != UNSPEC_GOTPCREL
13916 && XINT (term, 1) != UNSPEC_PCREL))
13917 return x;
13918
13919 return XVECEXP (term, 0, 0);
13920 }
13921
13922 return ix86_delegitimize_address (x);
13923 }
13924 \f
13925 static void
13926 put_condition_code (enum rtx_code code, enum machine_mode mode, bool reverse,
13927 bool fp, FILE *file)
13928 {
13929 const char *suffix;
13930
13931 if (mode == CCFPmode || mode == CCFPUmode)
13932 {
13933 code = ix86_fp_compare_code_to_integer (code);
13934 mode = CCmode;
13935 }
13936 if (reverse)
13937 code = reverse_condition (code);
13938
13939 switch (code)
13940 {
13941 case EQ:
13942 switch (mode)
13943 {
13944 case CCAmode:
13945 suffix = "a";
13946 break;
13947
13948 case CCCmode:
13949 suffix = "c";
13950 break;
13951
13952 case CCOmode:
13953 suffix = "o";
13954 break;
13955
13956 case CCSmode:
13957 suffix = "s";
13958 break;
13959
13960 default:
13961 suffix = "e";
13962 }
13963 break;
13964 case NE:
13965 switch (mode)
13966 {
13967 case CCAmode:
13968 suffix = "na";
13969 break;
13970
13971 case CCCmode:
13972 suffix = "nc";
13973 break;
13974
13975 case CCOmode:
13976 suffix = "no";
13977 break;
13978
13979 case CCSmode:
13980 suffix = "ns";
13981 break;
13982
13983 default:
13984 suffix = "ne";
13985 }
13986 break;
13987 case GT:
13988 gcc_assert (mode == CCmode || mode == CCNOmode || mode == CCGCmode);
13989 suffix = "g";
13990 break;
13991 case GTU:
13992 /* ??? Use "nbe" instead of "a" for fcmov lossage on some assemblers.
13993 Those same assemblers have the same but opposite lossage on cmov. */
13994 if (mode == CCmode)
13995 suffix = fp ? "nbe" : "a";
13996 else if (mode == CCCmode)
13997 suffix = "b";
13998 else
13999 gcc_unreachable ();
14000 break;
14001 case LT:
14002 switch (mode)
14003 {
14004 case CCNOmode:
14005 case CCGOCmode:
14006 suffix = "s";
14007 break;
14008
14009 case CCmode:
14010 case CCGCmode:
14011 suffix = "l";
14012 break;
14013
14014 default:
14015 gcc_unreachable ();
14016 }
14017 break;
14018 case LTU:
14019 gcc_assert (mode == CCmode || mode == CCCmode);
14020 suffix = "b";
14021 break;
14022 case GE:
14023 switch (mode)
14024 {
14025 case CCNOmode:
14026 case CCGOCmode:
14027 suffix = "ns";
14028 break;
14029
14030 case CCmode:
14031 case CCGCmode:
14032 suffix = "ge";
14033 break;
14034
14035 default:
14036 gcc_unreachable ();
14037 }
14038 break;
14039 case GEU:
14040 /* ??? As above. */
14041 gcc_assert (mode == CCmode || mode == CCCmode);
14042 suffix = fp ? "nb" : "ae";
14043 break;
14044 case LE:
14045 gcc_assert (mode == CCmode || mode == CCGCmode || mode == CCNOmode);
14046 suffix = "le";
14047 break;
14048 case LEU:
14049 /* ??? As above. */
14050 if (mode == CCmode)
14051 suffix = "be";
14052 else if (mode == CCCmode)
14053 suffix = fp ? "nb" : "ae";
14054 else
14055 gcc_unreachable ();
14056 break;
14057 case UNORDERED:
14058 suffix = fp ? "u" : "p";
14059 break;
14060 case ORDERED:
14061 suffix = fp ? "nu" : "np";
14062 break;
14063 default:
14064 gcc_unreachable ();
14065 }
14066 fputs (suffix, file);
14067 }
14068
14069 /* Print the name of register X to FILE based on its machine mode and number.
14070 If CODE is 'w', pretend the mode is HImode.
14071 If CODE is 'b', pretend the mode is QImode.
14072 If CODE is 'k', pretend the mode is SImode.
14073 If CODE is 'q', pretend the mode is DImode.
14074 If CODE is 'x', pretend the mode is V4SFmode.
14075 If CODE is 't', pretend the mode is V8SFmode.
14076 If CODE is 'h', pretend the reg is the 'high' byte register.
14077 If CODE is 'y', print "st(0)" instead of "st", if the reg is stack op.
14078 If CODE is 'd', duplicate the operand for AVX instruction.
14079 */
14080
14081 void
14082 print_reg (rtx x, int code, FILE *file)
14083 {
14084 const char *reg;
14085 bool duplicated = code == 'd' && TARGET_AVX;
14086
14087 gcc_assert (x == pc_rtx
14088 || (REGNO (x) != ARG_POINTER_REGNUM
14089 && REGNO (x) != FRAME_POINTER_REGNUM
14090 && REGNO (x) != FLAGS_REG
14091 && REGNO (x) != FPSR_REG
14092 && REGNO (x) != FPCR_REG));
14093
14094 if (ASSEMBLER_DIALECT == ASM_ATT)
14095 putc ('%', file);
14096
14097 if (x == pc_rtx)
14098 {
14099 gcc_assert (TARGET_64BIT);
14100 fputs ("rip", file);
14101 return;
14102 }
14103
14104 if (code == 'w' || MMX_REG_P (x))
14105 code = 2;
14106 else if (code == 'b')
14107 code = 1;
14108 else if (code == 'k')
14109 code = 4;
14110 else if (code == 'q')
14111 code = 8;
14112 else if (code == 'y')
14113 code = 3;
14114 else if (code == 'h')
14115 code = 0;
14116 else if (code == 'x')
14117 code = 16;
14118 else if (code == 't')
14119 code = 32;
14120 else
14121 code = GET_MODE_SIZE (GET_MODE (x));
14122
14123 /* Irritatingly, AMD extended registers use different naming convention
14124 from the normal registers: "r%d[bwd]" */
14125 if (REX_INT_REG_P (x))
14126 {
14127 gcc_assert (TARGET_64BIT);
14128 putc ('r', file);
14129 fprint_ul (file, REGNO (x) - FIRST_REX_INT_REG + 8);
14130 switch (code)
14131 {
14132 case 0:
14133 error ("extended registers have no high halves");
14134 break;
14135 case 1:
14136 putc ('b', file);
14137 break;
14138 case 2:
14139 putc ('w', file);
14140 break;
14141 case 4:
14142 putc ('d', file);
14143 break;
14144 case 8:
14145 /* no suffix */
14146 break;
14147 default:
14148 error ("unsupported operand size for extended register");
14149 break;
14150 }
14151 return;
14152 }
14153
14154 reg = NULL;
14155 switch (code)
14156 {
14157 case 3:
14158 if (STACK_TOP_P (x))
14159 {
14160 reg = "st(0)";
14161 break;
14162 }
14163 /* FALLTHRU */
14164 case 8:
14165 case 4:
14166 case 12:
14167 if (! ANY_FP_REG_P (x))
14168 putc (code == 8 && TARGET_64BIT ? 'r' : 'e', file);
14169 /* FALLTHRU */
14170 case 16:
14171 case 2:
14172 normal:
14173 reg = hi_reg_name[REGNO (x)];
14174 break;
14175 case 1:
14176 if (REGNO (x) >= ARRAY_SIZE (qi_reg_name))
14177 goto normal;
14178 reg = qi_reg_name[REGNO (x)];
14179 break;
14180 case 0:
14181 if (REGNO (x) >= ARRAY_SIZE (qi_high_reg_name))
14182 goto normal;
14183 reg = qi_high_reg_name[REGNO (x)];
14184 break;
14185 case 32:
14186 if (SSE_REG_P (x))
14187 {
14188 gcc_assert (!duplicated);
14189 putc ('y', file);
14190 fputs (hi_reg_name[REGNO (x)] + 1, file);
14191 return;
14192 }
14193 break;
14194 default:
14195 gcc_unreachable ();
14196 }
14197
14198 fputs (reg, file);
14199 if (duplicated)
14200 {
14201 if (ASSEMBLER_DIALECT == ASM_ATT)
14202 fprintf (file, ", %%%s", reg);
14203 else
14204 fprintf (file, ", %s", reg);
14205 }
14206 }
14207
14208 /* Locate some local-dynamic symbol still in use by this function
14209 so that we can print its name in some tls_local_dynamic_base
14210 pattern. */
14211
14212 static int
14213 get_some_local_dynamic_name_1 (rtx *px, void *data ATTRIBUTE_UNUSED)
14214 {
14215 rtx x = *px;
14216
14217 if (GET_CODE (x) == SYMBOL_REF
14218 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_DYNAMIC)
14219 {
14220 cfun->machine->some_ld_name = XSTR (x, 0);
14221 return 1;
14222 }
14223
14224 return 0;
14225 }
14226
14227 static const char *
14228 get_some_local_dynamic_name (void)
14229 {
14230 rtx insn;
14231
14232 if (cfun->machine->some_ld_name)
14233 return cfun->machine->some_ld_name;
14234
14235 for (insn = get_insns (); insn ; insn = NEXT_INSN (insn))
14236 if (NONDEBUG_INSN_P (insn)
14237 && for_each_rtx (&PATTERN (insn), get_some_local_dynamic_name_1, 0))
14238 return cfun->machine->some_ld_name;
14239
14240 return NULL;
14241 }
14242
14243 /* Meaning of CODE:
14244 L,W,B,Q,S,T -- print the opcode suffix for specified size of operand.
14245 C -- print opcode suffix for set/cmov insn.
14246 c -- like C, but print reversed condition
14247 F,f -- likewise, but for floating-point.
14248 O -- if HAVE_AS_IX86_CMOV_SUN_SYNTAX, expand to "w.", "l." or "q.",
14249 otherwise nothing
14250 R -- print the prefix for register names.
14251 z -- print the opcode suffix for the size of the current operand.
14252 Z -- likewise, with special suffixes for x87 instructions.
14253 * -- print a star (in certain assembler syntax)
14254 A -- print an absolute memory reference.
14255 E -- print address with DImode register names if TARGET_64BIT.
14256 w -- print the operand as if it's a "word" (HImode) even if it isn't.
14257 s -- print a shift double count, followed by the assemblers argument
14258 delimiter.
14259 b -- print the QImode name of the register for the indicated operand.
14260 %b0 would print %al if operands[0] is reg 0.
14261 w -- likewise, print the HImode name of the register.
14262 k -- likewise, print the SImode name of the register.
14263 q -- likewise, print the DImode name of the register.
14264 x -- likewise, print the V4SFmode name of the register.
14265 t -- likewise, print the V8SFmode name of the register.
14266 h -- print the QImode name for a "high" register, either ah, bh, ch or dh.
14267 y -- print "st(0)" instead of "st" as a register.
14268 d -- print duplicated register operand for AVX instruction.
14269 D -- print condition for SSE cmp instruction.
14270 P -- if PIC, print an @PLT suffix.
14271 p -- print raw symbol name.
14272 X -- don't print any sort of PIC '@' suffix for a symbol.
14273 & -- print some in-use local-dynamic symbol name.
14274 H -- print a memory address offset by 8; used for sse high-parts
14275 Y -- print condition for XOP pcom* instruction.
14276 + -- print a branch hint as 'cs' or 'ds' prefix
14277 ; -- print a semicolon (after prefixes due to bug in older gas).
14278 ~ -- print "i" if TARGET_AVX2, "f" otherwise.
14279 @ -- print a segment register of thread base pointer load
14280 ^ -- print addr32 prefix if TARGET_64BIT and Pmode != word_mode
14281 */
14282
14283 void
14284 ix86_print_operand (FILE *file, rtx x, int code)
14285 {
14286 if (code)
14287 {
14288 switch (code)
14289 {
14290 case 'A':
14291 switch (ASSEMBLER_DIALECT)
14292 {
14293 case ASM_ATT:
14294 putc ('*', file);
14295 break;
14296
14297 case ASM_INTEL:
14298 /* Intel syntax. For absolute addresses, registers should not
14299 be surrounded by braces. */
14300 if (!REG_P (x))
14301 {
14302 putc ('[', file);
14303 ix86_print_operand (file, x, 0);
14304 putc (']', file);
14305 return;
14306 }
14307 break;
14308
14309 default:
14310 gcc_unreachable ();
14311 }
14312
14313 ix86_print_operand (file, x, 0);
14314 return;
14315
14316 case 'E':
14317 /* Wrap address in an UNSPEC to declare special handling. */
14318 if (TARGET_64BIT)
14319 x = gen_rtx_UNSPEC (DImode, gen_rtvec (1, x), UNSPEC_LEA_ADDR);
14320
14321 output_address (x);
14322 return;
14323
14324 case 'L':
14325 if (ASSEMBLER_DIALECT == ASM_ATT)
14326 putc ('l', file);
14327 return;
14328
14329 case 'W':
14330 if (ASSEMBLER_DIALECT == ASM_ATT)
14331 putc ('w', file);
14332 return;
14333
14334 case 'B':
14335 if (ASSEMBLER_DIALECT == ASM_ATT)
14336 putc ('b', file);
14337 return;
14338
14339 case 'Q':
14340 if (ASSEMBLER_DIALECT == ASM_ATT)
14341 putc ('l', file);
14342 return;
14343
14344 case 'S':
14345 if (ASSEMBLER_DIALECT == ASM_ATT)
14346 putc ('s', file);
14347 return;
14348
14349 case 'T':
14350 if (ASSEMBLER_DIALECT == ASM_ATT)
14351 putc ('t', file);
14352 return;
14353
14354 case 'O':
14355 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
14356 if (ASSEMBLER_DIALECT != ASM_ATT)
14357 return;
14358
14359 switch (GET_MODE_SIZE (GET_MODE (x)))
14360 {
14361 case 2:
14362 putc ('w', file);
14363 break;
14364
14365 case 4:
14366 putc ('l', file);
14367 break;
14368
14369 case 8:
14370 putc ('q', file);
14371 break;
14372
14373 default:
14374 output_operand_lossage
14375 ("invalid operand size for operand code 'O'");
14376 return;
14377 }
14378
14379 putc ('.', file);
14380 #endif
14381 return;
14382
14383 case 'z':
14384 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
14385 {
14386 /* Opcodes don't get size suffixes if using Intel opcodes. */
14387 if (ASSEMBLER_DIALECT == ASM_INTEL)
14388 return;
14389
14390 switch (GET_MODE_SIZE (GET_MODE (x)))
14391 {
14392 case 1:
14393 putc ('b', file);
14394 return;
14395
14396 case 2:
14397 putc ('w', file);
14398 return;
14399
14400 case 4:
14401 putc ('l', file);
14402 return;
14403
14404 case 8:
14405 putc ('q', file);
14406 return;
14407
14408 default:
14409 output_operand_lossage
14410 ("invalid operand size for operand code 'z'");
14411 return;
14412 }
14413 }
14414
14415 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
14416 warning
14417 (0, "non-integer operand used with operand code 'z'");
14418 /* FALLTHRU */
14419
14420 case 'Z':
14421 /* 387 opcodes don't get size suffixes if using Intel opcodes. */
14422 if (ASSEMBLER_DIALECT == ASM_INTEL)
14423 return;
14424
14425 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
14426 {
14427 switch (GET_MODE_SIZE (GET_MODE (x)))
14428 {
14429 case 2:
14430 #ifdef HAVE_AS_IX86_FILDS
14431 putc ('s', file);
14432 #endif
14433 return;
14434
14435 case 4:
14436 putc ('l', file);
14437 return;
14438
14439 case 8:
14440 #ifdef HAVE_AS_IX86_FILDQ
14441 putc ('q', file);
14442 #else
14443 fputs ("ll", file);
14444 #endif
14445 return;
14446
14447 default:
14448 break;
14449 }
14450 }
14451 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
14452 {
14453 /* 387 opcodes don't get size suffixes
14454 if the operands are registers. */
14455 if (STACK_REG_P (x))
14456 return;
14457
14458 switch (GET_MODE_SIZE (GET_MODE (x)))
14459 {
14460 case 4:
14461 putc ('s', file);
14462 return;
14463
14464 case 8:
14465 putc ('l', file);
14466 return;
14467
14468 case 12:
14469 case 16:
14470 putc ('t', file);
14471 return;
14472
14473 default:
14474 break;
14475 }
14476 }
14477 else
14478 {
14479 output_operand_lossage
14480 ("invalid operand type used with operand code 'Z'");
14481 return;
14482 }
14483
14484 output_operand_lossage
14485 ("invalid operand size for operand code 'Z'");
14486 return;
14487
14488 case 'd':
14489 case 'b':
14490 case 'w':
14491 case 'k':
14492 case 'q':
14493 case 'h':
14494 case 't':
14495 case 'y':
14496 case 'x':
14497 case 'X':
14498 case 'P':
14499 case 'p':
14500 break;
14501
14502 case 's':
14503 if (CONST_INT_P (x) || ! SHIFT_DOUBLE_OMITS_COUNT)
14504 {
14505 ix86_print_operand (file, x, 0);
14506 fputs (", ", file);
14507 }
14508 return;
14509
14510 case 'Y':
14511 switch (GET_CODE (x))
14512 {
14513 case NE:
14514 fputs ("neq", file);
14515 break;
14516 case EQ:
14517 fputs ("eq", file);
14518 break;
14519 case GE:
14520 case GEU:
14521 fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "ge" : "unlt", file);
14522 break;
14523 case GT:
14524 case GTU:
14525 fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "gt" : "unle", file);
14526 break;
14527 case LE:
14528 case LEU:
14529 fputs ("le", file);
14530 break;
14531 case LT:
14532 case LTU:
14533 fputs ("lt", file);
14534 break;
14535 case UNORDERED:
14536 fputs ("unord", file);
14537 break;
14538 case ORDERED:
14539 fputs ("ord", file);
14540 break;
14541 case UNEQ:
14542 fputs ("ueq", file);
14543 break;
14544 case UNGE:
14545 fputs ("nlt", file);
14546 break;
14547 case UNGT:
14548 fputs ("nle", file);
14549 break;
14550 case UNLE:
14551 fputs ("ule", file);
14552 break;
14553 case UNLT:
14554 fputs ("ult", file);
14555 break;
14556 case LTGT:
14557 fputs ("une", file);
14558 break;
14559 default:
14560 output_operand_lossage ("operand is not a condition code, "
14561 "invalid operand code 'Y'");
14562 return;
14563 }
14564 return;
14565
14566 case 'D':
14567 /* Little bit of braindamage here. The SSE compare instructions
14568 does use completely different names for the comparisons that the
14569 fp conditional moves. */
14570 switch (GET_CODE (x))
14571 {
14572 case UNEQ:
14573 if (TARGET_AVX)
14574 {
14575 fputs ("eq_us", file);
14576 break;
14577 }
14578 case EQ:
14579 fputs ("eq", file);
14580 break;
14581 case UNLT:
14582 if (TARGET_AVX)
14583 {
14584 fputs ("nge", file);
14585 break;
14586 }
14587 case LT:
14588 fputs ("lt", file);
14589 break;
14590 case UNLE:
14591 if (TARGET_AVX)
14592 {
14593 fputs ("ngt", file);
14594 break;
14595 }
14596 case LE:
14597 fputs ("le", file);
14598 break;
14599 case UNORDERED:
14600 fputs ("unord", file);
14601 break;
14602 case LTGT:
14603 if (TARGET_AVX)
14604 {
14605 fputs ("neq_oq", file);
14606 break;
14607 }
14608 case NE:
14609 fputs ("neq", file);
14610 break;
14611 case GE:
14612 if (TARGET_AVX)
14613 {
14614 fputs ("ge", file);
14615 break;
14616 }
14617 case UNGE:
14618 fputs ("nlt", file);
14619 break;
14620 case GT:
14621 if (TARGET_AVX)
14622 {
14623 fputs ("gt", file);
14624 break;
14625 }
14626 case UNGT:
14627 fputs ("nle", file);
14628 break;
14629 case ORDERED:
14630 fputs ("ord", file);
14631 break;
14632 default:
14633 output_operand_lossage ("operand is not a condition code, "
14634 "invalid operand code 'D'");
14635 return;
14636 }
14637 return;
14638
14639 case 'F':
14640 case 'f':
14641 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
14642 if (ASSEMBLER_DIALECT == ASM_ATT)
14643 putc ('.', file);
14644 #endif
14645
14646 case 'C':
14647 case 'c':
14648 if (!COMPARISON_P (x))
14649 {
14650 output_operand_lossage ("operand is not a condition code, "
14651 "invalid operand code '%c'", code);
14652 return;
14653 }
14654 put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)),
14655 code == 'c' || code == 'f',
14656 code == 'F' || code == 'f',
14657 file);
14658 return;
14659
14660 case 'H':
14661 if (!offsettable_memref_p (x))
14662 {
14663 output_operand_lossage ("operand is not an offsettable memory "
14664 "reference, invalid operand code 'H'");
14665 return;
14666 }
14667 /* It doesn't actually matter what mode we use here, as we're
14668 only going to use this for printing. */
14669 x = adjust_address_nv (x, DImode, 8);
14670 break;
14671
14672 case 'K':
14673 gcc_assert (CONST_INT_P (x));
14674
14675 if (INTVAL (x) & IX86_HLE_ACQUIRE)
14676 #ifdef HAVE_AS_IX86_HLE
14677 fputs ("xacquire ", file);
14678 #else
14679 fputs ("\n" ASM_BYTE "0xf2\n\t", file);
14680 #endif
14681 else if (INTVAL (x) & IX86_HLE_RELEASE)
14682 #ifdef HAVE_AS_IX86_HLE
14683 fputs ("xrelease ", file);
14684 #else
14685 fputs ("\n" ASM_BYTE "0xf3\n\t", file);
14686 #endif
14687 /* We do not want to print value of the operand. */
14688 return;
14689
14690 case '*':
14691 if (ASSEMBLER_DIALECT == ASM_ATT)
14692 putc ('*', file);
14693 return;
14694
14695 case '&':
14696 {
14697 const char *name = get_some_local_dynamic_name ();
14698 if (name == NULL)
14699 output_operand_lossage ("'%%&' used without any "
14700 "local dynamic TLS references");
14701 else
14702 assemble_name (file, name);
14703 return;
14704 }
14705
14706 case '+':
14707 {
14708 rtx x;
14709
14710 if (!optimize
14711 || optimize_function_for_size_p (cfun)
14712 || !TARGET_BRANCH_PREDICTION_HINTS)
14713 return;
14714
14715 x = find_reg_note (current_output_insn, REG_BR_PROB, 0);
14716 if (x)
14717 {
14718 int pred_val = INTVAL (XEXP (x, 0));
14719
14720 if (pred_val < REG_BR_PROB_BASE * 45 / 100
14721 || pred_val > REG_BR_PROB_BASE * 55 / 100)
14722 {
14723 bool taken = pred_val > REG_BR_PROB_BASE / 2;
14724 bool cputaken
14725 = final_forward_branch_p (current_output_insn) == 0;
14726
14727 /* Emit hints only in the case default branch prediction
14728 heuristics would fail. */
14729 if (taken != cputaken)
14730 {
14731 /* We use 3e (DS) prefix for taken branches and
14732 2e (CS) prefix for not taken branches. */
14733 if (taken)
14734 fputs ("ds ; ", file);
14735 else
14736 fputs ("cs ; ", file);
14737 }
14738 }
14739 }
14740 return;
14741 }
14742
14743 case ';':
14744 #ifndef HAVE_AS_IX86_REP_LOCK_PREFIX
14745 putc (';', file);
14746 #endif
14747 return;
14748
14749 case '@':
14750 if (ASSEMBLER_DIALECT == ASM_ATT)
14751 putc ('%', file);
14752
14753 /* The kernel uses a different segment register for performance
14754 reasons; a system call would not have to trash the userspace
14755 segment register, which would be expensive. */
14756 if (TARGET_64BIT && ix86_cmodel != CM_KERNEL)
14757 fputs ("fs", file);
14758 else
14759 fputs ("gs", file);
14760 return;
14761
14762 case '~':
14763 putc (TARGET_AVX2 ? 'i' : 'f', file);
14764 return;
14765
14766 case '^':
14767 if (TARGET_64BIT && Pmode != word_mode)
14768 fputs ("addr32 ", file);
14769 return;
14770
14771 default:
14772 output_operand_lossage ("invalid operand code '%c'", code);
14773 }
14774 }
14775
14776 if (REG_P (x))
14777 print_reg (x, code, file);
14778
14779 else if (MEM_P (x))
14780 {
14781 /* No `byte ptr' prefix for call instructions or BLKmode operands. */
14782 if (ASSEMBLER_DIALECT == ASM_INTEL && code != 'X' && code != 'P'
14783 && GET_MODE (x) != BLKmode)
14784 {
14785 const char * size;
14786 switch (GET_MODE_SIZE (GET_MODE (x)))
14787 {
14788 case 1: size = "BYTE"; break;
14789 case 2: size = "WORD"; break;
14790 case 4: size = "DWORD"; break;
14791 case 8: size = "QWORD"; break;
14792 case 12: size = "TBYTE"; break;
14793 case 16:
14794 if (GET_MODE (x) == XFmode)
14795 size = "TBYTE";
14796 else
14797 size = "XMMWORD";
14798 break;
14799 case 32: size = "YMMWORD"; break;
14800 default:
14801 gcc_unreachable ();
14802 }
14803
14804 /* Check for explicit size override (codes 'b', 'w', 'k',
14805 'q' and 'x') */
14806 if (code == 'b')
14807 size = "BYTE";
14808 else if (code == 'w')
14809 size = "WORD";
14810 else if (code == 'k')
14811 size = "DWORD";
14812 else if (code == 'q')
14813 size = "QWORD";
14814 else if (code == 'x')
14815 size = "XMMWORD";
14816
14817 fputs (size, file);
14818 fputs (" PTR ", file);
14819 }
14820
14821 x = XEXP (x, 0);
14822 /* Avoid (%rip) for call operands. */
14823 if (CONSTANT_ADDRESS_P (x) && code == 'P'
14824 && !CONST_INT_P (x))
14825 output_addr_const (file, x);
14826 else if (this_is_asm_operands && ! address_operand (x, VOIDmode))
14827 output_operand_lossage ("invalid constraints for operand");
14828 else
14829 output_address (x);
14830 }
14831
14832 else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == SFmode)
14833 {
14834 REAL_VALUE_TYPE r;
14835 long l;
14836
14837 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
14838 REAL_VALUE_TO_TARGET_SINGLE (r, l);
14839
14840 if (ASSEMBLER_DIALECT == ASM_ATT)
14841 putc ('$', file);
14842 /* Sign extend 32bit SFmode immediate to 8 bytes. */
14843 if (code == 'q')
14844 fprintf (file, "0x%08llx", (unsigned long long) (int) l);
14845 else
14846 fprintf (file, "0x%08x", (unsigned int) l);
14847 }
14848
14849 else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == DFmode)
14850 {
14851 REAL_VALUE_TYPE r;
14852 long l[2];
14853
14854 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
14855 REAL_VALUE_TO_TARGET_DOUBLE (r, l);
14856
14857 if (ASSEMBLER_DIALECT == ASM_ATT)
14858 putc ('$', file);
14859 fprintf (file, "0x%lx%08lx", l[1] & 0xffffffff, l[0] & 0xffffffff);
14860 }
14861
14862 /* These float cases don't actually occur as immediate operands. */
14863 else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == XFmode)
14864 {
14865 char dstr[30];
14866
14867 real_to_decimal (dstr, CONST_DOUBLE_REAL_VALUE (x), sizeof (dstr), 0, 1);
14868 fputs (dstr, file);
14869 }
14870
14871 else
14872 {
14873 /* We have patterns that allow zero sets of memory, for instance.
14874 In 64-bit mode, we should probably support all 8-byte vectors,
14875 since we can in fact encode that into an immediate. */
14876 if (GET_CODE (x) == CONST_VECTOR)
14877 {
14878 gcc_assert (x == CONST0_RTX (GET_MODE (x)));
14879 x = const0_rtx;
14880 }
14881
14882 if (code != 'P' && code != 'p')
14883 {
14884 if (CONST_INT_P (x) || GET_CODE (x) == CONST_DOUBLE)
14885 {
14886 if (ASSEMBLER_DIALECT == ASM_ATT)
14887 putc ('$', file);
14888 }
14889 else if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF
14890 || GET_CODE (x) == LABEL_REF)
14891 {
14892 if (ASSEMBLER_DIALECT == ASM_ATT)
14893 putc ('$', file);
14894 else
14895 fputs ("OFFSET FLAT:", file);
14896 }
14897 }
14898 if (CONST_INT_P (x))
14899 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
14900 else if (flag_pic || MACHOPIC_INDIRECT)
14901 output_pic_addr_const (file, x, code);
14902 else
14903 output_addr_const (file, x);
14904 }
14905 }
14906
14907 static bool
14908 ix86_print_operand_punct_valid_p (unsigned char code)
14909 {
14910 return (code == '@' || code == '*' || code == '+' || code == '&'
14911 || code == ';' || code == '~' || code == '^');
14912 }
14913 \f
14914 /* Print a memory operand whose address is ADDR. */
14915
14916 static void
14917 ix86_print_operand_address (FILE *file, rtx addr)
14918 {
14919 struct ix86_address parts;
14920 rtx base, index, disp;
14921 int scale;
14922 int ok;
14923 bool vsib = false;
14924 int code = 0;
14925
14926 if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_VSIBADDR)
14927 {
14928 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
14929 gcc_assert (parts.index == NULL_RTX);
14930 parts.index = XVECEXP (addr, 0, 1);
14931 parts.scale = INTVAL (XVECEXP (addr, 0, 2));
14932 addr = XVECEXP (addr, 0, 0);
14933 vsib = true;
14934 }
14935 else if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_LEA_ADDR)
14936 {
14937 gcc_assert (TARGET_64BIT);
14938 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
14939 code = 'q';
14940 }
14941 else
14942 ok = ix86_decompose_address (addr, &parts);
14943
14944 gcc_assert (ok);
14945
14946 if (parts.base && GET_CODE (parts.base) == SUBREG)
14947 {
14948 rtx tmp = SUBREG_REG (parts.base);
14949 parts.base = simplify_subreg (GET_MODE (parts.base),
14950 tmp, GET_MODE (tmp), 0);
14951 gcc_assert (parts.base != NULL_RTX);
14952 }
14953
14954 if (parts.index && GET_CODE (parts.index) == SUBREG)
14955 {
14956 rtx tmp = SUBREG_REG (parts.index);
14957 parts.index = simplify_subreg (GET_MODE (parts.index),
14958 tmp, GET_MODE (tmp), 0);
14959 gcc_assert (parts.index != NULL_RTX);
14960 }
14961
14962 base = parts.base;
14963 index = parts.index;
14964 disp = parts.disp;
14965 scale = parts.scale;
14966
14967 switch (parts.seg)
14968 {
14969 case SEG_DEFAULT:
14970 break;
14971 case SEG_FS:
14972 case SEG_GS:
14973 if (ASSEMBLER_DIALECT == ASM_ATT)
14974 putc ('%', file);
14975 fputs ((parts.seg == SEG_FS ? "fs:" : "gs:"), file);
14976 break;
14977 default:
14978 gcc_unreachable ();
14979 }
14980
14981 /* Use one byte shorter RIP relative addressing for 64bit mode. */
14982 if (TARGET_64BIT && !base && !index)
14983 {
14984 rtx symbol = disp;
14985
14986 if (GET_CODE (disp) == CONST
14987 && GET_CODE (XEXP (disp, 0)) == PLUS
14988 && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
14989 symbol = XEXP (XEXP (disp, 0), 0);
14990
14991 if (GET_CODE (symbol) == LABEL_REF
14992 || (GET_CODE (symbol) == SYMBOL_REF
14993 && SYMBOL_REF_TLS_MODEL (symbol) == 0))
14994 base = pc_rtx;
14995 }
14996 if (!base && !index)
14997 {
14998 /* Displacement only requires special attention. */
14999
15000 if (CONST_INT_P (disp))
15001 {
15002 if (ASSEMBLER_DIALECT == ASM_INTEL && parts.seg == SEG_DEFAULT)
15003 fputs ("ds:", file);
15004 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (disp));
15005 }
15006 else if (flag_pic)
15007 output_pic_addr_const (file, disp, 0);
15008 else
15009 output_addr_const (file, disp);
15010 }
15011 else
15012 {
15013 /* Print SImode register names to force addr32 prefix. */
15014 if (SImode_address_operand (addr, VOIDmode))
15015 {
15016 #ifdef ENABLE_CHECKING
15017 gcc_assert (TARGET_64BIT);
15018 switch (GET_CODE (addr))
15019 {
15020 case SUBREG:
15021 gcc_assert (GET_MODE (addr) == SImode);
15022 gcc_assert (GET_MODE (SUBREG_REG (addr)) == DImode);
15023 break;
15024 case ZERO_EXTEND:
15025 case AND:
15026 gcc_assert (GET_MODE (addr) == DImode);
15027 break;
15028 default:
15029 gcc_unreachable ();
15030 }
15031 #endif
15032 gcc_assert (!code);
15033 code = 'l';
15034 }
15035
15036 if (ASSEMBLER_DIALECT == ASM_ATT)
15037 {
15038 if (disp)
15039 {
15040 if (flag_pic)
15041 output_pic_addr_const (file, disp, 0);
15042 else if (GET_CODE (disp) == LABEL_REF)
15043 output_asm_label (disp);
15044 else
15045 output_addr_const (file, disp);
15046 }
15047
15048 putc ('(', file);
15049 if (base)
15050 print_reg (base, code, file);
15051 if (index)
15052 {
15053 putc (',', file);
15054 print_reg (index, vsib ? 0 : code, file);
15055 if (scale != 1 || vsib)
15056 fprintf (file, ",%d", scale);
15057 }
15058 putc (')', file);
15059 }
15060 else
15061 {
15062 rtx offset = NULL_RTX;
15063
15064 if (disp)
15065 {
15066 /* Pull out the offset of a symbol; print any symbol itself. */
15067 if (GET_CODE (disp) == CONST
15068 && GET_CODE (XEXP (disp, 0)) == PLUS
15069 && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
15070 {
15071 offset = XEXP (XEXP (disp, 0), 1);
15072 disp = gen_rtx_CONST (VOIDmode,
15073 XEXP (XEXP (disp, 0), 0));
15074 }
15075
15076 if (flag_pic)
15077 output_pic_addr_const (file, disp, 0);
15078 else if (GET_CODE (disp) == LABEL_REF)
15079 output_asm_label (disp);
15080 else if (CONST_INT_P (disp))
15081 offset = disp;
15082 else
15083 output_addr_const (file, disp);
15084 }
15085
15086 putc ('[', file);
15087 if (base)
15088 {
15089 print_reg (base, code, file);
15090 if (offset)
15091 {
15092 if (INTVAL (offset) >= 0)
15093 putc ('+', file);
15094 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
15095 }
15096 }
15097 else if (offset)
15098 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
15099 else
15100 putc ('0', file);
15101
15102 if (index)
15103 {
15104 putc ('+', file);
15105 print_reg (index, vsib ? 0 : code, file);
15106 if (scale != 1 || vsib)
15107 fprintf (file, "*%d", scale);
15108 }
15109 putc (']', file);
15110 }
15111 }
15112 }
15113
15114 /* Implementation of TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA. */
15115
15116 static bool
15117 i386_asm_output_addr_const_extra (FILE *file, rtx x)
15118 {
15119 rtx op;
15120
15121 if (GET_CODE (x) != UNSPEC)
15122 return false;
15123
15124 op = XVECEXP (x, 0, 0);
15125 switch (XINT (x, 1))
15126 {
15127 case UNSPEC_GOTTPOFF:
15128 output_addr_const (file, op);
15129 /* FIXME: This might be @TPOFF in Sun ld. */
15130 fputs ("@gottpoff", file);
15131 break;
15132 case UNSPEC_TPOFF:
15133 output_addr_const (file, op);
15134 fputs ("@tpoff", file);
15135 break;
15136 case UNSPEC_NTPOFF:
15137 output_addr_const (file, op);
15138 if (TARGET_64BIT)
15139 fputs ("@tpoff", file);
15140 else
15141 fputs ("@ntpoff", file);
15142 break;
15143 case UNSPEC_DTPOFF:
15144 output_addr_const (file, op);
15145 fputs ("@dtpoff", file);
15146 break;
15147 case UNSPEC_GOTNTPOFF:
15148 output_addr_const (file, op);
15149 if (TARGET_64BIT)
15150 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
15151 "@gottpoff(%rip)" : "@gottpoff[rip]", file);
15152 else
15153 fputs ("@gotntpoff", file);
15154 break;
15155 case UNSPEC_INDNTPOFF:
15156 output_addr_const (file, op);
15157 fputs ("@indntpoff", file);
15158 break;
15159 #if TARGET_MACHO
15160 case UNSPEC_MACHOPIC_OFFSET:
15161 output_addr_const (file, op);
15162 putc ('-', file);
15163 machopic_output_function_base_name (file);
15164 break;
15165 #endif
15166
15167 case UNSPEC_STACK_CHECK:
15168 {
15169 int offset;
15170
15171 gcc_assert (flag_split_stack);
15172
15173 #ifdef TARGET_THREAD_SPLIT_STACK_OFFSET
15174 offset = TARGET_THREAD_SPLIT_STACK_OFFSET;
15175 #else
15176 gcc_unreachable ();
15177 #endif
15178
15179 fprintf (file, "%s:%d", TARGET_64BIT ? "%fs" : "%gs", offset);
15180 }
15181 break;
15182
15183 default:
15184 return false;
15185 }
15186
15187 return true;
15188 }
15189 \f
15190 /* Split one or more double-mode RTL references into pairs of half-mode
15191 references. The RTL can be REG, offsettable MEM, integer constant, or
15192 CONST_DOUBLE. "operands" is a pointer to an array of double-mode RTLs to
15193 split and "num" is its length. lo_half and hi_half are output arrays
15194 that parallel "operands". */
15195
15196 void
15197 split_double_mode (enum machine_mode mode, rtx operands[],
15198 int num, rtx lo_half[], rtx hi_half[])
15199 {
15200 enum machine_mode half_mode;
15201 unsigned int byte;
15202
15203 switch (mode)
15204 {
15205 case TImode:
15206 half_mode = DImode;
15207 break;
15208 case DImode:
15209 half_mode = SImode;
15210 break;
15211 default:
15212 gcc_unreachable ();
15213 }
15214
15215 byte = GET_MODE_SIZE (half_mode);
15216
15217 while (num--)
15218 {
15219 rtx op = operands[num];
15220
15221 /* simplify_subreg refuse to split volatile memory addresses,
15222 but we still have to handle it. */
15223 if (MEM_P (op))
15224 {
15225 lo_half[num] = adjust_address (op, half_mode, 0);
15226 hi_half[num] = adjust_address (op, half_mode, byte);
15227 }
15228 else
15229 {
15230 lo_half[num] = simplify_gen_subreg (half_mode, op,
15231 GET_MODE (op) == VOIDmode
15232 ? mode : GET_MODE (op), 0);
15233 hi_half[num] = simplify_gen_subreg (half_mode, op,
15234 GET_MODE (op) == VOIDmode
15235 ? mode : GET_MODE (op), byte);
15236 }
15237 }
15238 }
15239 \f
15240 /* Output code to perform a 387 binary operation in INSN, one of PLUS,
15241 MINUS, MULT or DIV. OPERANDS are the insn operands, where operands[3]
15242 is the expression of the binary operation. The output may either be
15243 emitted here, or returned to the caller, like all output_* functions.
15244
15245 There is no guarantee that the operands are the same mode, as they
15246 might be within FLOAT or FLOAT_EXTEND expressions. */
15247
15248 #ifndef SYSV386_COMPAT
15249 /* Set to 1 for compatibility with brain-damaged assemblers. No-one
15250 wants to fix the assemblers because that causes incompatibility
15251 with gcc. No-one wants to fix gcc because that causes
15252 incompatibility with assemblers... You can use the option of
15253 -DSYSV386_COMPAT=0 if you recompile both gcc and gas this way. */
15254 #define SYSV386_COMPAT 1
15255 #endif
15256
15257 const char *
15258 output_387_binary_op (rtx insn, rtx *operands)
15259 {
15260 static char buf[40];
15261 const char *p;
15262 const char *ssep;
15263 int is_sse = SSE_REG_P (operands[0]) || SSE_REG_P (operands[1]) || SSE_REG_P (operands[2]);
15264
15265 #ifdef ENABLE_CHECKING
15266 /* Even if we do not want to check the inputs, this documents input
15267 constraints. Which helps in understanding the following code. */
15268 if (STACK_REG_P (operands[0])
15269 && ((REG_P (operands[1])
15270 && REGNO (operands[0]) == REGNO (operands[1])
15271 && (STACK_REG_P (operands[2]) || MEM_P (operands[2])))
15272 || (REG_P (operands[2])
15273 && REGNO (operands[0]) == REGNO (operands[2])
15274 && (STACK_REG_P (operands[1]) || MEM_P (operands[1]))))
15275 && (STACK_TOP_P (operands[1]) || STACK_TOP_P (operands[2])))
15276 ; /* ok */
15277 else
15278 gcc_assert (is_sse);
15279 #endif
15280
15281 switch (GET_CODE (operands[3]))
15282 {
15283 case PLUS:
15284 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
15285 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
15286 p = "fiadd";
15287 else
15288 p = "fadd";
15289 ssep = "vadd";
15290 break;
15291
15292 case MINUS:
15293 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
15294 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
15295 p = "fisub";
15296 else
15297 p = "fsub";
15298 ssep = "vsub";
15299 break;
15300
15301 case MULT:
15302 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
15303 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
15304 p = "fimul";
15305 else
15306 p = "fmul";
15307 ssep = "vmul";
15308 break;
15309
15310 case DIV:
15311 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
15312 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
15313 p = "fidiv";
15314 else
15315 p = "fdiv";
15316 ssep = "vdiv";
15317 break;
15318
15319 default:
15320 gcc_unreachable ();
15321 }
15322
15323 if (is_sse)
15324 {
15325 if (TARGET_AVX)
15326 {
15327 strcpy (buf, ssep);
15328 if (GET_MODE (operands[0]) == SFmode)
15329 strcat (buf, "ss\t{%2, %1, %0|%0, %1, %2}");
15330 else
15331 strcat (buf, "sd\t{%2, %1, %0|%0, %1, %2}");
15332 }
15333 else
15334 {
15335 strcpy (buf, ssep + 1);
15336 if (GET_MODE (operands[0]) == SFmode)
15337 strcat (buf, "ss\t{%2, %0|%0, %2}");
15338 else
15339 strcat (buf, "sd\t{%2, %0|%0, %2}");
15340 }
15341 return buf;
15342 }
15343 strcpy (buf, p);
15344
15345 switch (GET_CODE (operands[3]))
15346 {
15347 case MULT:
15348 case PLUS:
15349 if (REG_P (operands[2]) && REGNO (operands[0]) == REGNO (operands[2]))
15350 {
15351 rtx temp = operands[2];
15352 operands[2] = operands[1];
15353 operands[1] = temp;
15354 }
15355
15356 /* know operands[0] == operands[1]. */
15357
15358 if (MEM_P (operands[2]))
15359 {
15360 p = "%Z2\t%2";
15361 break;
15362 }
15363
15364 if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
15365 {
15366 if (STACK_TOP_P (operands[0]))
15367 /* How is it that we are storing to a dead operand[2]?
15368 Well, presumably operands[1] is dead too. We can't
15369 store the result to st(0) as st(0) gets popped on this
15370 instruction. Instead store to operands[2] (which I
15371 think has to be st(1)). st(1) will be popped later.
15372 gcc <= 2.8.1 didn't have this check and generated
15373 assembly code that the Unixware assembler rejected. */
15374 p = "p\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */
15375 else
15376 p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */
15377 break;
15378 }
15379
15380 if (STACK_TOP_P (operands[0]))
15381 p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */
15382 else
15383 p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */
15384 break;
15385
15386 case MINUS:
15387 case DIV:
15388 if (MEM_P (operands[1]))
15389 {
15390 p = "r%Z1\t%1";
15391 break;
15392 }
15393
15394 if (MEM_P (operands[2]))
15395 {
15396 p = "%Z2\t%2";
15397 break;
15398 }
15399
15400 if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
15401 {
15402 #if SYSV386_COMPAT
15403 /* The SystemV/386 SVR3.2 assembler, and probably all AT&T
15404 derived assemblers, confusingly reverse the direction of
15405 the operation for fsub{r} and fdiv{r} when the
15406 destination register is not st(0). The Intel assembler
15407 doesn't have this brain damage. Read !SYSV386_COMPAT to
15408 figure out what the hardware really does. */
15409 if (STACK_TOP_P (operands[0]))
15410 p = "{p\t%0, %2|rp\t%2, %0}";
15411 else
15412 p = "{rp\t%2, %0|p\t%0, %2}";
15413 #else
15414 if (STACK_TOP_P (operands[0]))
15415 /* As above for fmul/fadd, we can't store to st(0). */
15416 p = "rp\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */
15417 else
15418 p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */
15419 #endif
15420 break;
15421 }
15422
15423 if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
15424 {
15425 #if SYSV386_COMPAT
15426 if (STACK_TOP_P (operands[0]))
15427 p = "{rp\t%0, %1|p\t%1, %0}";
15428 else
15429 p = "{p\t%1, %0|rp\t%0, %1}";
15430 #else
15431 if (STACK_TOP_P (operands[0]))
15432 p = "p\t{%0, %1|%1, %0}"; /* st(1) = st(1) op st(0); pop */
15433 else
15434 p = "rp\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2); pop */
15435 #endif
15436 break;
15437 }
15438
15439 if (STACK_TOP_P (operands[0]))
15440 {
15441 if (STACK_TOP_P (operands[1]))
15442 p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */
15443 else
15444 p = "r\t{%y1, %0|%0, %y1}"; /* st(0) = st(r1) op st(0) */
15445 break;
15446 }
15447 else if (STACK_TOP_P (operands[1]))
15448 {
15449 #if SYSV386_COMPAT
15450 p = "{\t%1, %0|r\t%0, %1}";
15451 #else
15452 p = "r\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2) */
15453 #endif
15454 }
15455 else
15456 {
15457 #if SYSV386_COMPAT
15458 p = "{r\t%2, %0|\t%0, %2}";
15459 #else
15460 p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */
15461 #endif
15462 }
15463 break;
15464
15465 default:
15466 gcc_unreachable ();
15467 }
15468
15469 strcat (buf, p);
15470 return buf;
15471 }
15472
15473 /* Return needed mode for entity in optimize_mode_switching pass. */
15474
15475 int
15476 ix86_mode_needed (int entity, rtx insn)
15477 {
15478 enum attr_i387_cw mode;
15479
15480 /* The mode UNINITIALIZED is used to store control word after a
15481 function call or ASM pattern. The mode ANY specify that function
15482 has no requirements on the control word and make no changes in the
15483 bits we are interested in. */
15484
15485 if (CALL_P (insn)
15486 || (NONJUMP_INSN_P (insn)
15487 && (asm_noperands (PATTERN (insn)) >= 0
15488 || GET_CODE (PATTERN (insn)) == ASM_INPUT)))
15489 return I387_CW_UNINITIALIZED;
15490
15491 if (recog_memoized (insn) < 0)
15492 return I387_CW_ANY;
15493
15494 mode = get_attr_i387_cw (insn);
15495
15496 switch (entity)
15497 {
15498 case I387_TRUNC:
15499 if (mode == I387_CW_TRUNC)
15500 return mode;
15501 break;
15502
15503 case I387_FLOOR:
15504 if (mode == I387_CW_FLOOR)
15505 return mode;
15506 break;
15507
15508 case I387_CEIL:
15509 if (mode == I387_CW_CEIL)
15510 return mode;
15511 break;
15512
15513 case I387_MASK_PM:
15514 if (mode == I387_CW_MASK_PM)
15515 return mode;
15516 break;
15517
15518 default:
15519 gcc_unreachable ();
15520 }
15521
15522 return I387_CW_ANY;
15523 }
15524
15525 /* Output code to initialize control word copies used by trunc?f?i and
15526 rounding patterns. CURRENT_MODE is set to current control word,
15527 while NEW_MODE is set to new control word. */
15528
15529 void
15530 emit_i387_cw_initialization (int mode)
15531 {
15532 rtx stored_mode = assign_386_stack_local (HImode, SLOT_CW_STORED);
15533 rtx new_mode;
15534
15535 enum ix86_stack_slot slot;
15536
15537 rtx reg = gen_reg_rtx (HImode);
15538
15539 emit_insn (gen_x86_fnstcw_1 (stored_mode));
15540 emit_move_insn (reg, copy_rtx (stored_mode));
15541
15542 if (TARGET_64BIT || TARGET_PARTIAL_REG_STALL
15543 || optimize_function_for_size_p (cfun))
15544 {
15545 switch (mode)
15546 {
15547 case I387_CW_TRUNC:
15548 /* round toward zero (truncate) */
15549 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0c00)));
15550 slot = SLOT_CW_TRUNC;
15551 break;
15552
15553 case I387_CW_FLOOR:
15554 /* round down toward -oo */
15555 emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
15556 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0400)));
15557 slot = SLOT_CW_FLOOR;
15558 break;
15559
15560 case I387_CW_CEIL:
15561 /* round up toward +oo */
15562 emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
15563 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0800)));
15564 slot = SLOT_CW_CEIL;
15565 break;
15566
15567 case I387_CW_MASK_PM:
15568 /* mask precision exception for nearbyint() */
15569 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
15570 slot = SLOT_CW_MASK_PM;
15571 break;
15572
15573 default:
15574 gcc_unreachable ();
15575 }
15576 }
15577 else
15578 {
15579 switch (mode)
15580 {
15581 case I387_CW_TRUNC:
15582 /* round toward zero (truncate) */
15583 emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0xc)));
15584 slot = SLOT_CW_TRUNC;
15585 break;
15586
15587 case I387_CW_FLOOR:
15588 /* round down toward -oo */
15589 emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0x4)));
15590 slot = SLOT_CW_FLOOR;
15591 break;
15592
15593 case I387_CW_CEIL:
15594 /* round up toward +oo */
15595 emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0x8)));
15596 slot = SLOT_CW_CEIL;
15597 break;
15598
15599 case I387_CW_MASK_PM:
15600 /* mask precision exception for nearbyint() */
15601 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
15602 slot = SLOT_CW_MASK_PM;
15603 break;
15604
15605 default:
15606 gcc_unreachable ();
15607 }
15608 }
15609
15610 gcc_assert (slot < MAX_386_STACK_LOCALS);
15611
15612 new_mode = assign_386_stack_local (HImode, slot);
15613 emit_move_insn (new_mode, reg);
15614 }
15615
15616 /* Output code for INSN to convert a float to a signed int. OPERANDS
15617 are the insn operands. The output may be [HSD]Imode and the input
15618 operand may be [SDX]Fmode. */
15619
15620 const char *
15621 output_fix_trunc (rtx insn, rtx *operands, bool fisttp)
15622 {
15623 int stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG) != 0;
15624 int dimode_p = GET_MODE (operands[0]) == DImode;
15625 int round_mode = get_attr_i387_cw (insn);
15626
15627 /* Jump through a hoop or two for DImode, since the hardware has no
15628 non-popping instruction. We used to do this a different way, but
15629 that was somewhat fragile and broke with post-reload splitters. */
15630 if ((dimode_p || fisttp) && !stack_top_dies)
15631 output_asm_insn ("fld\t%y1", operands);
15632
15633 gcc_assert (STACK_TOP_P (operands[1]));
15634 gcc_assert (MEM_P (operands[0]));
15635 gcc_assert (GET_MODE (operands[1]) != TFmode);
15636
15637 if (fisttp)
15638 output_asm_insn ("fisttp%Z0\t%0", operands);
15639 else
15640 {
15641 if (round_mode != I387_CW_ANY)
15642 output_asm_insn ("fldcw\t%3", operands);
15643 if (stack_top_dies || dimode_p)
15644 output_asm_insn ("fistp%Z0\t%0", operands);
15645 else
15646 output_asm_insn ("fist%Z0\t%0", operands);
15647 if (round_mode != I387_CW_ANY)
15648 output_asm_insn ("fldcw\t%2", operands);
15649 }
15650
15651 return "";
15652 }
15653
15654 /* Output code for x87 ffreep insn. The OPNO argument, which may only
15655 have the values zero or one, indicates the ffreep insn's operand
15656 from the OPERANDS array. */
15657
15658 static const char *
15659 output_387_ffreep (rtx *operands ATTRIBUTE_UNUSED, int opno)
15660 {
15661 if (TARGET_USE_FFREEP)
15662 #ifdef HAVE_AS_IX86_FFREEP
15663 return opno ? "ffreep\t%y1" : "ffreep\t%y0";
15664 #else
15665 {
15666 static char retval[32];
15667 int regno = REGNO (operands[opno]);
15668
15669 gcc_assert (STACK_REGNO_P (regno));
15670
15671 regno -= FIRST_STACK_REG;
15672
15673 snprintf (retval, sizeof (retval), ASM_SHORT "0xc%ddf", regno);
15674 return retval;
15675 }
15676 #endif
15677
15678 return opno ? "fstp\t%y1" : "fstp\t%y0";
15679 }
15680
15681
15682 /* Output code for INSN to compare OPERANDS. EFLAGS_P is 1 when fcomi
15683 should be used. UNORDERED_P is true when fucom should be used. */
15684
15685 const char *
15686 output_fp_compare (rtx insn, rtx *operands, bool eflags_p, bool unordered_p)
15687 {
15688 int stack_top_dies;
15689 rtx cmp_op0, cmp_op1;
15690 int is_sse = SSE_REG_P (operands[0]) || SSE_REG_P (operands[1]);
15691
15692 if (eflags_p)
15693 {
15694 cmp_op0 = operands[0];
15695 cmp_op1 = operands[1];
15696 }
15697 else
15698 {
15699 cmp_op0 = operands[1];
15700 cmp_op1 = operands[2];
15701 }
15702
15703 if (is_sse)
15704 {
15705 if (GET_MODE (operands[0]) == SFmode)
15706 if (unordered_p)
15707 return "%vucomiss\t{%1, %0|%0, %1}";
15708 else
15709 return "%vcomiss\t{%1, %0|%0, %1}";
15710 else
15711 if (unordered_p)
15712 return "%vucomisd\t{%1, %0|%0, %1}";
15713 else
15714 return "%vcomisd\t{%1, %0|%0, %1}";
15715 }
15716
15717 gcc_assert (STACK_TOP_P (cmp_op0));
15718
15719 stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG) != 0;
15720
15721 if (cmp_op1 == CONST0_RTX (GET_MODE (cmp_op1)))
15722 {
15723 if (stack_top_dies)
15724 {
15725 output_asm_insn ("ftst\n\tfnstsw\t%0", operands);
15726 return output_387_ffreep (operands, 1);
15727 }
15728 else
15729 return "ftst\n\tfnstsw\t%0";
15730 }
15731
15732 if (STACK_REG_P (cmp_op1)
15733 && stack_top_dies
15734 && find_regno_note (insn, REG_DEAD, REGNO (cmp_op1))
15735 && REGNO (cmp_op1) != FIRST_STACK_REG)
15736 {
15737 /* If both the top of the 387 stack dies, and the other operand
15738 is also a stack register that dies, then this must be a
15739 `fcompp' float compare */
15740
15741 if (eflags_p)
15742 {
15743 /* There is no double popping fcomi variant. Fortunately,
15744 eflags is immune from the fstp's cc clobbering. */
15745 if (unordered_p)
15746 output_asm_insn ("fucomip\t{%y1, %0|%0, %y1}", operands);
15747 else
15748 output_asm_insn ("fcomip\t{%y1, %0|%0, %y1}", operands);
15749 return output_387_ffreep (operands, 0);
15750 }
15751 else
15752 {
15753 if (unordered_p)
15754 return "fucompp\n\tfnstsw\t%0";
15755 else
15756 return "fcompp\n\tfnstsw\t%0";
15757 }
15758 }
15759 else
15760 {
15761 /* Encoded here as eflags_p | intmode | unordered_p | stack_top_dies. */
15762
15763 static const char * const alt[16] =
15764 {
15765 "fcom%Z2\t%y2\n\tfnstsw\t%0",
15766 "fcomp%Z2\t%y2\n\tfnstsw\t%0",
15767 "fucom%Z2\t%y2\n\tfnstsw\t%0",
15768 "fucomp%Z2\t%y2\n\tfnstsw\t%0",
15769
15770 "ficom%Z2\t%y2\n\tfnstsw\t%0",
15771 "ficomp%Z2\t%y2\n\tfnstsw\t%0",
15772 NULL,
15773 NULL,
15774
15775 "fcomi\t{%y1, %0|%0, %y1}",
15776 "fcomip\t{%y1, %0|%0, %y1}",
15777 "fucomi\t{%y1, %0|%0, %y1}",
15778 "fucomip\t{%y1, %0|%0, %y1}",
15779
15780 NULL,
15781 NULL,
15782 NULL,
15783 NULL
15784 };
15785
15786 int mask;
15787 const char *ret;
15788
15789 mask = eflags_p << 3;
15790 mask |= (GET_MODE_CLASS (GET_MODE (cmp_op1)) == MODE_INT) << 2;
15791 mask |= unordered_p << 1;
15792 mask |= stack_top_dies;
15793
15794 gcc_assert (mask < 16);
15795 ret = alt[mask];
15796 gcc_assert (ret);
15797
15798 return ret;
15799 }
15800 }
15801
15802 void
15803 ix86_output_addr_vec_elt (FILE *file, int value)
15804 {
15805 const char *directive = ASM_LONG;
15806
15807 #ifdef ASM_QUAD
15808 if (TARGET_LP64)
15809 directive = ASM_QUAD;
15810 #else
15811 gcc_assert (!TARGET_64BIT);
15812 #endif
15813
15814 fprintf (file, "%s%s%d\n", directive, LPREFIX, value);
15815 }
15816
15817 void
15818 ix86_output_addr_diff_elt (FILE *file, int value, int rel)
15819 {
15820 const char *directive = ASM_LONG;
15821
15822 #ifdef ASM_QUAD
15823 if (TARGET_64BIT && CASE_VECTOR_MODE == DImode)
15824 directive = ASM_QUAD;
15825 #else
15826 gcc_assert (!TARGET_64BIT);
15827 #endif
15828 /* We can't use @GOTOFF for text labels on VxWorks; see gotoff_operand. */
15829 if (TARGET_64BIT || TARGET_VXWORKS_RTP)
15830 fprintf (file, "%s%s%d-%s%d\n",
15831 directive, LPREFIX, value, LPREFIX, rel);
15832 else if (HAVE_AS_GOTOFF_IN_DATA)
15833 fprintf (file, ASM_LONG "%s%d@GOTOFF\n", LPREFIX, value);
15834 #if TARGET_MACHO
15835 else if (TARGET_MACHO)
15836 {
15837 fprintf (file, ASM_LONG "%s%d-", LPREFIX, value);
15838 machopic_output_function_base_name (file);
15839 putc ('\n', file);
15840 }
15841 #endif
15842 else
15843 asm_fprintf (file, ASM_LONG "%U%s+[.-%s%d]\n",
15844 GOT_SYMBOL_NAME, LPREFIX, value);
15845 }
15846 \f
15847 /* Generate either "mov $0, reg" or "xor reg, reg", as appropriate
15848 for the target. */
15849
15850 void
15851 ix86_expand_clear (rtx dest)
15852 {
15853 rtx tmp;
15854
15855 /* We play register width games, which are only valid after reload. */
15856 gcc_assert (reload_completed);
15857
15858 /* Avoid HImode and its attendant prefix byte. */
15859 if (GET_MODE_SIZE (GET_MODE (dest)) < 4)
15860 dest = gen_rtx_REG (SImode, REGNO (dest));
15861 tmp = gen_rtx_SET (VOIDmode, dest, const0_rtx);
15862
15863 /* This predicate should match that for movsi_xor and movdi_xor_rex64. */
15864 if (!TARGET_USE_MOV0 || optimize_insn_for_speed_p ())
15865 {
15866 rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
15867 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, clob));
15868 }
15869
15870 emit_insn (tmp);
15871 }
15872
15873 /* X is an unchanging MEM. If it is a constant pool reference, return
15874 the constant pool rtx, else NULL. */
15875
15876 rtx
15877 maybe_get_pool_constant (rtx x)
15878 {
15879 x = ix86_delegitimize_address (XEXP (x, 0));
15880
15881 if (GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x))
15882 return get_pool_constant (x);
15883
15884 return NULL_RTX;
15885 }
15886
15887 void
15888 ix86_expand_move (enum machine_mode mode, rtx operands[])
15889 {
15890 rtx op0, op1;
15891 enum tls_model model;
15892
15893 op0 = operands[0];
15894 op1 = operands[1];
15895
15896 if (GET_CODE (op1) == SYMBOL_REF)
15897 {
15898 model = SYMBOL_REF_TLS_MODEL (op1);
15899 if (model)
15900 {
15901 op1 = legitimize_tls_address (op1, model, true);
15902 op1 = force_operand (op1, op0);
15903 if (op1 == op0)
15904 return;
15905 if (GET_MODE (op1) != mode)
15906 op1 = convert_to_mode (mode, op1, 1);
15907 }
15908 else if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
15909 && SYMBOL_REF_DLLIMPORT_P (op1))
15910 op1 = legitimize_dllimport_symbol (op1, false);
15911 }
15912 else if (GET_CODE (op1) == CONST
15913 && GET_CODE (XEXP (op1, 0)) == PLUS
15914 && GET_CODE (XEXP (XEXP (op1, 0), 0)) == SYMBOL_REF)
15915 {
15916 rtx addend = XEXP (XEXP (op1, 0), 1);
15917 rtx symbol = XEXP (XEXP (op1, 0), 0);
15918 rtx tmp = NULL;
15919
15920 model = SYMBOL_REF_TLS_MODEL (symbol);
15921 if (model)
15922 tmp = legitimize_tls_address (symbol, model, true);
15923 else if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
15924 && SYMBOL_REF_DLLIMPORT_P (symbol))
15925 tmp = legitimize_dllimport_symbol (symbol, true);
15926
15927 if (tmp)
15928 {
15929 tmp = force_operand (tmp, NULL);
15930 tmp = expand_simple_binop (Pmode, PLUS, tmp, addend,
15931 op0, 1, OPTAB_DIRECT);
15932 if (tmp == op0)
15933 return;
15934 if (GET_MODE (tmp) != mode)
15935 op1 = convert_to_mode (mode, tmp, 1);
15936 }
15937 }
15938
15939 if ((flag_pic || MACHOPIC_INDIRECT)
15940 && symbolic_operand (op1, mode))
15941 {
15942 if (TARGET_MACHO && !TARGET_64BIT)
15943 {
15944 #if TARGET_MACHO
15945 /* dynamic-no-pic */
15946 if (MACHOPIC_INDIRECT)
15947 {
15948 rtx temp = ((reload_in_progress
15949 || ((op0 && REG_P (op0))
15950 && mode == Pmode))
15951 ? op0 : gen_reg_rtx (Pmode));
15952 op1 = machopic_indirect_data_reference (op1, temp);
15953 if (MACHOPIC_PURE)
15954 op1 = machopic_legitimize_pic_address (op1, mode,
15955 temp == op1 ? 0 : temp);
15956 }
15957 if (op0 != op1 && GET_CODE (op0) != MEM)
15958 {
15959 rtx insn = gen_rtx_SET (VOIDmode, op0, op1);
15960 emit_insn (insn);
15961 return;
15962 }
15963 if (GET_CODE (op0) == MEM)
15964 op1 = force_reg (Pmode, op1);
15965 else
15966 {
15967 rtx temp = op0;
15968 if (GET_CODE (temp) != REG)
15969 temp = gen_reg_rtx (Pmode);
15970 temp = legitimize_pic_address (op1, temp);
15971 if (temp == op0)
15972 return;
15973 op1 = temp;
15974 }
15975 /* dynamic-no-pic */
15976 #endif
15977 }
15978 else
15979 {
15980 if (MEM_P (op0))
15981 op1 = force_reg (mode, op1);
15982 else if (!(TARGET_64BIT && x86_64_movabs_operand (op1, DImode)))
15983 {
15984 rtx reg = can_create_pseudo_p () ? NULL_RTX : op0;
15985 op1 = legitimize_pic_address (op1, reg);
15986 if (op0 == op1)
15987 return;
15988 if (GET_MODE (op1) != mode)
15989 op1 = convert_to_mode (mode, op1, 1);
15990 }
15991 }
15992 }
15993 else
15994 {
15995 if (MEM_P (op0)
15996 && (PUSH_ROUNDING (GET_MODE_SIZE (mode)) != GET_MODE_SIZE (mode)
15997 || !push_operand (op0, mode))
15998 && MEM_P (op1))
15999 op1 = force_reg (mode, op1);
16000
16001 if (push_operand (op0, mode)
16002 && ! general_no_elim_operand (op1, mode))
16003 op1 = copy_to_mode_reg (mode, op1);
16004
16005 /* Force large constants in 64bit compilation into register
16006 to get them CSEed. */
16007 if (can_create_pseudo_p ()
16008 && (mode == DImode) && TARGET_64BIT
16009 && immediate_operand (op1, mode)
16010 && !x86_64_zext_immediate_operand (op1, VOIDmode)
16011 && !register_operand (op0, mode)
16012 && optimize)
16013 op1 = copy_to_mode_reg (mode, op1);
16014
16015 if (can_create_pseudo_p ()
16016 && FLOAT_MODE_P (mode)
16017 && GET_CODE (op1) == CONST_DOUBLE)
16018 {
16019 /* If we are loading a floating point constant to a register,
16020 force the value to memory now, since we'll get better code
16021 out the back end. */
16022
16023 op1 = validize_mem (force_const_mem (mode, op1));
16024 if (!register_operand (op0, mode))
16025 {
16026 rtx temp = gen_reg_rtx (mode);
16027 emit_insn (gen_rtx_SET (VOIDmode, temp, op1));
16028 emit_move_insn (op0, temp);
16029 return;
16030 }
16031 }
16032 }
16033
16034 emit_insn (gen_rtx_SET (VOIDmode, op0, op1));
16035 }
16036
16037 void
16038 ix86_expand_vector_move (enum machine_mode mode, rtx operands[])
16039 {
16040 rtx op0 = operands[0], op1 = operands[1];
16041 unsigned int align = GET_MODE_ALIGNMENT (mode);
16042
16043 /* Force constants other than zero into memory. We do not know how
16044 the instructions used to build constants modify the upper 64 bits
16045 of the register, once we have that information we may be able
16046 to handle some of them more efficiently. */
16047 if (can_create_pseudo_p ()
16048 && register_operand (op0, mode)
16049 && (CONSTANT_P (op1)
16050 || (GET_CODE (op1) == SUBREG
16051 && CONSTANT_P (SUBREG_REG (op1))))
16052 && !standard_sse_constant_p (op1))
16053 op1 = validize_mem (force_const_mem (mode, op1));
16054
16055 /* We need to check memory alignment for SSE mode since attribute
16056 can make operands unaligned. */
16057 if (can_create_pseudo_p ()
16058 && SSE_REG_MODE_P (mode)
16059 && ((MEM_P (op0) && (MEM_ALIGN (op0) < align))
16060 || (MEM_P (op1) && (MEM_ALIGN (op1) < align))))
16061 {
16062 rtx tmp[2];
16063
16064 /* ix86_expand_vector_move_misalign() does not like constants ... */
16065 if (CONSTANT_P (op1)
16066 || (GET_CODE (op1) == SUBREG
16067 && CONSTANT_P (SUBREG_REG (op1))))
16068 op1 = validize_mem (force_const_mem (mode, op1));
16069
16070 /* ... nor both arguments in memory. */
16071 if (!register_operand (op0, mode)
16072 && !register_operand (op1, mode))
16073 op1 = force_reg (mode, op1);
16074
16075 tmp[0] = op0; tmp[1] = op1;
16076 ix86_expand_vector_move_misalign (mode, tmp);
16077 return;
16078 }
16079
16080 /* Make operand1 a register if it isn't already. */
16081 if (can_create_pseudo_p ()
16082 && !register_operand (op0, mode)
16083 && !register_operand (op1, mode))
16084 {
16085 emit_move_insn (op0, force_reg (GET_MODE (op0), op1));
16086 return;
16087 }
16088
16089 emit_insn (gen_rtx_SET (VOIDmode, op0, op1));
16090 }
16091
16092 /* Split 32-byte AVX unaligned load and store if needed. */
16093
16094 static void
16095 ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1)
16096 {
16097 rtx m;
16098 rtx (*extract) (rtx, rtx, rtx);
16099 rtx (*load_unaligned) (rtx, rtx);
16100 rtx (*store_unaligned) (rtx, rtx);
16101 enum machine_mode mode;
16102
16103 switch (GET_MODE (op0))
16104 {
16105 default:
16106 gcc_unreachable ();
16107 case V32QImode:
16108 extract = gen_avx_vextractf128v32qi;
16109 load_unaligned = gen_avx_loaddqu256;
16110 store_unaligned = gen_avx_storedqu256;
16111 mode = V16QImode;
16112 break;
16113 case V8SFmode:
16114 extract = gen_avx_vextractf128v8sf;
16115 load_unaligned = gen_avx_loadups256;
16116 store_unaligned = gen_avx_storeups256;
16117 mode = V4SFmode;
16118 break;
16119 case V4DFmode:
16120 extract = gen_avx_vextractf128v4df;
16121 load_unaligned = gen_avx_loadupd256;
16122 store_unaligned = gen_avx_storeupd256;
16123 mode = V2DFmode;
16124 break;
16125 }
16126
16127 if (MEM_P (op1))
16128 {
16129 if (TARGET_AVX256_SPLIT_UNALIGNED_LOAD)
16130 {
16131 rtx r = gen_reg_rtx (mode);
16132 m = adjust_address (op1, mode, 0);
16133 emit_move_insn (r, m);
16134 m = adjust_address (op1, mode, 16);
16135 r = gen_rtx_VEC_CONCAT (GET_MODE (op0), r, m);
16136 emit_move_insn (op0, r);
16137 }
16138 else
16139 emit_insn (load_unaligned (op0, op1));
16140 }
16141 else if (MEM_P (op0))
16142 {
16143 if (TARGET_AVX256_SPLIT_UNALIGNED_STORE)
16144 {
16145 m = adjust_address (op0, mode, 0);
16146 emit_insn (extract (m, op1, const0_rtx));
16147 m = adjust_address (op0, mode, 16);
16148 emit_insn (extract (m, op1, const1_rtx));
16149 }
16150 else
16151 emit_insn (store_unaligned (op0, op1));
16152 }
16153 else
16154 gcc_unreachable ();
16155 }
16156
16157 /* Implement the movmisalign patterns for SSE. Non-SSE modes go
16158 straight to ix86_expand_vector_move. */
16159 /* Code generation for scalar reg-reg moves of single and double precision data:
16160 if (x86_sse_partial_reg_dependency == true | x86_sse_split_regs == true)
16161 movaps reg, reg
16162 else
16163 movss reg, reg
16164 if (x86_sse_partial_reg_dependency == true)
16165 movapd reg, reg
16166 else
16167 movsd reg, reg
16168
16169 Code generation for scalar loads of double precision data:
16170 if (x86_sse_split_regs == true)
16171 movlpd mem, reg (gas syntax)
16172 else
16173 movsd mem, reg
16174
16175 Code generation for unaligned packed loads of single precision data
16176 (x86_sse_unaligned_move_optimal overrides x86_sse_partial_reg_dependency):
16177 if (x86_sse_unaligned_move_optimal)
16178 movups mem, reg
16179
16180 if (x86_sse_partial_reg_dependency == true)
16181 {
16182 xorps reg, reg
16183 movlps mem, reg
16184 movhps mem+8, reg
16185 }
16186 else
16187 {
16188 movlps mem, reg
16189 movhps mem+8, reg
16190 }
16191
16192 Code generation for unaligned packed loads of double precision data
16193 (x86_sse_unaligned_move_optimal overrides x86_sse_split_regs):
16194 if (x86_sse_unaligned_move_optimal)
16195 movupd mem, reg
16196
16197 if (x86_sse_split_regs == true)
16198 {
16199 movlpd mem, reg
16200 movhpd mem+8, reg
16201 }
16202 else
16203 {
16204 movsd mem, reg
16205 movhpd mem+8, reg
16206 }
16207 */
16208
16209 void
16210 ix86_expand_vector_move_misalign (enum machine_mode mode, rtx operands[])
16211 {
16212 rtx op0, op1, m;
16213
16214 op0 = operands[0];
16215 op1 = operands[1];
16216
16217 if (TARGET_AVX
16218 && GET_MODE_SIZE (mode) == 32)
16219 {
16220 switch (GET_MODE_CLASS (mode))
16221 {
16222 case MODE_VECTOR_INT:
16223 case MODE_INT:
16224 op0 = gen_lowpart (V32QImode, op0);
16225 op1 = gen_lowpart (V32QImode, op1);
16226 /* FALLTHRU */
16227
16228 case MODE_VECTOR_FLOAT:
16229 ix86_avx256_split_vector_move_misalign (op0, op1);
16230 break;
16231
16232 default:
16233 gcc_unreachable ();
16234 }
16235
16236 return;
16237 }
16238
16239 if (MEM_P (op1))
16240 {
16241 /* ??? If we have typed data, then it would appear that using
16242 movdqu is the only way to get unaligned data loaded with
16243 integer type. */
16244 if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
16245 {
16246 op0 = gen_lowpart (V16QImode, op0);
16247 op1 = gen_lowpart (V16QImode, op1);
16248 /* We will eventually emit movups based on insn attributes. */
16249 emit_insn (gen_sse2_loaddqu (op0, op1));
16250 }
16251 else if (TARGET_SSE2 && mode == V2DFmode)
16252 {
16253 rtx zero;
16254
16255 if (TARGET_AVX
16256 || TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
16257 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
16258 || optimize_function_for_size_p (cfun))
16259 {
16260 /* We will eventually emit movups based on insn attributes. */
16261 emit_insn (gen_sse2_loadupd (op0, op1));
16262 return;
16263 }
16264
16265 /* When SSE registers are split into halves, we can avoid
16266 writing to the top half twice. */
16267 if (TARGET_SSE_SPLIT_REGS)
16268 {
16269 emit_clobber (op0);
16270 zero = op0;
16271 }
16272 else
16273 {
16274 /* ??? Not sure about the best option for the Intel chips.
16275 The following would seem to satisfy; the register is
16276 entirely cleared, breaking the dependency chain. We
16277 then store to the upper half, with a dependency depth
16278 of one. A rumor has it that Intel recommends two movsd
16279 followed by an unpacklpd, but this is unconfirmed. And
16280 given that the dependency depth of the unpacklpd would
16281 still be one, I'm not sure why this would be better. */
16282 zero = CONST0_RTX (V2DFmode);
16283 }
16284
16285 m = adjust_address (op1, DFmode, 0);
16286 emit_insn (gen_sse2_loadlpd (op0, zero, m));
16287 m = adjust_address (op1, DFmode, 8);
16288 emit_insn (gen_sse2_loadhpd (op0, op0, m));
16289 }
16290 else
16291 {
16292 if (TARGET_AVX
16293 || TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
16294 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
16295 || optimize_function_for_size_p (cfun))
16296 {
16297 op0 = gen_lowpart (V4SFmode, op0);
16298 op1 = gen_lowpart (V4SFmode, op1);
16299 emit_insn (gen_sse_loadups (op0, op1));
16300 return;
16301 }
16302
16303 if (TARGET_SSE_PARTIAL_REG_DEPENDENCY)
16304 emit_move_insn (op0, CONST0_RTX (mode));
16305 else
16306 emit_clobber (op0);
16307
16308 if (mode != V4SFmode)
16309 op0 = gen_lowpart (V4SFmode, op0);
16310
16311 m = adjust_address (op1, V2SFmode, 0);
16312 emit_insn (gen_sse_loadlps (op0, op0, m));
16313 m = adjust_address (op1, V2SFmode, 8);
16314 emit_insn (gen_sse_loadhps (op0, op0, m));
16315 }
16316 }
16317 else if (MEM_P (op0))
16318 {
16319 if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
16320 {
16321 op0 = gen_lowpart (V16QImode, op0);
16322 op1 = gen_lowpart (V16QImode, op1);
16323 /* We will eventually emit movups based on insn attributes. */
16324 emit_insn (gen_sse2_storedqu (op0, op1));
16325 }
16326 else if (TARGET_SSE2 && mode == V2DFmode)
16327 {
16328 if (TARGET_AVX
16329 || TARGET_SSE_UNALIGNED_STORE_OPTIMAL
16330 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
16331 || optimize_function_for_size_p (cfun))
16332 /* We will eventually emit movups based on insn attributes. */
16333 emit_insn (gen_sse2_storeupd (op0, op1));
16334 else
16335 {
16336 m = adjust_address (op0, DFmode, 0);
16337 emit_insn (gen_sse2_storelpd (m, op1));
16338 m = adjust_address (op0, DFmode, 8);
16339 emit_insn (gen_sse2_storehpd (m, op1));
16340 }
16341 }
16342 else
16343 {
16344 if (mode != V4SFmode)
16345 op1 = gen_lowpart (V4SFmode, op1);
16346
16347 if (TARGET_AVX
16348 || TARGET_SSE_UNALIGNED_STORE_OPTIMAL
16349 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
16350 || optimize_function_for_size_p (cfun))
16351 {
16352 op0 = gen_lowpart (V4SFmode, op0);
16353 emit_insn (gen_sse_storeups (op0, op1));
16354 }
16355 else
16356 {
16357 m = adjust_address (op0, V2SFmode, 0);
16358 emit_insn (gen_sse_storelps (m, op1));
16359 m = adjust_address (op0, V2SFmode, 8);
16360 emit_insn (gen_sse_storehps (m, op1));
16361 }
16362 }
16363 }
16364 else
16365 gcc_unreachable ();
16366 }
16367
16368 /* Expand a push in MODE. This is some mode for which we do not support
16369 proper push instructions, at least from the registers that we expect
16370 the value to live in. */
16371
16372 void
16373 ix86_expand_push (enum machine_mode mode, rtx x)
16374 {
16375 rtx tmp;
16376
16377 tmp = expand_simple_binop (Pmode, PLUS, stack_pointer_rtx,
16378 GEN_INT (-GET_MODE_SIZE (mode)),
16379 stack_pointer_rtx, 1, OPTAB_DIRECT);
16380 if (tmp != stack_pointer_rtx)
16381 emit_move_insn (stack_pointer_rtx, tmp);
16382
16383 tmp = gen_rtx_MEM (mode, stack_pointer_rtx);
16384
16385 /* When we push an operand onto stack, it has to be aligned at least
16386 at the function argument boundary. However since we don't have
16387 the argument type, we can't determine the actual argument
16388 boundary. */
16389 emit_move_insn (tmp, x);
16390 }
16391
16392 /* Helper function of ix86_fixup_binary_operands to canonicalize
16393 operand order. Returns true if the operands should be swapped. */
16394
16395 static bool
16396 ix86_swap_binary_operands_p (enum rtx_code code, enum machine_mode mode,
16397 rtx operands[])
16398 {
16399 rtx dst = operands[0];
16400 rtx src1 = operands[1];
16401 rtx src2 = operands[2];
16402
16403 /* If the operation is not commutative, we can't do anything. */
16404 if (GET_RTX_CLASS (code) != RTX_COMM_ARITH)
16405 return false;
16406
16407 /* Highest priority is that src1 should match dst. */
16408 if (rtx_equal_p (dst, src1))
16409 return false;
16410 if (rtx_equal_p (dst, src2))
16411 return true;
16412
16413 /* Next highest priority is that immediate constants come second. */
16414 if (immediate_operand (src2, mode))
16415 return false;
16416 if (immediate_operand (src1, mode))
16417 return true;
16418
16419 /* Lowest priority is that memory references should come second. */
16420 if (MEM_P (src2))
16421 return false;
16422 if (MEM_P (src1))
16423 return true;
16424
16425 return false;
16426 }
16427
16428
16429 /* Fix up OPERANDS to satisfy ix86_binary_operator_ok. Return the
16430 destination to use for the operation. If different from the true
16431 destination in operands[0], a copy operation will be required. */
16432
16433 rtx
16434 ix86_fixup_binary_operands (enum rtx_code code, enum machine_mode mode,
16435 rtx operands[])
16436 {
16437 rtx dst = operands[0];
16438 rtx src1 = operands[1];
16439 rtx src2 = operands[2];
16440
16441 /* Canonicalize operand order. */
16442 if (ix86_swap_binary_operands_p (code, mode, operands))
16443 {
16444 rtx temp;
16445
16446 /* It is invalid to swap operands of different modes. */
16447 gcc_assert (GET_MODE (src1) == GET_MODE (src2));
16448
16449 temp = src1;
16450 src1 = src2;
16451 src2 = temp;
16452 }
16453
16454 /* Both source operands cannot be in memory. */
16455 if (MEM_P (src1) && MEM_P (src2))
16456 {
16457 /* Optimization: Only read from memory once. */
16458 if (rtx_equal_p (src1, src2))
16459 {
16460 src2 = force_reg (mode, src2);
16461 src1 = src2;
16462 }
16463 else
16464 src2 = force_reg (mode, src2);
16465 }
16466
16467 /* If the destination is memory, and we do not have matching source
16468 operands, do things in registers. */
16469 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
16470 dst = gen_reg_rtx (mode);
16471
16472 /* Source 1 cannot be a constant. */
16473 if (CONSTANT_P (src1))
16474 src1 = force_reg (mode, src1);
16475
16476 /* Source 1 cannot be a non-matching memory. */
16477 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
16478 src1 = force_reg (mode, src1);
16479
16480 /* Improve address combine. */
16481 if (code == PLUS
16482 && GET_MODE_CLASS (mode) == MODE_INT
16483 && MEM_P (src2))
16484 src2 = force_reg (mode, src2);
16485
16486 operands[1] = src1;
16487 operands[2] = src2;
16488 return dst;
16489 }
16490
16491 /* Similarly, but assume that the destination has already been
16492 set up properly. */
16493
16494 void
16495 ix86_fixup_binary_operands_no_copy (enum rtx_code code,
16496 enum machine_mode mode, rtx operands[])
16497 {
16498 rtx dst = ix86_fixup_binary_operands (code, mode, operands);
16499 gcc_assert (dst == operands[0]);
16500 }
16501
16502 /* Attempt to expand a binary operator. Make the expansion closer to the
16503 actual machine, then just general_operand, which will allow 3 separate
16504 memory references (one output, two input) in a single insn. */
16505
16506 void
16507 ix86_expand_binary_operator (enum rtx_code code, enum machine_mode mode,
16508 rtx operands[])
16509 {
16510 rtx src1, src2, dst, op, clob;
16511
16512 dst = ix86_fixup_binary_operands (code, mode, operands);
16513 src1 = operands[1];
16514 src2 = operands[2];
16515
16516 /* Emit the instruction. */
16517
16518 op = gen_rtx_SET (VOIDmode, dst, gen_rtx_fmt_ee (code, mode, src1, src2));
16519 if (reload_in_progress)
16520 {
16521 /* Reload doesn't know about the flags register, and doesn't know that
16522 it doesn't want to clobber it. We can only do this with PLUS. */
16523 gcc_assert (code == PLUS);
16524 emit_insn (op);
16525 }
16526 else if (reload_completed
16527 && code == PLUS
16528 && !rtx_equal_p (dst, src1))
16529 {
16530 /* This is going to be an LEA; avoid splitting it later. */
16531 emit_insn (op);
16532 }
16533 else
16534 {
16535 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
16536 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
16537 }
16538
16539 /* Fix up the destination if needed. */
16540 if (dst != operands[0])
16541 emit_move_insn (operands[0], dst);
16542 }
16543
16544 /* Expand vector logical operation CODE (AND, IOR, XOR) in MODE with
16545 the given OPERANDS. */
16546
16547 void
16548 ix86_expand_vector_logical_operator (enum rtx_code code, enum machine_mode mode,
16549 rtx operands[])
16550 {
16551 rtx op1 = NULL_RTX, op2 = NULL_RTX;
16552 if (GET_CODE (operands[1]) == SUBREG)
16553 {
16554 op1 = operands[1];
16555 op2 = operands[2];
16556 }
16557 else if (GET_CODE (operands[2]) == SUBREG)
16558 {
16559 op1 = operands[2];
16560 op2 = operands[1];
16561 }
16562 /* Optimize (__m128i) d | (__m128i) e and similar code
16563 when d and e are float vectors into float vector logical
16564 insn. In C/C++ without using intrinsics there is no other way
16565 to express vector logical operation on float vectors than
16566 to cast them temporarily to integer vectors. */
16567 if (op1
16568 && !TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
16569 && ((GET_CODE (op2) == SUBREG || GET_CODE (op2) == CONST_VECTOR))
16570 && GET_MODE_CLASS (GET_MODE (SUBREG_REG (op1))) == MODE_VECTOR_FLOAT
16571 && GET_MODE_SIZE (GET_MODE (SUBREG_REG (op1))) == GET_MODE_SIZE (mode)
16572 && SUBREG_BYTE (op1) == 0
16573 && (GET_CODE (op2) == CONST_VECTOR
16574 || (GET_MODE (SUBREG_REG (op1)) == GET_MODE (SUBREG_REG (op2))
16575 && SUBREG_BYTE (op2) == 0))
16576 && can_create_pseudo_p ())
16577 {
16578 rtx dst;
16579 switch (GET_MODE (SUBREG_REG (op1)))
16580 {
16581 case V4SFmode:
16582 case V8SFmode:
16583 case V2DFmode:
16584 case V4DFmode:
16585 dst = gen_reg_rtx (GET_MODE (SUBREG_REG (op1)));
16586 if (GET_CODE (op2) == CONST_VECTOR)
16587 {
16588 op2 = gen_lowpart (GET_MODE (dst), op2);
16589 op2 = force_reg (GET_MODE (dst), op2);
16590 }
16591 else
16592 {
16593 op1 = operands[1];
16594 op2 = SUBREG_REG (operands[2]);
16595 if (!nonimmediate_operand (op2, GET_MODE (dst)))
16596 op2 = force_reg (GET_MODE (dst), op2);
16597 }
16598 op1 = SUBREG_REG (op1);
16599 if (!nonimmediate_operand (op1, GET_MODE (dst)))
16600 op1 = force_reg (GET_MODE (dst), op1);
16601 emit_insn (gen_rtx_SET (VOIDmode, dst,
16602 gen_rtx_fmt_ee (code, GET_MODE (dst),
16603 op1, op2)));
16604 emit_move_insn (operands[0], gen_lowpart (mode, dst));
16605 return;
16606 default:
16607 break;
16608 }
16609 }
16610 if (!nonimmediate_operand (operands[1], mode))
16611 operands[1] = force_reg (mode, operands[1]);
16612 if (!nonimmediate_operand (operands[2], mode))
16613 operands[2] = force_reg (mode, operands[2]);
16614 ix86_fixup_binary_operands_no_copy (code, mode, operands);
16615 emit_insn (gen_rtx_SET (VOIDmode, operands[0],
16616 gen_rtx_fmt_ee (code, mode, operands[1],
16617 operands[2])));
16618 }
16619
16620 /* Return TRUE or FALSE depending on whether the binary operator meets the
16621 appropriate constraints. */
16622
16623 bool
16624 ix86_binary_operator_ok (enum rtx_code code, enum machine_mode mode,
16625 rtx operands[3])
16626 {
16627 rtx dst = operands[0];
16628 rtx src1 = operands[1];
16629 rtx src2 = operands[2];
16630
16631 /* Both source operands cannot be in memory. */
16632 if (MEM_P (src1) && MEM_P (src2))
16633 return false;
16634
16635 /* Canonicalize operand order for commutative operators. */
16636 if (ix86_swap_binary_operands_p (code, mode, operands))
16637 {
16638 rtx temp = src1;
16639 src1 = src2;
16640 src2 = temp;
16641 }
16642
16643 /* If the destination is memory, we must have a matching source operand. */
16644 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
16645 return false;
16646
16647 /* Source 1 cannot be a constant. */
16648 if (CONSTANT_P (src1))
16649 return false;
16650
16651 /* Source 1 cannot be a non-matching memory. */
16652 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
16653 /* Support "andhi/andsi/anddi" as a zero-extending move. */
16654 return (code == AND
16655 && (mode == HImode
16656 || mode == SImode
16657 || (TARGET_64BIT && mode == DImode))
16658 && satisfies_constraint_L (src2));
16659
16660 return true;
16661 }
16662
16663 /* Attempt to expand a unary operator. Make the expansion closer to the
16664 actual machine, then just general_operand, which will allow 2 separate
16665 memory references (one output, one input) in a single insn. */
16666
16667 void
16668 ix86_expand_unary_operator (enum rtx_code code, enum machine_mode mode,
16669 rtx operands[])
16670 {
16671 int matching_memory;
16672 rtx src, dst, op, clob;
16673
16674 dst = operands[0];
16675 src = operands[1];
16676
16677 /* If the destination is memory, and we do not have matching source
16678 operands, do things in registers. */
16679 matching_memory = 0;
16680 if (MEM_P (dst))
16681 {
16682 if (rtx_equal_p (dst, src))
16683 matching_memory = 1;
16684 else
16685 dst = gen_reg_rtx (mode);
16686 }
16687
16688 /* When source operand is memory, destination must match. */
16689 if (MEM_P (src) && !matching_memory)
16690 src = force_reg (mode, src);
16691
16692 /* Emit the instruction. */
16693
16694 op = gen_rtx_SET (VOIDmode, dst, gen_rtx_fmt_e (code, mode, src));
16695 if (reload_in_progress || code == NOT)
16696 {
16697 /* Reload doesn't know about the flags register, and doesn't know that
16698 it doesn't want to clobber it. */
16699 gcc_assert (code == NOT);
16700 emit_insn (op);
16701 }
16702 else
16703 {
16704 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
16705 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
16706 }
16707
16708 /* Fix up the destination if needed. */
16709 if (dst != operands[0])
16710 emit_move_insn (operands[0], dst);
16711 }
16712
16713 /* Split 32bit/64bit divmod with 8bit unsigned divmod if dividend and
16714 divisor are within the range [0-255]. */
16715
16716 void
16717 ix86_split_idivmod (enum machine_mode mode, rtx operands[],
16718 bool signed_p)
16719 {
16720 rtx end_label, qimode_label;
16721 rtx insn, div, mod;
16722 rtx scratch, tmp0, tmp1, tmp2;
16723 rtx (*gen_divmod4_1) (rtx, rtx, rtx, rtx);
16724 rtx (*gen_zero_extend) (rtx, rtx);
16725 rtx (*gen_test_ccno_1) (rtx, rtx);
16726
16727 switch (mode)
16728 {
16729 case SImode:
16730 gen_divmod4_1 = signed_p ? gen_divmodsi4_1 : gen_udivmodsi4_1;
16731 gen_test_ccno_1 = gen_testsi_ccno_1;
16732 gen_zero_extend = gen_zero_extendqisi2;
16733 break;
16734 case DImode:
16735 gen_divmod4_1 = signed_p ? gen_divmoddi4_1 : gen_udivmoddi4_1;
16736 gen_test_ccno_1 = gen_testdi_ccno_1;
16737 gen_zero_extend = gen_zero_extendqidi2;
16738 break;
16739 default:
16740 gcc_unreachable ();
16741 }
16742
16743 end_label = gen_label_rtx ();
16744 qimode_label = gen_label_rtx ();
16745
16746 scratch = gen_reg_rtx (mode);
16747
16748 /* Use 8bit unsigned divimod if dividend and divisor are within
16749 the range [0-255]. */
16750 emit_move_insn (scratch, operands[2]);
16751 scratch = expand_simple_binop (mode, IOR, scratch, operands[3],
16752 scratch, 1, OPTAB_DIRECT);
16753 emit_insn (gen_test_ccno_1 (scratch, GEN_INT (-0x100)));
16754 tmp0 = gen_rtx_REG (CCNOmode, FLAGS_REG);
16755 tmp0 = gen_rtx_EQ (VOIDmode, tmp0, const0_rtx);
16756 tmp0 = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp0,
16757 gen_rtx_LABEL_REF (VOIDmode, qimode_label),
16758 pc_rtx);
16759 insn = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp0));
16760 predict_jump (REG_BR_PROB_BASE * 50 / 100);
16761 JUMP_LABEL (insn) = qimode_label;
16762
16763 /* Generate original signed/unsigned divimod. */
16764 div = gen_divmod4_1 (operands[0], operands[1],
16765 operands[2], operands[3]);
16766 emit_insn (div);
16767
16768 /* Branch to the end. */
16769 emit_jump_insn (gen_jump (end_label));
16770 emit_barrier ();
16771
16772 /* Generate 8bit unsigned divide. */
16773 emit_label (qimode_label);
16774 /* Don't use operands[0] for result of 8bit divide since not all
16775 registers support QImode ZERO_EXTRACT. */
16776 tmp0 = simplify_gen_subreg (HImode, scratch, mode, 0);
16777 tmp1 = simplify_gen_subreg (HImode, operands[2], mode, 0);
16778 tmp2 = simplify_gen_subreg (QImode, operands[3], mode, 0);
16779 emit_insn (gen_udivmodhiqi3 (tmp0, tmp1, tmp2));
16780
16781 if (signed_p)
16782 {
16783 div = gen_rtx_DIV (SImode, operands[2], operands[3]);
16784 mod = gen_rtx_MOD (SImode, operands[2], operands[3]);
16785 }
16786 else
16787 {
16788 div = gen_rtx_UDIV (SImode, operands[2], operands[3]);
16789 mod = gen_rtx_UMOD (SImode, operands[2], operands[3]);
16790 }
16791
16792 /* Extract remainder from AH. */
16793 tmp1 = gen_rtx_ZERO_EXTRACT (mode, tmp0, GEN_INT (8), GEN_INT (8));
16794 if (REG_P (operands[1]))
16795 insn = emit_move_insn (operands[1], tmp1);
16796 else
16797 {
16798 /* Need a new scratch register since the old one has result
16799 of 8bit divide. */
16800 scratch = gen_reg_rtx (mode);
16801 emit_move_insn (scratch, tmp1);
16802 insn = emit_move_insn (operands[1], scratch);
16803 }
16804 set_unique_reg_note (insn, REG_EQUAL, mod);
16805
16806 /* Zero extend quotient from AL. */
16807 tmp1 = gen_lowpart (QImode, tmp0);
16808 insn = emit_insn (gen_zero_extend (operands[0], tmp1));
16809 set_unique_reg_note (insn, REG_EQUAL, div);
16810
16811 emit_label (end_label);
16812 }
16813
16814 #define LEA_MAX_STALL (3)
16815 #define LEA_SEARCH_THRESHOLD (LEA_MAX_STALL << 1)
16816
16817 /* Increase given DISTANCE in half-cycles according to
16818 dependencies between PREV and NEXT instructions.
16819 Add 1 half-cycle if there is no dependency and
16820 go to next cycle if there is some dependecy. */
16821
16822 static unsigned int
16823 increase_distance (rtx prev, rtx next, unsigned int distance)
16824 {
16825 df_ref *use_rec;
16826 df_ref *def_rec;
16827
16828 if (!prev || !next)
16829 return distance + (distance & 1) + 2;
16830
16831 if (!DF_INSN_USES (next) || !DF_INSN_DEFS (prev))
16832 return distance + 1;
16833
16834 for (use_rec = DF_INSN_USES (next); *use_rec; use_rec++)
16835 for (def_rec = DF_INSN_DEFS (prev); *def_rec; def_rec++)
16836 if (!DF_REF_IS_ARTIFICIAL (*def_rec)
16837 && DF_REF_REGNO (*use_rec) == DF_REF_REGNO (*def_rec))
16838 return distance + (distance & 1) + 2;
16839
16840 return distance + 1;
16841 }
16842
16843 /* Function checks if instruction INSN defines register number
16844 REGNO1 or REGNO2. */
16845
16846 static bool
16847 insn_defines_reg (unsigned int regno1, unsigned int regno2,
16848 rtx insn)
16849 {
16850 df_ref *def_rec;
16851
16852 for (def_rec = DF_INSN_DEFS (insn); *def_rec; def_rec++)
16853 if (DF_REF_REG_DEF_P (*def_rec)
16854 && !DF_REF_IS_ARTIFICIAL (*def_rec)
16855 && (regno1 == DF_REF_REGNO (*def_rec)
16856 || regno2 == DF_REF_REGNO (*def_rec)))
16857 {
16858 return true;
16859 }
16860
16861 return false;
16862 }
16863
16864 /* Function checks if instruction INSN uses register number
16865 REGNO as a part of address expression. */
16866
16867 static bool
16868 insn_uses_reg_mem (unsigned int regno, rtx insn)
16869 {
16870 df_ref *use_rec;
16871
16872 for (use_rec = DF_INSN_USES (insn); *use_rec; use_rec++)
16873 if (DF_REF_REG_MEM_P (*use_rec) && regno == DF_REF_REGNO (*use_rec))
16874 return true;
16875
16876 return false;
16877 }
16878
16879 /* Search backward for non-agu definition of register number REGNO1
16880 or register number REGNO2 in basic block starting from instruction
16881 START up to head of basic block or instruction INSN.
16882
16883 Function puts true value into *FOUND var if definition was found
16884 and false otherwise.
16885
16886 Distance in half-cycles between START and found instruction or head
16887 of BB is added to DISTANCE and returned. */
16888
16889 static int
16890 distance_non_agu_define_in_bb (unsigned int regno1, unsigned int regno2,
16891 rtx insn, int distance,
16892 rtx start, bool *found)
16893 {
16894 basic_block bb = start ? BLOCK_FOR_INSN (start) : NULL;
16895 rtx prev = start;
16896 rtx next = NULL;
16897
16898 *found = false;
16899
16900 while (prev
16901 && prev != insn
16902 && distance < LEA_SEARCH_THRESHOLD)
16903 {
16904 if (NONDEBUG_INSN_P (prev) && NONJUMP_INSN_P (prev))
16905 {
16906 distance = increase_distance (prev, next, distance);
16907 if (insn_defines_reg (regno1, regno2, prev))
16908 {
16909 if (recog_memoized (prev) < 0
16910 || get_attr_type (prev) != TYPE_LEA)
16911 {
16912 *found = true;
16913 return distance;
16914 }
16915 }
16916
16917 next = prev;
16918 }
16919 if (prev == BB_HEAD (bb))
16920 break;
16921
16922 prev = PREV_INSN (prev);
16923 }
16924
16925 return distance;
16926 }
16927
16928 /* Search backward for non-agu definition of register number REGNO1
16929 or register number REGNO2 in INSN's basic block until
16930 1. Pass LEA_SEARCH_THRESHOLD instructions, or
16931 2. Reach neighbour BBs boundary, or
16932 3. Reach agu definition.
16933 Returns the distance between the non-agu definition point and INSN.
16934 If no definition point, returns -1. */
16935
16936 static int
16937 distance_non_agu_define (unsigned int regno1, unsigned int regno2,
16938 rtx insn)
16939 {
16940 basic_block bb = BLOCK_FOR_INSN (insn);
16941 int distance = 0;
16942 bool found = false;
16943
16944 if (insn != BB_HEAD (bb))
16945 distance = distance_non_agu_define_in_bb (regno1, regno2, insn,
16946 distance, PREV_INSN (insn),
16947 &found);
16948
16949 if (!found && distance < LEA_SEARCH_THRESHOLD)
16950 {
16951 edge e;
16952 edge_iterator ei;
16953 bool simple_loop = false;
16954
16955 FOR_EACH_EDGE (e, ei, bb->preds)
16956 if (e->src == bb)
16957 {
16958 simple_loop = true;
16959 break;
16960 }
16961
16962 if (simple_loop)
16963 distance = distance_non_agu_define_in_bb (regno1, regno2,
16964 insn, distance,
16965 BB_END (bb), &found);
16966 else
16967 {
16968 int shortest_dist = -1;
16969 bool found_in_bb = false;
16970
16971 FOR_EACH_EDGE (e, ei, bb->preds)
16972 {
16973 int bb_dist
16974 = distance_non_agu_define_in_bb (regno1, regno2,
16975 insn, distance,
16976 BB_END (e->src),
16977 &found_in_bb);
16978 if (found_in_bb)
16979 {
16980 if (shortest_dist < 0)
16981 shortest_dist = bb_dist;
16982 else if (bb_dist > 0)
16983 shortest_dist = MIN (bb_dist, shortest_dist);
16984
16985 found = true;
16986 }
16987 }
16988
16989 distance = shortest_dist;
16990 }
16991 }
16992
16993 /* get_attr_type may modify recog data. We want to make sure
16994 that recog data is valid for instruction INSN, on which
16995 distance_non_agu_define is called. INSN is unchanged here. */
16996 extract_insn_cached (insn);
16997
16998 if (!found)
16999 return -1;
17000
17001 return distance >> 1;
17002 }
17003
17004 /* Return the distance in half-cycles between INSN and the next
17005 insn that uses register number REGNO in memory address added
17006 to DISTANCE. Return -1 if REGNO0 is set.
17007
17008 Put true value into *FOUND if register usage was found and
17009 false otherwise.
17010 Put true value into *REDEFINED if register redefinition was
17011 found and false otherwise. */
17012
17013 static int
17014 distance_agu_use_in_bb (unsigned int regno,
17015 rtx insn, int distance, rtx start,
17016 bool *found, bool *redefined)
17017 {
17018 basic_block bb = start ? BLOCK_FOR_INSN (start) : NULL;
17019 rtx next = start;
17020 rtx prev = NULL;
17021
17022 *found = false;
17023 *redefined = false;
17024
17025 while (next
17026 && next != insn
17027 && distance < LEA_SEARCH_THRESHOLD)
17028 {
17029 if (NONDEBUG_INSN_P (next) && NONJUMP_INSN_P (next))
17030 {
17031 distance = increase_distance(prev, next, distance);
17032 if (insn_uses_reg_mem (regno, next))
17033 {
17034 /* Return DISTANCE if OP0 is used in memory
17035 address in NEXT. */
17036 *found = true;
17037 return distance;
17038 }
17039
17040 if (insn_defines_reg (regno, INVALID_REGNUM, next))
17041 {
17042 /* Return -1 if OP0 is set in NEXT. */
17043 *redefined = true;
17044 return -1;
17045 }
17046
17047 prev = next;
17048 }
17049
17050 if (next == BB_END (bb))
17051 break;
17052
17053 next = NEXT_INSN (next);
17054 }
17055
17056 return distance;
17057 }
17058
17059 /* Return the distance between INSN and the next insn that uses
17060 register number REGNO0 in memory address. Return -1 if no such
17061 a use is found within LEA_SEARCH_THRESHOLD or REGNO0 is set. */
17062
17063 static int
17064 distance_agu_use (unsigned int regno0, rtx insn)
17065 {
17066 basic_block bb = BLOCK_FOR_INSN (insn);
17067 int distance = 0;
17068 bool found = false;
17069 bool redefined = false;
17070
17071 if (insn != BB_END (bb))
17072 distance = distance_agu_use_in_bb (regno0, insn, distance,
17073 NEXT_INSN (insn),
17074 &found, &redefined);
17075
17076 if (!found && !redefined && distance < LEA_SEARCH_THRESHOLD)
17077 {
17078 edge e;
17079 edge_iterator ei;
17080 bool simple_loop = false;
17081
17082 FOR_EACH_EDGE (e, ei, bb->succs)
17083 if (e->dest == bb)
17084 {
17085 simple_loop = true;
17086 break;
17087 }
17088
17089 if (simple_loop)
17090 distance = distance_agu_use_in_bb (regno0, insn,
17091 distance, BB_HEAD (bb),
17092 &found, &redefined);
17093 else
17094 {
17095 int shortest_dist = -1;
17096 bool found_in_bb = false;
17097 bool redefined_in_bb = false;
17098
17099 FOR_EACH_EDGE (e, ei, bb->succs)
17100 {
17101 int bb_dist
17102 = distance_agu_use_in_bb (regno0, insn,
17103 distance, BB_HEAD (e->dest),
17104 &found_in_bb, &redefined_in_bb);
17105 if (found_in_bb)
17106 {
17107 if (shortest_dist < 0)
17108 shortest_dist = bb_dist;
17109 else if (bb_dist > 0)
17110 shortest_dist = MIN (bb_dist, shortest_dist);
17111
17112 found = true;
17113 }
17114 }
17115
17116 distance = shortest_dist;
17117 }
17118 }
17119
17120 if (!found || redefined)
17121 return -1;
17122
17123 return distance >> 1;
17124 }
17125
17126 /* Define this macro to tune LEA priority vs ADD, it take effect when
17127 there is a dilemma of choicing LEA or ADD
17128 Negative value: ADD is more preferred than LEA
17129 Zero: Netrual
17130 Positive value: LEA is more preferred than ADD*/
17131 #define IX86_LEA_PRIORITY 0
17132
17133 /* Return true if usage of lea INSN has performance advantage
17134 over a sequence of instructions. Instructions sequence has
17135 SPLIT_COST cycles higher latency than lea latency. */
17136
17137 static bool
17138 ix86_lea_outperforms (rtx insn, unsigned int regno0, unsigned int regno1,
17139 unsigned int regno2, int split_cost)
17140 {
17141 int dist_define, dist_use;
17142
17143 dist_define = distance_non_agu_define (regno1, regno2, insn);
17144 dist_use = distance_agu_use (regno0, insn);
17145
17146 if (dist_define < 0 || dist_define >= LEA_MAX_STALL)
17147 {
17148 /* If there is no non AGU operand definition, no AGU
17149 operand usage and split cost is 0 then both lea
17150 and non lea variants have same priority. Currently
17151 we prefer lea for 64 bit code and non lea on 32 bit
17152 code. */
17153 if (dist_use < 0 && split_cost == 0)
17154 return TARGET_64BIT || IX86_LEA_PRIORITY;
17155 else
17156 return true;
17157 }
17158
17159 /* With longer definitions distance lea is more preferable.
17160 Here we change it to take into account splitting cost and
17161 lea priority. */
17162 dist_define += split_cost + IX86_LEA_PRIORITY;
17163
17164 /* If there is no use in memory addess then we just check
17165 that split cost exceeds AGU stall. */
17166 if (dist_use < 0)
17167 return dist_define > LEA_MAX_STALL;
17168
17169 /* If this insn has both backward non-agu dependence and forward
17170 agu dependence, the one with short distance takes effect. */
17171 return dist_define >= dist_use;
17172 }
17173
17174 /* Return true if it is legal to clobber flags by INSN and
17175 false otherwise. */
17176
17177 static bool
17178 ix86_ok_to_clobber_flags (rtx insn)
17179 {
17180 basic_block bb = BLOCK_FOR_INSN (insn);
17181 df_ref *use;
17182 bitmap live;
17183
17184 while (insn)
17185 {
17186 if (NONDEBUG_INSN_P (insn))
17187 {
17188 for (use = DF_INSN_USES (insn); *use; use++)
17189 if (DF_REF_REG_USE_P (*use) && DF_REF_REGNO (*use) == FLAGS_REG)
17190 return false;
17191
17192 if (insn_defines_reg (FLAGS_REG, INVALID_REGNUM, insn))
17193 return true;
17194 }
17195
17196 if (insn == BB_END (bb))
17197 break;
17198
17199 insn = NEXT_INSN (insn);
17200 }
17201
17202 live = df_get_live_out(bb);
17203 return !REGNO_REG_SET_P (live, FLAGS_REG);
17204 }
17205
17206 /* Return true if we need to split op0 = op1 + op2 into a sequence of
17207 move and add to avoid AGU stalls. */
17208
17209 bool
17210 ix86_avoid_lea_for_add (rtx insn, rtx operands[])
17211 {
17212 unsigned int regno0, regno1, regno2;
17213
17214 /* Check if we need to optimize. */
17215 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
17216 return false;
17217
17218 /* Check it is correct to split here. */
17219 if (!ix86_ok_to_clobber_flags(insn))
17220 return false;
17221
17222 regno0 = true_regnum (operands[0]);
17223 regno1 = true_regnum (operands[1]);
17224 regno2 = true_regnum (operands[2]);
17225
17226 /* We need to split only adds with non destructive
17227 destination operand. */
17228 if (regno0 == regno1 || regno0 == regno2)
17229 return false;
17230 else
17231 return !ix86_lea_outperforms (insn, regno0, regno1, regno2, 1);
17232 }
17233
17234 /* Return true if we should emit lea instruction instead of mov
17235 instruction. */
17236
17237 bool
17238 ix86_use_lea_for_mov (rtx insn, rtx operands[])
17239 {
17240 unsigned int regno0, regno1;
17241
17242 /* Check if we need to optimize. */
17243 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
17244 return false;
17245
17246 /* Use lea for reg to reg moves only. */
17247 if (!REG_P (operands[0]) || !REG_P (operands[1]))
17248 return false;
17249
17250 regno0 = true_regnum (operands[0]);
17251 regno1 = true_regnum (operands[1]);
17252
17253 return ix86_lea_outperforms (insn, regno0, regno1, INVALID_REGNUM, 0);
17254 }
17255
17256 /* Return true if we need to split lea into a sequence of
17257 instructions to avoid AGU stalls. */
17258
17259 bool
17260 ix86_avoid_lea_for_addr (rtx insn, rtx operands[])
17261 {
17262 unsigned int regno0, regno1, regno2;
17263 int split_cost;
17264 struct ix86_address parts;
17265 int ok;
17266
17267 /* Check we need to optimize. */
17268 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
17269 return false;
17270
17271 /* Check it is correct to split here. */
17272 if (!ix86_ok_to_clobber_flags(insn))
17273 return false;
17274
17275 ok = ix86_decompose_address (operands[1], &parts);
17276 gcc_assert (ok);
17277
17278 /* There should be at least two components in the address. */
17279 if ((parts.base != NULL_RTX) + (parts.index != NULL_RTX)
17280 + (parts.disp != NULL_RTX) + (parts.scale > 1) < 2)
17281 return false;
17282
17283 /* We should not split into add if non legitimate pic
17284 operand is used as displacement. */
17285 if (parts.disp && flag_pic && !LEGITIMATE_PIC_OPERAND_P (parts.disp))
17286 return false;
17287
17288 regno0 = true_regnum (operands[0]) ;
17289 regno1 = INVALID_REGNUM;
17290 regno2 = INVALID_REGNUM;
17291
17292 if (parts.base)
17293 regno1 = true_regnum (parts.base);
17294 if (parts.index)
17295 regno2 = true_regnum (parts.index);
17296
17297 split_cost = 0;
17298
17299 /* Compute how many cycles we will add to execution time
17300 if split lea into a sequence of instructions. */
17301 if (parts.base || parts.index)
17302 {
17303 /* Have to use mov instruction if non desctructive
17304 destination form is used. */
17305 if (regno1 != regno0 && regno2 != regno0)
17306 split_cost += 1;
17307
17308 /* Have to add index to base if both exist. */
17309 if (parts.base && parts.index)
17310 split_cost += 1;
17311
17312 /* Have to use shift and adds if scale is 2 or greater. */
17313 if (parts.scale > 1)
17314 {
17315 if (regno0 != regno1)
17316 split_cost += 1;
17317 else if (regno2 == regno0)
17318 split_cost += 4;
17319 else
17320 split_cost += parts.scale;
17321 }
17322
17323 /* Have to use add instruction with immediate if
17324 disp is non zero. */
17325 if (parts.disp && parts.disp != const0_rtx)
17326 split_cost += 1;
17327
17328 /* Subtract the price of lea. */
17329 split_cost -= 1;
17330 }
17331
17332 return !ix86_lea_outperforms (insn, regno0, regno1, regno2, split_cost);
17333 }
17334
17335 /* Emit x86 binary operand CODE in mode MODE, where the first operand
17336 matches destination. RTX includes clobber of FLAGS_REG. */
17337
17338 static void
17339 ix86_emit_binop (enum rtx_code code, enum machine_mode mode,
17340 rtx dst, rtx src)
17341 {
17342 rtx op, clob;
17343
17344 op = gen_rtx_SET (VOIDmode, dst, gen_rtx_fmt_ee (code, mode, dst, src));
17345 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
17346
17347 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
17348 }
17349
17350 /* Return true if regno1 def is nearest to the insn. */
17351
17352 static bool
17353 find_nearest_reg_def (rtx insn, int regno1, int regno2)
17354 {
17355 rtx prev = insn;
17356 rtx start = BB_HEAD (BLOCK_FOR_INSN (insn));
17357
17358 if (insn == start)
17359 return false;
17360 while (prev && prev != start)
17361 {
17362 if (!INSN_P (prev) || !NONDEBUG_INSN_P (prev))
17363 {
17364 prev = PREV_INSN (prev);
17365 continue;
17366 }
17367 if (insn_defines_reg (regno1, INVALID_REGNUM, prev))
17368 return true;
17369 else if (insn_defines_reg (regno2, INVALID_REGNUM, prev))
17370 return false;
17371 prev = PREV_INSN (prev);
17372 }
17373
17374 /* None of the regs is defined in the bb. */
17375 return false;
17376 }
17377
17378 /* Split lea instructions into a sequence of instructions
17379 which are executed on ALU to avoid AGU stalls.
17380 It is assumed that it is allowed to clobber flags register
17381 at lea position. */
17382
17383 void
17384 ix86_split_lea_for_addr (rtx insn, rtx operands[], enum machine_mode mode)
17385 {
17386 unsigned int regno0, regno1, regno2;
17387 struct ix86_address parts;
17388 rtx target, tmp;
17389 int ok, adds;
17390
17391 ok = ix86_decompose_address (operands[1], &parts);
17392 gcc_assert (ok);
17393
17394 target = gen_lowpart (mode, operands[0]);
17395
17396 regno0 = true_regnum (target);
17397 regno1 = INVALID_REGNUM;
17398 regno2 = INVALID_REGNUM;
17399
17400 if (parts.base)
17401 {
17402 parts.base = gen_lowpart (mode, parts.base);
17403 regno1 = true_regnum (parts.base);
17404 }
17405
17406 if (parts.index)
17407 {
17408 parts.index = gen_lowpart (mode, parts.index);
17409 regno2 = true_regnum (parts.index);
17410 }
17411
17412 if (parts.disp)
17413 parts.disp = gen_lowpart (mode, parts.disp);
17414
17415 if (parts.scale > 1)
17416 {
17417 /* Case r1 = r1 + ... */
17418 if (regno1 == regno0)
17419 {
17420 /* If we have a case r1 = r1 + C * r1 then we
17421 should use multiplication which is very
17422 expensive. Assume cost model is wrong if we
17423 have such case here. */
17424 gcc_assert (regno2 != regno0);
17425
17426 for (adds = parts.scale; adds > 0; adds--)
17427 ix86_emit_binop (PLUS, mode, target, parts.index);
17428 }
17429 else
17430 {
17431 /* r1 = r2 + r3 * C case. Need to move r3 into r1. */
17432 if (regno0 != regno2)
17433 emit_insn (gen_rtx_SET (VOIDmode, target, parts.index));
17434
17435 /* Use shift for scaling. */
17436 ix86_emit_binop (ASHIFT, mode, target,
17437 GEN_INT (exact_log2 (parts.scale)));
17438
17439 if (parts.base)
17440 ix86_emit_binop (PLUS, mode, target, parts.base);
17441
17442 if (parts.disp && parts.disp != const0_rtx)
17443 ix86_emit_binop (PLUS, mode, target, parts.disp);
17444 }
17445 }
17446 else if (!parts.base && !parts.index)
17447 {
17448 gcc_assert(parts.disp);
17449 emit_insn (gen_rtx_SET (VOIDmode, target, parts.disp));
17450 }
17451 else
17452 {
17453 if (!parts.base)
17454 {
17455 if (regno0 != regno2)
17456 emit_insn (gen_rtx_SET (VOIDmode, target, parts.index));
17457 }
17458 else if (!parts.index)
17459 {
17460 if (regno0 != regno1)
17461 emit_insn (gen_rtx_SET (VOIDmode, target, parts.base));
17462 }
17463 else
17464 {
17465 if (regno0 == regno1)
17466 tmp = parts.index;
17467 else if (regno0 == regno2)
17468 tmp = parts.base;
17469 else
17470 {
17471 rtx tmp1;
17472
17473 /* Find better operand for SET instruction, depending
17474 on which definition is farther from the insn. */
17475 if (find_nearest_reg_def (insn, regno1, regno2))
17476 tmp = parts.index, tmp1 = parts.base;
17477 else
17478 tmp = parts.base, tmp1 = parts.index;
17479
17480 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
17481
17482 if (parts.disp && parts.disp != const0_rtx)
17483 ix86_emit_binop (PLUS, mode, target, parts.disp);
17484
17485 ix86_emit_binop (PLUS, mode, target, tmp1);
17486 return;
17487 }
17488
17489 ix86_emit_binop (PLUS, mode, target, tmp);
17490 }
17491
17492 if (parts.disp && parts.disp != const0_rtx)
17493 ix86_emit_binop (PLUS, mode, target, parts.disp);
17494 }
17495 }
17496
17497 /* Return true if it is ok to optimize an ADD operation to LEA
17498 operation to avoid flag register consumation. For most processors,
17499 ADD is faster than LEA. For the processors like ATOM, if the
17500 destination register of LEA holds an actual address which will be
17501 used soon, LEA is better and otherwise ADD is better. */
17502
17503 bool
17504 ix86_lea_for_add_ok (rtx insn, rtx operands[])
17505 {
17506 unsigned int regno0 = true_regnum (operands[0]);
17507 unsigned int regno1 = true_regnum (operands[1]);
17508 unsigned int regno2 = true_regnum (operands[2]);
17509
17510 /* If a = b + c, (a!=b && a!=c), must use lea form. */
17511 if (regno0 != regno1 && regno0 != regno2)
17512 return true;
17513
17514 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
17515 return false;
17516
17517 return ix86_lea_outperforms (insn, regno0, regno1, regno2, 0);
17518 }
17519
17520 /* Return true if destination reg of SET_BODY is shift count of
17521 USE_BODY. */
17522
17523 static bool
17524 ix86_dep_by_shift_count_body (const_rtx set_body, const_rtx use_body)
17525 {
17526 rtx set_dest;
17527 rtx shift_rtx;
17528 int i;
17529
17530 /* Retrieve destination of SET_BODY. */
17531 switch (GET_CODE (set_body))
17532 {
17533 case SET:
17534 set_dest = SET_DEST (set_body);
17535 if (!set_dest || !REG_P (set_dest))
17536 return false;
17537 break;
17538 case PARALLEL:
17539 for (i = XVECLEN (set_body, 0) - 1; i >= 0; i--)
17540 if (ix86_dep_by_shift_count_body (XVECEXP (set_body, 0, i),
17541 use_body))
17542 return true;
17543 default:
17544 return false;
17545 break;
17546 }
17547
17548 /* Retrieve shift count of USE_BODY. */
17549 switch (GET_CODE (use_body))
17550 {
17551 case SET:
17552 shift_rtx = XEXP (use_body, 1);
17553 break;
17554 case PARALLEL:
17555 for (i = XVECLEN (use_body, 0) - 1; i >= 0; i--)
17556 if (ix86_dep_by_shift_count_body (set_body,
17557 XVECEXP (use_body, 0, i)))
17558 return true;
17559 default:
17560 return false;
17561 break;
17562 }
17563
17564 if (shift_rtx
17565 && (GET_CODE (shift_rtx) == ASHIFT
17566 || GET_CODE (shift_rtx) == LSHIFTRT
17567 || GET_CODE (shift_rtx) == ASHIFTRT
17568 || GET_CODE (shift_rtx) == ROTATE
17569 || GET_CODE (shift_rtx) == ROTATERT))
17570 {
17571 rtx shift_count = XEXP (shift_rtx, 1);
17572
17573 /* Return true if shift count is dest of SET_BODY. */
17574 if (REG_P (shift_count))
17575 {
17576 /* Add check since it can be invoked before register
17577 allocation in pre-reload schedule. */
17578 if (reload_completed
17579 && true_regnum (set_dest) == true_regnum (shift_count))
17580 return true;
17581 else if (REGNO(set_dest) == REGNO(shift_count))
17582 return true;
17583 }
17584 }
17585
17586 return false;
17587 }
17588
17589 /* Return true if destination reg of SET_INSN is shift count of
17590 USE_INSN. */
17591
17592 bool
17593 ix86_dep_by_shift_count (const_rtx set_insn, const_rtx use_insn)
17594 {
17595 return ix86_dep_by_shift_count_body (PATTERN (set_insn),
17596 PATTERN (use_insn));
17597 }
17598
17599 /* Return TRUE or FALSE depending on whether the unary operator meets the
17600 appropriate constraints. */
17601
17602 bool
17603 ix86_unary_operator_ok (enum rtx_code code ATTRIBUTE_UNUSED,
17604 enum machine_mode mode ATTRIBUTE_UNUSED,
17605 rtx operands[2] ATTRIBUTE_UNUSED)
17606 {
17607 /* If one of operands is memory, source and destination must match. */
17608 if ((MEM_P (operands[0])
17609 || MEM_P (operands[1]))
17610 && ! rtx_equal_p (operands[0], operands[1]))
17611 return false;
17612 return true;
17613 }
17614
17615 /* Return TRUE if the operands to a vec_interleave_{high,low}v2df
17616 are ok, keeping in mind the possible movddup alternative. */
17617
17618 bool
17619 ix86_vec_interleave_v2df_operator_ok (rtx operands[3], bool high)
17620 {
17621 if (MEM_P (operands[0]))
17622 return rtx_equal_p (operands[0], operands[1 + high]);
17623 if (MEM_P (operands[1]) && MEM_P (operands[2]))
17624 return TARGET_SSE3 && rtx_equal_p (operands[1], operands[2]);
17625 return true;
17626 }
17627
17628 /* Post-reload splitter for converting an SF or DFmode value in an
17629 SSE register into an unsigned SImode. */
17630
17631 void
17632 ix86_split_convert_uns_si_sse (rtx operands[])
17633 {
17634 enum machine_mode vecmode;
17635 rtx value, large, zero_or_two31, input, two31, x;
17636
17637 large = operands[1];
17638 zero_or_two31 = operands[2];
17639 input = operands[3];
17640 two31 = operands[4];
17641 vecmode = GET_MODE (large);
17642 value = gen_rtx_REG (vecmode, REGNO (operands[0]));
17643
17644 /* Load up the value into the low element. We must ensure that the other
17645 elements are valid floats -- zero is the easiest such value. */
17646 if (MEM_P (input))
17647 {
17648 if (vecmode == V4SFmode)
17649 emit_insn (gen_vec_setv4sf_0 (value, CONST0_RTX (V4SFmode), input));
17650 else
17651 emit_insn (gen_sse2_loadlpd (value, CONST0_RTX (V2DFmode), input));
17652 }
17653 else
17654 {
17655 input = gen_rtx_REG (vecmode, REGNO (input));
17656 emit_move_insn (value, CONST0_RTX (vecmode));
17657 if (vecmode == V4SFmode)
17658 emit_insn (gen_sse_movss (value, value, input));
17659 else
17660 emit_insn (gen_sse2_movsd (value, value, input));
17661 }
17662
17663 emit_move_insn (large, two31);
17664 emit_move_insn (zero_or_two31, MEM_P (two31) ? large : two31);
17665
17666 x = gen_rtx_fmt_ee (LE, vecmode, large, value);
17667 emit_insn (gen_rtx_SET (VOIDmode, large, x));
17668
17669 x = gen_rtx_AND (vecmode, zero_or_two31, large);
17670 emit_insn (gen_rtx_SET (VOIDmode, zero_or_two31, x));
17671
17672 x = gen_rtx_MINUS (vecmode, value, zero_or_two31);
17673 emit_insn (gen_rtx_SET (VOIDmode, value, x));
17674
17675 large = gen_rtx_REG (V4SImode, REGNO (large));
17676 emit_insn (gen_ashlv4si3 (large, large, GEN_INT (31)));
17677
17678 x = gen_rtx_REG (V4SImode, REGNO (value));
17679 if (vecmode == V4SFmode)
17680 emit_insn (gen_fix_truncv4sfv4si2 (x, value));
17681 else
17682 emit_insn (gen_sse2_cvttpd2dq (x, value));
17683 value = x;
17684
17685 emit_insn (gen_xorv4si3 (value, value, large));
17686 }
17687
17688 /* Convert an unsigned DImode value into a DFmode, using only SSE.
17689 Expects the 64-bit DImode to be supplied in a pair of integral
17690 registers. Requires SSE2; will use SSE3 if available. For x86_32,
17691 -mfpmath=sse, !optimize_size only. */
17692
17693 void
17694 ix86_expand_convert_uns_didf_sse (rtx target, rtx input)
17695 {
17696 REAL_VALUE_TYPE bias_lo_rvt, bias_hi_rvt;
17697 rtx int_xmm, fp_xmm;
17698 rtx biases, exponents;
17699 rtx x;
17700
17701 int_xmm = gen_reg_rtx (V4SImode);
17702 if (TARGET_INTER_UNIT_MOVES)
17703 emit_insn (gen_movdi_to_sse (int_xmm, input));
17704 else if (TARGET_SSE_SPLIT_REGS)
17705 {
17706 emit_clobber (int_xmm);
17707 emit_move_insn (gen_lowpart (DImode, int_xmm), input);
17708 }
17709 else
17710 {
17711 x = gen_reg_rtx (V2DImode);
17712 ix86_expand_vector_init_one_nonzero (false, V2DImode, x, input, 0);
17713 emit_move_insn (int_xmm, gen_lowpart (V4SImode, x));
17714 }
17715
17716 x = gen_rtx_CONST_VECTOR (V4SImode,
17717 gen_rtvec (4, GEN_INT (0x43300000UL),
17718 GEN_INT (0x45300000UL),
17719 const0_rtx, const0_rtx));
17720 exponents = validize_mem (force_const_mem (V4SImode, x));
17721
17722 /* int_xmm = {0x45300000UL, fp_xmm/hi, 0x43300000, fp_xmm/lo } */
17723 emit_insn (gen_vec_interleave_lowv4si (int_xmm, int_xmm, exponents));
17724
17725 /* Concatenating (juxtaposing) (0x43300000UL ## fp_value_low_xmm)
17726 yields a valid DF value equal to (0x1.0p52 + double(fp_value_lo_xmm)).
17727 Similarly (0x45300000UL ## fp_value_hi_xmm) yields
17728 (0x1.0p84 + double(fp_value_hi_xmm)).
17729 Note these exponents differ by 32. */
17730
17731 fp_xmm = copy_to_mode_reg (V2DFmode, gen_lowpart (V2DFmode, int_xmm));
17732
17733 /* Subtract off those 0x1.0p52 and 0x1.0p84 biases, to produce values
17734 in [0,2**32-1] and [0]+[2**32,2**64-1] respectively. */
17735 real_ldexp (&bias_lo_rvt, &dconst1, 52);
17736 real_ldexp (&bias_hi_rvt, &dconst1, 84);
17737 biases = const_double_from_real_value (bias_lo_rvt, DFmode);
17738 x = const_double_from_real_value (bias_hi_rvt, DFmode);
17739 biases = gen_rtx_CONST_VECTOR (V2DFmode, gen_rtvec (2, biases, x));
17740 biases = validize_mem (force_const_mem (V2DFmode, biases));
17741 emit_insn (gen_subv2df3 (fp_xmm, fp_xmm, biases));
17742
17743 /* Add the upper and lower DFmode values together. */
17744 if (TARGET_SSE3)
17745 emit_insn (gen_sse3_haddv2df3 (fp_xmm, fp_xmm, fp_xmm));
17746 else
17747 {
17748 x = copy_to_mode_reg (V2DFmode, fp_xmm);
17749 emit_insn (gen_vec_interleave_highv2df (fp_xmm, fp_xmm, fp_xmm));
17750 emit_insn (gen_addv2df3 (fp_xmm, fp_xmm, x));
17751 }
17752
17753 ix86_expand_vector_extract (false, target, fp_xmm, 0);
17754 }
17755
17756 /* Not used, but eases macroization of patterns. */
17757 void
17758 ix86_expand_convert_uns_sixf_sse (rtx target ATTRIBUTE_UNUSED,
17759 rtx input ATTRIBUTE_UNUSED)
17760 {
17761 gcc_unreachable ();
17762 }
17763
17764 /* Convert an unsigned SImode value into a DFmode. Only currently used
17765 for SSE, but applicable anywhere. */
17766
17767 void
17768 ix86_expand_convert_uns_sidf_sse (rtx target, rtx input)
17769 {
17770 REAL_VALUE_TYPE TWO31r;
17771 rtx x, fp;
17772
17773 x = expand_simple_binop (SImode, PLUS, input, GEN_INT (-2147483647 - 1),
17774 NULL, 1, OPTAB_DIRECT);
17775
17776 fp = gen_reg_rtx (DFmode);
17777 emit_insn (gen_floatsidf2 (fp, x));
17778
17779 real_ldexp (&TWO31r, &dconst1, 31);
17780 x = const_double_from_real_value (TWO31r, DFmode);
17781
17782 x = expand_simple_binop (DFmode, PLUS, fp, x, target, 0, OPTAB_DIRECT);
17783 if (x != target)
17784 emit_move_insn (target, x);
17785 }
17786
17787 /* Convert a signed DImode value into a DFmode. Only used for SSE in
17788 32-bit mode; otherwise we have a direct convert instruction. */
17789
17790 void
17791 ix86_expand_convert_sign_didf_sse (rtx target, rtx input)
17792 {
17793 REAL_VALUE_TYPE TWO32r;
17794 rtx fp_lo, fp_hi, x;
17795
17796 fp_lo = gen_reg_rtx (DFmode);
17797 fp_hi = gen_reg_rtx (DFmode);
17798
17799 emit_insn (gen_floatsidf2 (fp_hi, gen_highpart (SImode, input)));
17800
17801 real_ldexp (&TWO32r, &dconst1, 32);
17802 x = const_double_from_real_value (TWO32r, DFmode);
17803 fp_hi = expand_simple_binop (DFmode, MULT, fp_hi, x, fp_hi, 0, OPTAB_DIRECT);
17804
17805 ix86_expand_convert_uns_sidf_sse (fp_lo, gen_lowpart (SImode, input));
17806
17807 x = expand_simple_binop (DFmode, PLUS, fp_hi, fp_lo, target,
17808 0, OPTAB_DIRECT);
17809 if (x != target)
17810 emit_move_insn (target, x);
17811 }
17812
17813 /* Convert an unsigned SImode value into a SFmode, using only SSE.
17814 For x86_32, -mfpmath=sse, !optimize_size only. */
17815 void
17816 ix86_expand_convert_uns_sisf_sse (rtx target, rtx input)
17817 {
17818 REAL_VALUE_TYPE ONE16r;
17819 rtx fp_hi, fp_lo, int_hi, int_lo, x;
17820
17821 real_ldexp (&ONE16r, &dconst1, 16);
17822 x = const_double_from_real_value (ONE16r, SFmode);
17823 int_lo = expand_simple_binop (SImode, AND, input, GEN_INT(0xffff),
17824 NULL, 0, OPTAB_DIRECT);
17825 int_hi = expand_simple_binop (SImode, LSHIFTRT, input, GEN_INT(16),
17826 NULL, 0, OPTAB_DIRECT);
17827 fp_hi = gen_reg_rtx (SFmode);
17828 fp_lo = gen_reg_rtx (SFmode);
17829 emit_insn (gen_floatsisf2 (fp_hi, int_hi));
17830 emit_insn (gen_floatsisf2 (fp_lo, int_lo));
17831 fp_hi = expand_simple_binop (SFmode, MULT, fp_hi, x, fp_hi,
17832 0, OPTAB_DIRECT);
17833 fp_hi = expand_simple_binop (SFmode, PLUS, fp_hi, fp_lo, target,
17834 0, OPTAB_DIRECT);
17835 if (!rtx_equal_p (target, fp_hi))
17836 emit_move_insn (target, fp_hi);
17837 }
17838
17839 /* floatunsv{4,8}siv{4,8}sf2 expander. Expand code to convert
17840 a vector of unsigned ints VAL to vector of floats TARGET. */
17841
17842 void
17843 ix86_expand_vector_convert_uns_vsivsf (rtx target, rtx val)
17844 {
17845 rtx tmp[8];
17846 REAL_VALUE_TYPE TWO16r;
17847 enum machine_mode intmode = GET_MODE (val);
17848 enum machine_mode fltmode = GET_MODE (target);
17849 rtx (*cvt) (rtx, rtx);
17850
17851 if (intmode == V4SImode)
17852 cvt = gen_floatv4siv4sf2;
17853 else
17854 cvt = gen_floatv8siv8sf2;
17855 tmp[0] = ix86_build_const_vector (intmode, 1, GEN_INT (0xffff));
17856 tmp[0] = force_reg (intmode, tmp[0]);
17857 tmp[1] = expand_simple_binop (intmode, AND, val, tmp[0], NULL_RTX, 1,
17858 OPTAB_DIRECT);
17859 tmp[2] = expand_simple_binop (intmode, LSHIFTRT, val, GEN_INT (16),
17860 NULL_RTX, 1, OPTAB_DIRECT);
17861 tmp[3] = gen_reg_rtx (fltmode);
17862 emit_insn (cvt (tmp[3], tmp[1]));
17863 tmp[4] = gen_reg_rtx (fltmode);
17864 emit_insn (cvt (tmp[4], tmp[2]));
17865 real_ldexp (&TWO16r, &dconst1, 16);
17866 tmp[5] = const_double_from_real_value (TWO16r, SFmode);
17867 tmp[5] = force_reg (fltmode, ix86_build_const_vector (fltmode, 1, tmp[5]));
17868 tmp[6] = expand_simple_binop (fltmode, MULT, tmp[4], tmp[5], NULL_RTX, 1,
17869 OPTAB_DIRECT);
17870 tmp[7] = expand_simple_binop (fltmode, PLUS, tmp[3], tmp[6], target, 1,
17871 OPTAB_DIRECT);
17872 if (tmp[7] != target)
17873 emit_move_insn (target, tmp[7]);
17874 }
17875
17876 /* Adjust a V*SFmode/V*DFmode value VAL so that *sfix_trunc* resp. fix_trunc*
17877 pattern can be used on it instead of *ufix_trunc* resp. fixuns_trunc*.
17878 This is done by doing just signed conversion if < 0x1p31, and otherwise by
17879 subtracting 0x1p31 first and xoring in 0x80000000 from *XORP afterwards. */
17880
17881 rtx
17882 ix86_expand_adjust_ufix_to_sfix_si (rtx val, rtx *xorp)
17883 {
17884 REAL_VALUE_TYPE TWO31r;
17885 rtx two31r, tmp[4];
17886 enum machine_mode mode = GET_MODE (val);
17887 enum machine_mode scalarmode = GET_MODE_INNER (mode);
17888 enum machine_mode intmode = GET_MODE_SIZE (mode) == 32 ? V8SImode : V4SImode;
17889 rtx (*cmp) (rtx, rtx, rtx, rtx);
17890 int i;
17891
17892 for (i = 0; i < 3; i++)
17893 tmp[i] = gen_reg_rtx (mode);
17894 real_ldexp (&TWO31r, &dconst1, 31);
17895 two31r = const_double_from_real_value (TWO31r, scalarmode);
17896 two31r = ix86_build_const_vector (mode, 1, two31r);
17897 two31r = force_reg (mode, two31r);
17898 switch (mode)
17899 {
17900 case V8SFmode: cmp = gen_avx_maskcmpv8sf3; break;
17901 case V4SFmode: cmp = gen_sse_maskcmpv4sf3; break;
17902 case V4DFmode: cmp = gen_avx_maskcmpv4df3; break;
17903 case V2DFmode: cmp = gen_sse2_maskcmpv2df3; break;
17904 default: gcc_unreachable ();
17905 }
17906 tmp[3] = gen_rtx_LE (mode, two31r, val);
17907 emit_insn (cmp (tmp[0], two31r, val, tmp[3]));
17908 tmp[1] = expand_simple_binop (mode, AND, tmp[0], two31r, tmp[1],
17909 0, OPTAB_DIRECT);
17910 if (intmode == V4SImode || TARGET_AVX2)
17911 *xorp = expand_simple_binop (intmode, ASHIFT,
17912 gen_lowpart (intmode, tmp[0]),
17913 GEN_INT (31), NULL_RTX, 0,
17914 OPTAB_DIRECT);
17915 else
17916 {
17917 rtx two31 = GEN_INT ((unsigned HOST_WIDE_INT) 1 << 31);
17918 two31 = ix86_build_const_vector (intmode, 1, two31);
17919 *xorp = expand_simple_binop (intmode, AND,
17920 gen_lowpart (intmode, tmp[0]),
17921 two31, NULL_RTX, 0,
17922 OPTAB_DIRECT);
17923 }
17924 return expand_simple_binop (mode, MINUS, val, tmp[1], tmp[2],
17925 0, OPTAB_DIRECT);
17926 }
17927
17928 /* A subroutine of ix86_build_signbit_mask. If VECT is true,
17929 then replicate the value for all elements of the vector
17930 register. */
17931
17932 rtx
17933 ix86_build_const_vector (enum machine_mode mode, bool vect, rtx value)
17934 {
17935 int i, n_elt;
17936 rtvec v;
17937 enum machine_mode scalar_mode;
17938
17939 switch (mode)
17940 {
17941 case V32QImode:
17942 case V16QImode:
17943 case V16HImode:
17944 case V8HImode:
17945 case V8SImode:
17946 case V4SImode:
17947 case V4DImode:
17948 case V2DImode:
17949 gcc_assert (vect);
17950 case V8SFmode:
17951 case V4SFmode:
17952 case V4DFmode:
17953 case V2DFmode:
17954 n_elt = GET_MODE_NUNITS (mode);
17955 v = rtvec_alloc (n_elt);
17956 scalar_mode = GET_MODE_INNER (mode);
17957
17958 RTVEC_ELT (v, 0) = value;
17959
17960 for (i = 1; i < n_elt; ++i)
17961 RTVEC_ELT (v, i) = vect ? value : CONST0_RTX (scalar_mode);
17962
17963 return gen_rtx_CONST_VECTOR (mode, v);
17964
17965 default:
17966 gcc_unreachable ();
17967 }
17968 }
17969
17970 /* A subroutine of ix86_expand_fp_absneg_operator, copysign expanders
17971 and ix86_expand_int_vcond. Create a mask for the sign bit in MODE
17972 for an SSE register. If VECT is true, then replicate the mask for
17973 all elements of the vector register. If INVERT is true, then create
17974 a mask excluding the sign bit. */
17975
17976 rtx
17977 ix86_build_signbit_mask (enum machine_mode mode, bool vect, bool invert)
17978 {
17979 enum machine_mode vec_mode, imode;
17980 HOST_WIDE_INT hi, lo;
17981 int shift = 63;
17982 rtx v;
17983 rtx mask;
17984
17985 /* Find the sign bit, sign extended to 2*HWI. */
17986 switch (mode)
17987 {
17988 case V8SImode:
17989 case V4SImode:
17990 case V8SFmode:
17991 case V4SFmode:
17992 vec_mode = mode;
17993 mode = GET_MODE_INNER (mode);
17994 imode = SImode;
17995 lo = 0x80000000, hi = lo < 0;
17996 break;
17997
17998 case V4DImode:
17999 case V2DImode:
18000 case V4DFmode:
18001 case V2DFmode:
18002 vec_mode = mode;
18003 mode = GET_MODE_INNER (mode);
18004 imode = DImode;
18005 if (HOST_BITS_PER_WIDE_INT >= 64)
18006 lo = (HOST_WIDE_INT)1 << shift, hi = -1;
18007 else
18008 lo = 0, hi = (HOST_WIDE_INT)1 << (shift - HOST_BITS_PER_WIDE_INT);
18009 break;
18010
18011 case TImode:
18012 case TFmode:
18013 vec_mode = VOIDmode;
18014 if (HOST_BITS_PER_WIDE_INT >= 64)
18015 {
18016 imode = TImode;
18017 lo = 0, hi = (HOST_WIDE_INT)1 << shift;
18018 }
18019 else
18020 {
18021 rtvec vec;
18022
18023 imode = DImode;
18024 lo = 0, hi = (HOST_WIDE_INT)1 << (shift - HOST_BITS_PER_WIDE_INT);
18025
18026 if (invert)
18027 {
18028 lo = ~lo, hi = ~hi;
18029 v = constm1_rtx;
18030 }
18031 else
18032 v = const0_rtx;
18033
18034 mask = immed_double_const (lo, hi, imode);
18035
18036 vec = gen_rtvec (2, v, mask);
18037 v = gen_rtx_CONST_VECTOR (V2DImode, vec);
18038 v = copy_to_mode_reg (mode, gen_lowpart (mode, v));
18039
18040 return v;
18041 }
18042 break;
18043
18044 default:
18045 gcc_unreachable ();
18046 }
18047
18048 if (invert)
18049 lo = ~lo, hi = ~hi;
18050
18051 /* Force this value into the low part of a fp vector constant. */
18052 mask = immed_double_const (lo, hi, imode);
18053 mask = gen_lowpart (mode, mask);
18054
18055 if (vec_mode == VOIDmode)
18056 return force_reg (mode, mask);
18057
18058 v = ix86_build_const_vector (vec_mode, vect, mask);
18059 return force_reg (vec_mode, v);
18060 }
18061
18062 /* Generate code for floating point ABS or NEG. */
18063
18064 void
18065 ix86_expand_fp_absneg_operator (enum rtx_code code, enum machine_mode mode,
18066 rtx operands[])
18067 {
18068 rtx mask, set, dst, src;
18069 bool use_sse = false;
18070 bool vector_mode = VECTOR_MODE_P (mode);
18071 enum machine_mode vmode = mode;
18072
18073 if (vector_mode)
18074 use_sse = true;
18075 else if (mode == TFmode)
18076 use_sse = true;
18077 else if (TARGET_SSE_MATH)
18078 {
18079 use_sse = SSE_FLOAT_MODE_P (mode);
18080 if (mode == SFmode)
18081 vmode = V4SFmode;
18082 else if (mode == DFmode)
18083 vmode = V2DFmode;
18084 }
18085
18086 /* NEG and ABS performed with SSE use bitwise mask operations.
18087 Create the appropriate mask now. */
18088 if (use_sse)
18089 mask = ix86_build_signbit_mask (vmode, vector_mode, code == ABS);
18090 else
18091 mask = NULL_RTX;
18092
18093 dst = operands[0];
18094 src = operands[1];
18095
18096 set = gen_rtx_fmt_e (code, mode, src);
18097 set = gen_rtx_SET (VOIDmode, dst, set);
18098
18099 if (mask)
18100 {
18101 rtx use, clob;
18102 rtvec par;
18103
18104 use = gen_rtx_USE (VOIDmode, mask);
18105 if (vector_mode)
18106 par = gen_rtvec (2, set, use);
18107 else
18108 {
18109 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
18110 par = gen_rtvec (3, set, use, clob);
18111 }
18112 emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
18113 }
18114 else
18115 emit_insn (set);
18116 }
18117
18118 /* Expand a copysign operation. Special case operand 0 being a constant. */
18119
18120 void
18121 ix86_expand_copysign (rtx operands[])
18122 {
18123 enum machine_mode mode, vmode;
18124 rtx dest, op0, op1, mask, nmask;
18125
18126 dest = operands[0];
18127 op0 = operands[1];
18128 op1 = operands[2];
18129
18130 mode = GET_MODE (dest);
18131
18132 if (mode == SFmode)
18133 vmode = V4SFmode;
18134 else if (mode == DFmode)
18135 vmode = V2DFmode;
18136 else
18137 vmode = mode;
18138
18139 if (GET_CODE (op0) == CONST_DOUBLE)
18140 {
18141 rtx (*copysign_insn)(rtx, rtx, rtx, rtx);
18142
18143 if (real_isneg (CONST_DOUBLE_REAL_VALUE (op0)))
18144 op0 = simplify_unary_operation (ABS, mode, op0, mode);
18145
18146 if (mode == SFmode || mode == DFmode)
18147 {
18148 if (op0 == CONST0_RTX (mode))
18149 op0 = CONST0_RTX (vmode);
18150 else
18151 {
18152 rtx v = ix86_build_const_vector (vmode, false, op0);
18153
18154 op0 = force_reg (vmode, v);
18155 }
18156 }
18157 else if (op0 != CONST0_RTX (mode))
18158 op0 = force_reg (mode, op0);
18159
18160 mask = ix86_build_signbit_mask (vmode, 0, 0);
18161
18162 if (mode == SFmode)
18163 copysign_insn = gen_copysignsf3_const;
18164 else if (mode == DFmode)
18165 copysign_insn = gen_copysigndf3_const;
18166 else
18167 copysign_insn = gen_copysigntf3_const;
18168
18169 emit_insn (copysign_insn (dest, op0, op1, mask));
18170 }
18171 else
18172 {
18173 rtx (*copysign_insn)(rtx, rtx, rtx, rtx, rtx, rtx);
18174
18175 nmask = ix86_build_signbit_mask (vmode, 0, 1);
18176 mask = ix86_build_signbit_mask (vmode, 0, 0);
18177
18178 if (mode == SFmode)
18179 copysign_insn = gen_copysignsf3_var;
18180 else if (mode == DFmode)
18181 copysign_insn = gen_copysigndf3_var;
18182 else
18183 copysign_insn = gen_copysigntf3_var;
18184
18185 emit_insn (copysign_insn (dest, NULL_RTX, op0, op1, nmask, mask));
18186 }
18187 }
18188
18189 /* Deconstruct a copysign operation into bit masks. Operand 0 is known to
18190 be a constant, and so has already been expanded into a vector constant. */
18191
18192 void
18193 ix86_split_copysign_const (rtx operands[])
18194 {
18195 enum machine_mode mode, vmode;
18196 rtx dest, op0, mask, x;
18197
18198 dest = operands[0];
18199 op0 = operands[1];
18200 mask = operands[3];
18201
18202 mode = GET_MODE (dest);
18203 vmode = GET_MODE (mask);
18204
18205 dest = simplify_gen_subreg (vmode, dest, mode, 0);
18206 x = gen_rtx_AND (vmode, dest, mask);
18207 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
18208
18209 if (op0 != CONST0_RTX (vmode))
18210 {
18211 x = gen_rtx_IOR (vmode, dest, op0);
18212 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
18213 }
18214 }
18215
18216 /* Deconstruct a copysign operation into bit masks. Operand 0 is variable,
18217 so we have to do two masks. */
18218
18219 void
18220 ix86_split_copysign_var (rtx operands[])
18221 {
18222 enum machine_mode mode, vmode;
18223 rtx dest, scratch, op0, op1, mask, nmask, x;
18224
18225 dest = operands[0];
18226 scratch = operands[1];
18227 op0 = operands[2];
18228 op1 = operands[3];
18229 nmask = operands[4];
18230 mask = operands[5];
18231
18232 mode = GET_MODE (dest);
18233 vmode = GET_MODE (mask);
18234
18235 if (rtx_equal_p (op0, op1))
18236 {
18237 /* Shouldn't happen often (it's useless, obviously), but when it does
18238 we'd generate incorrect code if we continue below. */
18239 emit_move_insn (dest, op0);
18240 return;
18241 }
18242
18243 if (REG_P (mask) && REGNO (dest) == REGNO (mask)) /* alternative 0 */
18244 {
18245 gcc_assert (REGNO (op1) == REGNO (scratch));
18246
18247 x = gen_rtx_AND (vmode, scratch, mask);
18248 emit_insn (gen_rtx_SET (VOIDmode, scratch, x));
18249
18250 dest = mask;
18251 op0 = simplify_gen_subreg (vmode, op0, mode, 0);
18252 x = gen_rtx_NOT (vmode, dest);
18253 x = gen_rtx_AND (vmode, x, op0);
18254 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
18255 }
18256 else
18257 {
18258 if (REGNO (op1) == REGNO (scratch)) /* alternative 1,3 */
18259 {
18260 x = gen_rtx_AND (vmode, scratch, mask);
18261 }
18262 else /* alternative 2,4 */
18263 {
18264 gcc_assert (REGNO (mask) == REGNO (scratch));
18265 op1 = simplify_gen_subreg (vmode, op1, mode, 0);
18266 x = gen_rtx_AND (vmode, scratch, op1);
18267 }
18268 emit_insn (gen_rtx_SET (VOIDmode, scratch, x));
18269
18270 if (REGNO (op0) == REGNO (dest)) /* alternative 1,2 */
18271 {
18272 dest = simplify_gen_subreg (vmode, op0, mode, 0);
18273 x = gen_rtx_AND (vmode, dest, nmask);
18274 }
18275 else /* alternative 3,4 */
18276 {
18277 gcc_assert (REGNO (nmask) == REGNO (dest));
18278 dest = nmask;
18279 op0 = simplify_gen_subreg (vmode, op0, mode, 0);
18280 x = gen_rtx_AND (vmode, dest, op0);
18281 }
18282 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
18283 }
18284
18285 x = gen_rtx_IOR (vmode, dest, scratch);
18286 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
18287 }
18288
18289 /* Return TRUE or FALSE depending on whether the first SET in INSN
18290 has source and destination with matching CC modes, and that the
18291 CC mode is at least as constrained as REQ_MODE. */
18292
18293 bool
18294 ix86_match_ccmode (rtx insn, enum machine_mode req_mode)
18295 {
18296 rtx set;
18297 enum machine_mode set_mode;
18298
18299 set = PATTERN (insn);
18300 if (GET_CODE (set) == PARALLEL)
18301 set = XVECEXP (set, 0, 0);
18302 gcc_assert (GET_CODE (set) == SET);
18303 gcc_assert (GET_CODE (SET_SRC (set)) == COMPARE);
18304
18305 set_mode = GET_MODE (SET_DEST (set));
18306 switch (set_mode)
18307 {
18308 case CCNOmode:
18309 if (req_mode != CCNOmode
18310 && (req_mode != CCmode
18311 || XEXP (SET_SRC (set), 1) != const0_rtx))
18312 return false;
18313 break;
18314 case CCmode:
18315 if (req_mode == CCGCmode)
18316 return false;
18317 /* FALLTHRU */
18318 case CCGCmode:
18319 if (req_mode == CCGOCmode || req_mode == CCNOmode)
18320 return false;
18321 /* FALLTHRU */
18322 case CCGOCmode:
18323 if (req_mode == CCZmode)
18324 return false;
18325 /* FALLTHRU */
18326 case CCZmode:
18327 break;
18328
18329 case CCAmode:
18330 case CCCmode:
18331 case CCOmode:
18332 case CCSmode:
18333 if (set_mode != req_mode)
18334 return false;
18335 break;
18336
18337 default:
18338 gcc_unreachable ();
18339 }
18340
18341 return GET_MODE (SET_SRC (set)) == set_mode;
18342 }
18343
18344 /* Generate insn patterns to do an integer compare of OPERANDS. */
18345
18346 static rtx
18347 ix86_expand_int_compare (enum rtx_code code, rtx op0, rtx op1)
18348 {
18349 enum machine_mode cmpmode;
18350 rtx tmp, flags;
18351
18352 cmpmode = SELECT_CC_MODE (code, op0, op1);
18353 flags = gen_rtx_REG (cmpmode, FLAGS_REG);
18354
18355 /* This is very simple, but making the interface the same as in the
18356 FP case makes the rest of the code easier. */
18357 tmp = gen_rtx_COMPARE (cmpmode, op0, op1);
18358 emit_insn (gen_rtx_SET (VOIDmode, flags, tmp));
18359
18360 /* Return the test that should be put into the flags user, i.e.
18361 the bcc, scc, or cmov instruction. */
18362 return gen_rtx_fmt_ee (code, VOIDmode, flags, const0_rtx);
18363 }
18364
18365 /* Figure out whether to use ordered or unordered fp comparisons.
18366 Return the appropriate mode to use. */
18367
18368 enum machine_mode
18369 ix86_fp_compare_mode (enum rtx_code code ATTRIBUTE_UNUSED)
18370 {
18371 /* ??? In order to make all comparisons reversible, we do all comparisons
18372 non-trapping when compiling for IEEE. Once gcc is able to distinguish
18373 all forms trapping and nontrapping comparisons, we can make inequality
18374 comparisons trapping again, since it results in better code when using
18375 FCOM based compares. */
18376 return TARGET_IEEE_FP ? CCFPUmode : CCFPmode;
18377 }
18378
18379 enum machine_mode
18380 ix86_cc_mode (enum rtx_code code, rtx op0, rtx op1)
18381 {
18382 enum machine_mode mode = GET_MODE (op0);
18383
18384 if (SCALAR_FLOAT_MODE_P (mode))
18385 {
18386 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
18387 return ix86_fp_compare_mode (code);
18388 }
18389
18390 switch (code)
18391 {
18392 /* Only zero flag is needed. */
18393 case EQ: /* ZF=0 */
18394 case NE: /* ZF!=0 */
18395 return CCZmode;
18396 /* Codes needing carry flag. */
18397 case GEU: /* CF=0 */
18398 case LTU: /* CF=1 */
18399 /* Detect overflow checks. They need just the carry flag. */
18400 if (GET_CODE (op0) == PLUS
18401 && rtx_equal_p (op1, XEXP (op0, 0)))
18402 return CCCmode;
18403 else
18404 return CCmode;
18405 case GTU: /* CF=0 & ZF=0 */
18406 case LEU: /* CF=1 | ZF=1 */
18407 /* Detect overflow checks. They need just the carry flag. */
18408 if (GET_CODE (op0) == MINUS
18409 && rtx_equal_p (op1, XEXP (op0, 0)))
18410 return CCCmode;
18411 else
18412 return CCmode;
18413 /* Codes possibly doable only with sign flag when
18414 comparing against zero. */
18415 case GE: /* SF=OF or SF=0 */
18416 case LT: /* SF<>OF or SF=1 */
18417 if (op1 == const0_rtx)
18418 return CCGOCmode;
18419 else
18420 /* For other cases Carry flag is not required. */
18421 return CCGCmode;
18422 /* Codes doable only with sign flag when comparing
18423 against zero, but we miss jump instruction for it
18424 so we need to use relational tests against overflow
18425 that thus needs to be zero. */
18426 case GT: /* ZF=0 & SF=OF */
18427 case LE: /* ZF=1 | SF<>OF */
18428 if (op1 == const0_rtx)
18429 return CCNOmode;
18430 else
18431 return CCGCmode;
18432 /* strcmp pattern do (use flags) and combine may ask us for proper
18433 mode. */
18434 case USE:
18435 return CCmode;
18436 default:
18437 gcc_unreachable ();
18438 }
18439 }
18440
18441 /* Return the fixed registers used for condition codes. */
18442
18443 static bool
18444 ix86_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
18445 {
18446 *p1 = FLAGS_REG;
18447 *p2 = FPSR_REG;
18448 return true;
18449 }
18450
18451 /* If two condition code modes are compatible, return a condition code
18452 mode which is compatible with both. Otherwise, return
18453 VOIDmode. */
18454
18455 static enum machine_mode
18456 ix86_cc_modes_compatible (enum machine_mode m1, enum machine_mode m2)
18457 {
18458 if (m1 == m2)
18459 return m1;
18460
18461 if (GET_MODE_CLASS (m1) != MODE_CC || GET_MODE_CLASS (m2) != MODE_CC)
18462 return VOIDmode;
18463
18464 if ((m1 == CCGCmode && m2 == CCGOCmode)
18465 || (m1 == CCGOCmode && m2 == CCGCmode))
18466 return CCGCmode;
18467
18468 if (m1 == CCZmode && (m2 == CCGCmode || m2 == CCGOCmode))
18469 return m2;
18470 else if (m2 == CCZmode && (m1 == CCGCmode || m1 == CCGOCmode))
18471 return m1;
18472
18473 switch (m1)
18474 {
18475 default:
18476 gcc_unreachable ();
18477
18478 case CCmode:
18479 case CCGCmode:
18480 case CCGOCmode:
18481 case CCNOmode:
18482 case CCAmode:
18483 case CCCmode:
18484 case CCOmode:
18485 case CCSmode:
18486 case CCZmode:
18487 switch (m2)
18488 {
18489 default:
18490 return VOIDmode;
18491
18492 case CCmode:
18493 case CCGCmode:
18494 case CCGOCmode:
18495 case CCNOmode:
18496 case CCAmode:
18497 case CCCmode:
18498 case CCOmode:
18499 case CCSmode:
18500 case CCZmode:
18501 return CCmode;
18502 }
18503
18504 case CCFPmode:
18505 case CCFPUmode:
18506 /* These are only compatible with themselves, which we already
18507 checked above. */
18508 return VOIDmode;
18509 }
18510 }
18511
18512
18513 /* Return a comparison we can do and that it is equivalent to
18514 swap_condition (code) apart possibly from orderedness.
18515 But, never change orderedness if TARGET_IEEE_FP, returning
18516 UNKNOWN in that case if necessary. */
18517
18518 static enum rtx_code
18519 ix86_fp_swap_condition (enum rtx_code code)
18520 {
18521 switch (code)
18522 {
18523 case GT: /* GTU - CF=0 & ZF=0 */
18524 return TARGET_IEEE_FP ? UNKNOWN : UNLT;
18525 case GE: /* GEU - CF=0 */
18526 return TARGET_IEEE_FP ? UNKNOWN : UNLE;
18527 case UNLT: /* LTU - CF=1 */
18528 return TARGET_IEEE_FP ? UNKNOWN : GT;
18529 case UNLE: /* LEU - CF=1 | ZF=1 */
18530 return TARGET_IEEE_FP ? UNKNOWN : GE;
18531 default:
18532 return swap_condition (code);
18533 }
18534 }
18535
18536 /* Return cost of comparison CODE using the best strategy for performance.
18537 All following functions do use number of instructions as a cost metrics.
18538 In future this should be tweaked to compute bytes for optimize_size and
18539 take into account performance of various instructions on various CPUs. */
18540
18541 static int
18542 ix86_fp_comparison_cost (enum rtx_code code)
18543 {
18544 int arith_cost;
18545
18546 /* The cost of code using bit-twiddling on %ah. */
18547 switch (code)
18548 {
18549 case UNLE:
18550 case UNLT:
18551 case LTGT:
18552 case GT:
18553 case GE:
18554 case UNORDERED:
18555 case ORDERED:
18556 case UNEQ:
18557 arith_cost = 4;
18558 break;
18559 case LT:
18560 case NE:
18561 case EQ:
18562 case UNGE:
18563 arith_cost = TARGET_IEEE_FP ? 5 : 4;
18564 break;
18565 case LE:
18566 case UNGT:
18567 arith_cost = TARGET_IEEE_FP ? 6 : 4;
18568 break;
18569 default:
18570 gcc_unreachable ();
18571 }
18572
18573 switch (ix86_fp_comparison_strategy (code))
18574 {
18575 case IX86_FPCMP_COMI:
18576 return arith_cost > 4 ? 3 : 2;
18577 case IX86_FPCMP_SAHF:
18578 return arith_cost > 4 ? 4 : 3;
18579 default:
18580 return arith_cost;
18581 }
18582 }
18583
18584 /* Return strategy to use for floating-point. We assume that fcomi is always
18585 preferrable where available, since that is also true when looking at size
18586 (2 bytes, vs. 3 for fnstsw+sahf and at least 5 for fnstsw+test). */
18587
18588 enum ix86_fpcmp_strategy
18589 ix86_fp_comparison_strategy (enum rtx_code code ATTRIBUTE_UNUSED)
18590 {
18591 /* Do fcomi/sahf based test when profitable. */
18592
18593 if (TARGET_CMOVE)
18594 return IX86_FPCMP_COMI;
18595
18596 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_function_for_size_p (cfun)))
18597 return IX86_FPCMP_SAHF;
18598
18599 return IX86_FPCMP_ARITH;
18600 }
18601
18602 /* Swap, force into registers, or otherwise massage the two operands
18603 to a fp comparison. The operands are updated in place; the new
18604 comparison code is returned. */
18605
18606 static enum rtx_code
18607 ix86_prepare_fp_compare_args (enum rtx_code code, rtx *pop0, rtx *pop1)
18608 {
18609 enum machine_mode fpcmp_mode = ix86_fp_compare_mode (code);
18610 rtx op0 = *pop0, op1 = *pop1;
18611 enum machine_mode op_mode = GET_MODE (op0);
18612 int is_sse = TARGET_SSE_MATH && SSE_FLOAT_MODE_P (op_mode);
18613
18614 /* All of the unordered compare instructions only work on registers.
18615 The same is true of the fcomi compare instructions. The XFmode
18616 compare instructions require registers except when comparing
18617 against zero or when converting operand 1 from fixed point to
18618 floating point. */
18619
18620 if (!is_sse
18621 && (fpcmp_mode == CCFPUmode
18622 || (op_mode == XFmode
18623 && ! (standard_80387_constant_p (op0) == 1
18624 || standard_80387_constant_p (op1) == 1)
18625 && GET_CODE (op1) != FLOAT)
18626 || ix86_fp_comparison_strategy (code) == IX86_FPCMP_COMI))
18627 {
18628 op0 = force_reg (op_mode, op0);
18629 op1 = force_reg (op_mode, op1);
18630 }
18631 else
18632 {
18633 /* %%% We only allow op1 in memory; op0 must be st(0). So swap
18634 things around if they appear profitable, otherwise force op0
18635 into a register. */
18636
18637 if (standard_80387_constant_p (op0) == 0
18638 || (MEM_P (op0)
18639 && ! (standard_80387_constant_p (op1) == 0
18640 || MEM_P (op1))))
18641 {
18642 enum rtx_code new_code = ix86_fp_swap_condition (code);
18643 if (new_code != UNKNOWN)
18644 {
18645 rtx tmp;
18646 tmp = op0, op0 = op1, op1 = tmp;
18647 code = new_code;
18648 }
18649 }
18650
18651 if (!REG_P (op0))
18652 op0 = force_reg (op_mode, op0);
18653
18654 if (CONSTANT_P (op1))
18655 {
18656 int tmp = standard_80387_constant_p (op1);
18657 if (tmp == 0)
18658 op1 = validize_mem (force_const_mem (op_mode, op1));
18659 else if (tmp == 1)
18660 {
18661 if (TARGET_CMOVE)
18662 op1 = force_reg (op_mode, op1);
18663 }
18664 else
18665 op1 = force_reg (op_mode, op1);
18666 }
18667 }
18668
18669 /* Try to rearrange the comparison to make it cheaper. */
18670 if (ix86_fp_comparison_cost (code)
18671 > ix86_fp_comparison_cost (swap_condition (code))
18672 && (REG_P (op1) || can_create_pseudo_p ()))
18673 {
18674 rtx tmp;
18675 tmp = op0, op0 = op1, op1 = tmp;
18676 code = swap_condition (code);
18677 if (!REG_P (op0))
18678 op0 = force_reg (op_mode, op0);
18679 }
18680
18681 *pop0 = op0;
18682 *pop1 = op1;
18683 return code;
18684 }
18685
18686 /* Convert comparison codes we use to represent FP comparison to integer
18687 code that will result in proper branch. Return UNKNOWN if no such code
18688 is available. */
18689
18690 enum rtx_code
18691 ix86_fp_compare_code_to_integer (enum rtx_code code)
18692 {
18693 switch (code)
18694 {
18695 case GT:
18696 return GTU;
18697 case GE:
18698 return GEU;
18699 case ORDERED:
18700 case UNORDERED:
18701 return code;
18702 break;
18703 case UNEQ:
18704 return EQ;
18705 break;
18706 case UNLT:
18707 return LTU;
18708 break;
18709 case UNLE:
18710 return LEU;
18711 break;
18712 case LTGT:
18713 return NE;
18714 break;
18715 default:
18716 return UNKNOWN;
18717 }
18718 }
18719
18720 /* Generate insn patterns to do a floating point compare of OPERANDS. */
18721
18722 static rtx
18723 ix86_expand_fp_compare (enum rtx_code code, rtx op0, rtx op1, rtx scratch)
18724 {
18725 enum machine_mode fpcmp_mode, intcmp_mode;
18726 rtx tmp, tmp2;
18727
18728 fpcmp_mode = ix86_fp_compare_mode (code);
18729 code = ix86_prepare_fp_compare_args (code, &op0, &op1);
18730
18731 /* Do fcomi/sahf based test when profitable. */
18732 switch (ix86_fp_comparison_strategy (code))
18733 {
18734 case IX86_FPCMP_COMI:
18735 intcmp_mode = fpcmp_mode;
18736 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
18737 tmp = gen_rtx_SET (VOIDmode, gen_rtx_REG (fpcmp_mode, FLAGS_REG),
18738 tmp);
18739 emit_insn (tmp);
18740 break;
18741
18742 case IX86_FPCMP_SAHF:
18743 intcmp_mode = fpcmp_mode;
18744 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
18745 tmp = gen_rtx_SET (VOIDmode, gen_rtx_REG (fpcmp_mode, FLAGS_REG),
18746 tmp);
18747
18748 if (!scratch)
18749 scratch = gen_reg_rtx (HImode);
18750 tmp2 = gen_rtx_CLOBBER (VOIDmode, scratch);
18751 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, tmp2)));
18752 break;
18753
18754 case IX86_FPCMP_ARITH:
18755 /* Sadness wrt reg-stack pops killing fpsr -- gotta get fnstsw first. */
18756 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
18757 tmp2 = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
18758 if (!scratch)
18759 scratch = gen_reg_rtx (HImode);
18760 emit_insn (gen_rtx_SET (VOIDmode, scratch, tmp2));
18761
18762 /* In the unordered case, we have to check C2 for NaN's, which
18763 doesn't happen to work out to anything nice combination-wise.
18764 So do some bit twiddling on the value we've got in AH to come
18765 up with an appropriate set of condition codes. */
18766
18767 intcmp_mode = CCNOmode;
18768 switch (code)
18769 {
18770 case GT:
18771 case UNGT:
18772 if (code == GT || !TARGET_IEEE_FP)
18773 {
18774 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x45)));
18775 code = EQ;
18776 }
18777 else
18778 {
18779 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18780 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
18781 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x44)));
18782 intcmp_mode = CCmode;
18783 code = GEU;
18784 }
18785 break;
18786 case LT:
18787 case UNLT:
18788 if (code == LT && TARGET_IEEE_FP)
18789 {
18790 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18791 emit_insn (gen_cmpqi_ext_3 (scratch, const1_rtx));
18792 intcmp_mode = CCmode;
18793 code = EQ;
18794 }
18795 else
18796 {
18797 emit_insn (gen_testqi_ext_ccno_0 (scratch, const1_rtx));
18798 code = NE;
18799 }
18800 break;
18801 case GE:
18802 case UNGE:
18803 if (code == GE || !TARGET_IEEE_FP)
18804 {
18805 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x05)));
18806 code = EQ;
18807 }
18808 else
18809 {
18810 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18811 emit_insn (gen_xorqi_cc_ext_1 (scratch, scratch, const1_rtx));
18812 code = NE;
18813 }
18814 break;
18815 case LE:
18816 case UNLE:
18817 if (code == LE && TARGET_IEEE_FP)
18818 {
18819 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18820 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
18821 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
18822 intcmp_mode = CCmode;
18823 code = LTU;
18824 }
18825 else
18826 {
18827 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x45)));
18828 code = NE;
18829 }
18830 break;
18831 case EQ:
18832 case UNEQ:
18833 if (code == EQ && TARGET_IEEE_FP)
18834 {
18835 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18836 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
18837 intcmp_mode = CCmode;
18838 code = EQ;
18839 }
18840 else
18841 {
18842 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x40)));
18843 code = NE;
18844 }
18845 break;
18846 case NE:
18847 case LTGT:
18848 if (code == NE && TARGET_IEEE_FP)
18849 {
18850 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18851 emit_insn (gen_xorqi_cc_ext_1 (scratch, scratch,
18852 GEN_INT (0x40)));
18853 code = NE;
18854 }
18855 else
18856 {
18857 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x40)));
18858 code = EQ;
18859 }
18860 break;
18861
18862 case UNORDERED:
18863 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x04)));
18864 code = NE;
18865 break;
18866 case ORDERED:
18867 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x04)));
18868 code = EQ;
18869 break;
18870
18871 default:
18872 gcc_unreachable ();
18873 }
18874 break;
18875
18876 default:
18877 gcc_unreachable();
18878 }
18879
18880 /* Return the test that should be put into the flags user, i.e.
18881 the bcc, scc, or cmov instruction. */
18882 return gen_rtx_fmt_ee (code, VOIDmode,
18883 gen_rtx_REG (intcmp_mode, FLAGS_REG),
18884 const0_rtx);
18885 }
18886
18887 static rtx
18888 ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1)
18889 {
18890 rtx ret;
18891
18892 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC)
18893 ret = gen_rtx_fmt_ee (code, VOIDmode, op0, op1);
18894
18895 else if (SCALAR_FLOAT_MODE_P (GET_MODE (op0)))
18896 {
18897 gcc_assert (!DECIMAL_FLOAT_MODE_P (GET_MODE (op0)));
18898 ret = ix86_expand_fp_compare (code, op0, op1, NULL_RTX);
18899 }
18900 else
18901 ret = ix86_expand_int_compare (code, op0, op1);
18902
18903 return ret;
18904 }
18905
18906 void
18907 ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label)
18908 {
18909 enum machine_mode mode = GET_MODE (op0);
18910 rtx tmp;
18911
18912 switch (mode)
18913 {
18914 case SFmode:
18915 case DFmode:
18916 case XFmode:
18917 case QImode:
18918 case HImode:
18919 case SImode:
18920 simple:
18921 tmp = ix86_expand_compare (code, op0, op1);
18922 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
18923 gen_rtx_LABEL_REF (VOIDmode, label),
18924 pc_rtx);
18925 emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
18926 return;
18927
18928 case DImode:
18929 if (TARGET_64BIT)
18930 goto simple;
18931 case TImode:
18932 /* Expand DImode branch into multiple compare+branch. */
18933 {
18934 rtx lo[2], hi[2], label2;
18935 enum rtx_code code1, code2, code3;
18936 enum machine_mode submode;
18937
18938 if (CONSTANT_P (op0) && !CONSTANT_P (op1))
18939 {
18940 tmp = op0, op0 = op1, op1 = tmp;
18941 code = swap_condition (code);
18942 }
18943
18944 split_double_mode (mode, &op0, 1, lo+0, hi+0);
18945 split_double_mode (mode, &op1, 1, lo+1, hi+1);
18946
18947 submode = mode == DImode ? SImode : DImode;
18948
18949 /* When comparing for equality, we can use (hi0^hi1)|(lo0^lo1) to
18950 avoid two branches. This costs one extra insn, so disable when
18951 optimizing for size. */
18952
18953 if ((code == EQ || code == NE)
18954 && (!optimize_insn_for_size_p ()
18955 || hi[1] == const0_rtx || lo[1] == const0_rtx))
18956 {
18957 rtx xor0, xor1;
18958
18959 xor1 = hi[0];
18960 if (hi[1] != const0_rtx)
18961 xor1 = expand_binop (submode, xor_optab, xor1, hi[1],
18962 NULL_RTX, 0, OPTAB_WIDEN);
18963
18964 xor0 = lo[0];
18965 if (lo[1] != const0_rtx)
18966 xor0 = expand_binop (submode, xor_optab, xor0, lo[1],
18967 NULL_RTX, 0, OPTAB_WIDEN);
18968
18969 tmp = expand_binop (submode, ior_optab, xor1, xor0,
18970 NULL_RTX, 0, OPTAB_WIDEN);
18971
18972 ix86_expand_branch (code, tmp, const0_rtx, label);
18973 return;
18974 }
18975
18976 /* Otherwise, if we are doing less-than or greater-or-equal-than,
18977 op1 is a constant and the low word is zero, then we can just
18978 examine the high word. Similarly for low word -1 and
18979 less-or-equal-than or greater-than. */
18980
18981 if (CONST_INT_P (hi[1]))
18982 switch (code)
18983 {
18984 case LT: case LTU: case GE: case GEU:
18985 if (lo[1] == const0_rtx)
18986 {
18987 ix86_expand_branch (code, hi[0], hi[1], label);
18988 return;
18989 }
18990 break;
18991 case LE: case LEU: case GT: case GTU:
18992 if (lo[1] == constm1_rtx)
18993 {
18994 ix86_expand_branch (code, hi[0], hi[1], label);
18995 return;
18996 }
18997 break;
18998 default:
18999 break;
19000 }
19001
19002 /* Otherwise, we need two or three jumps. */
19003
19004 label2 = gen_label_rtx ();
19005
19006 code1 = code;
19007 code2 = swap_condition (code);
19008 code3 = unsigned_condition (code);
19009
19010 switch (code)
19011 {
19012 case LT: case GT: case LTU: case GTU:
19013 break;
19014
19015 case LE: code1 = LT; code2 = GT; break;
19016 case GE: code1 = GT; code2 = LT; break;
19017 case LEU: code1 = LTU; code2 = GTU; break;
19018 case GEU: code1 = GTU; code2 = LTU; break;
19019
19020 case EQ: code1 = UNKNOWN; code2 = NE; break;
19021 case NE: code2 = UNKNOWN; break;
19022
19023 default:
19024 gcc_unreachable ();
19025 }
19026
19027 /*
19028 * a < b =>
19029 * if (hi(a) < hi(b)) goto true;
19030 * if (hi(a) > hi(b)) goto false;
19031 * if (lo(a) < lo(b)) goto true;
19032 * false:
19033 */
19034
19035 if (code1 != UNKNOWN)
19036 ix86_expand_branch (code1, hi[0], hi[1], label);
19037 if (code2 != UNKNOWN)
19038 ix86_expand_branch (code2, hi[0], hi[1], label2);
19039
19040 ix86_expand_branch (code3, lo[0], lo[1], label);
19041
19042 if (code2 != UNKNOWN)
19043 emit_label (label2);
19044 return;
19045 }
19046
19047 default:
19048 gcc_assert (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC);
19049 goto simple;
19050 }
19051 }
19052
19053 /* Split branch based on floating point condition. */
19054 void
19055 ix86_split_fp_branch (enum rtx_code code, rtx op1, rtx op2,
19056 rtx target1, rtx target2, rtx tmp, rtx pushed)
19057 {
19058 rtx condition;
19059 rtx i;
19060
19061 if (target2 != pc_rtx)
19062 {
19063 rtx tmp = target2;
19064 code = reverse_condition_maybe_unordered (code);
19065 target2 = target1;
19066 target1 = tmp;
19067 }
19068
19069 condition = ix86_expand_fp_compare (code, op1, op2,
19070 tmp);
19071
19072 /* Remove pushed operand from stack. */
19073 if (pushed)
19074 ix86_free_from_memory (GET_MODE (pushed));
19075
19076 i = emit_jump_insn (gen_rtx_SET
19077 (VOIDmode, pc_rtx,
19078 gen_rtx_IF_THEN_ELSE (VOIDmode,
19079 condition, target1, target2)));
19080 if (split_branch_probability >= 0)
19081 add_reg_note (i, REG_BR_PROB, GEN_INT (split_branch_probability));
19082 }
19083
19084 void
19085 ix86_expand_setcc (rtx dest, enum rtx_code code, rtx op0, rtx op1)
19086 {
19087 rtx ret;
19088
19089 gcc_assert (GET_MODE (dest) == QImode);
19090
19091 ret = ix86_expand_compare (code, op0, op1);
19092 PUT_MODE (ret, QImode);
19093 emit_insn (gen_rtx_SET (VOIDmode, dest, ret));
19094 }
19095
19096 /* Expand comparison setting or clearing carry flag. Return true when
19097 successful and set pop for the operation. */
19098 static bool
19099 ix86_expand_carry_flag_compare (enum rtx_code code, rtx op0, rtx op1, rtx *pop)
19100 {
19101 enum machine_mode mode =
19102 GET_MODE (op0) != VOIDmode ? GET_MODE (op0) : GET_MODE (op1);
19103
19104 /* Do not handle double-mode compares that go through special path. */
19105 if (mode == (TARGET_64BIT ? TImode : DImode))
19106 return false;
19107
19108 if (SCALAR_FLOAT_MODE_P (mode))
19109 {
19110 rtx compare_op, compare_seq;
19111
19112 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
19113
19114 /* Shortcut: following common codes never translate
19115 into carry flag compares. */
19116 if (code == EQ || code == NE || code == UNEQ || code == LTGT
19117 || code == ORDERED || code == UNORDERED)
19118 return false;
19119
19120 /* These comparisons require zero flag; swap operands so they won't. */
19121 if ((code == GT || code == UNLE || code == LE || code == UNGT)
19122 && !TARGET_IEEE_FP)
19123 {
19124 rtx tmp = op0;
19125 op0 = op1;
19126 op1 = tmp;
19127 code = swap_condition (code);
19128 }
19129
19130 /* Try to expand the comparison and verify that we end up with
19131 carry flag based comparison. This fails to be true only when
19132 we decide to expand comparison using arithmetic that is not
19133 too common scenario. */
19134 start_sequence ();
19135 compare_op = ix86_expand_fp_compare (code, op0, op1, NULL_RTX);
19136 compare_seq = get_insns ();
19137 end_sequence ();
19138
19139 if (GET_MODE (XEXP (compare_op, 0)) == CCFPmode
19140 || GET_MODE (XEXP (compare_op, 0)) == CCFPUmode)
19141 code = ix86_fp_compare_code_to_integer (GET_CODE (compare_op));
19142 else
19143 code = GET_CODE (compare_op);
19144
19145 if (code != LTU && code != GEU)
19146 return false;
19147
19148 emit_insn (compare_seq);
19149 *pop = compare_op;
19150 return true;
19151 }
19152
19153 if (!INTEGRAL_MODE_P (mode))
19154 return false;
19155
19156 switch (code)
19157 {
19158 case LTU:
19159 case GEU:
19160 break;
19161
19162 /* Convert a==0 into (unsigned)a<1. */
19163 case EQ:
19164 case NE:
19165 if (op1 != const0_rtx)
19166 return false;
19167 op1 = const1_rtx;
19168 code = (code == EQ ? LTU : GEU);
19169 break;
19170
19171 /* Convert a>b into b<a or a>=b-1. */
19172 case GTU:
19173 case LEU:
19174 if (CONST_INT_P (op1))
19175 {
19176 op1 = gen_int_mode (INTVAL (op1) + 1, GET_MODE (op0));
19177 /* Bail out on overflow. We still can swap operands but that
19178 would force loading of the constant into register. */
19179 if (op1 == const0_rtx
19180 || !x86_64_immediate_operand (op1, GET_MODE (op1)))
19181 return false;
19182 code = (code == GTU ? GEU : LTU);
19183 }
19184 else
19185 {
19186 rtx tmp = op1;
19187 op1 = op0;
19188 op0 = tmp;
19189 code = (code == GTU ? LTU : GEU);
19190 }
19191 break;
19192
19193 /* Convert a>=0 into (unsigned)a<0x80000000. */
19194 case LT:
19195 case GE:
19196 if (mode == DImode || op1 != const0_rtx)
19197 return false;
19198 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
19199 code = (code == LT ? GEU : LTU);
19200 break;
19201 case LE:
19202 case GT:
19203 if (mode == DImode || op1 != constm1_rtx)
19204 return false;
19205 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
19206 code = (code == LE ? GEU : LTU);
19207 break;
19208
19209 default:
19210 return false;
19211 }
19212 /* Swapping operands may cause constant to appear as first operand. */
19213 if (!nonimmediate_operand (op0, VOIDmode))
19214 {
19215 if (!can_create_pseudo_p ())
19216 return false;
19217 op0 = force_reg (mode, op0);
19218 }
19219 *pop = ix86_expand_compare (code, op0, op1);
19220 gcc_assert (GET_CODE (*pop) == LTU || GET_CODE (*pop) == GEU);
19221 return true;
19222 }
19223
19224 bool
19225 ix86_expand_int_movcc (rtx operands[])
19226 {
19227 enum rtx_code code = GET_CODE (operands[1]), compare_code;
19228 rtx compare_seq, compare_op;
19229 enum machine_mode mode = GET_MODE (operands[0]);
19230 bool sign_bit_compare_p = false;
19231 rtx op0 = XEXP (operands[1], 0);
19232 rtx op1 = XEXP (operands[1], 1);
19233
19234 if (GET_MODE (op0) == TImode
19235 || (GET_MODE (op0) == DImode
19236 && !TARGET_64BIT))
19237 return false;
19238
19239 start_sequence ();
19240 compare_op = ix86_expand_compare (code, op0, op1);
19241 compare_seq = get_insns ();
19242 end_sequence ();
19243
19244 compare_code = GET_CODE (compare_op);
19245
19246 if ((op1 == const0_rtx && (code == GE || code == LT))
19247 || (op1 == constm1_rtx && (code == GT || code == LE)))
19248 sign_bit_compare_p = true;
19249
19250 /* Don't attempt mode expansion here -- if we had to expand 5 or 6
19251 HImode insns, we'd be swallowed in word prefix ops. */
19252
19253 if ((mode != HImode || TARGET_FAST_PREFIX)
19254 && (mode != (TARGET_64BIT ? TImode : DImode))
19255 && CONST_INT_P (operands[2])
19256 && CONST_INT_P (operands[3]))
19257 {
19258 rtx out = operands[0];
19259 HOST_WIDE_INT ct = INTVAL (operands[2]);
19260 HOST_WIDE_INT cf = INTVAL (operands[3]);
19261 HOST_WIDE_INT diff;
19262
19263 diff = ct - cf;
19264 /* Sign bit compares are better done using shifts than we do by using
19265 sbb. */
19266 if (sign_bit_compare_p
19267 || ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
19268 {
19269 /* Detect overlap between destination and compare sources. */
19270 rtx tmp = out;
19271
19272 if (!sign_bit_compare_p)
19273 {
19274 rtx flags;
19275 bool fpcmp = false;
19276
19277 compare_code = GET_CODE (compare_op);
19278
19279 flags = XEXP (compare_op, 0);
19280
19281 if (GET_MODE (flags) == CCFPmode
19282 || GET_MODE (flags) == CCFPUmode)
19283 {
19284 fpcmp = true;
19285 compare_code
19286 = ix86_fp_compare_code_to_integer (compare_code);
19287 }
19288
19289 /* To simplify rest of code, restrict to the GEU case. */
19290 if (compare_code == LTU)
19291 {
19292 HOST_WIDE_INT tmp = ct;
19293 ct = cf;
19294 cf = tmp;
19295 compare_code = reverse_condition (compare_code);
19296 code = reverse_condition (code);
19297 }
19298 else
19299 {
19300 if (fpcmp)
19301 PUT_CODE (compare_op,
19302 reverse_condition_maybe_unordered
19303 (GET_CODE (compare_op)));
19304 else
19305 PUT_CODE (compare_op,
19306 reverse_condition (GET_CODE (compare_op)));
19307 }
19308 diff = ct - cf;
19309
19310 if (reg_overlap_mentioned_p (out, op0)
19311 || reg_overlap_mentioned_p (out, op1))
19312 tmp = gen_reg_rtx (mode);
19313
19314 if (mode == DImode)
19315 emit_insn (gen_x86_movdicc_0_m1 (tmp, flags, compare_op));
19316 else
19317 emit_insn (gen_x86_movsicc_0_m1 (gen_lowpart (SImode, tmp),
19318 flags, compare_op));
19319 }
19320 else
19321 {
19322 if (code == GT || code == GE)
19323 code = reverse_condition (code);
19324 else
19325 {
19326 HOST_WIDE_INT tmp = ct;
19327 ct = cf;
19328 cf = tmp;
19329 diff = ct - cf;
19330 }
19331 tmp = emit_store_flag (tmp, code, op0, op1, VOIDmode, 0, -1);
19332 }
19333
19334 if (diff == 1)
19335 {
19336 /*
19337 * cmpl op0,op1
19338 * sbbl dest,dest
19339 * [addl dest, ct]
19340 *
19341 * Size 5 - 8.
19342 */
19343 if (ct)
19344 tmp = expand_simple_binop (mode, PLUS,
19345 tmp, GEN_INT (ct),
19346 copy_rtx (tmp), 1, OPTAB_DIRECT);
19347 }
19348 else if (cf == -1)
19349 {
19350 /*
19351 * cmpl op0,op1
19352 * sbbl dest,dest
19353 * orl $ct, dest
19354 *
19355 * Size 8.
19356 */
19357 tmp = expand_simple_binop (mode, IOR,
19358 tmp, GEN_INT (ct),
19359 copy_rtx (tmp), 1, OPTAB_DIRECT);
19360 }
19361 else if (diff == -1 && ct)
19362 {
19363 /*
19364 * cmpl op0,op1
19365 * sbbl dest,dest
19366 * notl dest
19367 * [addl dest, cf]
19368 *
19369 * Size 8 - 11.
19370 */
19371 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
19372 if (cf)
19373 tmp = expand_simple_binop (mode, PLUS,
19374 copy_rtx (tmp), GEN_INT (cf),
19375 copy_rtx (tmp), 1, OPTAB_DIRECT);
19376 }
19377 else
19378 {
19379 /*
19380 * cmpl op0,op1
19381 * sbbl dest,dest
19382 * [notl dest]
19383 * andl cf - ct, dest
19384 * [addl dest, ct]
19385 *
19386 * Size 8 - 11.
19387 */
19388
19389 if (cf == 0)
19390 {
19391 cf = ct;
19392 ct = 0;
19393 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
19394 }
19395
19396 tmp = expand_simple_binop (mode, AND,
19397 copy_rtx (tmp),
19398 gen_int_mode (cf - ct, mode),
19399 copy_rtx (tmp), 1, OPTAB_DIRECT);
19400 if (ct)
19401 tmp = expand_simple_binop (mode, PLUS,
19402 copy_rtx (tmp), GEN_INT (ct),
19403 copy_rtx (tmp), 1, OPTAB_DIRECT);
19404 }
19405
19406 if (!rtx_equal_p (tmp, out))
19407 emit_move_insn (copy_rtx (out), copy_rtx (tmp));
19408
19409 return true;
19410 }
19411
19412 if (diff < 0)
19413 {
19414 enum machine_mode cmp_mode = GET_MODE (op0);
19415
19416 HOST_WIDE_INT tmp;
19417 tmp = ct, ct = cf, cf = tmp;
19418 diff = -diff;
19419
19420 if (SCALAR_FLOAT_MODE_P (cmp_mode))
19421 {
19422 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
19423
19424 /* We may be reversing unordered compare to normal compare, that
19425 is not valid in general (we may convert non-trapping condition
19426 to trapping one), however on i386 we currently emit all
19427 comparisons unordered. */
19428 compare_code = reverse_condition_maybe_unordered (compare_code);
19429 code = reverse_condition_maybe_unordered (code);
19430 }
19431 else
19432 {
19433 compare_code = reverse_condition (compare_code);
19434 code = reverse_condition (code);
19435 }
19436 }
19437
19438 compare_code = UNKNOWN;
19439 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT
19440 && CONST_INT_P (op1))
19441 {
19442 if (op1 == const0_rtx
19443 && (code == LT || code == GE))
19444 compare_code = code;
19445 else if (op1 == constm1_rtx)
19446 {
19447 if (code == LE)
19448 compare_code = LT;
19449 else if (code == GT)
19450 compare_code = GE;
19451 }
19452 }
19453
19454 /* Optimize dest = (op0 < 0) ? -1 : cf. */
19455 if (compare_code != UNKNOWN
19456 && GET_MODE (op0) == GET_MODE (out)
19457 && (cf == -1 || ct == -1))
19458 {
19459 /* If lea code below could be used, only optimize
19460 if it results in a 2 insn sequence. */
19461
19462 if (! (diff == 1 || diff == 2 || diff == 4 || diff == 8
19463 || diff == 3 || diff == 5 || diff == 9)
19464 || (compare_code == LT && ct == -1)
19465 || (compare_code == GE && cf == -1))
19466 {
19467 /*
19468 * notl op1 (if necessary)
19469 * sarl $31, op1
19470 * orl cf, op1
19471 */
19472 if (ct != -1)
19473 {
19474 cf = ct;
19475 ct = -1;
19476 code = reverse_condition (code);
19477 }
19478
19479 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
19480
19481 out = expand_simple_binop (mode, IOR,
19482 out, GEN_INT (cf),
19483 out, 1, OPTAB_DIRECT);
19484 if (out != operands[0])
19485 emit_move_insn (operands[0], out);
19486
19487 return true;
19488 }
19489 }
19490
19491
19492 if ((diff == 1 || diff == 2 || diff == 4 || diff == 8
19493 || diff == 3 || diff == 5 || diff == 9)
19494 && ((mode != QImode && mode != HImode) || !TARGET_PARTIAL_REG_STALL)
19495 && (mode != DImode
19496 || x86_64_immediate_operand (GEN_INT (cf), VOIDmode)))
19497 {
19498 /*
19499 * xorl dest,dest
19500 * cmpl op1,op2
19501 * setcc dest
19502 * lea cf(dest*(ct-cf)),dest
19503 *
19504 * Size 14.
19505 *
19506 * This also catches the degenerate setcc-only case.
19507 */
19508
19509 rtx tmp;
19510 int nops;
19511
19512 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
19513
19514 nops = 0;
19515 /* On x86_64 the lea instruction operates on Pmode, so we need
19516 to get arithmetics done in proper mode to match. */
19517 if (diff == 1)
19518 tmp = copy_rtx (out);
19519 else
19520 {
19521 rtx out1;
19522 out1 = copy_rtx (out);
19523 tmp = gen_rtx_MULT (mode, out1, GEN_INT (diff & ~1));
19524 nops++;
19525 if (diff & 1)
19526 {
19527 tmp = gen_rtx_PLUS (mode, tmp, out1);
19528 nops++;
19529 }
19530 }
19531 if (cf != 0)
19532 {
19533 tmp = gen_rtx_PLUS (mode, tmp, GEN_INT (cf));
19534 nops++;
19535 }
19536 if (!rtx_equal_p (tmp, out))
19537 {
19538 if (nops == 1)
19539 out = force_operand (tmp, copy_rtx (out));
19540 else
19541 emit_insn (gen_rtx_SET (VOIDmode, copy_rtx (out), copy_rtx (tmp)));
19542 }
19543 if (!rtx_equal_p (out, operands[0]))
19544 emit_move_insn (operands[0], copy_rtx (out));
19545
19546 return true;
19547 }
19548
19549 /*
19550 * General case: Jumpful:
19551 * xorl dest,dest cmpl op1, op2
19552 * cmpl op1, op2 movl ct, dest
19553 * setcc dest jcc 1f
19554 * decl dest movl cf, dest
19555 * andl (cf-ct),dest 1:
19556 * addl ct,dest
19557 *
19558 * Size 20. Size 14.
19559 *
19560 * This is reasonably steep, but branch mispredict costs are
19561 * high on modern cpus, so consider failing only if optimizing
19562 * for space.
19563 */
19564
19565 if ((!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
19566 && BRANCH_COST (optimize_insn_for_speed_p (),
19567 false) >= 2)
19568 {
19569 if (cf == 0)
19570 {
19571 enum machine_mode cmp_mode = GET_MODE (op0);
19572
19573 cf = ct;
19574 ct = 0;
19575
19576 if (SCALAR_FLOAT_MODE_P (cmp_mode))
19577 {
19578 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
19579
19580 /* We may be reversing unordered compare to normal compare,
19581 that is not valid in general (we may convert non-trapping
19582 condition to trapping one), however on i386 we currently
19583 emit all comparisons unordered. */
19584 code = reverse_condition_maybe_unordered (code);
19585 }
19586 else
19587 {
19588 code = reverse_condition (code);
19589 if (compare_code != UNKNOWN)
19590 compare_code = reverse_condition (compare_code);
19591 }
19592 }
19593
19594 if (compare_code != UNKNOWN)
19595 {
19596 /* notl op1 (if needed)
19597 sarl $31, op1
19598 andl (cf-ct), op1
19599 addl ct, op1
19600
19601 For x < 0 (resp. x <= -1) there will be no notl,
19602 so if possible swap the constants to get rid of the
19603 complement.
19604 True/false will be -1/0 while code below (store flag
19605 followed by decrement) is 0/-1, so the constants need
19606 to be exchanged once more. */
19607
19608 if (compare_code == GE || !cf)
19609 {
19610 code = reverse_condition (code);
19611 compare_code = LT;
19612 }
19613 else
19614 {
19615 HOST_WIDE_INT tmp = cf;
19616 cf = ct;
19617 ct = tmp;
19618 }
19619
19620 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
19621 }
19622 else
19623 {
19624 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
19625
19626 out = expand_simple_binop (mode, PLUS, copy_rtx (out),
19627 constm1_rtx,
19628 copy_rtx (out), 1, OPTAB_DIRECT);
19629 }
19630
19631 out = expand_simple_binop (mode, AND, copy_rtx (out),
19632 gen_int_mode (cf - ct, mode),
19633 copy_rtx (out), 1, OPTAB_DIRECT);
19634 if (ct)
19635 out = expand_simple_binop (mode, PLUS, copy_rtx (out), GEN_INT (ct),
19636 copy_rtx (out), 1, OPTAB_DIRECT);
19637 if (!rtx_equal_p (out, operands[0]))
19638 emit_move_insn (operands[0], copy_rtx (out));
19639
19640 return true;
19641 }
19642 }
19643
19644 if (!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
19645 {
19646 /* Try a few things more with specific constants and a variable. */
19647
19648 optab op;
19649 rtx var, orig_out, out, tmp;
19650
19651 if (BRANCH_COST (optimize_insn_for_speed_p (), false) <= 2)
19652 return false;
19653
19654 /* If one of the two operands is an interesting constant, load a
19655 constant with the above and mask it in with a logical operation. */
19656
19657 if (CONST_INT_P (operands[2]))
19658 {
19659 var = operands[3];
19660 if (INTVAL (operands[2]) == 0 && operands[3] != constm1_rtx)
19661 operands[3] = constm1_rtx, op = and_optab;
19662 else if (INTVAL (operands[2]) == -1 && operands[3] != const0_rtx)
19663 operands[3] = const0_rtx, op = ior_optab;
19664 else
19665 return false;
19666 }
19667 else if (CONST_INT_P (operands[3]))
19668 {
19669 var = operands[2];
19670 if (INTVAL (operands[3]) == 0 && operands[2] != constm1_rtx)
19671 operands[2] = constm1_rtx, op = and_optab;
19672 else if (INTVAL (operands[3]) == -1 && operands[3] != const0_rtx)
19673 operands[2] = const0_rtx, op = ior_optab;
19674 else
19675 return false;
19676 }
19677 else
19678 return false;
19679
19680 orig_out = operands[0];
19681 tmp = gen_reg_rtx (mode);
19682 operands[0] = tmp;
19683
19684 /* Recurse to get the constant loaded. */
19685 if (ix86_expand_int_movcc (operands) == 0)
19686 return false;
19687
19688 /* Mask in the interesting variable. */
19689 out = expand_binop (mode, op, var, tmp, orig_out, 0,
19690 OPTAB_WIDEN);
19691 if (!rtx_equal_p (out, orig_out))
19692 emit_move_insn (copy_rtx (orig_out), copy_rtx (out));
19693
19694 return true;
19695 }
19696
19697 /*
19698 * For comparison with above,
19699 *
19700 * movl cf,dest
19701 * movl ct,tmp
19702 * cmpl op1,op2
19703 * cmovcc tmp,dest
19704 *
19705 * Size 15.
19706 */
19707
19708 if (! nonimmediate_operand (operands[2], mode))
19709 operands[2] = force_reg (mode, operands[2]);
19710 if (! nonimmediate_operand (operands[3], mode))
19711 operands[3] = force_reg (mode, operands[3]);
19712
19713 if (! register_operand (operands[2], VOIDmode)
19714 && (mode == QImode
19715 || ! register_operand (operands[3], VOIDmode)))
19716 operands[2] = force_reg (mode, operands[2]);
19717
19718 if (mode == QImode
19719 && ! register_operand (operands[3], VOIDmode))
19720 operands[3] = force_reg (mode, operands[3]);
19721
19722 emit_insn (compare_seq);
19723 emit_insn (gen_rtx_SET (VOIDmode, operands[0],
19724 gen_rtx_IF_THEN_ELSE (mode,
19725 compare_op, operands[2],
19726 operands[3])));
19727 return true;
19728 }
19729
19730 /* Swap, force into registers, or otherwise massage the two operands
19731 to an sse comparison with a mask result. Thus we differ a bit from
19732 ix86_prepare_fp_compare_args which expects to produce a flags result.
19733
19734 The DEST operand exists to help determine whether to commute commutative
19735 operators. The POP0/POP1 operands are updated in place. The new
19736 comparison code is returned, or UNKNOWN if not implementable. */
19737
19738 static enum rtx_code
19739 ix86_prepare_sse_fp_compare_args (rtx dest, enum rtx_code code,
19740 rtx *pop0, rtx *pop1)
19741 {
19742 rtx tmp;
19743
19744 switch (code)
19745 {
19746 case LTGT:
19747 case UNEQ:
19748 /* AVX supports all the needed comparisons. */
19749 if (TARGET_AVX)
19750 break;
19751 /* We have no LTGT as an operator. We could implement it with
19752 NE & ORDERED, but this requires an extra temporary. It's
19753 not clear that it's worth it. */
19754 return UNKNOWN;
19755
19756 case LT:
19757 case LE:
19758 case UNGT:
19759 case UNGE:
19760 /* These are supported directly. */
19761 break;
19762
19763 case EQ:
19764 case NE:
19765 case UNORDERED:
19766 case ORDERED:
19767 /* AVX has 3 operand comparisons, no need to swap anything. */
19768 if (TARGET_AVX)
19769 break;
19770 /* For commutative operators, try to canonicalize the destination
19771 operand to be first in the comparison - this helps reload to
19772 avoid extra moves. */
19773 if (!dest || !rtx_equal_p (dest, *pop1))
19774 break;
19775 /* FALLTHRU */
19776
19777 case GE:
19778 case GT:
19779 case UNLE:
19780 case UNLT:
19781 /* These are not supported directly before AVX, and furthermore
19782 ix86_expand_sse_fp_minmax only optimizes LT/UNGE. Swap the
19783 comparison operands to transform into something that is
19784 supported. */
19785 tmp = *pop0;
19786 *pop0 = *pop1;
19787 *pop1 = tmp;
19788 code = swap_condition (code);
19789 break;
19790
19791 default:
19792 gcc_unreachable ();
19793 }
19794
19795 return code;
19796 }
19797
19798 /* Detect conditional moves that exactly match min/max operational
19799 semantics. Note that this is IEEE safe, as long as we don't
19800 interchange the operands.
19801
19802 Returns FALSE if this conditional move doesn't match a MIN/MAX,
19803 and TRUE if the operation is successful and instructions are emitted. */
19804
19805 static bool
19806 ix86_expand_sse_fp_minmax (rtx dest, enum rtx_code code, rtx cmp_op0,
19807 rtx cmp_op1, rtx if_true, rtx if_false)
19808 {
19809 enum machine_mode mode;
19810 bool is_min;
19811 rtx tmp;
19812
19813 if (code == LT)
19814 ;
19815 else if (code == UNGE)
19816 {
19817 tmp = if_true;
19818 if_true = if_false;
19819 if_false = tmp;
19820 }
19821 else
19822 return false;
19823
19824 if (rtx_equal_p (cmp_op0, if_true) && rtx_equal_p (cmp_op1, if_false))
19825 is_min = true;
19826 else if (rtx_equal_p (cmp_op1, if_true) && rtx_equal_p (cmp_op0, if_false))
19827 is_min = false;
19828 else
19829 return false;
19830
19831 mode = GET_MODE (dest);
19832
19833 /* We want to check HONOR_NANS and HONOR_SIGNED_ZEROS here,
19834 but MODE may be a vector mode and thus not appropriate. */
19835 if (!flag_finite_math_only || !flag_unsafe_math_optimizations)
19836 {
19837 int u = is_min ? UNSPEC_IEEE_MIN : UNSPEC_IEEE_MAX;
19838 rtvec v;
19839
19840 if_true = force_reg (mode, if_true);
19841 v = gen_rtvec (2, if_true, if_false);
19842 tmp = gen_rtx_UNSPEC (mode, v, u);
19843 }
19844 else
19845 {
19846 code = is_min ? SMIN : SMAX;
19847 tmp = gen_rtx_fmt_ee (code, mode, if_true, if_false);
19848 }
19849
19850 emit_insn (gen_rtx_SET (VOIDmode, dest, tmp));
19851 return true;
19852 }
19853
19854 /* Expand an sse vector comparison. Return the register with the result. */
19855
19856 static rtx
19857 ix86_expand_sse_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1,
19858 rtx op_true, rtx op_false)
19859 {
19860 enum machine_mode mode = GET_MODE (dest);
19861 enum machine_mode cmp_mode = GET_MODE (cmp_op0);
19862 rtx x;
19863
19864 cmp_op0 = force_reg (cmp_mode, cmp_op0);
19865 if (!nonimmediate_operand (cmp_op1, cmp_mode))
19866 cmp_op1 = force_reg (cmp_mode, cmp_op1);
19867
19868 if (optimize
19869 || reg_overlap_mentioned_p (dest, op_true)
19870 || reg_overlap_mentioned_p (dest, op_false))
19871 dest = gen_reg_rtx (mode);
19872
19873 x = gen_rtx_fmt_ee (code, cmp_mode, cmp_op0, cmp_op1);
19874 if (cmp_mode != mode)
19875 {
19876 x = force_reg (cmp_mode, x);
19877 convert_move (dest, x, false);
19878 }
19879 else
19880 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
19881
19882 return dest;
19883 }
19884
19885 /* Expand DEST = CMP ? OP_TRUE : OP_FALSE into a sequence of logical
19886 operations. This is used for both scalar and vector conditional moves. */
19887
19888 static void
19889 ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
19890 {
19891 enum machine_mode mode = GET_MODE (dest);
19892 rtx t2, t3, x;
19893
19894 if (vector_all_ones_operand (op_true, mode)
19895 && rtx_equal_p (op_false, CONST0_RTX (mode)))
19896 {
19897 emit_insn (gen_rtx_SET (VOIDmode, dest, cmp));
19898 }
19899 else if (op_false == CONST0_RTX (mode))
19900 {
19901 op_true = force_reg (mode, op_true);
19902 x = gen_rtx_AND (mode, cmp, op_true);
19903 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
19904 }
19905 else if (op_true == CONST0_RTX (mode))
19906 {
19907 op_false = force_reg (mode, op_false);
19908 x = gen_rtx_NOT (mode, cmp);
19909 x = gen_rtx_AND (mode, x, op_false);
19910 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
19911 }
19912 else if (INTEGRAL_MODE_P (mode) && op_true == CONSTM1_RTX (mode))
19913 {
19914 op_false = force_reg (mode, op_false);
19915 x = gen_rtx_IOR (mode, cmp, op_false);
19916 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
19917 }
19918 else if (TARGET_XOP)
19919 {
19920 op_true = force_reg (mode, op_true);
19921
19922 if (!nonimmediate_operand (op_false, mode))
19923 op_false = force_reg (mode, op_false);
19924
19925 emit_insn (gen_rtx_SET (mode, dest,
19926 gen_rtx_IF_THEN_ELSE (mode, cmp,
19927 op_true,
19928 op_false)));
19929 }
19930 else
19931 {
19932 rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
19933
19934 if (!nonimmediate_operand (op_true, mode))
19935 op_true = force_reg (mode, op_true);
19936
19937 op_false = force_reg (mode, op_false);
19938
19939 switch (mode)
19940 {
19941 case V4SFmode:
19942 if (TARGET_SSE4_1)
19943 gen = gen_sse4_1_blendvps;
19944 break;
19945 case V2DFmode:
19946 if (TARGET_SSE4_1)
19947 gen = gen_sse4_1_blendvpd;
19948 break;
19949 case V16QImode:
19950 case V8HImode:
19951 case V4SImode:
19952 case V2DImode:
19953 if (TARGET_SSE4_1)
19954 {
19955 gen = gen_sse4_1_pblendvb;
19956 dest = gen_lowpart (V16QImode, dest);
19957 op_false = gen_lowpart (V16QImode, op_false);
19958 op_true = gen_lowpart (V16QImode, op_true);
19959 cmp = gen_lowpart (V16QImode, cmp);
19960 }
19961 break;
19962 case V8SFmode:
19963 if (TARGET_AVX)
19964 gen = gen_avx_blendvps256;
19965 break;
19966 case V4DFmode:
19967 if (TARGET_AVX)
19968 gen = gen_avx_blendvpd256;
19969 break;
19970 case V32QImode:
19971 case V16HImode:
19972 case V8SImode:
19973 case V4DImode:
19974 if (TARGET_AVX2)
19975 {
19976 gen = gen_avx2_pblendvb;
19977 dest = gen_lowpart (V32QImode, dest);
19978 op_false = gen_lowpart (V32QImode, op_false);
19979 op_true = gen_lowpart (V32QImode, op_true);
19980 cmp = gen_lowpart (V32QImode, cmp);
19981 }
19982 break;
19983 default:
19984 break;
19985 }
19986
19987 if (gen != NULL)
19988 emit_insn (gen (dest, op_false, op_true, cmp));
19989 else
19990 {
19991 op_true = force_reg (mode, op_true);
19992
19993 t2 = gen_reg_rtx (mode);
19994 if (optimize)
19995 t3 = gen_reg_rtx (mode);
19996 else
19997 t3 = dest;
19998
19999 x = gen_rtx_AND (mode, op_true, cmp);
20000 emit_insn (gen_rtx_SET (VOIDmode, t2, x));
20001
20002 x = gen_rtx_NOT (mode, cmp);
20003 x = gen_rtx_AND (mode, x, op_false);
20004 emit_insn (gen_rtx_SET (VOIDmode, t3, x));
20005
20006 x = gen_rtx_IOR (mode, t3, t2);
20007 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
20008 }
20009 }
20010 }
20011
20012 /* Expand a floating-point conditional move. Return true if successful. */
20013
20014 bool
20015 ix86_expand_fp_movcc (rtx operands[])
20016 {
20017 enum machine_mode mode = GET_MODE (operands[0]);
20018 enum rtx_code code = GET_CODE (operands[1]);
20019 rtx tmp, compare_op;
20020 rtx op0 = XEXP (operands[1], 0);
20021 rtx op1 = XEXP (operands[1], 1);
20022
20023 if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode))
20024 {
20025 enum machine_mode cmode;
20026
20027 /* Since we've no cmove for sse registers, don't force bad register
20028 allocation just to gain access to it. Deny movcc when the
20029 comparison mode doesn't match the move mode. */
20030 cmode = GET_MODE (op0);
20031 if (cmode == VOIDmode)
20032 cmode = GET_MODE (op1);
20033 if (cmode != mode)
20034 return false;
20035
20036 code = ix86_prepare_sse_fp_compare_args (operands[0], code, &op0, &op1);
20037 if (code == UNKNOWN)
20038 return false;
20039
20040 if (ix86_expand_sse_fp_minmax (operands[0], code, op0, op1,
20041 operands[2], operands[3]))
20042 return true;
20043
20044 tmp = ix86_expand_sse_cmp (operands[0], code, op0, op1,
20045 operands[2], operands[3]);
20046 ix86_expand_sse_movcc (operands[0], tmp, operands[2], operands[3]);
20047 return true;
20048 }
20049
20050 /* The floating point conditional move instructions don't directly
20051 support conditions resulting from a signed integer comparison. */
20052
20053 compare_op = ix86_expand_compare (code, op0, op1);
20054 if (!fcmov_comparison_operator (compare_op, VOIDmode))
20055 {
20056 tmp = gen_reg_rtx (QImode);
20057 ix86_expand_setcc (tmp, code, op0, op1);
20058
20059 compare_op = ix86_expand_compare (NE, tmp, const0_rtx);
20060 }
20061
20062 emit_insn (gen_rtx_SET (VOIDmode, operands[0],
20063 gen_rtx_IF_THEN_ELSE (mode, compare_op,
20064 operands[2], operands[3])));
20065
20066 return true;
20067 }
20068
20069 /* Expand a floating-point vector conditional move; a vcond operation
20070 rather than a movcc operation. */
20071
20072 bool
20073 ix86_expand_fp_vcond (rtx operands[])
20074 {
20075 enum rtx_code code = GET_CODE (operands[3]);
20076 rtx cmp;
20077
20078 code = ix86_prepare_sse_fp_compare_args (operands[0], code,
20079 &operands[4], &operands[5]);
20080 if (code == UNKNOWN)
20081 {
20082 rtx temp;
20083 switch (GET_CODE (operands[3]))
20084 {
20085 case LTGT:
20086 temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[4],
20087 operands[5], operands[0], operands[0]);
20088 cmp = ix86_expand_sse_cmp (operands[0], NE, operands[4],
20089 operands[5], operands[1], operands[2]);
20090 code = AND;
20091 break;
20092 case UNEQ:
20093 temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[4],
20094 operands[5], operands[0], operands[0]);
20095 cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[4],
20096 operands[5], operands[1], operands[2]);
20097 code = IOR;
20098 break;
20099 default:
20100 gcc_unreachable ();
20101 }
20102 cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
20103 OPTAB_DIRECT);
20104 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
20105 return true;
20106 }
20107
20108 if (ix86_expand_sse_fp_minmax (operands[0], code, operands[4],
20109 operands[5], operands[1], operands[2]))
20110 return true;
20111
20112 cmp = ix86_expand_sse_cmp (operands[0], code, operands[4], operands[5],
20113 operands[1], operands[2]);
20114 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
20115 return true;
20116 }
20117
20118 /* Expand a signed/unsigned integral vector conditional move. */
20119
20120 bool
20121 ix86_expand_int_vcond (rtx operands[])
20122 {
20123 enum machine_mode data_mode = GET_MODE (operands[0]);
20124 enum machine_mode mode = GET_MODE (operands[4]);
20125 enum rtx_code code = GET_CODE (operands[3]);
20126 bool negate = false;
20127 rtx x, cop0, cop1;
20128
20129 cop0 = operands[4];
20130 cop1 = operands[5];
20131
20132 /* Try to optimize x < 0 ? -1 : 0 into (signed) x >> 31
20133 and x < 0 ? 1 : 0 into (unsigned) x >> 31. */
20134 if ((code == LT || code == GE)
20135 && data_mode == mode
20136 && cop1 == CONST0_RTX (mode)
20137 && operands[1 + (code == LT)] == CONST0_RTX (data_mode)
20138 && GET_MODE_SIZE (GET_MODE_INNER (data_mode)) > 1
20139 && GET_MODE_SIZE (GET_MODE_INNER (data_mode)) <= 8
20140 && (GET_MODE_SIZE (data_mode) == 16
20141 || (TARGET_AVX2 && GET_MODE_SIZE (data_mode) == 32)))
20142 {
20143 rtx negop = operands[2 - (code == LT)];
20144 int shift = GET_MODE_BITSIZE (GET_MODE_INNER (data_mode)) - 1;
20145 if (negop == CONST1_RTX (data_mode))
20146 {
20147 rtx res = expand_simple_binop (mode, LSHIFTRT, cop0, GEN_INT (shift),
20148 operands[0], 1, OPTAB_DIRECT);
20149 if (res != operands[0])
20150 emit_move_insn (operands[0], res);
20151 return true;
20152 }
20153 else if (GET_MODE_INNER (data_mode) != DImode
20154 && vector_all_ones_operand (negop, data_mode))
20155 {
20156 rtx res = expand_simple_binop (mode, ASHIFTRT, cop0, GEN_INT (shift),
20157 operands[0], 0, OPTAB_DIRECT);
20158 if (res != operands[0])
20159 emit_move_insn (operands[0], res);
20160 return true;
20161 }
20162 }
20163
20164 if (!nonimmediate_operand (cop1, mode))
20165 cop1 = force_reg (mode, cop1);
20166 if (!general_operand (operands[1], data_mode))
20167 operands[1] = force_reg (data_mode, operands[1]);
20168 if (!general_operand (operands[2], data_mode))
20169 operands[2] = force_reg (data_mode, operands[2]);
20170
20171 /* XOP supports all of the comparisons on all 128-bit vector int types. */
20172 if (TARGET_XOP
20173 && (mode == V16QImode || mode == V8HImode
20174 || mode == V4SImode || mode == V2DImode))
20175 ;
20176 else
20177 {
20178 /* Canonicalize the comparison to EQ, GT, GTU. */
20179 switch (code)
20180 {
20181 case EQ:
20182 case GT:
20183 case GTU:
20184 break;
20185
20186 case NE:
20187 case LE:
20188 case LEU:
20189 code = reverse_condition (code);
20190 negate = true;
20191 break;
20192
20193 case GE:
20194 case GEU:
20195 code = reverse_condition (code);
20196 negate = true;
20197 /* FALLTHRU */
20198
20199 case LT:
20200 case LTU:
20201 code = swap_condition (code);
20202 x = cop0, cop0 = cop1, cop1 = x;
20203 break;
20204
20205 default:
20206 gcc_unreachable ();
20207 }
20208
20209 /* Only SSE4.1/SSE4.2 supports V2DImode. */
20210 if (mode == V2DImode)
20211 {
20212 switch (code)
20213 {
20214 case EQ:
20215 /* SSE4.1 supports EQ. */
20216 if (!TARGET_SSE4_1)
20217 return false;
20218 break;
20219
20220 case GT:
20221 case GTU:
20222 /* SSE4.2 supports GT/GTU. */
20223 if (!TARGET_SSE4_2)
20224 return false;
20225 break;
20226
20227 default:
20228 gcc_unreachable ();
20229 }
20230 }
20231
20232 /* Unsigned parallel compare is not supported by the hardware.
20233 Play some tricks to turn this into a signed comparison
20234 against 0. */
20235 if (code == GTU)
20236 {
20237 cop0 = force_reg (mode, cop0);
20238
20239 switch (mode)
20240 {
20241 case V8SImode:
20242 case V4DImode:
20243 case V4SImode:
20244 case V2DImode:
20245 {
20246 rtx t1, t2, mask;
20247 rtx (*gen_sub3) (rtx, rtx, rtx);
20248
20249 switch (mode)
20250 {
20251 case V8SImode: gen_sub3 = gen_subv8si3; break;
20252 case V4DImode: gen_sub3 = gen_subv4di3; break;
20253 case V4SImode: gen_sub3 = gen_subv4si3; break;
20254 case V2DImode: gen_sub3 = gen_subv2di3; break;
20255 default:
20256 gcc_unreachable ();
20257 }
20258 /* Subtract (-(INT MAX) - 1) from both operands to make
20259 them signed. */
20260 mask = ix86_build_signbit_mask (mode, true, false);
20261 t1 = gen_reg_rtx (mode);
20262 emit_insn (gen_sub3 (t1, cop0, mask));
20263
20264 t2 = gen_reg_rtx (mode);
20265 emit_insn (gen_sub3 (t2, cop1, mask));
20266
20267 cop0 = t1;
20268 cop1 = t2;
20269 code = GT;
20270 }
20271 break;
20272
20273 case V32QImode:
20274 case V16HImode:
20275 case V16QImode:
20276 case V8HImode:
20277 /* Perform a parallel unsigned saturating subtraction. */
20278 x = gen_reg_rtx (mode);
20279 emit_insn (gen_rtx_SET (VOIDmode, x,
20280 gen_rtx_US_MINUS (mode, cop0, cop1)));
20281
20282 cop0 = x;
20283 cop1 = CONST0_RTX (mode);
20284 code = EQ;
20285 negate = !negate;
20286 break;
20287
20288 default:
20289 gcc_unreachable ();
20290 }
20291 }
20292 }
20293
20294 /* Allow the comparison to be done in one mode, but the movcc to
20295 happen in another mode. */
20296 if (data_mode == mode)
20297 {
20298 x = ix86_expand_sse_cmp (operands[0], code, cop0, cop1,
20299 operands[1+negate], operands[2-negate]);
20300 }
20301 else
20302 {
20303 gcc_assert (GET_MODE_SIZE (data_mode) == GET_MODE_SIZE (mode));
20304 x = ix86_expand_sse_cmp (gen_lowpart (mode, operands[0]),
20305 code, cop0, cop1,
20306 operands[1+negate], operands[2-negate]);
20307 x = gen_lowpart (data_mode, x);
20308 }
20309
20310 ix86_expand_sse_movcc (operands[0], x, operands[1+negate],
20311 operands[2-negate]);
20312 return true;
20313 }
20314
20315 /* Expand a variable vector permutation. */
20316
20317 void
20318 ix86_expand_vec_perm (rtx operands[])
20319 {
20320 rtx target = operands[0];
20321 rtx op0 = operands[1];
20322 rtx op1 = operands[2];
20323 rtx mask = operands[3];
20324 rtx t1, t2, t3, t4, vt, vt2, vec[32];
20325 enum machine_mode mode = GET_MODE (op0);
20326 enum machine_mode maskmode = GET_MODE (mask);
20327 int w, e, i;
20328 bool one_operand_shuffle = rtx_equal_p (op0, op1);
20329
20330 /* Number of elements in the vector. */
20331 w = GET_MODE_NUNITS (mode);
20332 e = GET_MODE_UNIT_SIZE (mode);
20333 gcc_assert (w <= 32);
20334
20335 if (TARGET_AVX2)
20336 {
20337 if (mode == V4DImode || mode == V4DFmode || mode == V16HImode)
20338 {
20339 /* Unfortunately, the VPERMQ and VPERMPD instructions only support
20340 an constant shuffle operand. With a tiny bit of effort we can
20341 use VPERMD instead. A re-interpretation stall for V4DFmode is
20342 unfortunate but there's no avoiding it.
20343 Similarly for V16HImode we don't have instructions for variable
20344 shuffling, while for V32QImode we can use after preparing suitable
20345 masks vpshufb; vpshufb; vpermq; vpor. */
20346
20347 if (mode == V16HImode)
20348 {
20349 maskmode = mode = V32QImode;
20350 w = 32;
20351 e = 1;
20352 }
20353 else
20354 {
20355 maskmode = mode = V8SImode;
20356 w = 8;
20357 e = 4;
20358 }
20359 t1 = gen_reg_rtx (maskmode);
20360
20361 /* Replicate the low bits of the V4DImode mask into V8SImode:
20362 mask = { A B C D }
20363 t1 = { A A B B C C D D }. */
20364 for (i = 0; i < w / 2; ++i)
20365 vec[i*2 + 1] = vec[i*2] = GEN_INT (i * 2);
20366 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
20367 vt = force_reg (maskmode, vt);
20368 mask = gen_lowpart (maskmode, mask);
20369 if (maskmode == V8SImode)
20370 emit_insn (gen_avx2_permvarv8si (t1, mask, vt));
20371 else
20372 emit_insn (gen_avx2_pshufbv32qi3 (t1, mask, vt));
20373
20374 /* Multiply the shuffle indicies by two. */
20375 t1 = expand_simple_binop (maskmode, PLUS, t1, t1, t1, 1,
20376 OPTAB_DIRECT);
20377
20378 /* Add one to the odd shuffle indicies:
20379 t1 = { A*2, A*2+1, B*2, B*2+1, ... }. */
20380 for (i = 0; i < w / 2; ++i)
20381 {
20382 vec[i * 2] = const0_rtx;
20383 vec[i * 2 + 1] = const1_rtx;
20384 }
20385 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
20386 vt = force_const_mem (maskmode, vt);
20387 t1 = expand_simple_binop (maskmode, PLUS, t1, vt, t1, 1,
20388 OPTAB_DIRECT);
20389
20390 /* Continue as if V8SImode (resp. V32QImode) was used initially. */
20391 operands[3] = mask = t1;
20392 target = gen_lowpart (mode, target);
20393 op0 = gen_lowpart (mode, op0);
20394 op1 = gen_lowpart (mode, op1);
20395 }
20396
20397 switch (mode)
20398 {
20399 case V8SImode:
20400 /* The VPERMD and VPERMPS instructions already properly ignore
20401 the high bits of the shuffle elements. No need for us to
20402 perform an AND ourselves. */
20403 if (one_operand_shuffle)
20404 emit_insn (gen_avx2_permvarv8si (target, op0, mask));
20405 else
20406 {
20407 t1 = gen_reg_rtx (V8SImode);
20408 t2 = gen_reg_rtx (V8SImode);
20409 emit_insn (gen_avx2_permvarv8si (t1, op0, mask));
20410 emit_insn (gen_avx2_permvarv8si (t2, op1, mask));
20411 goto merge_two;
20412 }
20413 return;
20414
20415 case V8SFmode:
20416 mask = gen_lowpart (V8SFmode, mask);
20417 if (one_operand_shuffle)
20418 emit_insn (gen_avx2_permvarv8sf (target, op0, mask));
20419 else
20420 {
20421 t1 = gen_reg_rtx (V8SFmode);
20422 t2 = gen_reg_rtx (V8SFmode);
20423 emit_insn (gen_avx2_permvarv8sf (t1, op0, mask));
20424 emit_insn (gen_avx2_permvarv8sf (t2, op1, mask));
20425 goto merge_two;
20426 }
20427 return;
20428
20429 case V4SImode:
20430 /* By combining the two 128-bit input vectors into one 256-bit
20431 input vector, we can use VPERMD and VPERMPS for the full
20432 two-operand shuffle. */
20433 t1 = gen_reg_rtx (V8SImode);
20434 t2 = gen_reg_rtx (V8SImode);
20435 emit_insn (gen_avx_vec_concatv8si (t1, op0, op1));
20436 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
20437 emit_insn (gen_avx2_permvarv8si (t1, t1, t2));
20438 emit_insn (gen_avx_vextractf128v8si (target, t1, const0_rtx));
20439 return;
20440
20441 case V4SFmode:
20442 t1 = gen_reg_rtx (V8SFmode);
20443 t2 = gen_reg_rtx (V8SImode);
20444 mask = gen_lowpart (V4SImode, mask);
20445 emit_insn (gen_avx_vec_concatv8sf (t1, op0, op1));
20446 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
20447 emit_insn (gen_avx2_permvarv8sf (t1, t1, t2));
20448 emit_insn (gen_avx_vextractf128v8sf (target, t1, const0_rtx));
20449 return;
20450
20451 case V32QImode:
20452 t1 = gen_reg_rtx (V32QImode);
20453 t2 = gen_reg_rtx (V32QImode);
20454 t3 = gen_reg_rtx (V32QImode);
20455 vt2 = GEN_INT (128);
20456 for (i = 0; i < 32; i++)
20457 vec[i] = vt2;
20458 vt = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
20459 vt = force_reg (V32QImode, vt);
20460 for (i = 0; i < 32; i++)
20461 vec[i] = i < 16 ? vt2 : const0_rtx;
20462 vt2 = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
20463 vt2 = force_reg (V32QImode, vt2);
20464 /* From mask create two adjusted masks, which contain the same
20465 bits as mask in the low 7 bits of each vector element.
20466 The first mask will have the most significant bit clear
20467 if it requests element from the same 128-bit lane
20468 and MSB set if it requests element from the other 128-bit lane.
20469 The second mask will have the opposite values of the MSB,
20470 and additionally will have its 128-bit lanes swapped.
20471 E.g. { 07 12 1e 09 ... | 17 19 05 1f ... } mask vector will have
20472 t1 { 07 92 9e 09 ... | 17 19 85 1f ... } and
20473 t3 { 97 99 05 9f ... | 87 12 1e 89 ... } where each ...
20474 stands for other 12 bytes. */
20475 /* The bit whether element is from the same lane or the other
20476 lane is bit 4, so shift it up by 3 to the MSB position. */
20477 emit_insn (gen_ashlv4di3 (gen_lowpart (V4DImode, t1),
20478 gen_lowpart (V4DImode, mask),
20479 GEN_INT (3)));
20480 /* Clear MSB bits from the mask just in case it had them set. */
20481 emit_insn (gen_avx2_andnotv32qi3 (t2, vt, mask));
20482 /* After this t1 will have MSB set for elements from other lane. */
20483 emit_insn (gen_xorv32qi3 (t1, t1, vt2));
20484 /* Clear bits other than MSB. */
20485 emit_insn (gen_andv32qi3 (t1, t1, vt));
20486 /* Or in the lower bits from mask into t3. */
20487 emit_insn (gen_iorv32qi3 (t3, t1, t2));
20488 /* And invert MSB bits in t1, so MSB is set for elements from the same
20489 lane. */
20490 emit_insn (gen_xorv32qi3 (t1, t1, vt));
20491 /* Swap 128-bit lanes in t3. */
20492 emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t3),
20493 gen_lowpart (V4DImode, t3),
20494 const2_rtx, GEN_INT (3),
20495 const0_rtx, const1_rtx));
20496 /* And or in the lower bits from mask into t1. */
20497 emit_insn (gen_iorv32qi3 (t1, t1, t2));
20498 if (one_operand_shuffle)
20499 {
20500 /* Each of these shuffles will put 0s in places where
20501 element from the other 128-bit lane is needed, otherwise
20502 will shuffle in the requested value. */
20503 emit_insn (gen_avx2_pshufbv32qi3 (t3, op0, t3));
20504 emit_insn (gen_avx2_pshufbv32qi3 (t1, op0, t1));
20505 /* For t3 the 128-bit lanes are swapped again. */
20506 emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t3),
20507 gen_lowpart (V4DImode, t3),
20508 const2_rtx, GEN_INT (3),
20509 const0_rtx, const1_rtx));
20510 /* And oring both together leads to the result. */
20511 emit_insn (gen_iorv32qi3 (target, t1, t3));
20512 return;
20513 }
20514
20515 t4 = gen_reg_rtx (V32QImode);
20516 /* Similarly to the above one_operand_shuffle code,
20517 just for repeated twice for each operand. merge_two:
20518 code will merge the two results together. */
20519 emit_insn (gen_avx2_pshufbv32qi3 (t4, op0, t3));
20520 emit_insn (gen_avx2_pshufbv32qi3 (t3, op1, t3));
20521 emit_insn (gen_avx2_pshufbv32qi3 (t2, op0, t1));
20522 emit_insn (gen_avx2_pshufbv32qi3 (t1, op1, t1));
20523 emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t4),
20524 gen_lowpart (V4DImode, t4),
20525 const2_rtx, GEN_INT (3),
20526 const0_rtx, const1_rtx));
20527 emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t3),
20528 gen_lowpart (V4DImode, t3),
20529 const2_rtx, GEN_INT (3),
20530 const0_rtx, const1_rtx));
20531 emit_insn (gen_iorv32qi3 (t4, t2, t4));
20532 emit_insn (gen_iorv32qi3 (t3, t1, t3));
20533 t1 = t4;
20534 t2 = t3;
20535 goto merge_two;
20536
20537 default:
20538 gcc_assert (GET_MODE_SIZE (mode) <= 16);
20539 break;
20540 }
20541 }
20542
20543 if (TARGET_XOP)
20544 {
20545 /* The XOP VPPERM insn supports three inputs. By ignoring the
20546 one_operand_shuffle special case, we avoid creating another
20547 set of constant vectors in memory. */
20548 one_operand_shuffle = false;
20549
20550 /* mask = mask & {2*w-1, ...} */
20551 vt = GEN_INT (2*w - 1);
20552 }
20553 else
20554 {
20555 /* mask = mask & {w-1, ...} */
20556 vt = GEN_INT (w - 1);
20557 }
20558
20559 for (i = 0; i < w; i++)
20560 vec[i] = vt;
20561 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
20562 mask = expand_simple_binop (maskmode, AND, mask, vt,
20563 NULL_RTX, 0, OPTAB_DIRECT);
20564
20565 /* For non-QImode operations, convert the word permutation control
20566 into a byte permutation control. */
20567 if (mode != V16QImode)
20568 {
20569 mask = expand_simple_binop (maskmode, ASHIFT, mask,
20570 GEN_INT (exact_log2 (e)),
20571 NULL_RTX, 0, OPTAB_DIRECT);
20572
20573 /* Convert mask to vector of chars. */
20574 mask = force_reg (V16QImode, gen_lowpart (V16QImode, mask));
20575
20576 /* Replicate each of the input bytes into byte positions:
20577 (v2di) --> {0,0,0,0,0,0,0,0, 8,8,8,8,8,8,8,8}
20578 (v4si) --> {0,0,0,0, 4,4,4,4, 8,8,8,8, 12,12,12,12}
20579 (v8hi) --> {0,0, 2,2, 4,4, 6,6, ...}. */
20580 for (i = 0; i < 16; ++i)
20581 vec[i] = GEN_INT (i/e * e);
20582 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
20583 vt = force_const_mem (V16QImode, vt);
20584 if (TARGET_XOP)
20585 emit_insn (gen_xop_pperm (mask, mask, mask, vt));
20586 else
20587 emit_insn (gen_ssse3_pshufbv16qi3 (mask, mask, vt));
20588
20589 /* Convert it into the byte positions by doing
20590 mask = mask + {0,1,..,16/w, 0,1,..,16/w, ...} */
20591 for (i = 0; i < 16; ++i)
20592 vec[i] = GEN_INT (i % e);
20593 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
20594 vt = force_const_mem (V16QImode, vt);
20595 emit_insn (gen_addv16qi3 (mask, mask, vt));
20596 }
20597
20598 /* The actual shuffle operations all operate on V16QImode. */
20599 op0 = gen_lowpart (V16QImode, op0);
20600 op1 = gen_lowpart (V16QImode, op1);
20601 target = gen_lowpart (V16QImode, target);
20602
20603 if (TARGET_XOP)
20604 {
20605 emit_insn (gen_xop_pperm (target, op0, op1, mask));
20606 }
20607 else if (one_operand_shuffle)
20608 {
20609 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, mask));
20610 }
20611 else
20612 {
20613 rtx xops[6];
20614 bool ok;
20615
20616 /* Shuffle the two input vectors independently. */
20617 t1 = gen_reg_rtx (V16QImode);
20618 t2 = gen_reg_rtx (V16QImode);
20619 emit_insn (gen_ssse3_pshufbv16qi3 (t1, op0, mask));
20620 emit_insn (gen_ssse3_pshufbv16qi3 (t2, op1, mask));
20621
20622 merge_two:
20623 /* Then merge them together. The key is whether any given control
20624 element contained a bit set that indicates the second word. */
20625 mask = operands[3];
20626 vt = GEN_INT (w);
20627 if (maskmode == V2DImode && !TARGET_SSE4_1)
20628 {
20629 /* Without SSE4.1, we don't have V2DImode EQ. Perform one
20630 more shuffle to convert the V2DI input mask into a V4SI
20631 input mask. At which point the masking that expand_int_vcond
20632 will work as desired. */
20633 rtx t3 = gen_reg_rtx (V4SImode);
20634 emit_insn (gen_sse2_pshufd_1 (t3, gen_lowpart (V4SImode, mask),
20635 const0_rtx, const0_rtx,
20636 const2_rtx, const2_rtx));
20637 mask = t3;
20638 maskmode = V4SImode;
20639 e = w = 4;
20640 }
20641
20642 for (i = 0; i < w; i++)
20643 vec[i] = vt;
20644 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
20645 vt = force_reg (maskmode, vt);
20646 mask = expand_simple_binop (maskmode, AND, mask, vt,
20647 NULL_RTX, 0, OPTAB_DIRECT);
20648
20649 xops[0] = gen_lowpart (mode, operands[0]);
20650 xops[1] = gen_lowpart (mode, t2);
20651 xops[2] = gen_lowpart (mode, t1);
20652 xops[3] = gen_rtx_EQ (maskmode, mask, vt);
20653 xops[4] = mask;
20654 xops[5] = vt;
20655 ok = ix86_expand_int_vcond (xops);
20656 gcc_assert (ok);
20657 }
20658 }
20659
20660 /* Unpack OP[1] into the next wider integer vector type. UNSIGNED_P is
20661 true if we should do zero extension, else sign extension. HIGH_P is
20662 true if we want the N/2 high elements, else the low elements. */
20663
20664 void
20665 ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p)
20666 {
20667 enum machine_mode imode = GET_MODE (src);
20668 rtx tmp;
20669
20670 if (TARGET_SSE4_1)
20671 {
20672 rtx (*unpack)(rtx, rtx);
20673 rtx (*extract)(rtx, rtx) = NULL;
20674 enum machine_mode halfmode = BLKmode;
20675
20676 switch (imode)
20677 {
20678 case V32QImode:
20679 if (unsigned_p)
20680 unpack = gen_avx2_zero_extendv16qiv16hi2;
20681 else
20682 unpack = gen_avx2_sign_extendv16qiv16hi2;
20683 halfmode = V16QImode;
20684 extract
20685 = high_p ? gen_vec_extract_hi_v32qi : gen_vec_extract_lo_v32qi;
20686 break;
20687 case V16HImode:
20688 if (unsigned_p)
20689 unpack = gen_avx2_zero_extendv8hiv8si2;
20690 else
20691 unpack = gen_avx2_sign_extendv8hiv8si2;
20692 halfmode = V8HImode;
20693 extract
20694 = high_p ? gen_vec_extract_hi_v16hi : gen_vec_extract_lo_v16hi;
20695 break;
20696 case V8SImode:
20697 if (unsigned_p)
20698 unpack = gen_avx2_zero_extendv4siv4di2;
20699 else
20700 unpack = gen_avx2_sign_extendv4siv4di2;
20701 halfmode = V4SImode;
20702 extract
20703 = high_p ? gen_vec_extract_hi_v8si : gen_vec_extract_lo_v8si;
20704 break;
20705 case V16QImode:
20706 if (unsigned_p)
20707 unpack = gen_sse4_1_zero_extendv8qiv8hi2;
20708 else
20709 unpack = gen_sse4_1_sign_extendv8qiv8hi2;
20710 break;
20711 case V8HImode:
20712 if (unsigned_p)
20713 unpack = gen_sse4_1_zero_extendv4hiv4si2;
20714 else
20715 unpack = gen_sse4_1_sign_extendv4hiv4si2;
20716 break;
20717 case V4SImode:
20718 if (unsigned_p)
20719 unpack = gen_sse4_1_zero_extendv2siv2di2;
20720 else
20721 unpack = gen_sse4_1_sign_extendv2siv2di2;
20722 break;
20723 default:
20724 gcc_unreachable ();
20725 }
20726
20727 if (GET_MODE_SIZE (imode) == 32)
20728 {
20729 tmp = gen_reg_rtx (halfmode);
20730 emit_insn (extract (tmp, src));
20731 }
20732 else if (high_p)
20733 {
20734 /* Shift higher 8 bytes to lower 8 bytes. */
20735 tmp = gen_reg_rtx (imode);
20736 emit_insn (gen_sse2_lshrv1ti3 (gen_lowpart (V1TImode, tmp),
20737 gen_lowpart (V1TImode, src),
20738 GEN_INT (64)));
20739 }
20740 else
20741 tmp = src;
20742
20743 emit_insn (unpack (dest, tmp));
20744 }
20745 else
20746 {
20747 rtx (*unpack)(rtx, rtx, rtx);
20748
20749 switch (imode)
20750 {
20751 case V16QImode:
20752 if (high_p)
20753 unpack = gen_vec_interleave_highv16qi;
20754 else
20755 unpack = gen_vec_interleave_lowv16qi;
20756 break;
20757 case V8HImode:
20758 if (high_p)
20759 unpack = gen_vec_interleave_highv8hi;
20760 else
20761 unpack = gen_vec_interleave_lowv8hi;
20762 break;
20763 case V4SImode:
20764 if (high_p)
20765 unpack = gen_vec_interleave_highv4si;
20766 else
20767 unpack = gen_vec_interleave_lowv4si;
20768 break;
20769 default:
20770 gcc_unreachable ();
20771 }
20772
20773 if (unsigned_p)
20774 tmp = force_reg (imode, CONST0_RTX (imode));
20775 else
20776 tmp = ix86_expand_sse_cmp (gen_reg_rtx (imode), GT, CONST0_RTX (imode),
20777 src, pc_rtx, pc_rtx);
20778
20779 emit_insn (unpack (gen_lowpart (imode, dest), src, tmp));
20780 }
20781 }
20782
20783 /* Expand conditional increment or decrement using adb/sbb instructions.
20784 The default case using setcc followed by the conditional move can be
20785 done by generic code. */
20786 bool
20787 ix86_expand_int_addcc (rtx operands[])
20788 {
20789 enum rtx_code code = GET_CODE (operands[1]);
20790 rtx flags;
20791 rtx (*insn)(rtx, rtx, rtx, rtx, rtx);
20792 rtx compare_op;
20793 rtx val = const0_rtx;
20794 bool fpcmp = false;
20795 enum machine_mode mode;
20796 rtx op0 = XEXP (operands[1], 0);
20797 rtx op1 = XEXP (operands[1], 1);
20798
20799 if (operands[3] != const1_rtx
20800 && operands[3] != constm1_rtx)
20801 return false;
20802 if (!ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
20803 return false;
20804 code = GET_CODE (compare_op);
20805
20806 flags = XEXP (compare_op, 0);
20807
20808 if (GET_MODE (flags) == CCFPmode
20809 || GET_MODE (flags) == CCFPUmode)
20810 {
20811 fpcmp = true;
20812 code = ix86_fp_compare_code_to_integer (code);
20813 }
20814
20815 if (code != LTU)
20816 {
20817 val = constm1_rtx;
20818 if (fpcmp)
20819 PUT_CODE (compare_op,
20820 reverse_condition_maybe_unordered
20821 (GET_CODE (compare_op)));
20822 else
20823 PUT_CODE (compare_op, reverse_condition (GET_CODE (compare_op)));
20824 }
20825
20826 mode = GET_MODE (operands[0]);
20827
20828 /* Construct either adc or sbb insn. */
20829 if ((code == LTU) == (operands[3] == constm1_rtx))
20830 {
20831 switch (mode)
20832 {
20833 case QImode:
20834 insn = gen_subqi3_carry;
20835 break;
20836 case HImode:
20837 insn = gen_subhi3_carry;
20838 break;
20839 case SImode:
20840 insn = gen_subsi3_carry;
20841 break;
20842 case DImode:
20843 insn = gen_subdi3_carry;
20844 break;
20845 default:
20846 gcc_unreachable ();
20847 }
20848 }
20849 else
20850 {
20851 switch (mode)
20852 {
20853 case QImode:
20854 insn = gen_addqi3_carry;
20855 break;
20856 case HImode:
20857 insn = gen_addhi3_carry;
20858 break;
20859 case SImode:
20860 insn = gen_addsi3_carry;
20861 break;
20862 case DImode:
20863 insn = gen_adddi3_carry;
20864 break;
20865 default:
20866 gcc_unreachable ();
20867 }
20868 }
20869 emit_insn (insn (operands[0], operands[2], val, flags, compare_op));
20870
20871 return true;
20872 }
20873
20874
20875 /* Split operands 0 and 1 into half-mode parts. Similar to split_double_mode,
20876 but works for floating pointer parameters and nonoffsetable memories.
20877 For pushes, it returns just stack offsets; the values will be saved
20878 in the right order. Maximally three parts are generated. */
20879
20880 static int
20881 ix86_split_to_parts (rtx operand, rtx *parts, enum machine_mode mode)
20882 {
20883 int size;
20884
20885 if (!TARGET_64BIT)
20886 size = mode==XFmode ? 3 : GET_MODE_SIZE (mode) / 4;
20887 else
20888 size = (GET_MODE_SIZE (mode) + 4) / 8;
20889
20890 gcc_assert (!REG_P (operand) || !MMX_REGNO_P (REGNO (operand)));
20891 gcc_assert (size >= 2 && size <= 4);
20892
20893 /* Optimize constant pool reference to immediates. This is used by fp
20894 moves, that force all constants to memory to allow combining. */
20895 if (MEM_P (operand) && MEM_READONLY_P (operand))
20896 {
20897 rtx tmp = maybe_get_pool_constant (operand);
20898 if (tmp)
20899 operand = tmp;
20900 }
20901
20902 if (MEM_P (operand) && !offsettable_memref_p (operand))
20903 {
20904 /* The only non-offsetable memories we handle are pushes. */
20905 int ok = push_operand (operand, VOIDmode);
20906
20907 gcc_assert (ok);
20908
20909 operand = copy_rtx (operand);
20910 PUT_MODE (operand, word_mode);
20911 parts[0] = parts[1] = parts[2] = parts[3] = operand;
20912 return size;
20913 }
20914
20915 if (GET_CODE (operand) == CONST_VECTOR)
20916 {
20917 enum machine_mode imode = int_mode_for_mode (mode);
20918 /* Caution: if we looked through a constant pool memory above,
20919 the operand may actually have a different mode now. That's
20920 ok, since we want to pun this all the way back to an integer. */
20921 operand = simplify_subreg (imode, operand, GET_MODE (operand), 0);
20922 gcc_assert (operand != NULL);
20923 mode = imode;
20924 }
20925
20926 if (!TARGET_64BIT)
20927 {
20928 if (mode == DImode)
20929 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
20930 else
20931 {
20932 int i;
20933
20934 if (REG_P (operand))
20935 {
20936 gcc_assert (reload_completed);
20937 for (i = 0; i < size; i++)
20938 parts[i] = gen_rtx_REG (SImode, REGNO (operand) + i);
20939 }
20940 else if (offsettable_memref_p (operand))
20941 {
20942 operand = adjust_address (operand, SImode, 0);
20943 parts[0] = operand;
20944 for (i = 1; i < size; i++)
20945 parts[i] = adjust_address (operand, SImode, 4 * i);
20946 }
20947 else if (GET_CODE (operand) == CONST_DOUBLE)
20948 {
20949 REAL_VALUE_TYPE r;
20950 long l[4];
20951
20952 REAL_VALUE_FROM_CONST_DOUBLE (r, operand);
20953 switch (mode)
20954 {
20955 case TFmode:
20956 real_to_target (l, &r, mode);
20957 parts[3] = gen_int_mode (l[3], SImode);
20958 parts[2] = gen_int_mode (l[2], SImode);
20959 break;
20960 case XFmode:
20961 /* We can't use REAL_VALUE_TO_TARGET_LONG_DOUBLE since
20962 long double may not be 80-bit. */
20963 real_to_target (l, &r, mode);
20964 parts[2] = gen_int_mode (l[2], SImode);
20965 break;
20966 case DFmode:
20967 REAL_VALUE_TO_TARGET_DOUBLE (r, l);
20968 break;
20969 default:
20970 gcc_unreachable ();
20971 }
20972 parts[1] = gen_int_mode (l[1], SImode);
20973 parts[0] = gen_int_mode (l[0], SImode);
20974 }
20975 else
20976 gcc_unreachable ();
20977 }
20978 }
20979 else
20980 {
20981 if (mode == TImode)
20982 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
20983 if (mode == XFmode || mode == TFmode)
20984 {
20985 enum machine_mode upper_mode = mode==XFmode ? SImode : DImode;
20986 if (REG_P (operand))
20987 {
20988 gcc_assert (reload_completed);
20989 parts[0] = gen_rtx_REG (DImode, REGNO (operand) + 0);
20990 parts[1] = gen_rtx_REG (upper_mode, REGNO (operand) + 1);
20991 }
20992 else if (offsettable_memref_p (operand))
20993 {
20994 operand = adjust_address (operand, DImode, 0);
20995 parts[0] = operand;
20996 parts[1] = adjust_address (operand, upper_mode, 8);
20997 }
20998 else if (GET_CODE (operand) == CONST_DOUBLE)
20999 {
21000 REAL_VALUE_TYPE r;
21001 long l[4];
21002
21003 REAL_VALUE_FROM_CONST_DOUBLE (r, operand);
21004 real_to_target (l, &r, mode);
21005
21006 /* Do not use shift by 32 to avoid warning on 32bit systems. */
21007 if (HOST_BITS_PER_WIDE_INT >= 64)
21008 parts[0]
21009 = gen_int_mode
21010 ((l[0] & (((HOST_WIDE_INT) 2 << 31) - 1))
21011 + ((((HOST_WIDE_INT) l[1]) << 31) << 1),
21012 DImode);
21013 else
21014 parts[0] = immed_double_const (l[0], l[1], DImode);
21015
21016 if (upper_mode == SImode)
21017 parts[1] = gen_int_mode (l[2], SImode);
21018 else if (HOST_BITS_PER_WIDE_INT >= 64)
21019 parts[1]
21020 = gen_int_mode
21021 ((l[2] & (((HOST_WIDE_INT) 2 << 31) - 1))
21022 + ((((HOST_WIDE_INT) l[3]) << 31) << 1),
21023 DImode);
21024 else
21025 parts[1] = immed_double_const (l[2], l[3], DImode);
21026 }
21027 else
21028 gcc_unreachable ();
21029 }
21030 }
21031
21032 return size;
21033 }
21034
21035 /* Emit insns to perform a move or push of DI, DF, XF, and TF values.
21036 Return false when normal moves are needed; true when all required
21037 insns have been emitted. Operands 2-4 contain the input values
21038 int the correct order; operands 5-7 contain the output values. */
21039
21040 void
21041 ix86_split_long_move (rtx operands[])
21042 {
21043 rtx part[2][4];
21044 int nparts, i, j;
21045 int push = 0;
21046 int collisions = 0;
21047 enum machine_mode mode = GET_MODE (operands[0]);
21048 bool collisionparts[4];
21049
21050 /* The DFmode expanders may ask us to move double.
21051 For 64bit target this is single move. By hiding the fact
21052 here we simplify i386.md splitters. */
21053 if (TARGET_64BIT && GET_MODE_SIZE (GET_MODE (operands[0])) == 8)
21054 {
21055 /* Optimize constant pool reference to immediates. This is used by
21056 fp moves, that force all constants to memory to allow combining. */
21057
21058 if (MEM_P (operands[1])
21059 && GET_CODE (XEXP (operands[1], 0)) == SYMBOL_REF
21060 && CONSTANT_POOL_ADDRESS_P (XEXP (operands[1], 0)))
21061 operands[1] = get_pool_constant (XEXP (operands[1], 0));
21062 if (push_operand (operands[0], VOIDmode))
21063 {
21064 operands[0] = copy_rtx (operands[0]);
21065 PUT_MODE (operands[0], word_mode);
21066 }
21067 else
21068 operands[0] = gen_lowpart (DImode, operands[0]);
21069 operands[1] = gen_lowpart (DImode, operands[1]);
21070 emit_move_insn (operands[0], operands[1]);
21071 return;
21072 }
21073
21074 /* The only non-offsettable memory we handle is push. */
21075 if (push_operand (operands[0], VOIDmode))
21076 push = 1;
21077 else
21078 gcc_assert (!MEM_P (operands[0])
21079 || offsettable_memref_p (operands[0]));
21080
21081 nparts = ix86_split_to_parts (operands[1], part[1], GET_MODE (operands[0]));
21082 ix86_split_to_parts (operands[0], part[0], GET_MODE (operands[0]));
21083
21084 /* When emitting push, take care for source operands on the stack. */
21085 if (push && MEM_P (operands[1])
21086 && reg_overlap_mentioned_p (stack_pointer_rtx, operands[1]))
21087 {
21088 rtx src_base = XEXP (part[1][nparts - 1], 0);
21089
21090 /* Compensate for the stack decrement by 4. */
21091 if (!TARGET_64BIT && nparts == 3
21092 && mode == XFmode && TARGET_128BIT_LONG_DOUBLE)
21093 src_base = plus_constant (Pmode, src_base, 4);
21094
21095 /* src_base refers to the stack pointer and is
21096 automatically decreased by emitted push. */
21097 for (i = 0; i < nparts; i++)
21098 part[1][i] = change_address (part[1][i],
21099 GET_MODE (part[1][i]), src_base);
21100 }
21101
21102 /* We need to do copy in the right order in case an address register
21103 of the source overlaps the destination. */
21104 if (REG_P (part[0][0]) && MEM_P (part[1][0]))
21105 {
21106 rtx tmp;
21107
21108 for (i = 0; i < nparts; i++)
21109 {
21110 collisionparts[i]
21111 = reg_overlap_mentioned_p (part[0][i], XEXP (part[1][0], 0));
21112 if (collisionparts[i])
21113 collisions++;
21114 }
21115
21116 /* Collision in the middle part can be handled by reordering. */
21117 if (collisions == 1 && nparts == 3 && collisionparts [1])
21118 {
21119 tmp = part[0][1]; part[0][1] = part[0][2]; part[0][2] = tmp;
21120 tmp = part[1][1]; part[1][1] = part[1][2]; part[1][2] = tmp;
21121 }
21122 else if (collisions == 1
21123 && nparts == 4
21124 && (collisionparts [1] || collisionparts [2]))
21125 {
21126 if (collisionparts [1])
21127 {
21128 tmp = part[0][1]; part[0][1] = part[0][2]; part[0][2] = tmp;
21129 tmp = part[1][1]; part[1][1] = part[1][2]; part[1][2] = tmp;
21130 }
21131 else
21132 {
21133 tmp = part[0][2]; part[0][2] = part[0][3]; part[0][3] = tmp;
21134 tmp = part[1][2]; part[1][2] = part[1][3]; part[1][3] = tmp;
21135 }
21136 }
21137
21138 /* If there are more collisions, we can't handle it by reordering.
21139 Do an lea to the last part and use only one colliding move. */
21140 else if (collisions > 1)
21141 {
21142 rtx base;
21143
21144 collisions = 1;
21145
21146 base = part[0][nparts - 1];
21147
21148 /* Handle the case when the last part isn't valid for lea.
21149 Happens in 64-bit mode storing the 12-byte XFmode. */
21150 if (GET_MODE (base) != Pmode)
21151 base = gen_rtx_REG (Pmode, REGNO (base));
21152
21153 emit_insn (gen_rtx_SET (VOIDmode, base, XEXP (part[1][0], 0)));
21154 part[1][0] = replace_equiv_address (part[1][0], base);
21155 for (i = 1; i < nparts; i++)
21156 {
21157 tmp = plus_constant (Pmode, base, UNITS_PER_WORD * i);
21158 part[1][i] = replace_equiv_address (part[1][i], tmp);
21159 }
21160 }
21161 }
21162
21163 if (push)
21164 {
21165 if (!TARGET_64BIT)
21166 {
21167 if (nparts == 3)
21168 {
21169 if (TARGET_128BIT_LONG_DOUBLE && mode == XFmode)
21170 emit_insn (ix86_gen_add3 (stack_pointer_rtx,
21171 stack_pointer_rtx, GEN_INT (-4)));
21172 emit_move_insn (part[0][2], part[1][2]);
21173 }
21174 else if (nparts == 4)
21175 {
21176 emit_move_insn (part[0][3], part[1][3]);
21177 emit_move_insn (part[0][2], part[1][2]);
21178 }
21179 }
21180 else
21181 {
21182 /* In 64bit mode we don't have 32bit push available. In case this is
21183 register, it is OK - we will just use larger counterpart. We also
21184 retype memory - these comes from attempt to avoid REX prefix on
21185 moving of second half of TFmode value. */
21186 if (GET_MODE (part[1][1]) == SImode)
21187 {
21188 switch (GET_CODE (part[1][1]))
21189 {
21190 case MEM:
21191 part[1][1] = adjust_address (part[1][1], DImode, 0);
21192 break;
21193
21194 case REG:
21195 part[1][1] = gen_rtx_REG (DImode, REGNO (part[1][1]));
21196 break;
21197
21198 default:
21199 gcc_unreachable ();
21200 }
21201
21202 if (GET_MODE (part[1][0]) == SImode)
21203 part[1][0] = part[1][1];
21204 }
21205 }
21206 emit_move_insn (part[0][1], part[1][1]);
21207 emit_move_insn (part[0][0], part[1][0]);
21208 return;
21209 }
21210
21211 /* Choose correct order to not overwrite the source before it is copied. */
21212 if ((REG_P (part[0][0])
21213 && REG_P (part[1][1])
21214 && (REGNO (part[0][0]) == REGNO (part[1][1])
21215 || (nparts == 3
21216 && REGNO (part[0][0]) == REGNO (part[1][2]))
21217 || (nparts == 4
21218 && REGNO (part[0][0]) == REGNO (part[1][3]))))
21219 || (collisions > 0
21220 && reg_overlap_mentioned_p (part[0][0], XEXP (part[1][0], 0))))
21221 {
21222 for (i = 0, j = nparts - 1; i < nparts; i++, j--)
21223 {
21224 operands[2 + i] = part[0][j];
21225 operands[6 + i] = part[1][j];
21226 }
21227 }
21228 else
21229 {
21230 for (i = 0; i < nparts; i++)
21231 {
21232 operands[2 + i] = part[0][i];
21233 operands[6 + i] = part[1][i];
21234 }
21235 }
21236
21237 /* If optimizing for size, attempt to locally unCSE nonzero constants. */
21238 if (optimize_insn_for_size_p ())
21239 {
21240 for (j = 0; j < nparts - 1; j++)
21241 if (CONST_INT_P (operands[6 + j])
21242 && operands[6 + j] != const0_rtx
21243 && REG_P (operands[2 + j]))
21244 for (i = j; i < nparts - 1; i++)
21245 if (CONST_INT_P (operands[7 + i])
21246 && INTVAL (operands[7 + i]) == INTVAL (operands[6 + j]))
21247 operands[7 + i] = operands[2 + j];
21248 }
21249
21250 for (i = 0; i < nparts; i++)
21251 emit_move_insn (operands[2 + i], operands[6 + i]);
21252
21253 return;
21254 }
21255
21256 /* Helper function of ix86_split_ashl used to generate an SImode/DImode
21257 left shift by a constant, either using a single shift or
21258 a sequence of add instructions. */
21259
21260 static void
21261 ix86_expand_ashl_const (rtx operand, int count, enum machine_mode mode)
21262 {
21263 rtx (*insn)(rtx, rtx, rtx);
21264
21265 if (count == 1
21266 || (count * ix86_cost->add <= ix86_cost->shift_const
21267 && !optimize_insn_for_size_p ()))
21268 {
21269 insn = mode == DImode ? gen_addsi3 : gen_adddi3;
21270 while (count-- > 0)
21271 emit_insn (insn (operand, operand, operand));
21272 }
21273 else
21274 {
21275 insn = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
21276 emit_insn (insn (operand, operand, GEN_INT (count)));
21277 }
21278 }
21279
21280 void
21281 ix86_split_ashl (rtx *operands, rtx scratch, enum machine_mode mode)
21282 {
21283 rtx (*gen_ashl3)(rtx, rtx, rtx);
21284 rtx (*gen_shld)(rtx, rtx, rtx);
21285 int half_width = GET_MODE_BITSIZE (mode) >> 1;
21286
21287 rtx low[2], high[2];
21288 int count;
21289
21290 if (CONST_INT_P (operands[2]))
21291 {
21292 split_double_mode (mode, operands, 2, low, high);
21293 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
21294
21295 if (count >= half_width)
21296 {
21297 emit_move_insn (high[0], low[1]);
21298 emit_move_insn (low[0], const0_rtx);
21299
21300 if (count > half_width)
21301 ix86_expand_ashl_const (high[0], count - half_width, mode);
21302 }
21303 else
21304 {
21305 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
21306
21307 if (!rtx_equal_p (operands[0], operands[1]))
21308 emit_move_insn (operands[0], operands[1]);
21309
21310 emit_insn (gen_shld (high[0], low[0], GEN_INT (count)));
21311 ix86_expand_ashl_const (low[0], count, mode);
21312 }
21313 return;
21314 }
21315
21316 split_double_mode (mode, operands, 1, low, high);
21317
21318 gen_ashl3 = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
21319
21320 if (operands[1] == const1_rtx)
21321 {
21322 /* Assuming we've chosen a QImode capable registers, then 1 << N
21323 can be done with two 32/64-bit shifts, no branches, no cmoves. */
21324 if (ANY_QI_REG_P (low[0]) && ANY_QI_REG_P (high[0]))
21325 {
21326 rtx s, d, flags = gen_rtx_REG (CCZmode, FLAGS_REG);
21327
21328 ix86_expand_clear (low[0]);
21329 ix86_expand_clear (high[0]);
21330 emit_insn (gen_testqi_ccz_1 (operands[2], GEN_INT (half_width)));
21331
21332 d = gen_lowpart (QImode, low[0]);
21333 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
21334 s = gen_rtx_EQ (QImode, flags, const0_rtx);
21335 emit_insn (gen_rtx_SET (VOIDmode, d, s));
21336
21337 d = gen_lowpart (QImode, high[0]);
21338 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
21339 s = gen_rtx_NE (QImode, flags, const0_rtx);
21340 emit_insn (gen_rtx_SET (VOIDmode, d, s));
21341 }
21342
21343 /* Otherwise, we can get the same results by manually performing
21344 a bit extract operation on bit 5/6, and then performing the two
21345 shifts. The two methods of getting 0/1 into low/high are exactly
21346 the same size. Avoiding the shift in the bit extract case helps
21347 pentium4 a bit; no one else seems to care much either way. */
21348 else
21349 {
21350 enum machine_mode half_mode;
21351 rtx (*gen_lshr3)(rtx, rtx, rtx);
21352 rtx (*gen_and3)(rtx, rtx, rtx);
21353 rtx (*gen_xor3)(rtx, rtx, rtx);
21354 HOST_WIDE_INT bits;
21355 rtx x;
21356
21357 if (mode == DImode)
21358 {
21359 half_mode = SImode;
21360 gen_lshr3 = gen_lshrsi3;
21361 gen_and3 = gen_andsi3;
21362 gen_xor3 = gen_xorsi3;
21363 bits = 5;
21364 }
21365 else
21366 {
21367 half_mode = DImode;
21368 gen_lshr3 = gen_lshrdi3;
21369 gen_and3 = gen_anddi3;
21370 gen_xor3 = gen_xordi3;
21371 bits = 6;
21372 }
21373
21374 if (TARGET_PARTIAL_REG_STALL && !optimize_insn_for_size_p ())
21375 x = gen_rtx_ZERO_EXTEND (half_mode, operands[2]);
21376 else
21377 x = gen_lowpart (half_mode, operands[2]);
21378 emit_insn (gen_rtx_SET (VOIDmode, high[0], x));
21379
21380 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (bits)));
21381 emit_insn (gen_and3 (high[0], high[0], const1_rtx));
21382 emit_move_insn (low[0], high[0]);
21383 emit_insn (gen_xor3 (low[0], low[0], const1_rtx));
21384 }
21385
21386 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
21387 emit_insn (gen_ashl3 (high[0], high[0], operands[2]));
21388 return;
21389 }
21390
21391 if (operands[1] == constm1_rtx)
21392 {
21393 /* For -1 << N, we can avoid the shld instruction, because we
21394 know that we're shifting 0...31/63 ones into a -1. */
21395 emit_move_insn (low[0], constm1_rtx);
21396 if (optimize_insn_for_size_p ())
21397 emit_move_insn (high[0], low[0]);
21398 else
21399 emit_move_insn (high[0], constm1_rtx);
21400 }
21401 else
21402 {
21403 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
21404
21405 if (!rtx_equal_p (operands[0], operands[1]))
21406 emit_move_insn (operands[0], operands[1]);
21407
21408 split_double_mode (mode, operands, 1, low, high);
21409 emit_insn (gen_shld (high[0], low[0], operands[2]));
21410 }
21411
21412 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
21413
21414 if (TARGET_CMOVE && scratch)
21415 {
21416 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
21417 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
21418
21419 ix86_expand_clear (scratch);
21420 emit_insn (gen_x86_shift_adj_1 (high[0], low[0], operands[2], scratch));
21421 }
21422 else
21423 {
21424 rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
21425 = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
21426
21427 emit_insn (gen_x86_shift_adj_2 (high[0], low[0], operands[2]));
21428 }
21429 }
21430
21431 void
21432 ix86_split_ashr (rtx *operands, rtx scratch, enum machine_mode mode)
21433 {
21434 rtx (*gen_ashr3)(rtx, rtx, rtx)
21435 = mode == DImode ? gen_ashrsi3 : gen_ashrdi3;
21436 rtx (*gen_shrd)(rtx, rtx, rtx);
21437 int half_width = GET_MODE_BITSIZE (mode) >> 1;
21438
21439 rtx low[2], high[2];
21440 int count;
21441
21442 if (CONST_INT_P (operands[2]))
21443 {
21444 split_double_mode (mode, operands, 2, low, high);
21445 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
21446
21447 if (count == GET_MODE_BITSIZE (mode) - 1)
21448 {
21449 emit_move_insn (high[0], high[1]);
21450 emit_insn (gen_ashr3 (high[0], high[0],
21451 GEN_INT (half_width - 1)));
21452 emit_move_insn (low[0], high[0]);
21453
21454 }
21455 else if (count >= half_width)
21456 {
21457 emit_move_insn (low[0], high[1]);
21458 emit_move_insn (high[0], low[0]);
21459 emit_insn (gen_ashr3 (high[0], high[0],
21460 GEN_INT (half_width - 1)));
21461
21462 if (count > half_width)
21463 emit_insn (gen_ashr3 (low[0], low[0],
21464 GEN_INT (count - half_width)));
21465 }
21466 else
21467 {
21468 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
21469
21470 if (!rtx_equal_p (operands[0], operands[1]))
21471 emit_move_insn (operands[0], operands[1]);
21472
21473 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
21474 emit_insn (gen_ashr3 (high[0], high[0], GEN_INT (count)));
21475 }
21476 }
21477 else
21478 {
21479 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
21480
21481 if (!rtx_equal_p (operands[0], operands[1]))
21482 emit_move_insn (operands[0], operands[1]);
21483
21484 split_double_mode (mode, operands, 1, low, high);
21485
21486 emit_insn (gen_shrd (low[0], high[0], operands[2]));
21487 emit_insn (gen_ashr3 (high[0], high[0], operands[2]));
21488
21489 if (TARGET_CMOVE && scratch)
21490 {
21491 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
21492 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
21493
21494 emit_move_insn (scratch, high[0]);
21495 emit_insn (gen_ashr3 (scratch, scratch,
21496 GEN_INT (half_width - 1)));
21497 emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
21498 scratch));
21499 }
21500 else
21501 {
21502 rtx (*gen_x86_shift_adj_3)(rtx, rtx, rtx)
21503 = mode == DImode ? gen_x86_shiftsi_adj_3 : gen_x86_shiftdi_adj_3;
21504
21505 emit_insn (gen_x86_shift_adj_3 (low[0], high[0], operands[2]));
21506 }
21507 }
21508 }
21509
21510 void
21511 ix86_split_lshr (rtx *operands, rtx scratch, enum machine_mode mode)
21512 {
21513 rtx (*gen_lshr3)(rtx, rtx, rtx)
21514 = mode == DImode ? gen_lshrsi3 : gen_lshrdi3;
21515 rtx (*gen_shrd)(rtx, rtx, rtx);
21516 int half_width = GET_MODE_BITSIZE (mode) >> 1;
21517
21518 rtx low[2], high[2];
21519 int count;
21520
21521 if (CONST_INT_P (operands[2]))
21522 {
21523 split_double_mode (mode, operands, 2, low, high);
21524 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
21525
21526 if (count >= half_width)
21527 {
21528 emit_move_insn (low[0], high[1]);
21529 ix86_expand_clear (high[0]);
21530
21531 if (count > half_width)
21532 emit_insn (gen_lshr3 (low[0], low[0],
21533 GEN_INT (count - half_width)));
21534 }
21535 else
21536 {
21537 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
21538
21539 if (!rtx_equal_p (operands[0], operands[1]))
21540 emit_move_insn (operands[0], operands[1]);
21541
21542 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
21543 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (count)));
21544 }
21545 }
21546 else
21547 {
21548 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
21549
21550 if (!rtx_equal_p (operands[0], operands[1]))
21551 emit_move_insn (operands[0], operands[1]);
21552
21553 split_double_mode (mode, operands, 1, low, high);
21554
21555 emit_insn (gen_shrd (low[0], high[0], operands[2]));
21556 emit_insn (gen_lshr3 (high[0], high[0], operands[2]));
21557
21558 if (TARGET_CMOVE && scratch)
21559 {
21560 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
21561 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
21562
21563 ix86_expand_clear (scratch);
21564 emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
21565 scratch));
21566 }
21567 else
21568 {
21569 rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
21570 = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
21571
21572 emit_insn (gen_x86_shift_adj_2 (low[0], high[0], operands[2]));
21573 }
21574 }
21575 }
21576
21577 /* Predict just emitted jump instruction to be taken with probability PROB. */
21578 static void
21579 predict_jump (int prob)
21580 {
21581 rtx insn = get_last_insn ();
21582 gcc_assert (JUMP_P (insn));
21583 add_reg_note (insn, REG_BR_PROB, GEN_INT (prob));
21584 }
21585
21586 /* Helper function for the string operations below. Dest VARIABLE whether
21587 it is aligned to VALUE bytes. If true, jump to the label. */
21588 static rtx
21589 ix86_expand_aligntest (rtx variable, int value, bool epilogue)
21590 {
21591 rtx label = gen_label_rtx ();
21592 rtx tmpcount = gen_reg_rtx (GET_MODE (variable));
21593 if (GET_MODE (variable) == DImode)
21594 emit_insn (gen_anddi3 (tmpcount, variable, GEN_INT (value)));
21595 else
21596 emit_insn (gen_andsi3 (tmpcount, variable, GEN_INT (value)));
21597 emit_cmp_and_jump_insns (tmpcount, const0_rtx, EQ, 0, GET_MODE (variable),
21598 1, label);
21599 if (epilogue)
21600 predict_jump (REG_BR_PROB_BASE * 50 / 100);
21601 else
21602 predict_jump (REG_BR_PROB_BASE * 90 / 100);
21603 return label;
21604 }
21605
21606 /* Adjust COUNTER by the VALUE. */
21607 static void
21608 ix86_adjust_counter (rtx countreg, HOST_WIDE_INT value)
21609 {
21610 rtx (*gen_add)(rtx, rtx, rtx)
21611 = GET_MODE (countreg) == DImode ? gen_adddi3 : gen_addsi3;
21612
21613 emit_insn (gen_add (countreg, countreg, GEN_INT (-value)));
21614 }
21615
21616 /* Zero extend possibly SImode EXP to Pmode register. */
21617 rtx
21618 ix86_zero_extend_to_Pmode (rtx exp)
21619 {
21620 if (GET_MODE (exp) != Pmode)
21621 exp = convert_to_mode (Pmode, exp, 1);
21622 return force_reg (Pmode, exp);
21623 }
21624
21625 /* Divide COUNTREG by SCALE. */
21626 static rtx
21627 scale_counter (rtx countreg, int scale)
21628 {
21629 rtx sc;
21630
21631 if (scale == 1)
21632 return countreg;
21633 if (CONST_INT_P (countreg))
21634 return GEN_INT (INTVAL (countreg) / scale);
21635 gcc_assert (REG_P (countreg));
21636
21637 sc = expand_simple_binop (GET_MODE (countreg), LSHIFTRT, countreg,
21638 GEN_INT (exact_log2 (scale)),
21639 NULL, 1, OPTAB_DIRECT);
21640 return sc;
21641 }
21642
21643 /* Return mode for the memcpy/memset loop counter. Prefer SImode over
21644 DImode for constant loop counts. */
21645
21646 static enum machine_mode
21647 counter_mode (rtx count_exp)
21648 {
21649 if (GET_MODE (count_exp) != VOIDmode)
21650 return GET_MODE (count_exp);
21651 if (!CONST_INT_P (count_exp))
21652 return Pmode;
21653 if (TARGET_64BIT && (INTVAL (count_exp) & ~0xffffffff))
21654 return DImode;
21655 return SImode;
21656 }
21657
21658 /* When SRCPTR is non-NULL, output simple loop to move memory
21659 pointer to SRCPTR to DESTPTR via chunks of MODE unrolled UNROLL times,
21660 overall size is COUNT specified in bytes. When SRCPTR is NULL, output the
21661 equivalent loop to set memory by VALUE (supposed to be in MODE).
21662
21663 The size is rounded down to whole number of chunk size moved at once.
21664 SRCMEM and DESTMEM provide MEMrtx to feed proper aliasing info. */
21665
21666
21667 static void
21668 expand_set_or_movmem_via_loop (rtx destmem, rtx srcmem,
21669 rtx destptr, rtx srcptr, rtx value,
21670 rtx count, enum machine_mode mode, int unroll,
21671 int expected_size)
21672 {
21673 rtx out_label, top_label, iter, tmp;
21674 enum machine_mode iter_mode = counter_mode (count);
21675 rtx piece_size = GEN_INT (GET_MODE_SIZE (mode) * unroll);
21676 rtx piece_size_mask = GEN_INT (~((GET_MODE_SIZE (mode) * unroll) - 1));
21677 rtx size;
21678 rtx x_addr;
21679 rtx y_addr;
21680 int i;
21681
21682 top_label = gen_label_rtx ();
21683 out_label = gen_label_rtx ();
21684 iter = gen_reg_rtx (iter_mode);
21685
21686 size = expand_simple_binop (iter_mode, AND, count, piece_size_mask,
21687 NULL, 1, OPTAB_DIRECT);
21688 /* Those two should combine. */
21689 if (piece_size == const1_rtx)
21690 {
21691 emit_cmp_and_jump_insns (size, const0_rtx, EQ, NULL_RTX, iter_mode,
21692 true, out_label);
21693 predict_jump (REG_BR_PROB_BASE * 10 / 100);
21694 }
21695 emit_move_insn (iter, const0_rtx);
21696
21697 emit_label (top_label);
21698
21699 tmp = convert_modes (Pmode, iter_mode, iter, true);
21700 x_addr = gen_rtx_PLUS (Pmode, destptr, tmp);
21701 destmem = change_address (destmem, mode, x_addr);
21702
21703 if (srcmem)
21704 {
21705 y_addr = gen_rtx_PLUS (Pmode, srcptr, copy_rtx (tmp));
21706 srcmem = change_address (srcmem, mode, y_addr);
21707
21708 /* When unrolling for chips that reorder memory reads and writes,
21709 we can save registers by using single temporary.
21710 Also using 4 temporaries is overkill in 32bit mode. */
21711 if (!TARGET_64BIT && 0)
21712 {
21713 for (i = 0; i < unroll; i++)
21714 {
21715 if (i)
21716 {
21717 destmem =
21718 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
21719 srcmem =
21720 adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
21721 }
21722 emit_move_insn (destmem, srcmem);
21723 }
21724 }
21725 else
21726 {
21727 rtx tmpreg[4];
21728 gcc_assert (unroll <= 4);
21729 for (i = 0; i < unroll; i++)
21730 {
21731 tmpreg[i] = gen_reg_rtx (mode);
21732 if (i)
21733 {
21734 srcmem =
21735 adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
21736 }
21737 emit_move_insn (tmpreg[i], srcmem);
21738 }
21739 for (i = 0; i < unroll; i++)
21740 {
21741 if (i)
21742 {
21743 destmem =
21744 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
21745 }
21746 emit_move_insn (destmem, tmpreg[i]);
21747 }
21748 }
21749 }
21750 else
21751 for (i = 0; i < unroll; i++)
21752 {
21753 if (i)
21754 destmem =
21755 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
21756 emit_move_insn (destmem, value);
21757 }
21758
21759 tmp = expand_simple_binop (iter_mode, PLUS, iter, piece_size, iter,
21760 true, OPTAB_LIB_WIDEN);
21761 if (tmp != iter)
21762 emit_move_insn (iter, tmp);
21763
21764 emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
21765 true, top_label);
21766 if (expected_size != -1)
21767 {
21768 expected_size /= GET_MODE_SIZE (mode) * unroll;
21769 if (expected_size == 0)
21770 predict_jump (0);
21771 else if (expected_size > REG_BR_PROB_BASE)
21772 predict_jump (REG_BR_PROB_BASE - 1);
21773 else
21774 predict_jump (REG_BR_PROB_BASE - (REG_BR_PROB_BASE + expected_size / 2) / expected_size);
21775 }
21776 else
21777 predict_jump (REG_BR_PROB_BASE * 80 / 100);
21778 iter = ix86_zero_extend_to_Pmode (iter);
21779 tmp = expand_simple_binop (Pmode, PLUS, destptr, iter, destptr,
21780 true, OPTAB_LIB_WIDEN);
21781 if (tmp != destptr)
21782 emit_move_insn (destptr, tmp);
21783 if (srcptr)
21784 {
21785 tmp = expand_simple_binop (Pmode, PLUS, srcptr, iter, srcptr,
21786 true, OPTAB_LIB_WIDEN);
21787 if (tmp != srcptr)
21788 emit_move_insn (srcptr, tmp);
21789 }
21790 emit_label (out_label);
21791 }
21792
21793 /* Output "rep; mov" instruction.
21794 Arguments have same meaning as for previous function */
21795 static void
21796 expand_movmem_via_rep_mov (rtx destmem, rtx srcmem,
21797 rtx destptr, rtx srcptr,
21798 rtx count,
21799 enum machine_mode mode)
21800 {
21801 rtx destexp;
21802 rtx srcexp;
21803 rtx countreg;
21804 HOST_WIDE_INT rounded_count;
21805
21806 /* If the size is known, it is shorter to use rep movs. */
21807 if (mode == QImode && CONST_INT_P (count)
21808 && !(INTVAL (count) & 3))
21809 mode = SImode;
21810
21811 if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
21812 destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
21813 if (srcptr != XEXP (srcmem, 0) || GET_MODE (srcmem) != BLKmode)
21814 srcmem = adjust_automodify_address_nv (srcmem, BLKmode, srcptr, 0);
21815 countreg = ix86_zero_extend_to_Pmode (scale_counter (count, GET_MODE_SIZE (mode)));
21816 if (mode != QImode)
21817 {
21818 destexp = gen_rtx_ASHIFT (Pmode, countreg,
21819 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
21820 destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
21821 srcexp = gen_rtx_ASHIFT (Pmode, countreg,
21822 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
21823 srcexp = gen_rtx_PLUS (Pmode, srcexp, srcptr);
21824 }
21825 else
21826 {
21827 destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
21828 srcexp = gen_rtx_PLUS (Pmode, srcptr, countreg);
21829 }
21830 if (CONST_INT_P (count))
21831 {
21832 rounded_count = (INTVAL (count)
21833 & ~((HOST_WIDE_INT) GET_MODE_SIZE (mode) - 1));
21834 destmem = shallow_copy_rtx (destmem);
21835 srcmem = shallow_copy_rtx (srcmem);
21836 set_mem_size (destmem, rounded_count);
21837 set_mem_size (srcmem, rounded_count);
21838 }
21839 else
21840 {
21841 if (MEM_SIZE_KNOWN_P (destmem))
21842 clear_mem_size (destmem);
21843 if (MEM_SIZE_KNOWN_P (srcmem))
21844 clear_mem_size (srcmem);
21845 }
21846 emit_insn (gen_rep_mov (destptr, destmem, srcptr, srcmem, countreg,
21847 destexp, srcexp));
21848 }
21849
21850 /* Output "rep; stos" instruction.
21851 Arguments have same meaning as for previous function */
21852 static void
21853 expand_setmem_via_rep_stos (rtx destmem, rtx destptr, rtx value,
21854 rtx count, enum machine_mode mode,
21855 rtx orig_value)
21856 {
21857 rtx destexp;
21858 rtx countreg;
21859 HOST_WIDE_INT rounded_count;
21860
21861 if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
21862 destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
21863 value = force_reg (mode, gen_lowpart (mode, value));
21864 countreg = ix86_zero_extend_to_Pmode (scale_counter (count, GET_MODE_SIZE (mode)));
21865 if (mode != QImode)
21866 {
21867 destexp = gen_rtx_ASHIFT (Pmode, countreg,
21868 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
21869 destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
21870 }
21871 else
21872 destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
21873 if (orig_value == const0_rtx && CONST_INT_P (count))
21874 {
21875 rounded_count = (INTVAL (count)
21876 & ~((HOST_WIDE_INT) GET_MODE_SIZE (mode) - 1));
21877 destmem = shallow_copy_rtx (destmem);
21878 set_mem_size (destmem, rounded_count);
21879 }
21880 else if (MEM_SIZE_KNOWN_P (destmem))
21881 clear_mem_size (destmem);
21882 emit_insn (gen_rep_stos (destptr, countreg, destmem, value, destexp));
21883 }
21884
21885 static void
21886 emit_strmov (rtx destmem, rtx srcmem,
21887 rtx destptr, rtx srcptr, enum machine_mode mode, int offset)
21888 {
21889 rtx src = adjust_automodify_address_nv (srcmem, mode, srcptr, offset);
21890 rtx dest = adjust_automodify_address_nv (destmem, mode, destptr, offset);
21891 emit_insn (gen_strmov (destptr, dest, srcptr, src));
21892 }
21893
21894 /* Output code to copy at most count & (max_size - 1) bytes from SRC to DEST. */
21895 static void
21896 expand_movmem_epilogue (rtx destmem, rtx srcmem,
21897 rtx destptr, rtx srcptr, rtx count, int max_size)
21898 {
21899 rtx src, dest;
21900 if (CONST_INT_P (count))
21901 {
21902 HOST_WIDE_INT countval = INTVAL (count);
21903 int offset = 0;
21904
21905 if ((countval & 0x10) && max_size > 16)
21906 {
21907 if (TARGET_64BIT)
21908 {
21909 emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset);
21910 emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset + 8);
21911 }
21912 else
21913 gcc_unreachable ();
21914 offset += 16;
21915 }
21916 if ((countval & 0x08) && max_size > 8)
21917 {
21918 if (TARGET_64BIT)
21919 emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset);
21920 else
21921 {
21922 emit_strmov (destmem, srcmem, destptr, srcptr, SImode, offset);
21923 emit_strmov (destmem, srcmem, destptr, srcptr, SImode, offset + 4);
21924 }
21925 offset += 8;
21926 }
21927 if ((countval & 0x04) && max_size > 4)
21928 {
21929 emit_strmov (destmem, srcmem, destptr, srcptr, SImode, offset);
21930 offset += 4;
21931 }
21932 if ((countval & 0x02) && max_size > 2)
21933 {
21934 emit_strmov (destmem, srcmem, destptr, srcptr, HImode, offset);
21935 offset += 2;
21936 }
21937 if ((countval & 0x01) && max_size > 1)
21938 {
21939 emit_strmov (destmem, srcmem, destptr, srcptr, QImode, offset);
21940 offset += 1;
21941 }
21942 return;
21943 }
21944 if (max_size > 8)
21945 {
21946 count = expand_simple_binop (GET_MODE (count), AND, count, GEN_INT (max_size - 1),
21947 count, 1, OPTAB_DIRECT);
21948 expand_set_or_movmem_via_loop (destmem, srcmem, destptr, srcptr, NULL,
21949 count, QImode, 1, 4);
21950 return;
21951 }
21952
21953 /* When there are stringops, we can cheaply increase dest and src pointers.
21954 Otherwise we save code size by maintaining offset (zero is readily
21955 available from preceding rep operation) and using x86 addressing modes.
21956 */
21957 if (TARGET_SINGLE_STRINGOP)
21958 {
21959 if (max_size > 4)
21960 {
21961 rtx label = ix86_expand_aligntest (count, 4, true);
21962 src = change_address (srcmem, SImode, srcptr);
21963 dest = change_address (destmem, SImode, destptr);
21964 emit_insn (gen_strmov (destptr, dest, srcptr, src));
21965 emit_label (label);
21966 LABEL_NUSES (label) = 1;
21967 }
21968 if (max_size > 2)
21969 {
21970 rtx label = ix86_expand_aligntest (count, 2, true);
21971 src = change_address (srcmem, HImode, srcptr);
21972 dest = change_address (destmem, HImode, destptr);
21973 emit_insn (gen_strmov (destptr, dest, srcptr, src));
21974 emit_label (label);
21975 LABEL_NUSES (label) = 1;
21976 }
21977 if (max_size > 1)
21978 {
21979 rtx label = ix86_expand_aligntest (count, 1, true);
21980 src = change_address (srcmem, QImode, srcptr);
21981 dest = change_address (destmem, QImode, destptr);
21982 emit_insn (gen_strmov (destptr, dest, srcptr, src));
21983 emit_label (label);
21984 LABEL_NUSES (label) = 1;
21985 }
21986 }
21987 else
21988 {
21989 rtx offset = force_reg (Pmode, const0_rtx);
21990 rtx tmp;
21991
21992 if (max_size > 4)
21993 {
21994 rtx label = ix86_expand_aligntest (count, 4, true);
21995 src = change_address (srcmem, SImode, srcptr);
21996 dest = change_address (destmem, SImode, destptr);
21997 emit_move_insn (dest, src);
21998 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (4), NULL,
21999 true, OPTAB_LIB_WIDEN);
22000 if (tmp != offset)
22001 emit_move_insn (offset, tmp);
22002 emit_label (label);
22003 LABEL_NUSES (label) = 1;
22004 }
22005 if (max_size > 2)
22006 {
22007 rtx label = ix86_expand_aligntest (count, 2, true);
22008 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
22009 src = change_address (srcmem, HImode, tmp);
22010 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
22011 dest = change_address (destmem, HImode, tmp);
22012 emit_move_insn (dest, src);
22013 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (2), tmp,
22014 true, OPTAB_LIB_WIDEN);
22015 if (tmp != offset)
22016 emit_move_insn (offset, tmp);
22017 emit_label (label);
22018 LABEL_NUSES (label) = 1;
22019 }
22020 if (max_size > 1)
22021 {
22022 rtx label = ix86_expand_aligntest (count, 1, true);
22023 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
22024 src = change_address (srcmem, QImode, tmp);
22025 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
22026 dest = change_address (destmem, QImode, tmp);
22027 emit_move_insn (dest, src);
22028 emit_label (label);
22029 LABEL_NUSES (label) = 1;
22030 }
22031 }
22032 }
22033
22034 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
22035 static void
22036 expand_setmem_epilogue_via_loop (rtx destmem, rtx destptr, rtx value,
22037 rtx count, int max_size)
22038 {
22039 count =
22040 expand_simple_binop (counter_mode (count), AND, count,
22041 GEN_INT (max_size - 1), count, 1, OPTAB_DIRECT);
22042 expand_set_or_movmem_via_loop (destmem, NULL, destptr, NULL,
22043 gen_lowpart (QImode, value), count, QImode,
22044 1, max_size / 2);
22045 }
22046
22047 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
22048 static void
22049 expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx count, int max_size)
22050 {
22051 rtx dest;
22052
22053 if (CONST_INT_P (count))
22054 {
22055 HOST_WIDE_INT countval = INTVAL (count);
22056 int offset = 0;
22057
22058 if ((countval & 0x10) && max_size > 16)
22059 {
22060 if (TARGET_64BIT)
22061 {
22062 dest = adjust_automodify_address_nv (destmem, DImode, destptr, offset);
22063 emit_insn (gen_strset (destptr, dest, value));
22064 dest = adjust_automodify_address_nv (destmem, DImode, destptr, offset + 8);
22065 emit_insn (gen_strset (destptr, dest, value));
22066 }
22067 else
22068 gcc_unreachable ();
22069 offset += 16;
22070 }
22071 if ((countval & 0x08) && max_size > 8)
22072 {
22073 if (TARGET_64BIT)
22074 {
22075 dest = adjust_automodify_address_nv (destmem, DImode, destptr, offset);
22076 emit_insn (gen_strset (destptr, dest, value));
22077 }
22078 else
22079 {
22080 dest = adjust_automodify_address_nv (destmem, SImode, destptr, offset);
22081 emit_insn (gen_strset (destptr, dest, value));
22082 dest = adjust_automodify_address_nv (destmem, SImode, destptr, offset + 4);
22083 emit_insn (gen_strset (destptr, dest, value));
22084 }
22085 offset += 8;
22086 }
22087 if ((countval & 0x04) && max_size > 4)
22088 {
22089 dest = adjust_automodify_address_nv (destmem, SImode, destptr, offset);
22090 emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
22091 offset += 4;
22092 }
22093 if ((countval & 0x02) && max_size > 2)
22094 {
22095 dest = adjust_automodify_address_nv (destmem, HImode, destptr, offset);
22096 emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
22097 offset += 2;
22098 }
22099 if ((countval & 0x01) && max_size > 1)
22100 {
22101 dest = adjust_automodify_address_nv (destmem, QImode, destptr, offset);
22102 emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
22103 offset += 1;
22104 }
22105 return;
22106 }
22107 if (max_size > 32)
22108 {
22109 expand_setmem_epilogue_via_loop (destmem, destptr, value, count, max_size);
22110 return;
22111 }
22112 if (max_size > 16)
22113 {
22114 rtx label = ix86_expand_aligntest (count, 16, true);
22115 if (TARGET_64BIT)
22116 {
22117 dest = change_address (destmem, DImode, destptr);
22118 emit_insn (gen_strset (destptr, dest, value));
22119 emit_insn (gen_strset (destptr, dest, value));
22120 }
22121 else
22122 {
22123 dest = change_address (destmem, SImode, destptr);
22124 emit_insn (gen_strset (destptr, dest, value));
22125 emit_insn (gen_strset (destptr, dest, value));
22126 emit_insn (gen_strset (destptr, dest, value));
22127 emit_insn (gen_strset (destptr, dest, value));
22128 }
22129 emit_label (label);
22130 LABEL_NUSES (label) = 1;
22131 }
22132 if (max_size > 8)
22133 {
22134 rtx label = ix86_expand_aligntest (count, 8, true);
22135 if (TARGET_64BIT)
22136 {
22137 dest = change_address (destmem, DImode, destptr);
22138 emit_insn (gen_strset (destptr, dest, value));
22139 }
22140 else
22141 {
22142 dest = change_address (destmem, SImode, destptr);
22143 emit_insn (gen_strset (destptr, dest, value));
22144 emit_insn (gen_strset (destptr, dest, value));
22145 }
22146 emit_label (label);
22147 LABEL_NUSES (label) = 1;
22148 }
22149 if (max_size > 4)
22150 {
22151 rtx label = ix86_expand_aligntest (count, 4, true);
22152 dest = change_address (destmem, SImode, destptr);
22153 emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
22154 emit_label (label);
22155 LABEL_NUSES (label) = 1;
22156 }
22157 if (max_size > 2)
22158 {
22159 rtx label = ix86_expand_aligntest (count, 2, true);
22160 dest = change_address (destmem, HImode, destptr);
22161 emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
22162 emit_label (label);
22163 LABEL_NUSES (label) = 1;
22164 }
22165 if (max_size > 1)
22166 {
22167 rtx label = ix86_expand_aligntest (count, 1, true);
22168 dest = change_address (destmem, QImode, destptr);
22169 emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
22170 emit_label (label);
22171 LABEL_NUSES (label) = 1;
22172 }
22173 }
22174
22175 /* Copy enough from DEST to SRC to align DEST known to by aligned by ALIGN to
22176 DESIRED_ALIGNMENT. */
22177 static void
22178 expand_movmem_prologue (rtx destmem, rtx srcmem,
22179 rtx destptr, rtx srcptr, rtx count,
22180 int align, int desired_alignment)
22181 {
22182 if (align <= 1 && desired_alignment > 1)
22183 {
22184 rtx label = ix86_expand_aligntest (destptr, 1, false);
22185 srcmem = change_address (srcmem, QImode, srcptr);
22186 destmem = change_address (destmem, QImode, destptr);
22187 emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem));
22188 ix86_adjust_counter (count, 1);
22189 emit_label (label);
22190 LABEL_NUSES (label) = 1;
22191 }
22192 if (align <= 2 && desired_alignment > 2)
22193 {
22194 rtx label = ix86_expand_aligntest (destptr, 2, false);
22195 srcmem = change_address (srcmem, HImode, srcptr);
22196 destmem = change_address (destmem, HImode, destptr);
22197 emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem));
22198 ix86_adjust_counter (count, 2);
22199 emit_label (label);
22200 LABEL_NUSES (label) = 1;
22201 }
22202 if (align <= 4 && desired_alignment > 4)
22203 {
22204 rtx label = ix86_expand_aligntest (destptr, 4, false);
22205 srcmem = change_address (srcmem, SImode, srcptr);
22206 destmem = change_address (destmem, SImode, destptr);
22207 emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem));
22208 ix86_adjust_counter (count, 4);
22209 emit_label (label);
22210 LABEL_NUSES (label) = 1;
22211 }
22212 gcc_assert (desired_alignment <= 8);
22213 }
22214
22215 /* Copy enough from DST to SRC to align DST known to DESIRED_ALIGN.
22216 ALIGN_BYTES is how many bytes need to be copied. */
22217 static rtx
22218 expand_constant_movmem_prologue (rtx dst, rtx *srcp, rtx destreg, rtx srcreg,
22219 int desired_align, int align_bytes)
22220 {
22221 rtx src = *srcp;
22222 rtx orig_dst = dst;
22223 rtx orig_src = src;
22224 int off = 0;
22225 int src_align_bytes = get_mem_align_offset (src, desired_align * BITS_PER_UNIT);
22226 if (src_align_bytes >= 0)
22227 src_align_bytes = desired_align - src_align_bytes;
22228 if (align_bytes & 1)
22229 {
22230 dst = adjust_automodify_address_nv (dst, QImode, destreg, 0);
22231 src = adjust_automodify_address_nv (src, QImode, srcreg, 0);
22232 off = 1;
22233 emit_insn (gen_strmov (destreg, dst, srcreg, src));
22234 }
22235 if (align_bytes & 2)
22236 {
22237 dst = adjust_automodify_address_nv (dst, HImode, destreg, off);
22238 src = adjust_automodify_address_nv (src, HImode, srcreg, off);
22239 if (MEM_ALIGN (dst) < 2 * BITS_PER_UNIT)
22240 set_mem_align (dst, 2 * BITS_PER_UNIT);
22241 if (src_align_bytes >= 0
22242 && (src_align_bytes & 1) == (align_bytes & 1)
22243 && MEM_ALIGN (src) < 2 * BITS_PER_UNIT)
22244 set_mem_align (src, 2 * BITS_PER_UNIT);
22245 off = 2;
22246 emit_insn (gen_strmov (destreg, dst, srcreg, src));
22247 }
22248 if (align_bytes & 4)
22249 {
22250 dst = adjust_automodify_address_nv (dst, SImode, destreg, off);
22251 src = adjust_automodify_address_nv (src, SImode, srcreg, off);
22252 if (MEM_ALIGN (dst) < 4 * BITS_PER_UNIT)
22253 set_mem_align (dst, 4 * BITS_PER_UNIT);
22254 if (src_align_bytes >= 0)
22255 {
22256 unsigned int src_align = 0;
22257 if ((src_align_bytes & 3) == (align_bytes & 3))
22258 src_align = 4;
22259 else if ((src_align_bytes & 1) == (align_bytes & 1))
22260 src_align = 2;
22261 if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
22262 set_mem_align (src, src_align * BITS_PER_UNIT);
22263 }
22264 off = 4;
22265 emit_insn (gen_strmov (destreg, dst, srcreg, src));
22266 }
22267 dst = adjust_automodify_address_nv (dst, BLKmode, destreg, off);
22268 src = adjust_automodify_address_nv (src, BLKmode, srcreg, off);
22269 if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
22270 set_mem_align (dst, desired_align * BITS_PER_UNIT);
22271 if (src_align_bytes >= 0)
22272 {
22273 unsigned int src_align = 0;
22274 if ((src_align_bytes & 7) == (align_bytes & 7))
22275 src_align = 8;
22276 else if ((src_align_bytes & 3) == (align_bytes & 3))
22277 src_align = 4;
22278 else if ((src_align_bytes & 1) == (align_bytes & 1))
22279 src_align = 2;
22280 if (src_align > (unsigned int) desired_align)
22281 src_align = desired_align;
22282 if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
22283 set_mem_align (src, src_align * BITS_PER_UNIT);
22284 }
22285 if (MEM_SIZE_KNOWN_P (orig_dst))
22286 set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
22287 if (MEM_SIZE_KNOWN_P (orig_src))
22288 set_mem_size (src, MEM_SIZE (orig_src) - align_bytes);
22289 *srcp = src;
22290 return dst;
22291 }
22292
22293 /* Set enough from DEST to align DEST known to by aligned by ALIGN to
22294 DESIRED_ALIGNMENT. */
22295 static void
22296 expand_setmem_prologue (rtx destmem, rtx destptr, rtx value, rtx count,
22297 int align, int desired_alignment)
22298 {
22299 if (align <= 1 && desired_alignment > 1)
22300 {
22301 rtx label = ix86_expand_aligntest (destptr, 1, false);
22302 destmem = change_address (destmem, QImode, destptr);
22303 emit_insn (gen_strset (destptr, destmem, gen_lowpart (QImode, value)));
22304 ix86_adjust_counter (count, 1);
22305 emit_label (label);
22306 LABEL_NUSES (label) = 1;
22307 }
22308 if (align <= 2 && desired_alignment > 2)
22309 {
22310 rtx label = ix86_expand_aligntest (destptr, 2, false);
22311 destmem = change_address (destmem, HImode, destptr);
22312 emit_insn (gen_strset (destptr, destmem, gen_lowpart (HImode, value)));
22313 ix86_adjust_counter (count, 2);
22314 emit_label (label);
22315 LABEL_NUSES (label) = 1;
22316 }
22317 if (align <= 4 && desired_alignment > 4)
22318 {
22319 rtx label = ix86_expand_aligntest (destptr, 4, false);
22320 destmem = change_address (destmem, SImode, destptr);
22321 emit_insn (gen_strset (destptr, destmem, gen_lowpart (SImode, value)));
22322 ix86_adjust_counter (count, 4);
22323 emit_label (label);
22324 LABEL_NUSES (label) = 1;
22325 }
22326 gcc_assert (desired_alignment <= 8);
22327 }
22328
22329 /* Set enough from DST to align DST known to by aligned by ALIGN to
22330 DESIRED_ALIGN. ALIGN_BYTES is how many bytes need to be stored. */
22331 static rtx
22332 expand_constant_setmem_prologue (rtx dst, rtx destreg, rtx value,
22333 int desired_align, int align_bytes)
22334 {
22335 int off = 0;
22336 rtx orig_dst = dst;
22337 if (align_bytes & 1)
22338 {
22339 dst = adjust_automodify_address_nv (dst, QImode, destreg, 0);
22340 off = 1;
22341 emit_insn (gen_strset (destreg, dst,
22342 gen_lowpart (QImode, value)));
22343 }
22344 if (align_bytes & 2)
22345 {
22346 dst = adjust_automodify_address_nv (dst, HImode, destreg, off);
22347 if (MEM_ALIGN (dst) < 2 * BITS_PER_UNIT)
22348 set_mem_align (dst, 2 * BITS_PER_UNIT);
22349 off = 2;
22350 emit_insn (gen_strset (destreg, dst,
22351 gen_lowpart (HImode, value)));
22352 }
22353 if (align_bytes & 4)
22354 {
22355 dst = adjust_automodify_address_nv (dst, SImode, destreg, off);
22356 if (MEM_ALIGN (dst) < 4 * BITS_PER_UNIT)
22357 set_mem_align (dst, 4 * BITS_PER_UNIT);
22358 off = 4;
22359 emit_insn (gen_strset (destreg, dst,
22360 gen_lowpart (SImode, value)));
22361 }
22362 dst = adjust_automodify_address_nv (dst, BLKmode, destreg, off);
22363 if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
22364 set_mem_align (dst, desired_align * BITS_PER_UNIT);
22365 if (MEM_SIZE_KNOWN_P (orig_dst))
22366 set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
22367 return dst;
22368 }
22369
22370 /* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation. */
22371 static enum stringop_alg
22372 decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size, bool memset,
22373 int *dynamic_check)
22374 {
22375 const struct stringop_algs * algs;
22376 bool optimize_for_speed;
22377 /* Algorithms using the rep prefix want at least edi and ecx;
22378 additionally, memset wants eax and memcpy wants esi. Don't
22379 consider such algorithms if the user has appropriated those
22380 registers for their own purposes. */
22381 bool rep_prefix_usable = !(fixed_regs[CX_REG] || fixed_regs[DI_REG]
22382 || (memset
22383 ? fixed_regs[AX_REG] : fixed_regs[SI_REG]));
22384
22385 #define ALG_USABLE_P(alg) (rep_prefix_usable \
22386 || (alg != rep_prefix_1_byte \
22387 && alg != rep_prefix_4_byte \
22388 && alg != rep_prefix_8_byte))
22389 const struct processor_costs *cost;
22390
22391 /* Even if the string operation call is cold, we still might spend a lot
22392 of time processing large blocks. */
22393 if (optimize_function_for_size_p (cfun)
22394 || (optimize_insn_for_size_p ()
22395 && expected_size != -1 && expected_size < 256))
22396 optimize_for_speed = false;
22397 else
22398 optimize_for_speed = true;
22399
22400 cost = optimize_for_speed ? ix86_cost : &ix86_size_cost;
22401
22402 *dynamic_check = -1;
22403 if (memset)
22404 algs = &cost->memset[TARGET_64BIT != 0];
22405 else
22406 algs = &cost->memcpy[TARGET_64BIT != 0];
22407 if (ix86_stringop_alg != no_stringop && ALG_USABLE_P (ix86_stringop_alg))
22408 return ix86_stringop_alg;
22409 /* rep; movq or rep; movl is the smallest variant. */
22410 else if (!optimize_for_speed)
22411 {
22412 if (!count || (count & 3))
22413 return rep_prefix_usable ? rep_prefix_1_byte : loop_1_byte;
22414 else
22415 return rep_prefix_usable ? rep_prefix_4_byte : loop;
22416 }
22417 /* Very tiny blocks are best handled via the loop, REP is expensive to setup.
22418 */
22419 else if (expected_size != -1 && expected_size < 4)
22420 return loop_1_byte;
22421 else if (expected_size != -1)
22422 {
22423 unsigned int i;
22424 enum stringop_alg alg = libcall;
22425 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
22426 {
22427 /* We get here if the algorithms that were not libcall-based
22428 were rep-prefix based and we are unable to use rep prefixes
22429 based on global register usage. Break out of the loop and
22430 use the heuristic below. */
22431 if (algs->size[i].max == 0)
22432 break;
22433 if (algs->size[i].max >= expected_size || algs->size[i].max == -1)
22434 {
22435 enum stringop_alg candidate = algs->size[i].alg;
22436
22437 if (candidate != libcall && ALG_USABLE_P (candidate))
22438 alg = candidate;
22439 /* Honor TARGET_INLINE_ALL_STRINGOPS by picking
22440 last non-libcall inline algorithm. */
22441 if (TARGET_INLINE_ALL_STRINGOPS)
22442 {
22443 /* When the current size is best to be copied by a libcall,
22444 but we are still forced to inline, run the heuristic below
22445 that will pick code for medium sized blocks. */
22446 if (alg != libcall)
22447 return alg;
22448 break;
22449 }
22450 else if (ALG_USABLE_P (candidate))
22451 return candidate;
22452 }
22453 }
22454 gcc_assert (TARGET_INLINE_ALL_STRINGOPS || !rep_prefix_usable);
22455 }
22456 /* When asked to inline the call anyway, try to pick meaningful choice.
22457 We look for maximal size of block that is faster to copy by hand and
22458 take blocks of at most of that size guessing that average size will
22459 be roughly half of the block.
22460
22461 If this turns out to be bad, we might simply specify the preferred
22462 choice in ix86_costs. */
22463 if ((TARGET_INLINE_ALL_STRINGOPS || TARGET_INLINE_STRINGOPS_DYNAMICALLY)
22464 && (algs->unknown_size == libcall || !ALG_USABLE_P (algs->unknown_size)))
22465 {
22466 int max = -1;
22467 enum stringop_alg alg;
22468 int i;
22469 bool any_alg_usable_p = true;
22470
22471 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
22472 {
22473 enum stringop_alg candidate = algs->size[i].alg;
22474 any_alg_usable_p = any_alg_usable_p && ALG_USABLE_P (candidate);
22475
22476 if (candidate != libcall && candidate
22477 && ALG_USABLE_P (candidate))
22478 max = algs->size[i].max;
22479 }
22480 /* If there aren't any usable algorithms, then recursing on
22481 smaller sizes isn't going to find anything. Just return the
22482 simple byte-at-a-time copy loop. */
22483 if (!any_alg_usable_p)
22484 {
22485 /* Pick something reasonable. */
22486 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
22487 *dynamic_check = 128;
22488 return loop_1_byte;
22489 }
22490 if (max == -1)
22491 max = 4096;
22492 alg = decide_alg (count, max / 2, memset, dynamic_check);
22493 gcc_assert (*dynamic_check == -1);
22494 gcc_assert (alg != libcall);
22495 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
22496 *dynamic_check = max;
22497 return alg;
22498 }
22499 return ALG_USABLE_P (algs->unknown_size) ? algs->unknown_size : libcall;
22500 #undef ALG_USABLE_P
22501 }
22502
22503 /* Decide on alignment. We know that the operand is already aligned to ALIGN
22504 (ALIGN can be based on profile feedback and thus it is not 100% guaranteed). */
22505 static int
22506 decide_alignment (int align,
22507 enum stringop_alg alg,
22508 int expected_size)
22509 {
22510 int desired_align = 0;
22511 switch (alg)
22512 {
22513 case no_stringop:
22514 gcc_unreachable ();
22515 case loop:
22516 case unrolled_loop:
22517 desired_align = GET_MODE_SIZE (Pmode);
22518 break;
22519 case rep_prefix_8_byte:
22520 desired_align = 8;
22521 break;
22522 case rep_prefix_4_byte:
22523 /* PentiumPro has special logic triggering for 8 byte aligned blocks.
22524 copying whole cacheline at once. */
22525 if (TARGET_PENTIUMPRO)
22526 desired_align = 8;
22527 else
22528 desired_align = 4;
22529 break;
22530 case rep_prefix_1_byte:
22531 /* PentiumPro has special logic triggering for 8 byte aligned blocks.
22532 copying whole cacheline at once. */
22533 if (TARGET_PENTIUMPRO)
22534 desired_align = 8;
22535 else
22536 desired_align = 1;
22537 break;
22538 case loop_1_byte:
22539 desired_align = 1;
22540 break;
22541 case libcall:
22542 return 0;
22543 }
22544
22545 if (optimize_size)
22546 desired_align = 1;
22547 if (desired_align < align)
22548 desired_align = align;
22549 if (expected_size != -1 && expected_size < 4)
22550 desired_align = align;
22551 return desired_align;
22552 }
22553
22554 /* Return the smallest power of 2 greater than VAL. */
22555 static int
22556 smallest_pow2_greater_than (int val)
22557 {
22558 int ret = 1;
22559 while (ret <= val)
22560 ret <<= 1;
22561 return ret;
22562 }
22563
22564 /* Expand string move (memcpy) operation. Use i386 string operations
22565 when profitable. expand_setmem contains similar code. The code
22566 depends upon architecture, block size and alignment, but always has
22567 the same overall structure:
22568
22569 1) Prologue guard: Conditional that jumps up to epilogues for small
22570 blocks that can be handled by epilogue alone. This is faster
22571 but also needed for correctness, since prologue assume the block
22572 is larger than the desired alignment.
22573
22574 Optional dynamic check for size and libcall for large
22575 blocks is emitted here too, with -minline-stringops-dynamically.
22576
22577 2) Prologue: copy first few bytes in order to get destination
22578 aligned to DESIRED_ALIGN. It is emitted only when ALIGN is less
22579 than DESIRED_ALIGN and up to DESIRED_ALIGN - ALIGN bytes can be
22580 copied. We emit either a jump tree on power of two sized
22581 blocks, or a byte loop.
22582
22583 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
22584 with specified algorithm.
22585
22586 4) Epilogue: code copying tail of the block that is too small to be
22587 handled by main body (or up to size guarded by prologue guard). */
22588
22589 bool
22590 ix86_expand_movmem (rtx dst, rtx src, rtx count_exp, rtx align_exp,
22591 rtx expected_align_exp, rtx expected_size_exp)
22592 {
22593 rtx destreg;
22594 rtx srcreg;
22595 rtx label = NULL;
22596 rtx tmp;
22597 rtx jump_around_label = NULL;
22598 HOST_WIDE_INT align = 1;
22599 unsigned HOST_WIDE_INT count = 0;
22600 HOST_WIDE_INT expected_size = -1;
22601 int size_needed = 0, epilogue_size_needed;
22602 int desired_align = 0, align_bytes = 0;
22603 enum stringop_alg alg;
22604 int dynamic_check;
22605 bool need_zero_guard = false;
22606
22607 if (CONST_INT_P (align_exp))
22608 align = INTVAL (align_exp);
22609 /* i386 can do misaligned access on reasonably increased cost. */
22610 if (CONST_INT_P (expected_align_exp)
22611 && INTVAL (expected_align_exp) > align)
22612 align = INTVAL (expected_align_exp);
22613 /* ALIGN is the minimum of destination and source alignment, but we care here
22614 just about destination alignment. */
22615 else if (MEM_ALIGN (dst) > (unsigned HOST_WIDE_INT) align * BITS_PER_UNIT)
22616 align = MEM_ALIGN (dst) / BITS_PER_UNIT;
22617
22618 if (CONST_INT_P (count_exp))
22619 count = expected_size = INTVAL (count_exp);
22620 if (CONST_INT_P (expected_size_exp) && count == 0)
22621 expected_size = INTVAL (expected_size_exp);
22622
22623 /* Make sure we don't need to care about overflow later on. */
22624 if (count > ((unsigned HOST_WIDE_INT) 1 << 30))
22625 return false;
22626
22627 /* Step 0: Decide on preferred algorithm, desired alignment and
22628 size of chunks to be copied by main loop. */
22629
22630 alg = decide_alg (count, expected_size, false, &dynamic_check);
22631 desired_align = decide_alignment (align, alg, expected_size);
22632
22633 if (!TARGET_ALIGN_STRINGOPS)
22634 align = desired_align;
22635
22636 if (alg == libcall)
22637 return false;
22638 gcc_assert (alg != no_stringop);
22639 if (!count)
22640 count_exp = copy_to_mode_reg (GET_MODE (count_exp), count_exp);
22641 destreg = copy_addr_to_reg (XEXP (dst, 0));
22642 srcreg = copy_addr_to_reg (XEXP (src, 0));
22643 switch (alg)
22644 {
22645 case libcall:
22646 case no_stringop:
22647 gcc_unreachable ();
22648 case loop:
22649 need_zero_guard = true;
22650 size_needed = GET_MODE_SIZE (word_mode);
22651 break;
22652 case unrolled_loop:
22653 need_zero_guard = true;
22654 size_needed = GET_MODE_SIZE (word_mode) * (TARGET_64BIT ? 4 : 2);
22655 break;
22656 case rep_prefix_8_byte:
22657 size_needed = 8;
22658 break;
22659 case rep_prefix_4_byte:
22660 size_needed = 4;
22661 break;
22662 case rep_prefix_1_byte:
22663 size_needed = 1;
22664 break;
22665 case loop_1_byte:
22666 need_zero_guard = true;
22667 size_needed = 1;
22668 break;
22669 }
22670
22671 epilogue_size_needed = size_needed;
22672
22673 /* Step 1: Prologue guard. */
22674
22675 /* Alignment code needs count to be in register. */
22676 if (CONST_INT_P (count_exp) && desired_align > align)
22677 {
22678 if (INTVAL (count_exp) > desired_align
22679 && INTVAL (count_exp) > size_needed)
22680 {
22681 align_bytes
22682 = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
22683 if (align_bytes <= 0)
22684 align_bytes = 0;
22685 else
22686 align_bytes = desired_align - align_bytes;
22687 }
22688 if (align_bytes == 0)
22689 count_exp = force_reg (counter_mode (count_exp), count_exp);
22690 }
22691 gcc_assert (desired_align >= 1 && align >= 1);
22692
22693 /* Ensure that alignment prologue won't copy past end of block. */
22694 if (size_needed > 1 || (desired_align > 1 && desired_align > align))
22695 {
22696 epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
22697 /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes.
22698 Make sure it is power of 2. */
22699 epilogue_size_needed = smallest_pow2_greater_than (epilogue_size_needed);
22700
22701 if (count)
22702 {
22703 if (count < (unsigned HOST_WIDE_INT)epilogue_size_needed)
22704 {
22705 /* If main algorithm works on QImode, no epilogue is needed.
22706 For small sizes just don't align anything. */
22707 if (size_needed == 1)
22708 desired_align = align;
22709 else
22710 goto epilogue;
22711 }
22712 }
22713 else
22714 {
22715 label = gen_label_rtx ();
22716 emit_cmp_and_jump_insns (count_exp,
22717 GEN_INT (epilogue_size_needed),
22718 LTU, 0, counter_mode (count_exp), 1, label);
22719 if (expected_size == -1 || expected_size < epilogue_size_needed)
22720 predict_jump (REG_BR_PROB_BASE * 60 / 100);
22721 else
22722 predict_jump (REG_BR_PROB_BASE * 20 / 100);
22723 }
22724 }
22725
22726 /* Emit code to decide on runtime whether library call or inline should be
22727 used. */
22728 if (dynamic_check != -1)
22729 {
22730 if (CONST_INT_P (count_exp))
22731 {
22732 if (UINTVAL (count_exp) >= (unsigned HOST_WIDE_INT)dynamic_check)
22733 {
22734 emit_block_move_via_libcall (dst, src, count_exp, false);
22735 count_exp = const0_rtx;
22736 goto epilogue;
22737 }
22738 }
22739 else
22740 {
22741 rtx hot_label = gen_label_rtx ();
22742 jump_around_label = gen_label_rtx ();
22743 emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
22744 LEU, 0, GET_MODE (count_exp), 1, hot_label);
22745 predict_jump (REG_BR_PROB_BASE * 90 / 100);
22746 emit_block_move_via_libcall (dst, src, count_exp, false);
22747 emit_jump (jump_around_label);
22748 emit_label (hot_label);
22749 }
22750 }
22751
22752 /* Step 2: Alignment prologue. */
22753
22754 if (desired_align > align)
22755 {
22756 if (align_bytes == 0)
22757 {
22758 /* Except for the first move in epilogue, we no longer know
22759 constant offset in aliasing info. It don't seems to worth
22760 the pain to maintain it for the first move, so throw away
22761 the info early. */
22762 src = change_address (src, BLKmode, srcreg);
22763 dst = change_address (dst, BLKmode, destreg);
22764 expand_movmem_prologue (dst, src, destreg, srcreg, count_exp, align,
22765 desired_align);
22766 }
22767 else
22768 {
22769 /* If we know how many bytes need to be stored before dst is
22770 sufficiently aligned, maintain aliasing info accurately. */
22771 dst = expand_constant_movmem_prologue (dst, &src, destreg, srcreg,
22772 desired_align, align_bytes);
22773 count_exp = plus_constant (counter_mode (count_exp),
22774 count_exp, -align_bytes);
22775 count -= align_bytes;
22776 }
22777 if (need_zero_guard
22778 && (count < (unsigned HOST_WIDE_INT) size_needed
22779 || (align_bytes == 0
22780 && count < ((unsigned HOST_WIDE_INT) size_needed
22781 + desired_align - align))))
22782 {
22783 /* It is possible that we copied enough so the main loop will not
22784 execute. */
22785 gcc_assert (size_needed > 1);
22786 if (label == NULL_RTX)
22787 label = gen_label_rtx ();
22788 emit_cmp_and_jump_insns (count_exp,
22789 GEN_INT (size_needed),
22790 LTU, 0, counter_mode (count_exp), 1, label);
22791 if (expected_size == -1
22792 || expected_size < (desired_align - align) / 2 + size_needed)
22793 predict_jump (REG_BR_PROB_BASE * 20 / 100);
22794 else
22795 predict_jump (REG_BR_PROB_BASE * 60 / 100);
22796 }
22797 }
22798 if (label && size_needed == 1)
22799 {
22800 emit_label (label);
22801 LABEL_NUSES (label) = 1;
22802 label = NULL;
22803 epilogue_size_needed = 1;
22804 }
22805 else if (label == NULL_RTX)
22806 epilogue_size_needed = size_needed;
22807
22808 /* Step 3: Main loop. */
22809
22810 switch (alg)
22811 {
22812 case libcall:
22813 case no_stringop:
22814 gcc_unreachable ();
22815 case loop_1_byte:
22816 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, NULL,
22817 count_exp, QImode, 1, expected_size);
22818 break;
22819 case loop:
22820 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, NULL,
22821 count_exp, word_mode, 1, expected_size);
22822 break;
22823 case unrolled_loop:
22824 /* Unroll only by factor of 2 in 32bit mode, since we don't have enough
22825 registers for 4 temporaries anyway. */
22826 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, NULL,
22827 count_exp, word_mode, TARGET_64BIT ? 4 : 2,
22828 expected_size);
22829 break;
22830 case rep_prefix_8_byte:
22831 expand_movmem_via_rep_mov (dst, src, destreg, srcreg, count_exp,
22832 DImode);
22833 break;
22834 case rep_prefix_4_byte:
22835 expand_movmem_via_rep_mov (dst, src, destreg, srcreg, count_exp,
22836 SImode);
22837 break;
22838 case rep_prefix_1_byte:
22839 expand_movmem_via_rep_mov (dst, src, destreg, srcreg, count_exp,
22840 QImode);
22841 break;
22842 }
22843 /* Adjust properly the offset of src and dest memory for aliasing. */
22844 if (CONST_INT_P (count_exp))
22845 {
22846 src = adjust_automodify_address_nv (src, BLKmode, srcreg,
22847 (count / size_needed) * size_needed);
22848 dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
22849 (count / size_needed) * size_needed);
22850 }
22851 else
22852 {
22853 src = change_address (src, BLKmode, srcreg);
22854 dst = change_address (dst, BLKmode, destreg);
22855 }
22856
22857 /* Step 4: Epilogue to copy the remaining bytes. */
22858 epilogue:
22859 if (label)
22860 {
22861 /* When the main loop is done, COUNT_EXP might hold original count,
22862 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
22863 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
22864 bytes. Compensate if needed. */
22865
22866 if (size_needed < epilogue_size_needed)
22867 {
22868 tmp =
22869 expand_simple_binop (counter_mode (count_exp), AND, count_exp,
22870 GEN_INT (size_needed - 1), count_exp, 1,
22871 OPTAB_DIRECT);
22872 if (tmp != count_exp)
22873 emit_move_insn (count_exp, tmp);
22874 }
22875 emit_label (label);
22876 LABEL_NUSES (label) = 1;
22877 }
22878
22879 if (count_exp != const0_rtx && epilogue_size_needed > 1)
22880 expand_movmem_epilogue (dst, src, destreg, srcreg, count_exp,
22881 epilogue_size_needed);
22882 if (jump_around_label)
22883 emit_label (jump_around_label);
22884 return true;
22885 }
22886
22887 /* Helper function for memcpy. For QImode value 0xXY produce
22888 0xXYXYXYXY of wide specified by MODE. This is essentially
22889 a * 0x10101010, but we can do slightly better than
22890 synth_mult by unwinding the sequence by hand on CPUs with
22891 slow multiply. */
22892 static rtx
22893 promote_duplicated_reg (enum machine_mode mode, rtx val)
22894 {
22895 enum machine_mode valmode = GET_MODE (val);
22896 rtx tmp;
22897 int nops = mode == DImode ? 3 : 2;
22898
22899 gcc_assert (mode == SImode || mode == DImode);
22900 if (val == const0_rtx)
22901 return copy_to_mode_reg (mode, const0_rtx);
22902 if (CONST_INT_P (val))
22903 {
22904 HOST_WIDE_INT v = INTVAL (val) & 255;
22905
22906 v |= v << 8;
22907 v |= v << 16;
22908 if (mode == DImode)
22909 v |= (v << 16) << 16;
22910 return copy_to_mode_reg (mode, gen_int_mode (v, mode));
22911 }
22912
22913 if (valmode == VOIDmode)
22914 valmode = QImode;
22915 if (valmode != QImode)
22916 val = gen_lowpart (QImode, val);
22917 if (mode == QImode)
22918 return val;
22919 if (!TARGET_PARTIAL_REG_STALL)
22920 nops--;
22921 if (ix86_cost->mult_init[mode == DImode ? 3 : 2]
22922 + ix86_cost->mult_bit * (mode == DImode ? 8 : 4)
22923 <= (ix86_cost->shift_const + ix86_cost->add) * nops
22924 + (COSTS_N_INSNS (TARGET_PARTIAL_REG_STALL == 0)))
22925 {
22926 rtx reg = convert_modes (mode, QImode, val, true);
22927 tmp = promote_duplicated_reg (mode, const1_rtx);
22928 return expand_simple_binop (mode, MULT, reg, tmp, NULL, 1,
22929 OPTAB_DIRECT);
22930 }
22931 else
22932 {
22933 rtx reg = convert_modes (mode, QImode, val, true);
22934
22935 if (!TARGET_PARTIAL_REG_STALL)
22936 if (mode == SImode)
22937 emit_insn (gen_movsi_insv_1 (reg, reg));
22938 else
22939 emit_insn (gen_movdi_insv_1 (reg, reg));
22940 else
22941 {
22942 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (8),
22943 NULL, 1, OPTAB_DIRECT);
22944 reg =
22945 expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
22946 }
22947 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (16),
22948 NULL, 1, OPTAB_DIRECT);
22949 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
22950 if (mode == SImode)
22951 return reg;
22952 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (32),
22953 NULL, 1, OPTAB_DIRECT);
22954 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
22955 return reg;
22956 }
22957 }
22958
22959 /* Duplicate value VAL using promote_duplicated_reg into maximal size that will
22960 be needed by main loop copying SIZE_NEEDED chunks and prologue getting
22961 alignment from ALIGN to DESIRED_ALIGN. */
22962 static rtx
22963 promote_duplicated_reg_to_size (rtx val, int size_needed, int desired_align, int align)
22964 {
22965 rtx promoted_val;
22966
22967 if (TARGET_64BIT
22968 && (size_needed > 4 || (desired_align > align && desired_align > 4)))
22969 promoted_val = promote_duplicated_reg (DImode, val);
22970 else if (size_needed > 2 || (desired_align > align && desired_align > 2))
22971 promoted_val = promote_duplicated_reg (SImode, val);
22972 else if (size_needed > 1 || (desired_align > align && desired_align > 1))
22973 promoted_val = promote_duplicated_reg (HImode, val);
22974 else
22975 promoted_val = val;
22976
22977 return promoted_val;
22978 }
22979
22980 /* Expand string clear operation (bzero). Use i386 string operations when
22981 profitable. See expand_movmem comment for explanation of individual
22982 steps performed. */
22983 bool
22984 ix86_expand_setmem (rtx dst, rtx count_exp, rtx val_exp, rtx align_exp,
22985 rtx expected_align_exp, rtx expected_size_exp)
22986 {
22987 rtx destreg;
22988 rtx label = NULL;
22989 rtx tmp;
22990 rtx jump_around_label = NULL;
22991 HOST_WIDE_INT align = 1;
22992 unsigned HOST_WIDE_INT count = 0;
22993 HOST_WIDE_INT expected_size = -1;
22994 int size_needed = 0, epilogue_size_needed;
22995 int desired_align = 0, align_bytes = 0;
22996 enum stringop_alg alg;
22997 rtx promoted_val = NULL;
22998 bool force_loopy_epilogue = false;
22999 int dynamic_check;
23000 bool need_zero_guard = false;
23001
23002 if (CONST_INT_P (align_exp))
23003 align = INTVAL (align_exp);
23004 /* i386 can do misaligned access on reasonably increased cost. */
23005 if (CONST_INT_P (expected_align_exp)
23006 && INTVAL (expected_align_exp) > align)
23007 align = INTVAL (expected_align_exp);
23008 if (CONST_INT_P (count_exp))
23009 count = expected_size = INTVAL (count_exp);
23010 if (CONST_INT_P (expected_size_exp) && count == 0)
23011 expected_size = INTVAL (expected_size_exp);
23012
23013 /* Make sure we don't need to care about overflow later on. */
23014 if (count > ((unsigned HOST_WIDE_INT) 1 << 30))
23015 return false;
23016
23017 /* Step 0: Decide on preferred algorithm, desired alignment and
23018 size of chunks to be copied by main loop. */
23019
23020 alg = decide_alg (count, expected_size, true, &dynamic_check);
23021 desired_align = decide_alignment (align, alg, expected_size);
23022
23023 if (!TARGET_ALIGN_STRINGOPS)
23024 align = desired_align;
23025
23026 if (alg == libcall)
23027 return false;
23028 gcc_assert (alg != no_stringop);
23029 if (!count)
23030 count_exp = copy_to_mode_reg (counter_mode (count_exp), count_exp);
23031 destreg = copy_addr_to_reg (XEXP (dst, 0));
23032 switch (alg)
23033 {
23034 case libcall:
23035 case no_stringop:
23036 gcc_unreachable ();
23037 case loop:
23038 need_zero_guard = true;
23039 size_needed = GET_MODE_SIZE (word_mode);
23040 break;
23041 case unrolled_loop:
23042 need_zero_guard = true;
23043 size_needed = GET_MODE_SIZE (word_mode) * 4;
23044 break;
23045 case rep_prefix_8_byte:
23046 size_needed = 8;
23047 break;
23048 case rep_prefix_4_byte:
23049 size_needed = 4;
23050 break;
23051 case rep_prefix_1_byte:
23052 size_needed = 1;
23053 break;
23054 case loop_1_byte:
23055 need_zero_guard = true;
23056 size_needed = 1;
23057 break;
23058 }
23059 epilogue_size_needed = size_needed;
23060
23061 /* Step 1: Prologue guard. */
23062
23063 /* Alignment code needs count to be in register. */
23064 if (CONST_INT_P (count_exp) && desired_align > align)
23065 {
23066 if (INTVAL (count_exp) > desired_align
23067 && INTVAL (count_exp) > size_needed)
23068 {
23069 align_bytes
23070 = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
23071 if (align_bytes <= 0)
23072 align_bytes = 0;
23073 else
23074 align_bytes = desired_align - align_bytes;
23075 }
23076 if (align_bytes == 0)
23077 {
23078 enum machine_mode mode = SImode;
23079 if (TARGET_64BIT && (count & ~0xffffffff))
23080 mode = DImode;
23081 count_exp = force_reg (mode, count_exp);
23082 }
23083 }
23084 /* Do the cheap promotion to allow better CSE across the
23085 main loop and epilogue (ie one load of the big constant in the
23086 front of all code. */
23087 if (CONST_INT_P (val_exp))
23088 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
23089 desired_align, align);
23090 /* Ensure that alignment prologue won't copy past end of block. */
23091 if (size_needed > 1 || (desired_align > 1 && desired_align > align))
23092 {
23093 epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
23094 /* Epilogue always copies COUNT_EXP & (EPILOGUE_SIZE_NEEDED - 1) bytes.
23095 Make sure it is power of 2. */
23096 epilogue_size_needed = smallest_pow2_greater_than (epilogue_size_needed);
23097
23098 /* To improve performance of small blocks, we jump around the VAL
23099 promoting mode. This mean that if the promoted VAL is not constant,
23100 we might not use it in the epilogue and have to use byte
23101 loop variant. */
23102 if (epilogue_size_needed > 2 && !promoted_val)
23103 force_loopy_epilogue = true;
23104 if (count)
23105 {
23106 if (count < (unsigned HOST_WIDE_INT)epilogue_size_needed)
23107 {
23108 /* If main algorithm works on QImode, no epilogue is needed.
23109 For small sizes just don't align anything. */
23110 if (size_needed == 1)
23111 desired_align = align;
23112 else
23113 goto epilogue;
23114 }
23115 }
23116 else
23117 {
23118 label = gen_label_rtx ();
23119 emit_cmp_and_jump_insns (count_exp,
23120 GEN_INT (epilogue_size_needed),
23121 LTU, 0, counter_mode (count_exp), 1, label);
23122 if (expected_size == -1 || expected_size <= epilogue_size_needed)
23123 predict_jump (REG_BR_PROB_BASE * 60 / 100);
23124 else
23125 predict_jump (REG_BR_PROB_BASE * 20 / 100);
23126 }
23127 }
23128 if (dynamic_check != -1)
23129 {
23130 rtx hot_label = gen_label_rtx ();
23131 jump_around_label = gen_label_rtx ();
23132 emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
23133 LEU, 0, counter_mode (count_exp), 1, hot_label);
23134 predict_jump (REG_BR_PROB_BASE * 90 / 100);
23135 set_storage_via_libcall (dst, count_exp, val_exp, false);
23136 emit_jump (jump_around_label);
23137 emit_label (hot_label);
23138 }
23139
23140 /* Step 2: Alignment prologue. */
23141
23142 /* Do the expensive promotion once we branched off the small blocks. */
23143 if (!promoted_val)
23144 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
23145 desired_align, align);
23146 gcc_assert (desired_align >= 1 && align >= 1);
23147
23148 if (desired_align > align)
23149 {
23150 if (align_bytes == 0)
23151 {
23152 /* Except for the first move in epilogue, we no longer know
23153 constant offset in aliasing info. It don't seems to worth
23154 the pain to maintain it for the first move, so throw away
23155 the info early. */
23156 dst = change_address (dst, BLKmode, destreg);
23157 expand_setmem_prologue (dst, destreg, promoted_val, count_exp, align,
23158 desired_align);
23159 }
23160 else
23161 {
23162 /* If we know how many bytes need to be stored before dst is
23163 sufficiently aligned, maintain aliasing info accurately. */
23164 dst = expand_constant_setmem_prologue (dst, destreg, promoted_val,
23165 desired_align, align_bytes);
23166 count_exp = plus_constant (counter_mode (count_exp),
23167 count_exp, -align_bytes);
23168 count -= align_bytes;
23169 }
23170 if (need_zero_guard
23171 && (count < (unsigned HOST_WIDE_INT) size_needed
23172 || (align_bytes == 0
23173 && count < ((unsigned HOST_WIDE_INT) size_needed
23174 + desired_align - align))))
23175 {
23176 /* It is possible that we copied enough so the main loop will not
23177 execute. */
23178 gcc_assert (size_needed > 1);
23179 if (label == NULL_RTX)
23180 label = gen_label_rtx ();
23181 emit_cmp_and_jump_insns (count_exp,
23182 GEN_INT (size_needed),
23183 LTU, 0, counter_mode (count_exp), 1, label);
23184 if (expected_size == -1
23185 || expected_size < (desired_align - align) / 2 + size_needed)
23186 predict_jump (REG_BR_PROB_BASE * 20 / 100);
23187 else
23188 predict_jump (REG_BR_PROB_BASE * 60 / 100);
23189 }
23190 }
23191 if (label && size_needed == 1)
23192 {
23193 emit_label (label);
23194 LABEL_NUSES (label) = 1;
23195 label = NULL;
23196 promoted_val = val_exp;
23197 epilogue_size_needed = 1;
23198 }
23199 else if (label == NULL_RTX)
23200 epilogue_size_needed = size_needed;
23201
23202 /* Step 3: Main loop. */
23203
23204 switch (alg)
23205 {
23206 case libcall:
23207 case no_stringop:
23208 gcc_unreachable ();
23209 case loop_1_byte:
23210 expand_set_or_movmem_via_loop (dst, NULL, destreg, NULL, promoted_val,
23211 count_exp, QImode, 1, expected_size);
23212 break;
23213 case loop:
23214 expand_set_or_movmem_via_loop (dst, NULL, destreg, NULL, promoted_val,
23215 count_exp, word_mode, 1, expected_size);
23216 break;
23217 case unrolled_loop:
23218 expand_set_or_movmem_via_loop (dst, NULL, destreg, NULL, promoted_val,
23219 count_exp, word_mode, 4, expected_size);
23220 break;
23221 case rep_prefix_8_byte:
23222 expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp,
23223 DImode, val_exp);
23224 break;
23225 case rep_prefix_4_byte:
23226 expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp,
23227 SImode, val_exp);
23228 break;
23229 case rep_prefix_1_byte:
23230 expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp,
23231 QImode, val_exp);
23232 break;
23233 }
23234 /* Adjust properly the offset of src and dest memory for aliasing. */
23235 if (CONST_INT_P (count_exp))
23236 dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
23237 (count / size_needed) * size_needed);
23238 else
23239 dst = change_address (dst, BLKmode, destreg);
23240
23241 /* Step 4: Epilogue to copy the remaining bytes. */
23242
23243 if (label)
23244 {
23245 /* When the main loop is done, COUNT_EXP might hold original count,
23246 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
23247 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
23248 bytes. Compensate if needed. */
23249
23250 if (size_needed < epilogue_size_needed)
23251 {
23252 tmp =
23253 expand_simple_binop (counter_mode (count_exp), AND, count_exp,
23254 GEN_INT (size_needed - 1), count_exp, 1,
23255 OPTAB_DIRECT);
23256 if (tmp != count_exp)
23257 emit_move_insn (count_exp, tmp);
23258 }
23259 emit_label (label);
23260 LABEL_NUSES (label) = 1;
23261 }
23262 epilogue:
23263 if (count_exp != const0_rtx && epilogue_size_needed > 1)
23264 {
23265 if (force_loopy_epilogue)
23266 expand_setmem_epilogue_via_loop (dst, destreg, val_exp, count_exp,
23267 epilogue_size_needed);
23268 else
23269 expand_setmem_epilogue (dst, destreg, promoted_val, count_exp,
23270 epilogue_size_needed);
23271 }
23272 if (jump_around_label)
23273 emit_label (jump_around_label);
23274 return true;
23275 }
23276
23277 /* Expand the appropriate insns for doing strlen if not just doing
23278 repnz; scasb
23279
23280 out = result, initialized with the start address
23281 align_rtx = alignment of the address.
23282 scratch = scratch register, initialized with the startaddress when
23283 not aligned, otherwise undefined
23284
23285 This is just the body. It needs the initializations mentioned above and
23286 some address computing at the end. These things are done in i386.md. */
23287
23288 static void
23289 ix86_expand_strlensi_unroll_1 (rtx out, rtx src, rtx align_rtx)
23290 {
23291 int align;
23292 rtx tmp;
23293 rtx align_2_label = NULL_RTX;
23294 rtx align_3_label = NULL_RTX;
23295 rtx align_4_label = gen_label_rtx ();
23296 rtx end_0_label = gen_label_rtx ();
23297 rtx mem;
23298 rtx tmpreg = gen_reg_rtx (SImode);
23299 rtx scratch = gen_reg_rtx (SImode);
23300 rtx cmp;
23301
23302 align = 0;
23303 if (CONST_INT_P (align_rtx))
23304 align = INTVAL (align_rtx);
23305
23306 /* Loop to check 1..3 bytes for null to get an aligned pointer. */
23307
23308 /* Is there a known alignment and is it less than 4? */
23309 if (align < 4)
23310 {
23311 rtx scratch1 = gen_reg_rtx (Pmode);
23312 emit_move_insn (scratch1, out);
23313 /* Is there a known alignment and is it not 2? */
23314 if (align != 2)
23315 {
23316 align_3_label = gen_label_rtx (); /* Label when aligned to 3-byte */
23317 align_2_label = gen_label_rtx (); /* Label when aligned to 2-byte */
23318
23319 /* Leave just the 3 lower bits. */
23320 align_rtx = expand_binop (Pmode, and_optab, scratch1, GEN_INT (3),
23321 NULL_RTX, 0, OPTAB_WIDEN);
23322
23323 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
23324 Pmode, 1, align_4_label);
23325 emit_cmp_and_jump_insns (align_rtx, const2_rtx, EQ, NULL,
23326 Pmode, 1, align_2_label);
23327 emit_cmp_and_jump_insns (align_rtx, const2_rtx, GTU, NULL,
23328 Pmode, 1, align_3_label);
23329 }
23330 else
23331 {
23332 /* Since the alignment is 2, we have to check 2 or 0 bytes;
23333 check if is aligned to 4 - byte. */
23334
23335 align_rtx = expand_binop (Pmode, and_optab, scratch1, const2_rtx,
23336 NULL_RTX, 0, OPTAB_WIDEN);
23337
23338 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
23339 Pmode, 1, align_4_label);
23340 }
23341
23342 mem = change_address (src, QImode, out);
23343
23344 /* Now compare the bytes. */
23345
23346 /* Compare the first n unaligned byte on a byte per byte basis. */
23347 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL,
23348 QImode, 1, end_0_label);
23349
23350 /* Increment the address. */
23351 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
23352
23353 /* Not needed with an alignment of 2 */
23354 if (align != 2)
23355 {
23356 emit_label (align_2_label);
23357
23358 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
23359 end_0_label);
23360
23361 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
23362
23363 emit_label (align_3_label);
23364 }
23365
23366 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
23367 end_0_label);
23368
23369 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
23370 }
23371
23372 /* Generate loop to check 4 bytes at a time. It is not a good idea to
23373 align this loop. It gives only huge programs, but does not help to
23374 speed up. */
23375 emit_label (align_4_label);
23376
23377 mem = change_address (src, SImode, out);
23378 emit_move_insn (scratch, mem);
23379 emit_insn (ix86_gen_add3 (out, out, GEN_INT (4)));
23380
23381 /* This formula yields a nonzero result iff one of the bytes is zero.
23382 This saves three branches inside loop and many cycles. */
23383
23384 emit_insn (gen_addsi3 (tmpreg, scratch, GEN_INT (-0x01010101)));
23385 emit_insn (gen_one_cmplsi2 (scratch, scratch));
23386 emit_insn (gen_andsi3 (tmpreg, tmpreg, scratch));
23387 emit_insn (gen_andsi3 (tmpreg, tmpreg,
23388 gen_int_mode (0x80808080, SImode)));
23389 emit_cmp_and_jump_insns (tmpreg, const0_rtx, EQ, 0, SImode, 1,
23390 align_4_label);
23391
23392 if (TARGET_CMOVE)
23393 {
23394 rtx reg = gen_reg_rtx (SImode);
23395 rtx reg2 = gen_reg_rtx (Pmode);
23396 emit_move_insn (reg, tmpreg);
23397 emit_insn (gen_lshrsi3 (reg, reg, GEN_INT (16)));
23398
23399 /* If zero is not in the first two bytes, move two bytes forward. */
23400 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
23401 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
23402 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
23403 emit_insn (gen_rtx_SET (VOIDmode, tmpreg,
23404 gen_rtx_IF_THEN_ELSE (SImode, tmp,
23405 reg,
23406 tmpreg)));
23407 /* Emit lea manually to avoid clobbering of flags. */
23408 emit_insn (gen_rtx_SET (SImode, reg2,
23409 gen_rtx_PLUS (Pmode, out, const2_rtx)));
23410
23411 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
23412 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
23413 emit_insn (gen_rtx_SET (VOIDmode, out,
23414 gen_rtx_IF_THEN_ELSE (Pmode, tmp,
23415 reg2,
23416 out)));
23417 }
23418 else
23419 {
23420 rtx end_2_label = gen_label_rtx ();
23421 /* Is zero in the first two bytes? */
23422
23423 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
23424 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
23425 tmp = gen_rtx_NE (VOIDmode, tmp, const0_rtx);
23426 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
23427 gen_rtx_LABEL_REF (VOIDmode, end_2_label),
23428 pc_rtx);
23429 tmp = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
23430 JUMP_LABEL (tmp) = end_2_label;
23431
23432 /* Not in the first two. Move two bytes forward. */
23433 emit_insn (gen_lshrsi3 (tmpreg, tmpreg, GEN_INT (16)));
23434 emit_insn (ix86_gen_add3 (out, out, const2_rtx));
23435
23436 emit_label (end_2_label);
23437
23438 }
23439
23440 /* Avoid branch in fixing the byte. */
23441 tmpreg = gen_lowpart (QImode, tmpreg);
23442 emit_insn (gen_addqi3_cc (tmpreg, tmpreg, tmpreg));
23443 tmp = gen_rtx_REG (CCmode, FLAGS_REG);
23444 cmp = gen_rtx_LTU (VOIDmode, tmp, const0_rtx);
23445 emit_insn (ix86_gen_sub3_carry (out, out, GEN_INT (3), tmp, cmp));
23446
23447 emit_label (end_0_label);
23448 }
23449
23450 /* Expand strlen. */
23451
23452 bool
23453 ix86_expand_strlen (rtx out, rtx src, rtx eoschar, rtx align)
23454 {
23455 rtx addr, scratch1, scratch2, scratch3, scratch4;
23456
23457 /* The generic case of strlen expander is long. Avoid it's
23458 expanding unless TARGET_INLINE_ALL_STRINGOPS. */
23459
23460 if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
23461 && !TARGET_INLINE_ALL_STRINGOPS
23462 && !optimize_insn_for_size_p ()
23463 && (!CONST_INT_P (align) || INTVAL (align) < 4))
23464 return false;
23465
23466 addr = force_reg (Pmode, XEXP (src, 0));
23467 scratch1 = gen_reg_rtx (Pmode);
23468
23469 if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
23470 && !optimize_insn_for_size_p ())
23471 {
23472 /* Well it seems that some optimizer does not combine a call like
23473 foo(strlen(bar), strlen(bar));
23474 when the move and the subtraction is done here. It does calculate
23475 the length just once when these instructions are done inside of
23476 output_strlen_unroll(). But I think since &bar[strlen(bar)] is
23477 often used and I use one fewer register for the lifetime of
23478 output_strlen_unroll() this is better. */
23479
23480 emit_move_insn (out, addr);
23481
23482 ix86_expand_strlensi_unroll_1 (out, src, align);
23483
23484 /* strlensi_unroll_1 returns the address of the zero at the end of
23485 the string, like memchr(), so compute the length by subtracting
23486 the start address. */
23487 emit_insn (ix86_gen_sub3 (out, out, addr));
23488 }
23489 else
23490 {
23491 rtx unspec;
23492
23493 /* Can't use this if the user has appropriated eax, ecx, or edi. */
23494 if (fixed_regs[AX_REG] || fixed_regs[CX_REG] || fixed_regs[DI_REG])
23495 return false;
23496
23497 scratch2 = gen_reg_rtx (Pmode);
23498 scratch3 = gen_reg_rtx (Pmode);
23499 scratch4 = force_reg (Pmode, constm1_rtx);
23500
23501 emit_move_insn (scratch3, addr);
23502 eoschar = force_reg (QImode, eoschar);
23503
23504 src = replace_equiv_address_nv (src, scratch3);
23505
23506 /* If .md starts supporting :P, this can be done in .md. */
23507 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (4, src, eoschar, align,
23508 scratch4), UNSPEC_SCAS);
23509 emit_insn (gen_strlenqi_1 (scratch1, scratch3, unspec));
23510 emit_insn (ix86_gen_one_cmpl2 (scratch2, scratch1));
23511 emit_insn (ix86_gen_add3 (out, scratch2, constm1_rtx));
23512 }
23513 return true;
23514 }
23515
23516 /* For given symbol (function) construct code to compute address of it's PLT
23517 entry in large x86-64 PIC model. */
23518 static rtx
23519 construct_plt_address (rtx symbol)
23520 {
23521 rtx tmp, unspec;
23522
23523 gcc_assert (GET_CODE (symbol) == SYMBOL_REF);
23524 gcc_assert (ix86_cmodel == CM_LARGE_PIC);
23525 gcc_assert (Pmode == DImode);
23526
23527 tmp = gen_reg_rtx (Pmode);
23528 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, symbol), UNSPEC_PLTOFF);
23529
23530 emit_move_insn (tmp, gen_rtx_CONST (Pmode, unspec));
23531 emit_insn (ix86_gen_add3 (tmp, tmp, pic_offset_table_rtx));
23532 return tmp;
23533 }
23534
23535 rtx
23536 ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1,
23537 rtx callarg2,
23538 rtx pop, bool sibcall)
23539 {
23540 /* We need to represent that SI and DI registers are clobbered
23541 by SYSV calls. */
23542 static int clobbered_registers[] = {
23543 XMM6_REG, XMM7_REG, XMM8_REG,
23544 XMM9_REG, XMM10_REG, XMM11_REG,
23545 XMM12_REG, XMM13_REG, XMM14_REG,
23546 XMM15_REG, SI_REG, DI_REG
23547 };
23548 rtx vec[ARRAY_SIZE (clobbered_registers) + 3];
23549 rtx use = NULL, call;
23550 unsigned int vec_len;
23551
23552 if (pop == const0_rtx)
23553 pop = NULL;
23554 gcc_assert (!TARGET_64BIT || !pop);
23555
23556 if (TARGET_MACHO && !TARGET_64BIT)
23557 {
23558 #if TARGET_MACHO
23559 if (flag_pic && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
23560 fnaddr = machopic_indirect_call_target (fnaddr);
23561 #endif
23562 }
23563 else
23564 {
23565 /* Static functions and indirect calls don't need the pic register. */
23566 if (flag_pic && (!TARGET_64BIT || ix86_cmodel == CM_LARGE_PIC)
23567 && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
23568 && ! SYMBOL_REF_LOCAL_P (XEXP (fnaddr, 0)))
23569 use_reg (&use, pic_offset_table_rtx);
23570 }
23571
23572 if (TARGET_64BIT && INTVAL (callarg2) >= 0)
23573 {
23574 rtx al = gen_rtx_REG (QImode, AX_REG);
23575 emit_move_insn (al, callarg2);
23576 use_reg (&use, al);
23577 }
23578
23579 if (ix86_cmodel == CM_LARGE_PIC
23580 && MEM_P (fnaddr)
23581 && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
23582 && !local_symbolic_operand (XEXP (fnaddr, 0), VOIDmode))
23583 fnaddr = gen_rtx_MEM (QImode, construct_plt_address (XEXP (fnaddr, 0)));
23584 else if (sibcall
23585 ? !sibcall_insn_operand (XEXP (fnaddr, 0), word_mode)
23586 : !call_insn_operand (XEXP (fnaddr, 0), word_mode))
23587 {
23588 fnaddr = XEXP (fnaddr, 0);
23589 if (GET_MODE (fnaddr) != word_mode)
23590 fnaddr = convert_to_mode (word_mode, fnaddr, 1);
23591 fnaddr = gen_rtx_MEM (QImode, copy_to_mode_reg (word_mode, fnaddr));
23592 }
23593
23594 vec_len = 0;
23595 call = gen_rtx_CALL (VOIDmode, fnaddr, callarg1);
23596 if (retval)
23597 call = gen_rtx_SET (VOIDmode, retval, call);
23598 vec[vec_len++] = call;
23599
23600 if (pop)
23601 {
23602 pop = gen_rtx_PLUS (Pmode, stack_pointer_rtx, pop);
23603 pop = gen_rtx_SET (VOIDmode, stack_pointer_rtx, pop);
23604 vec[vec_len++] = pop;
23605 }
23606
23607 if (TARGET_64BIT_MS_ABI
23608 && (!callarg2 || INTVAL (callarg2) != -2))
23609 {
23610 unsigned i;
23611
23612 vec[vec_len++] = gen_rtx_UNSPEC (VOIDmode, gen_rtvec (1, const0_rtx),
23613 UNSPEC_MS_TO_SYSV_CALL);
23614
23615 for (i = 0; i < ARRAY_SIZE (clobbered_registers); i++)
23616 vec[vec_len++]
23617 = gen_rtx_CLOBBER (SSE_REGNO_P (clobbered_registers[i])
23618 ? TImode : DImode,
23619 gen_rtx_REG (SSE_REGNO_P (clobbered_registers[i])
23620 ? TImode : DImode,
23621 clobbered_registers[i]));
23622 }
23623
23624 /* Add UNSPEC_CALL_NEEDS_VZEROUPPER decoration. */
23625 if (TARGET_VZEROUPPER)
23626 {
23627 int avx256;
23628 if (cfun->machine->callee_pass_avx256_p)
23629 {
23630 if (cfun->machine->callee_return_avx256_p)
23631 avx256 = callee_return_pass_avx256;
23632 else
23633 avx256 = callee_pass_avx256;
23634 }
23635 else if (cfun->machine->callee_return_avx256_p)
23636 avx256 = callee_return_avx256;
23637 else
23638 avx256 = call_no_avx256;
23639
23640 if (reload_completed)
23641 emit_insn (gen_avx_vzeroupper (GEN_INT (avx256)));
23642 else
23643 vec[vec_len++] = gen_rtx_UNSPEC (VOIDmode,
23644 gen_rtvec (1, GEN_INT (avx256)),
23645 UNSPEC_CALL_NEEDS_VZEROUPPER);
23646 }
23647
23648 if (vec_len > 1)
23649 call = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (vec_len, vec));
23650 call = emit_call_insn (call);
23651 if (use)
23652 CALL_INSN_FUNCTION_USAGE (call) = use;
23653
23654 return call;
23655 }
23656
23657 void
23658 ix86_split_call_vzeroupper (rtx insn, rtx vzeroupper)
23659 {
23660 rtx pat = PATTERN (insn);
23661 rtvec vec = XVEC (pat, 0);
23662 int len = GET_NUM_ELEM (vec) - 1;
23663
23664 /* Strip off the last entry of the parallel. */
23665 gcc_assert (GET_CODE (RTVEC_ELT (vec, len)) == UNSPEC);
23666 gcc_assert (XINT (RTVEC_ELT (vec, len), 1) == UNSPEC_CALL_NEEDS_VZEROUPPER);
23667 if (len == 1)
23668 pat = RTVEC_ELT (vec, 0);
23669 else
23670 pat = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (len, &RTVEC_ELT (vec, 0)));
23671
23672 emit_insn (gen_avx_vzeroupper (vzeroupper));
23673 emit_call_insn (pat);
23674 }
23675
23676 /* Output the assembly for a call instruction. */
23677
23678 const char *
23679 ix86_output_call_insn (rtx insn, rtx call_op)
23680 {
23681 bool direct_p = constant_call_address_operand (call_op, VOIDmode);
23682 bool seh_nop_p = false;
23683 const char *xasm;
23684
23685 if (SIBLING_CALL_P (insn))
23686 {
23687 if (direct_p)
23688 xasm = "jmp\t%P0";
23689 /* SEH epilogue detection requires the indirect branch case
23690 to include REX.W. */
23691 else if (TARGET_SEH)
23692 xasm = "rex.W jmp %A0";
23693 else
23694 xasm = "jmp\t%A0";
23695
23696 output_asm_insn (xasm, &call_op);
23697 return "";
23698 }
23699
23700 /* SEH unwinding can require an extra nop to be emitted in several
23701 circumstances. Determine if we have one of those. */
23702 if (TARGET_SEH)
23703 {
23704 rtx i;
23705
23706 for (i = NEXT_INSN (insn); i ; i = NEXT_INSN (i))
23707 {
23708 /* If we get to another real insn, we don't need the nop. */
23709 if (INSN_P (i))
23710 break;
23711
23712 /* If we get to the epilogue note, prevent a catch region from
23713 being adjacent to the standard epilogue sequence. If non-
23714 call-exceptions, we'll have done this during epilogue emission. */
23715 if (NOTE_P (i) && NOTE_KIND (i) == NOTE_INSN_EPILOGUE_BEG
23716 && !flag_non_call_exceptions
23717 && !can_throw_internal (insn))
23718 {
23719 seh_nop_p = true;
23720 break;
23721 }
23722 }
23723
23724 /* If we didn't find a real insn following the call, prevent the
23725 unwinder from looking into the next function. */
23726 if (i == NULL)
23727 seh_nop_p = true;
23728 }
23729
23730 if (direct_p)
23731 xasm = "call\t%P0";
23732 else
23733 xasm = "call\t%A0";
23734
23735 output_asm_insn (xasm, &call_op);
23736
23737 if (seh_nop_p)
23738 return "nop";
23739
23740 return "";
23741 }
23742 \f
23743 /* Clear stack slot assignments remembered from previous functions.
23744 This is called from INIT_EXPANDERS once before RTL is emitted for each
23745 function. */
23746
23747 static struct machine_function *
23748 ix86_init_machine_status (void)
23749 {
23750 struct machine_function *f;
23751
23752 f = ggc_alloc_cleared_machine_function ();
23753 f->use_fast_prologue_epilogue_nregs = -1;
23754 f->tls_descriptor_call_expanded_p = 0;
23755 f->call_abi = ix86_abi;
23756
23757 return f;
23758 }
23759
23760 /* Return a MEM corresponding to a stack slot with mode MODE.
23761 Allocate a new slot if necessary.
23762
23763 The RTL for a function can have several slots available: N is
23764 which slot to use. */
23765
23766 rtx
23767 assign_386_stack_local (enum machine_mode mode, enum ix86_stack_slot n)
23768 {
23769 struct stack_local_entry *s;
23770
23771 gcc_assert (n < MAX_386_STACK_LOCALS);
23772
23773 /* Virtual slot is valid only before vregs are instantiated. */
23774 gcc_assert ((n == SLOT_VIRTUAL) == !virtuals_instantiated);
23775
23776 for (s = ix86_stack_locals; s; s = s->next)
23777 if (s->mode == mode && s->n == n)
23778 return validize_mem (copy_rtx (s->rtl));
23779
23780 s = ggc_alloc_stack_local_entry ();
23781 s->n = n;
23782 s->mode = mode;
23783 s->rtl = assign_stack_local (mode, GET_MODE_SIZE (mode), 0);
23784
23785 s->next = ix86_stack_locals;
23786 ix86_stack_locals = s;
23787 return validize_mem (s->rtl);
23788 }
23789 \f
23790 /* Calculate the length of the memory address in the instruction encoding.
23791 Includes addr32 prefix, does not include the one-byte modrm, opcode,
23792 or other prefixes. We never generate addr32 prefix for LEA insn. */
23793
23794 int
23795 memory_address_length (rtx addr, bool lea)
23796 {
23797 struct ix86_address parts;
23798 rtx base, index, disp;
23799 int len;
23800 int ok;
23801
23802 if (GET_CODE (addr) == PRE_DEC
23803 || GET_CODE (addr) == POST_INC
23804 || GET_CODE (addr) == PRE_MODIFY
23805 || GET_CODE (addr) == POST_MODIFY)
23806 return 0;
23807
23808 ok = ix86_decompose_address (addr, &parts);
23809 gcc_assert (ok);
23810
23811 len = (parts.seg == SEG_DEFAULT) ? 0 : 1;
23812
23813 /* If this is not LEA instruction, add the length of addr32 prefix. */
23814 if (TARGET_64BIT && !lea
23815 && (SImode_address_operand (addr, VOIDmode)
23816 || (parts.base && GET_MODE (parts.base) == SImode)
23817 || (parts.index && GET_MODE (parts.index) == SImode)))
23818 len++;
23819
23820 base = parts.base;
23821 index = parts.index;
23822 disp = parts.disp;
23823
23824 if (base && GET_CODE (base) == SUBREG)
23825 base = SUBREG_REG (base);
23826 if (index && GET_CODE (index) == SUBREG)
23827 index = SUBREG_REG (index);
23828
23829 gcc_assert (base == NULL_RTX || REG_P (base));
23830 gcc_assert (index == NULL_RTX || REG_P (index));
23831
23832 /* Rule of thumb:
23833 - esp as the base always wants an index,
23834 - ebp as the base always wants a displacement,
23835 - r12 as the base always wants an index,
23836 - r13 as the base always wants a displacement. */
23837
23838 /* Register Indirect. */
23839 if (base && !index && !disp)
23840 {
23841 /* esp (for its index) and ebp (for its displacement) need
23842 the two-byte modrm form. Similarly for r12 and r13 in 64-bit
23843 code. */
23844 if (base == arg_pointer_rtx
23845 || base == frame_pointer_rtx
23846 || REGNO (base) == SP_REG
23847 || REGNO (base) == BP_REG
23848 || REGNO (base) == R12_REG
23849 || REGNO (base) == R13_REG)
23850 len++;
23851 }
23852
23853 /* Direct Addressing. In 64-bit mode mod 00 r/m 5
23854 is not disp32, but disp32(%rip), so for disp32
23855 SIB byte is needed, unless print_operand_address
23856 optimizes it into disp32(%rip) or (%rip) is implied
23857 by UNSPEC. */
23858 else if (disp && !base && !index)
23859 {
23860 len += 4;
23861 if (TARGET_64BIT)
23862 {
23863 rtx symbol = disp;
23864
23865 if (GET_CODE (disp) == CONST)
23866 symbol = XEXP (disp, 0);
23867 if (GET_CODE (symbol) == PLUS
23868 && CONST_INT_P (XEXP (symbol, 1)))
23869 symbol = XEXP (symbol, 0);
23870
23871 if (GET_CODE (symbol) != LABEL_REF
23872 && (GET_CODE (symbol) != SYMBOL_REF
23873 || SYMBOL_REF_TLS_MODEL (symbol) != 0)
23874 && (GET_CODE (symbol) != UNSPEC
23875 || (XINT (symbol, 1) != UNSPEC_GOTPCREL
23876 && XINT (symbol, 1) != UNSPEC_PCREL
23877 && XINT (symbol, 1) != UNSPEC_GOTNTPOFF)))
23878 len++;
23879 }
23880 }
23881 else
23882 {
23883 /* Find the length of the displacement constant. */
23884 if (disp)
23885 {
23886 if (base && satisfies_constraint_K (disp))
23887 len += 1;
23888 else
23889 len += 4;
23890 }
23891 /* ebp always wants a displacement. Similarly r13. */
23892 else if (base && (REGNO (base) == BP_REG || REGNO (base) == R13_REG))
23893 len++;
23894
23895 /* An index requires the two-byte modrm form.... */
23896 if (index
23897 /* ...like esp (or r12), which always wants an index. */
23898 || base == arg_pointer_rtx
23899 || base == frame_pointer_rtx
23900 || (base && (REGNO (base) == SP_REG || REGNO (base) == R12_REG)))
23901 len++;
23902 }
23903
23904 return len;
23905 }
23906
23907 /* Compute default value for "length_immediate" attribute. When SHORTFORM
23908 is set, expect that insn have 8bit immediate alternative. */
23909 int
23910 ix86_attr_length_immediate_default (rtx insn, bool shortform)
23911 {
23912 int len = 0;
23913 int i;
23914 extract_insn_cached (insn);
23915 for (i = recog_data.n_operands - 1; i >= 0; --i)
23916 if (CONSTANT_P (recog_data.operand[i]))
23917 {
23918 enum attr_mode mode = get_attr_mode (insn);
23919
23920 gcc_assert (!len);
23921 if (shortform && CONST_INT_P (recog_data.operand[i]))
23922 {
23923 HOST_WIDE_INT ival = INTVAL (recog_data.operand[i]);
23924 switch (mode)
23925 {
23926 case MODE_QI:
23927 len = 1;
23928 continue;
23929 case MODE_HI:
23930 ival = trunc_int_for_mode (ival, HImode);
23931 break;
23932 case MODE_SI:
23933 ival = trunc_int_for_mode (ival, SImode);
23934 break;
23935 default:
23936 break;
23937 }
23938 if (IN_RANGE (ival, -128, 127))
23939 {
23940 len = 1;
23941 continue;
23942 }
23943 }
23944 switch (mode)
23945 {
23946 case MODE_QI:
23947 len = 1;
23948 break;
23949 case MODE_HI:
23950 len = 2;
23951 break;
23952 case MODE_SI:
23953 len = 4;
23954 break;
23955 /* Immediates for DImode instructions are encoded
23956 as 32bit sign extended values. */
23957 case MODE_DI:
23958 len = 4;
23959 break;
23960 default:
23961 fatal_insn ("unknown insn mode", insn);
23962 }
23963 }
23964 return len;
23965 }
23966
23967 /* Compute default value for "length_address" attribute. */
23968 int
23969 ix86_attr_length_address_default (rtx insn)
23970 {
23971 int i;
23972
23973 if (get_attr_type (insn) == TYPE_LEA)
23974 {
23975 rtx set = PATTERN (insn), addr;
23976
23977 if (GET_CODE (set) == PARALLEL)
23978 set = XVECEXP (set, 0, 0);
23979
23980 gcc_assert (GET_CODE (set) == SET);
23981
23982 addr = SET_SRC (set);
23983
23984 return memory_address_length (addr, true);
23985 }
23986
23987 extract_insn_cached (insn);
23988 for (i = recog_data.n_operands - 1; i >= 0; --i)
23989 if (MEM_P (recog_data.operand[i]))
23990 {
23991 constrain_operands_cached (reload_completed);
23992 if (which_alternative != -1)
23993 {
23994 const char *constraints = recog_data.constraints[i];
23995 int alt = which_alternative;
23996
23997 while (*constraints == '=' || *constraints == '+')
23998 constraints++;
23999 while (alt-- > 0)
24000 while (*constraints++ != ',')
24001 ;
24002 /* Skip ignored operands. */
24003 if (*constraints == 'X')
24004 continue;
24005 }
24006 return memory_address_length (XEXP (recog_data.operand[i], 0), false);
24007 }
24008 return 0;
24009 }
24010
24011 /* Compute default value for "length_vex" attribute. It includes
24012 2 or 3 byte VEX prefix and 1 opcode byte. */
24013
24014 int
24015 ix86_attr_length_vex_default (rtx insn, bool has_0f_opcode, bool has_vex_w)
24016 {
24017 int i;
24018
24019 /* Only 0f opcode can use 2 byte VEX prefix and VEX W bit uses 3
24020 byte VEX prefix. */
24021 if (!has_0f_opcode || has_vex_w)
24022 return 3 + 1;
24023
24024 /* We can always use 2 byte VEX prefix in 32bit. */
24025 if (!TARGET_64BIT)
24026 return 2 + 1;
24027
24028 extract_insn_cached (insn);
24029
24030 for (i = recog_data.n_operands - 1; i >= 0; --i)
24031 if (REG_P (recog_data.operand[i]))
24032 {
24033 /* REX.W bit uses 3 byte VEX prefix. */
24034 if (GET_MODE (recog_data.operand[i]) == DImode
24035 && GENERAL_REG_P (recog_data.operand[i]))
24036 return 3 + 1;
24037 }
24038 else
24039 {
24040 /* REX.X or REX.B bits use 3 byte VEX prefix. */
24041 if (MEM_P (recog_data.operand[i])
24042 && x86_extended_reg_mentioned_p (recog_data.operand[i]))
24043 return 3 + 1;
24044 }
24045
24046 return 2 + 1;
24047 }
24048 \f
24049 /* Return the maximum number of instructions a cpu can issue. */
24050
24051 static int
24052 ix86_issue_rate (void)
24053 {
24054 switch (ix86_tune)
24055 {
24056 case PROCESSOR_PENTIUM:
24057 case PROCESSOR_ATOM:
24058 case PROCESSOR_K6:
24059 case PROCESSOR_BTVER2:
24060 return 2;
24061
24062 case PROCESSOR_PENTIUMPRO:
24063 case PROCESSOR_PENTIUM4:
24064 case PROCESSOR_CORE2_32:
24065 case PROCESSOR_CORE2_64:
24066 case PROCESSOR_COREI7_32:
24067 case PROCESSOR_COREI7_64:
24068 case PROCESSOR_ATHLON:
24069 case PROCESSOR_K8:
24070 case PROCESSOR_AMDFAM10:
24071 case PROCESSOR_NOCONA:
24072 case PROCESSOR_GENERIC32:
24073 case PROCESSOR_GENERIC64:
24074 case PROCESSOR_BDVER1:
24075 case PROCESSOR_BDVER2:
24076 case PROCESSOR_BTVER1:
24077 return 3;
24078
24079 default:
24080 return 1;
24081 }
24082 }
24083
24084 /* A subroutine of ix86_adjust_cost -- return TRUE iff INSN reads flags set
24085 by DEP_INSN and nothing set by DEP_INSN. */
24086
24087 static bool
24088 ix86_flags_dependent (rtx insn, rtx dep_insn, enum attr_type insn_type)
24089 {
24090 rtx set, set2;
24091
24092 /* Simplify the test for uninteresting insns. */
24093 if (insn_type != TYPE_SETCC
24094 && insn_type != TYPE_ICMOV
24095 && insn_type != TYPE_FCMOV
24096 && insn_type != TYPE_IBR)
24097 return false;
24098
24099 if ((set = single_set (dep_insn)) != 0)
24100 {
24101 set = SET_DEST (set);
24102 set2 = NULL_RTX;
24103 }
24104 else if (GET_CODE (PATTERN (dep_insn)) == PARALLEL
24105 && XVECLEN (PATTERN (dep_insn), 0) == 2
24106 && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 0)) == SET
24107 && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 1)) == SET)
24108 {
24109 set = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
24110 set2 = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
24111 }
24112 else
24113 return false;
24114
24115 if (!REG_P (set) || REGNO (set) != FLAGS_REG)
24116 return false;
24117
24118 /* This test is true if the dependent insn reads the flags but
24119 not any other potentially set register. */
24120 if (!reg_overlap_mentioned_p (set, PATTERN (insn)))
24121 return false;
24122
24123 if (set2 && reg_overlap_mentioned_p (set2, PATTERN (insn)))
24124 return false;
24125
24126 return true;
24127 }
24128
24129 /* Return true iff USE_INSN has a memory address with operands set by
24130 SET_INSN. */
24131
24132 bool
24133 ix86_agi_dependent (rtx set_insn, rtx use_insn)
24134 {
24135 int i;
24136 extract_insn_cached (use_insn);
24137 for (i = recog_data.n_operands - 1; i >= 0; --i)
24138 if (MEM_P (recog_data.operand[i]))
24139 {
24140 rtx addr = XEXP (recog_data.operand[i], 0);
24141 return modified_in_p (addr, set_insn) != 0;
24142 }
24143 return false;
24144 }
24145
24146 static int
24147 ix86_adjust_cost (rtx insn, rtx link, rtx dep_insn, int cost)
24148 {
24149 enum attr_type insn_type, dep_insn_type;
24150 enum attr_memory memory;
24151 rtx set, set2;
24152 int dep_insn_code_number;
24153
24154 /* Anti and output dependencies have zero cost on all CPUs. */
24155 if (REG_NOTE_KIND (link) != 0)
24156 return 0;
24157
24158 dep_insn_code_number = recog_memoized (dep_insn);
24159
24160 /* If we can't recognize the insns, we can't really do anything. */
24161 if (dep_insn_code_number < 0 || recog_memoized (insn) < 0)
24162 return cost;
24163
24164 insn_type = get_attr_type (insn);
24165 dep_insn_type = get_attr_type (dep_insn);
24166
24167 switch (ix86_tune)
24168 {
24169 case PROCESSOR_PENTIUM:
24170 /* Address Generation Interlock adds a cycle of latency. */
24171 if (insn_type == TYPE_LEA)
24172 {
24173 rtx addr = PATTERN (insn);
24174
24175 if (GET_CODE (addr) == PARALLEL)
24176 addr = XVECEXP (addr, 0, 0);
24177
24178 gcc_assert (GET_CODE (addr) == SET);
24179
24180 addr = SET_SRC (addr);
24181 if (modified_in_p (addr, dep_insn))
24182 cost += 1;
24183 }
24184 else if (ix86_agi_dependent (dep_insn, insn))
24185 cost += 1;
24186
24187 /* ??? Compares pair with jump/setcc. */
24188 if (ix86_flags_dependent (insn, dep_insn, insn_type))
24189 cost = 0;
24190
24191 /* Floating point stores require value to be ready one cycle earlier. */
24192 if (insn_type == TYPE_FMOV
24193 && get_attr_memory (insn) == MEMORY_STORE
24194 && !ix86_agi_dependent (dep_insn, insn))
24195 cost += 1;
24196 break;
24197
24198 case PROCESSOR_PENTIUMPRO:
24199 memory = get_attr_memory (insn);
24200
24201 /* INT->FP conversion is expensive. */
24202 if (get_attr_fp_int_src (dep_insn))
24203 cost += 5;
24204
24205 /* There is one cycle extra latency between an FP op and a store. */
24206 if (insn_type == TYPE_FMOV
24207 && (set = single_set (dep_insn)) != NULL_RTX
24208 && (set2 = single_set (insn)) != NULL_RTX
24209 && rtx_equal_p (SET_DEST (set), SET_SRC (set2))
24210 && MEM_P (SET_DEST (set2)))
24211 cost += 1;
24212
24213 /* Show ability of reorder buffer to hide latency of load by executing
24214 in parallel with previous instruction in case
24215 previous instruction is not needed to compute the address. */
24216 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
24217 && !ix86_agi_dependent (dep_insn, insn))
24218 {
24219 /* Claim moves to take one cycle, as core can issue one load
24220 at time and the next load can start cycle later. */
24221 if (dep_insn_type == TYPE_IMOV
24222 || dep_insn_type == TYPE_FMOV)
24223 cost = 1;
24224 else if (cost > 1)
24225 cost--;
24226 }
24227 break;
24228
24229 case PROCESSOR_K6:
24230 memory = get_attr_memory (insn);
24231
24232 /* The esp dependency is resolved before the instruction is really
24233 finished. */
24234 if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
24235 && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
24236 return 1;
24237
24238 /* INT->FP conversion is expensive. */
24239 if (get_attr_fp_int_src (dep_insn))
24240 cost += 5;
24241
24242 /* Show ability of reorder buffer to hide latency of load by executing
24243 in parallel with previous instruction in case
24244 previous instruction is not needed to compute the address. */
24245 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
24246 && !ix86_agi_dependent (dep_insn, insn))
24247 {
24248 /* Claim moves to take one cycle, as core can issue one load
24249 at time and the next load can start cycle later. */
24250 if (dep_insn_type == TYPE_IMOV
24251 || dep_insn_type == TYPE_FMOV)
24252 cost = 1;
24253 else if (cost > 2)
24254 cost -= 2;
24255 else
24256 cost = 1;
24257 }
24258 break;
24259
24260 case PROCESSOR_ATHLON:
24261 case PROCESSOR_K8:
24262 case PROCESSOR_AMDFAM10:
24263 case PROCESSOR_BDVER1:
24264 case PROCESSOR_BDVER2:
24265 case PROCESSOR_BTVER1:
24266 case PROCESSOR_BTVER2:
24267 case PROCESSOR_ATOM:
24268 case PROCESSOR_GENERIC32:
24269 case PROCESSOR_GENERIC64:
24270 memory = get_attr_memory (insn);
24271
24272 /* Show ability of reorder buffer to hide latency of load by executing
24273 in parallel with previous instruction in case
24274 previous instruction is not needed to compute the address. */
24275 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
24276 && !ix86_agi_dependent (dep_insn, insn))
24277 {
24278 enum attr_unit unit = get_attr_unit (insn);
24279 int loadcost = 3;
24280
24281 /* Because of the difference between the length of integer and
24282 floating unit pipeline preparation stages, the memory operands
24283 for floating point are cheaper.
24284
24285 ??? For Athlon it the difference is most probably 2. */
24286 if (unit == UNIT_INTEGER || unit == UNIT_UNKNOWN)
24287 loadcost = 3;
24288 else
24289 loadcost = TARGET_ATHLON ? 2 : 0;
24290
24291 if (cost >= loadcost)
24292 cost -= loadcost;
24293 else
24294 cost = 0;
24295 }
24296
24297 default:
24298 break;
24299 }
24300
24301 return cost;
24302 }
24303
24304 /* How many alternative schedules to try. This should be as wide as the
24305 scheduling freedom in the DFA, but no wider. Making this value too
24306 large results extra work for the scheduler. */
24307
24308 static int
24309 ia32_multipass_dfa_lookahead (void)
24310 {
24311 switch (ix86_tune)
24312 {
24313 case PROCESSOR_PENTIUM:
24314 return 2;
24315
24316 case PROCESSOR_PENTIUMPRO:
24317 case PROCESSOR_K6:
24318 return 1;
24319
24320 case PROCESSOR_CORE2_32:
24321 case PROCESSOR_CORE2_64:
24322 case PROCESSOR_COREI7_32:
24323 case PROCESSOR_COREI7_64:
24324 case PROCESSOR_ATOM:
24325 /* Generally, we want haifa-sched:max_issue() to look ahead as far
24326 as many instructions can be executed on a cycle, i.e.,
24327 issue_rate. I wonder why tuning for many CPUs does not do this. */
24328 if (reload_completed)
24329 return ix86_issue_rate ();
24330 /* Don't use lookahead for pre-reload schedule to save compile time. */
24331 return 0;
24332
24333 default:
24334 return 0;
24335 }
24336 }
24337
24338 /* Try to reorder ready list to take advantage of Atom pipelined IMUL
24339 execution. It is applied if
24340 (1) IMUL instruction is on the top of list;
24341 (2) There exists the only producer of independent IMUL instruction in
24342 ready list;
24343 (3) Put found producer on the top of ready list.
24344 Returns issue rate. */
24345
24346 static int
24347 ix86_sched_reorder(FILE *dump, int sched_verbose, rtx *ready, int *pn_ready,
24348 int clock_var ATTRIBUTE_UNUSED)
24349 {
24350 static int issue_rate = -1;
24351 int n_ready = *pn_ready;
24352 rtx insn, insn1, insn2;
24353 int i;
24354 sd_iterator_def sd_it;
24355 dep_t dep;
24356 int index = -1;
24357
24358 /* Set up issue rate. */
24359 issue_rate = ix86_issue_rate();
24360
24361 /* Do reodering for Atom only. */
24362 if (ix86_tune != PROCESSOR_ATOM)
24363 return issue_rate;
24364 /* Do not perform ready list reodering for pre-reload schedule pass. */
24365 if (!reload_completed)
24366 return issue_rate;
24367 /* Nothing to do if ready list contains only 1 instruction. */
24368 if (n_ready <= 1)
24369 return issue_rate;
24370
24371 /* Check that IMUL instruction is on the top of ready list. */
24372 insn = ready[n_ready - 1];
24373 if (!NONDEBUG_INSN_P (insn))
24374 return issue_rate;
24375 insn = PATTERN (insn);
24376 if (GET_CODE (insn) == PARALLEL)
24377 insn = XVECEXP (insn, 0, 0);
24378 if (GET_CODE (insn) != SET)
24379 return issue_rate;
24380 if (!(GET_CODE (SET_SRC (insn)) == MULT
24381 && GET_MODE (SET_SRC (insn)) == SImode))
24382 return issue_rate;
24383
24384 /* Search for producer of independent IMUL instruction. */
24385 for (i = n_ready - 2; i>= 0; i--)
24386 {
24387 insn = ready[i];
24388 if (!NONDEBUG_INSN_P (insn))
24389 continue;
24390 /* Skip IMUL instruction. */
24391 insn2 = PATTERN (insn);
24392 if (GET_CODE (insn2) == PARALLEL)
24393 insn2 = XVECEXP (insn2, 0, 0);
24394 if (GET_CODE (insn2) == SET
24395 && GET_CODE (SET_SRC (insn2)) == MULT
24396 && GET_MODE (SET_SRC (insn2)) == SImode)
24397 continue;
24398
24399 FOR_EACH_DEP (insn, SD_LIST_FORW, sd_it, dep)
24400 {
24401 rtx con;
24402 con = DEP_CON (dep);
24403 if (!NONDEBUG_INSN_P (con))
24404 continue;
24405 insn1 = PATTERN (con);
24406 if (GET_CODE (insn1) == PARALLEL)
24407 insn1 = XVECEXP (insn1, 0, 0);
24408
24409 if (GET_CODE (insn1) == SET
24410 && GET_CODE (SET_SRC (insn1)) == MULT
24411 && GET_MODE (SET_SRC (insn1)) == SImode)
24412 {
24413 sd_iterator_def sd_it1;
24414 dep_t dep1;
24415 /* Check if there is no other dependee for IMUL. */
24416 index = i;
24417 FOR_EACH_DEP (con, SD_LIST_BACK, sd_it1, dep1)
24418 {
24419 rtx pro;
24420 pro = DEP_PRO (dep1);
24421 if (!NONDEBUG_INSN_P (pro))
24422 continue;
24423 if (pro != insn)
24424 index = -1;
24425 }
24426 if (index >= 0)
24427 break;
24428 }
24429 }
24430 if (index >= 0)
24431 break;
24432 }
24433 if (index < 0)
24434 return issue_rate; /* Didn't find IMUL producer. */
24435
24436 if (sched_verbose > 1)
24437 fprintf(dump, ";;\tatom sched_reorder: swap %d and %d insns\n",
24438 INSN_UID (ready[index]), INSN_UID (ready[n_ready - 1]));
24439
24440 /* Put IMUL producer (ready[index]) at the top of ready list. */
24441 insn1= ready[index];
24442 for (i = index; i < n_ready - 1; i++)
24443 ready[i] = ready[i + 1];
24444 ready[n_ready - 1] = insn1;
24445
24446 return issue_rate;
24447 }
24448
24449 static bool
24450 ix86_class_likely_spilled_p (reg_class_t);
24451
24452 /* Returns true if lhs of insn is HW function argument register and set up
24453 is_spilled to true if it is likely spilled HW register. */
24454 static bool
24455 insn_is_function_arg (rtx insn, bool* is_spilled)
24456 {
24457 rtx dst;
24458
24459 if (!NONDEBUG_INSN_P (insn))
24460 return false;
24461 /* Call instructions are not movable, ignore it. */
24462 if (CALL_P (insn))
24463 return false;
24464 insn = PATTERN (insn);
24465 if (GET_CODE (insn) == PARALLEL)
24466 insn = XVECEXP (insn, 0, 0);
24467 if (GET_CODE (insn) != SET)
24468 return false;
24469 dst = SET_DEST (insn);
24470 if (REG_P (dst) && HARD_REGISTER_P (dst)
24471 && ix86_function_arg_regno_p (REGNO (dst)))
24472 {
24473 /* Is it likely spilled HW register? */
24474 if (!TEST_HARD_REG_BIT (fixed_reg_set, REGNO (dst))
24475 && ix86_class_likely_spilled_p (REGNO_REG_CLASS (REGNO (dst))))
24476 *is_spilled = true;
24477 return true;
24478 }
24479 return false;
24480 }
24481
24482 /* Add output dependencies for chain of function adjacent arguments if only
24483 there is a move to likely spilled HW register. Return first argument
24484 if at least one dependence was added or NULL otherwise. */
24485 static rtx
24486 add_parameter_dependencies (rtx call, rtx head)
24487 {
24488 rtx insn;
24489 rtx last = call;
24490 rtx first_arg = NULL;
24491 bool is_spilled = false;
24492
24493 /* Find nearest to call argument passing instruction. */
24494 while (true)
24495 {
24496 last = PREV_INSN (last);
24497 if (last == head)
24498 return NULL;
24499 if (!NONDEBUG_INSN_P (last))
24500 continue;
24501 if (insn_is_function_arg (last, &is_spilled))
24502 break;
24503 return NULL;
24504 }
24505
24506 first_arg = last;
24507 while (true)
24508 {
24509 insn = PREV_INSN (last);
24510 if (!INSN_P (insn))
24511 break;
24512 if (insn == head)
24513 break;
24514 if (!NONDEBUG_INSN_P (insn))
24515 {
24516 last = insn;
24517 continue;
24518 }
24519 if (insn_is_function_arg (insn, &is_spilled))
24520 {
24521 /* Add output depdendence between two function arguments if chain
24522 of output arguments contains likely spilled HW registers. */
24523 if (is_spilled)
24524 add_dependence (last, insn, REG_DEP_OUTPUT);
24525 first_arg = last = insn;
24526 }
24527 else
24528 break;
24529 }
24530 if (!is_spilled)
24531 return NULL;
24532 return first_arg;
24533 }
24534
24535 /* Add output or anti dependency from insn to first_arg to restrict its code
24536 motion. */
24537 static void
24538 avoid_func_arg_motion (rtx first_arg, rtx insn)
24539 {
24540 rtx set;
24541 rtx tmp;
24542
24543 set = single_set (insn);
24544 if (!set)
24545 return;
24546 tmp = SET_DEST (set);
24547 if (REG_P (tmp))
24548 {
24549 /* Add output dependency to the first function argument. */
24550 add_dependence (first_arg, insn, REG_DEP_OUTPUT);
24551 return;
24552 }
24553 /* Add anti dependency. */
24554 add_dependence (first_arg, insn, REG_DEP_ANTI);
24555 }
24556
24557 /* Avoid cross block motion of function argument through adding dependency
24558 from the first non-jump instruction in bb. */
24559 static void
24560 add_dependee_for_func_arg (rtx arg, basic_block bb)
24561 {
24562 rtx insn = BB_END (bb);
24563
24564 while (insn)
24565 {
24566 if (NONDEBUG_INSN_P (insn) && NONJUMP_INSN_P (insn))
24567 {
24568 rtx set = single_set (insn);
24569 if (set)
24570 {
24571 avoid_func_arg_motion (arg, insn);
24572 return;
24573 }
24574 }
24575 if (insn == BB_HEAD (bb))
24576 return;
24577 insn = PREV_INSN (insn);
24578 }
24579 }
24580
24581 /* Hook for pre-reload schedule - avoid motion of function arguments
24582 passed in likely spilled HW registers. */
24583 static void
24584 ix86_dependencies_evaluation_hook (rtx head, rtx tail)
24585 {
24586 rtx insn;
24587 rtx first_arg = NULL;
24588 if (reload_completed)
24589 return;
24590 for (insn = tail; insn != head; insn = PREV_INSN (insn))
24591 if (INSN_P (insn) && CALL_P (insn))
24592 {
24593 first_arg = add_parameter_dependencies (insn, head);
24594 if (first_arg)
24595 {
24596 /* Add dependee for first argument to predecessors if only
24597 region contains more than one block. */
24598 basic_block bb = BLOCK_FOR_INSN (insn);
24599 int rgn = CONTAINING_RGN (bb->index);
24600 int nr_blks = RGN_NR_BLOCKS (rgn);
24601 /* Skip trivial regions and region head blocks that can have
24602 predecessors outside of region. */
24603 if (nr_blks > 1 && BLOCK_TO_BB (bb->index) != 0)
24604 {
24605 edge e;
24606 edge_iterator ei;
24607 /* Assume that region is SCC, i.e. all immediate predecessors
24608 of non-head block are in the same region. */
24609 FOR_EACH_EDGE (e, ei, bb->preds)
24610 {
24611 /* Avoid creating of loop-carried dependencies through
24612 using topological odering in region. */
24613 if (BLOCK_TO_BB (bb->index) > BLOCK_TO_BB (e->src->index))
24614 add_dependee_for_func_arg (first_arg, e->src);
24615 }
24616 }
24617 insn = first_arg;
24618 }
24619 }
24620 else if (first_arg)
24621 avoid_func_arg_motion (first_arg, insn);
24622 }
24623
24624 /* Hook for pre-reload schedule - set priority of moves from likely spilled
24625 HW registers to maximum, to schedule them at soon as possible. These are
24626 moves from function argument registers at the top of the function entry
24627 and moves from function return value registers after call. */
24628 static int
24629 ix86_adjust_priority (rtx insn, int priority)
24630 {
24631 rtx set;
24632
24633 if (reload_completed)
24634 return priority;
24635
24636 if (!NONDEBUG_INSN_P (insn))
24637 return priority;
24638
24639 set = single_set (insn);
24640 if (set)
24641 {
24642 rtx tmp = SET_SRC (set);
24643 if (REG_P (tmp)
24644 && HARD_REGISTER_P (tmp)
24645 && !TEST_HARD_REG_BIT (fixed_reg_set, REGNO (tmp))
24646 && ix86_class_likely_spilled_p (REGNO_REG_CLASS (REGNO (tmp))))
24647 return current_sched_info->sched_max_insns_priority;
24648 }
24649
24650 return priority;
24651 }
24652
24653 /* Model decoder of Core 2/i7.
24654 Below hooks for multipass scheduling (see haifa-sched.c:max_issue)
24655 track the instruction fetch block boundaries and make sure that long
24656 (9+ bytes) instructions are assigned to D0. */
24657
24658 /* Maximum length of an insn that can be handled by
24659 a secondary decoder unit. '8' for Core 2/i7. */
24660 static int core2i7_secondary_decoder_max_insn_size;
24661
24662 /* Ifetch block size, i.e., number of bytes decoder reads per cycle.
24663 '16' for Core 2/i7. */
24664 static int core2i7_ifetch_block_size;
24665
24666 /* Maximum number of instructions decoder can handle per cycle.
24667 '6' for Core 2/i7. */
24668 static int core2i7_ifetch_block_max_insns;
24669
24670 typedef struct ix86_first_cycle_multipass_data_ *
24671 ix86_first_cycle_multipass_data_t;
24672 typedef const struct ix86_first_cycle_multipass_data_ *
24673 const_ix86_first_cycle_multipass_data_t;
24674
24675 /* A variable to store target state across calls to max_issue within
24676 one cycle. */
24677 static struct ix86_first_cycle_multipass_data_ _ix86_first_cycle_multipass_data,
24678 *ix86_first_cycle_multipass_data = &_ix86_first_cycle_multipass_data;
24679
24680 /* Initialize DATA. */
24681 static void
24682 core2i7_first_cycle_multipass_init (void *_data)
24683 {
24684 ix86_first_cycle_multipass_data_t data
24685 = (ix86_first_cycle_multipass_data_t) _data;
24686
24687 data->ifetch_block_len = 0;
24688 data->ifetch_block_n_insns = 0;
24689 data->ready_try_change = NULL;
24690 data->ready_try_change_size = 0;
24691 }
24692
24693 /* Advancing the cycle; reset ifetch block counts. */
24694 static void
24695 core2i7_dfa_post_advance_cycle (void)
24696 {
24697 ix86_first_cycle_multipass_data_t data = ix86_first_cycle_multipass_data;
24698
24699 gcc_assert (data->ifetch_block_n_insns <= core2i7_ifetch_block_max_insns);
24700
24701 data->ifetch_block_len = 0;
24702 data->ifetch_block_n_insns = 0;
24703 }
24704
24705 static int min_insn_size (rtx);
24706
24707 /* Filter out insns from ready_try that the core will not be able to issue
24708 on current cycle due to decoder. */
24709 static void
24710 core2i7_first_cycle_multipass_filter_ready_try
24711 (const_ix86_first_cycle_multipass_data_t data,
24712 char *ready_try, int n_ready, bool first_cycle_insn_p)
24713 {
24714 while (n_ready--)
24715 {
24716 rtx insn;
24717 int insn_size;
24718
24719 if (ready_try[n_ready])
24720 continue;
24721
24722 insn = get_ready_element (n_ready);
24723 insn_size = min_insn_size (insn);
24724
24725 if (/* If this is a too long an insn for a secondary decoder ... */
24726 (!first_cycle_insn_p
24727 && insn_size > core2i7_secondary_decoder_max_insn_size)
24728 /* ... or it would not fit into the ifetch block ... */
24729 || data->ifetch_block_len + insn_size > core2i7_ifetch_block_size
24730 /* ... or the decoder is full already ... */
24731 || data->ifetch_block_n_insns + 1 > core2i7_ifetch_block_max_insns)
24732 /* ... mask the insn out. */
24733 {
24734 ready_try[n_ready] = 1;
24735
24736 if (data->ready_try_change)
24737 SET_BIT (data->ready_try_change, n_ready);
24738 }
24739 }
24740 }
24741
24742 /* Prepare for a new round of multipass lookahead scheduling. */
24743 static void
24744 core2i7_first_cycle_multipass_begin (void *_data, char *ready_try, int n_ready,
24745 bool first_cycle_insn_p)
24746 {
24747 ix86_first_cycle_multipass_data_t data
24748 = (ix86_first_cycle_multipass_data_t) _data;
24749 const_ix86_first_cycle_multipass_data_t prev_data
24750 = ix86_first_cycle_multipass_data;
24751
24752 /* Restore the state from the end of the previous round. */
24753 data->ifetch_block_len = prev_data->ifetch_block_len;
24754 data->ifetch_block_n_insns = prev_data->ifetch_block_n_insns;
24755
24756 /* Filter instructions that cannot be issued on current cycle due to
24757 decoder restrictions. */
24758 core2i7_first_cycle_multipass_filter_ready_try (data, ready_try, n_ready,
24759 first_cycle_insn_p);
24760 }
24761
24762 /* INSN is being issued in current solution. Account for its impact on
24763 the decoder model. */
24764 static void
24765 core2i7_first_cycle_multipass_issue (void *_data, char *ready_try, int n_ready,
24766 rtx insn, const void *_prev_data)
24767 {
24768 ix86_first_cycle_multipass_data_t data
24769 = (ix86_first_cycle_multipass_data_t) _data;
24770 const_ix86_first_cycle_multipass_data_t prev_data
24771 = (const_ix86_first_cycle_multipass_data_t) _prev_data;
24772
24773 int insn_size = min_insn_size (insn);
24774
24775 data->ifetch_block_len = prev_data->ifetch_block_len + insn_size;
24776 data->ifetch_block_n_insns = prev_data->ifetch_block_n_insns + 1;
24777 gcc_assert (data->ifetch_block_len <= core2i7_ifetch_block_size
24778 && data->ifetch_block_n_insns <= core2i7_ifetch_block_max_insns);
24779
24780 /* Allocate or resize the bitmap for storing INSN's effect on ready_try. */
24781 if (!data->ready_try_change)
24782 {
24783 data->ready_try_change = sbitmap_alloc (n_ready);
24784 data->ready_try_change_size = n_ready;
24785 }
24786 else if (data->ready_try_change_size < n_ready)
24787 {
24788 data->ready_try_change = sbitmap_resize (data->ready_try_change,
24789 n_ready, 0);
24790 data->ready_try_change_size = n_ready;
24791 }
24792 sbitmap_zero (data->ready_try_change);
24793
24794 /* Filter out insns from ready_try that the core will not be able to issue
24795 on current cycle due to decoder. */
24796 core2i7_first_cycle_multipass_filter_ready_try (data, ready_try, n_ready,
24797 false);
24798 }
24799
24800 /* Revert the effect on ready_try. */
24801 static void
24802 core2i7_first_cycle_multipass_backtrack (const void *_data,
24803 char *ready_try,
24804 int n_ready ATTRIBUTE_UNUSED)
24805 {
24806 const_ix86_first_cycle_multipass_data_t data
24807 = (const_ix86_first_cycle_multipass_data_t) _data;
24808 unsigned int i = 0;
24809 sbitmap_iterator sbi;
24810
24811 gcc_assert (sbitmap_last_set_bit (data->ready_try_change) < n_ready);
24812 EXECUTE_IF_SET_IN_SBITMAP (data->ready_try_change, 0, i, sbi)
24813 {
24814 ready_try[i] = 0;
24815 }
24816 }
24817
24818 /* Save the result of multipass lookahead scheduling for the next round. */
24819 static void
24820 core2i7_first_cycle_multipass_end (const void *_data)
24821 {
24822 const_ix86_first_cycle_multipass_data_t data
24823 = (const_ix86_first_cycle_multipass_data_t) _data;
24824 ix86_first_cycle_multipass_data_t next_data
24825 = ix86_first_cycle_multipass_data;
24826
24827 if (data != NULL)
24828 {
24829 next_data->ifetch_block_len = data->ifetch_block_len;
24830 next_data->ifetch_block_n_insns = data->ifetch_block_n_insns;
24831 }
24832 }
24833
24834 /* Deallocate target data. */
24835 static void
24836 core2i7_first_cycle_multipass_fini (void *_data)
24837 {
24838 ix86_first_cycle_multipass_data_t data
24839 = (ix86_first_cycle_multipass_data_t) _data;
24840
24841 if (data->ready_try_change)
24842 {
24843 sbitmap_free (data->ready_try_change);
24844 data->ready_try_change = NULL;
24845 data->ready_try_change_size = 0;
24846 }
24847 }
24848
24849 /* Prepare for scheduling pass. */
24850 static void
24851 ix86_sched_init_global (FILE *dump ATTRIBUTE_UNUSED,
24852 int verbose ATTRIBUTE_UNUSED,
24853 int max_uid ATTRIBUTE_UNUSED)
24854 {
24855 /* Install scheduling hooks for current CPU. Some of these hooks are used
24856 in time-critical parts of the scheduler, so we only set them up when
24857 they are actually used. */
24858 switch (ix86_tune)
24859 {
24860 case PROCESSOR_CORE2_32:
24861 case PROCESSOR_CORE2_64:
24862 case PROCESSOR_COREI7_32:
24863 case PROCESSOR_COREI7_64:
24864 /* Do not perform multipass scheduling for pre-reload schedule
24865 to save compile time. */
24866 if (reload_completed)
24867 {
24868 targetm.sched.dfa_post_advance_cycle
24869 = core2i7_dfa_post_advance_cycle;
24870 targetm.sched.first_cycle_multipass_init
24871 = core2i7_first_cycle_multipass_init;
24872 targetm.sched.first_cycle_multipass_begin
24873 = core2i7_first_cycle_multipass_begin;
24874 targetm.sched.first_cycle_multipass_issue
24875 = core2i7_first_cycle_multipass_issue;
24876 targetm.sched.first_cycle_multipass_backtrack
24877 = core2i7_first_cycle_multipass_backtrack;
24878 targetm.sched.first_cycle_multipass_end
24879 = core2i7_first_cycle_multipass_end;
24880 targetm.sched.first_cycle_multipass_fini
24881 = core2i7_first_cycle_multipass_fini;
24882
24883 /* Set decoder parameters. */
24884 core2i7_secondary_decoder_max_insn_size = 8;
24885 core2i7_ifetch_block_size = 16;
24886 core2i7_ifetch_block_max_insns = 6;
24887 break;
24888 }
24889 /* ... Fall through ... */
24890 default:
24891 targetm.sched.dfa_post_advance_cycle = NULL;
24892 targetm.sched.first_cycle_multipass_init = NULL;
24893 targetm.sched.first_cycle_multipass_begin = NULL;
24894 targetm.sched.first_cycle_multipass_issue = NULL;
24895 targetm.sched.first_cycle_multipass_backtrack = NULL;
24896 targetm.sched.first_cycle_multipass_end = NULL;
24897 targetm.sched.first_cycle_multipass_fini = NULL;
24898 break;
24899 }
24900 }
24901
24902 \f
24903 /* Compute the alignment given to a constant that is being placed in memory.
24904 EXP is the constant and ALIGN is the alignment that the object would
24905 ordinarily have.
24906 The value of this function is used instead of that alignment to align
24907 the object. */
24908
24909 int
24910 ix86_constant_alignment (tree exp, int align)
24911 {
24912 if (TREE_CODE (exp) == REAL_CST || TREE_CODE (exp) == VECTOR_CST
24913 || TREE_CODE (exp) == INTEGER_CST)
24914 {
24915 if (TYPE_MODE (TREE_TYPE (exp)) == DFmode && align < 64)
24916 return 64;
24917 else if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (exp))) && align < 128)
24918 return 128;
24919 }
24920 else if (!optimize_size && TREE_CODE (exp) == STRING_CST
24921 && TREE_STRING_LENGTH (exp) >= 31 && align < BITS_PER_WORD)
24922 return BITS_PER_WORD;
24923
24924 return align;
24925 }
24926
24927 /* Compute the alignment for a static variable.
24928 TYPE is the data type, and ALIGN is the alignment that
24929 the object would ordinarily have. The value of this function is used
24930 instead of that alignment to align the object. */
24931
24932 int
24933 ix86_data_alignment (tree type, int align)
24934 {
24935 int max_align = optimize_size ? BITS_PER_WORD : MIN (256, MAX_OFILE_ALIGNMENT);
24936
24937 if (AGGREGATE_TYPE_P (type)
24938 && TYPE_SIZE (type)
24939 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
24940 && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= (unsigned) max_align
24941 || TREE_INT_CST_HIGH (TYPE_SIZE (type)))
24942 && align < max_align)
24943 align = max_align;
24944
24945 /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
24946 to 16byte boundary. */
24947 if (TARGET_64BIT)
24948 {
24949 if (AGGREGATE_TYPE_P (type)
24950 && TYPE_SIZE (type)
24951 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
24952 && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= 128
24953 || TREE_INT_CST_HIGH (TYPE_SIZE (type))) && align < 128)
24954 return 128;
24955 }
24956
24957 if (TREE_CODE (type) == ARRAY_TYPE)
24958 {
24959 if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
24960 return 64;
24961 if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
24962 return 128;
24963 }
24964 else if (TREE_CODE (type) == COMPLEX_TYPE)
24965 {
24966
24967 if (TYPE_MODE (type) == DCmode && align < 64)
24968 return 64;
24969 if ((TYPE_MODE (type) == XCmode
24970 || TYPE_MODE (type) == TCmode) && align < 128)
24971 return 128;
24972 }
24973 else if ((TREE_CODE (type) == RECORD_TYPE
24974 || TREE_CODE (type) == UNION_TYPE
24975 || TREE_CODE (type) == QUAL_UNION_TYPE)
24976 && TYPE_FIELDS (type))
24977 {
24978 if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
24979 return 64;
24980 if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
24981 return 128;
24982 }
24983 else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
24984 || TREE_CODE (type) == INTEGER_TYPE)
24985 {
24986 if (TYPE_MODE (type) == DFmode && align < 64)
24987 return 64;
24988 if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
24989 return 128;
24990 }
24991
24992 return align;
24993 }
24994
24995 /* Compute the alignment for a local variable or a stack slot. EXP is
24996 the data type or decl itself, MODE is the widest mode available and
24997 ALIGN is the alignment that the object would ordinarily have. The
24998 value of this macro is used instead of that alignment to align the
24999 object. */
25000
25001 unsigned int
25002 ix86_local_alignment (tree exp, enum machine_mode mode,
25003 unsigned int align)
25004 {
25005 tree type, decl;
25006
25007 if (exp && DECL_P (exp))
25008 {
25009 type = TREE_TYPE (exp);
25010 decl = exp;
25011 }
25012 else
25013 {
25014 type = exp;
25015 decl = NULL;
25016 }
25017
25018 /* Don't do dynamic stack realignment for long long objects with
25019 -mpreferred-stack-boundary=2. */
25020 if (!TARGET_64BIT
25021 && align == 64
25022 && ix86_preferred_stack_boundary < 64
25023 && (mode == DImode || (type && TYPE_MODE (type) == DImode))
25024 && (!type || !TYPE_USER_ALIGN (type))
25025 && (!decl || !DECL_USER_ALIGN (decl)))
25026 align = 32;
25027
25028 /* If TYPE is NULL, we are allocating a stack slot for caller-save
25029 register in MODE. We will return the largest alignment of XF
25030 and DF. */
25031 if (!type)
25032 {
25033 if (mode == XFmode && align < GET_MODE_ALIGNMENT (DFmode))
25034 align = GET_MODE_ALIGNMENT (DFmode);
25035 return align;
25036 }
25037
25038 /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
25039 to 16byte boundary. Exact wording is:
25040
25041 An array uses the same alignment as its elements, except that a local or
25042 global array variable of length at least 16 bytes or
25043 a C99 variable-length array variable always has alignment of at least 16 bytes.
25044
25045 This was added to allow use of aligned SSE instructions at arrays. This
25046 rule is meant for static storage (where compiler can not do the analysis
25047 by itself). We follow it for automatic variables only when convenient.
25048 We fully control everything in the function compiled and functions from
25049 other unit can not rely on the alignment.
25050
25051 Exclude va_list type. It is the common case of local array where
25052 we can not benefit from the alignment. */
25053 if (TARGET_64BIT && optimize_function_for_speed_p (cfun)
25054 && TARGET_SSE)
25055 {
25056 if (AGGREGATE_TYPE_P (type)
25057 && (va_list_type_node == NULL_TREE
25058 || (TYPE_MAIN_VARIANT (type)
25059 != TYPE_MAIN_VARIANT (va_list_type_node)))
25060 && TYPE_SIZE (type)
25061 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
25062 && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= 16
25063 || TREE_INT_CST_HIGH (TYPE_SIZE (type))) && align < 128)
25064 return 128;
25065 }
25066 if (TREE_CODE (type) == ARRAY_TYPE)
25067 {
25068 if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
25069 return 64;
25070 if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
25071 return 128;
25072 }
25073 else if (TREE_CODE (type) == COMPLEX_TYPE)
25074 {
25075 if (TYPE_MODE (type) == DCmode && align < 64)
25076 return 64;
25077 if ((TYPE_MODE (type) == XCmode
25078 || TYPE_MODE (type) == TCmode) && align < 128)
25079 return 128;
25080 }
25081 else if ((TREE_CODE (type) == RECORD_TYPE
25082 || TREE_CODE (type) == UNION_TYPE
25083 || TREE_CODE (type) == QUAL_UNION_TYPE)
25084 && TYPE_FIELDS (type))
25085 {
25086 if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
25087 return 64;
25088 if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
25089 return 128;
25090 }
25091 else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
25092 || TREE_CODE (type) == INTEGER_TYPE)
25093 {
25094
25095 if (TYPE_MODE (type) == DFmode && align < 64)
25096 return 64;
25097 if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
25098 return 128;
25099 }
25100 return align;
25101 }
25102
25103 /* Compute the minimum required alignment for dynamic stack realignment
25104 purposes for a local variable, parameter or a stack slot. EXP is
25105 the data type or decl itself, MODE is its mode and ALIGN is the
25106 alignment that the object would ordinarily have. */
25107
25108 unsigned int
25109 ix86_minimum_alignment (tree exp, enum machine_mode mode,
25110 unsigned int align)
25111 {
25112 tree type, decl;
25113
25114 if (exp && DECL_P (exp))
25115 {
25116 type = TREE_TYPE (exp);
25117 decl = exp;
25118 }
25119 else
25120 {
25121 type = exp;
25122 decl = NULL;
25123 }
25124
25125 if (TARGET_64BIT || align != 64 || ix86_preferred_stack_boundary >= 64)
25126 return align;
25127
25128 /* Don't do dynamic stack realignment for long long objects with
25129 -mpreferred-stack-boundary=2. */
25130 if ((mode == DImode || (type && TYPE_MODE (type) == DImode))
25131 && (!type || !TYPE_USER_ALIGN (type))
25132 && (!decl || !DECL_USER_ALIGN (decl)))
25133 return 32;
25134
25135 return align;
25136 }
25137 \f
25138 /* Find a location for the static chain incoming to a nested function.
25139 This is a register, unless all free registers are used by arguments. */
25140
25141 static rtx
25142 ix86_static_chain (const_tree fndecl, bool incoming_p)
25143 {
25144 unsigned regno;
25145
25146 if (!DECL_STATIC_CHAIN (fndecl))
25147 return NULL;
25148
25149 if (TARGET_64BIT)
25150 {
25151 /* We always use R10 in 64-bit mode. */
25152 regno = R10_REG;
25153 }
25154 else
25155 {
25156 tree fntype;
25157 unsigned int ccvt;
25158
25159 /* By default in 32-bit mode we use ECX to pass the static chain. */
25160 regno = CX_REG;
25161
25162 fntype = TREE_TYPE (fndecl);
25163 ccvt = ix86_get_callcvt (fntype);
25164 if ((ccvt & (IX86_CALLCVT_FASTCALL | IX86_CALLCVT_THISCALL)) != 0)
25165 {
25166 /* Fastcall functions use ecx/edx for arguments, which leaves
25167 us with EAX for the static chain.
25168 Thiscall functions use ecx for arguments, which also
25169 leaves us with EAX for the static chain. */
25170 regno = AX_REG;
25171 }
25172 else if (ix86_function_regparm (fntype, fndecl) == 3)
25173 {
25174 /* For regparm 3, we have no free call-clobbered registers in
25175 which to store the static chain. In order to implement this,
25176 we have the trampoline push the static chain to the stack.
25177 However, we can't push a value below the return address when
25178 we call the nested function directly, so we have to use an
25179 alternate entry point. For this we use ESI, and have the
25180 alternate entry point push ESI, so that things appear the
25181 same once we're executing the nested function. */
25182 if (incoming_p)
25183 {
25184 if (fndecl == current_function_decl)
25185 ix86_static_chain_on_stack = true;
25186 return gen_frame_mem (SImode,
25187 plus_constant (Pmode,
25188 arg_pointer_rtx, -8));
25189 }
25190 regno = SI_REG;
25191 }
25192 }
25193
25194 return gen_rtx_REG (Pmode, regno);
25195 }
25196
25197 /* Emit RTL insns to initialize the variable parts of a trampoline.
25198 FNDECL is the decl of the target address; M_TRAMP is a MEM for
25199 the trampoline, and CHAIN_VALUE is an RTX for the static chain
25200 to be passed to the target function. */
25201
25202 static void
25203 ix86_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
25204 {
25205 rtx mem, fnaddr;
25206 int opcode;
25207 int offset = 0;
25208
25209 fnaddr = XEXP (DECL_RTL (fndecl), 0);
25210
25211 if (TARGET_64BIT)
25212 {
25213 int size;
25214
25215 /* Load the function address to r11. Try to load address using
25216 the shorter movl instead of movabs. We may want to support
25217 movq for kernel mode, but kernel does not use trampolines at
25218 the moment. FNADDR is a 32bit address and may not be in
25219 DImode when ptr_mode == SImode. Always use movl in this
25220 case. */
25221 if (ptr_mode == SImode
25222 || x86_64_zext_immediate_operand (fnaddr, VOIDmode))
25223 {
25224 fnaddr = copy_addr_to_reg (fnaddr);
25225
25226 mem = adjust_address (m_tramp, HImode, offset);
25227 emit_move_insn (mem, gen_int_mode (0xbb41, HImode));
25228
25229 mem = adjust_address (m_tramp, SImode, offset + 2);
25230 emit_move_insn (mem, gen_lowpart (SImode, fnaddr));
25231 offset += 6;
25232 }
25233 else
25234 {
25235 mem = adjust_address (m_tramp, HImode, offset);
25236 emit_move_insn (mem, gen_int_mode (0xbb49, HImode));
25237
25238 mem = adjust_address (m_tramp, DImode, offset + 2);
25239 emit_move_insn (mem, fnaddr);
25240 offset += 10;
25241 }
25242
25243 /* Load static chain using movabs to r10. Use the shorter movl
25244 instead of movabs when ptr_mode == SImode. */
25245 if (ptr_mode == SImode)
25246 {
25247 opcode = 0xba41;
25248 size = 6;
25249 }
25250 else
25251 {
25252 opcode = 0xba49;
25253 size = 10;
25254 }
25255
25256 mem = adjust_address (m_tramp, HImode, offset);
25257 emit_move_insn (mem, gen_int_mode (opcode, HImode));
25258
25259 mem = adjust_address (m_tramp, ptr_mode, offset + 2);
25260 emit_move_insn (mem, chain_value);
25261 offset += size;
25262
25263 /* Jump to r11; the last (unused) byte is a nop, only there to
25264 pad the write out to a single 32-bit store. */
25265 mem = adjust_address (m_tramp, SImode, offset);
25266 emit_move_insn (mem, gen_int_mode (0x90e3ff49, SImode));
25267 offset += 4;
25268 }
25269 else
25270 {
25271 rtx disp, chain;
25272
25273 /* Depending on the static chain location, either load a register
25274 with a constant, or push the constant to the stack. All of the
25275 instructions are the same size. */
25276 chain = ix86_static_chain (fndecl, true);
25277 if (REG_P (chain))
25278 {
25279 switch (REGNO (chain))
25280 {
25281 case AX_REG:
25282 opcode = 0xb8; break;
25283 case CX_REG:
25284 opcode = 0xb9; break;
25285 default:
25286 gcc_unreachable ();
25287 }
25288 }
25289 else
25290 opcode = 0x68;
25291
25292 mem = adjust_address (m_tramp, QImode, offset);
25293 emit_move_insn (mem, gen_int_mode (opcode, QImode));
25294
25295 mem = adjust_address (m_tramp, SImode, offset + 1);
25296 emit_move_insn (mem, chain_value);
25297 offset += 5;
25298
25299 mem = adjust_address (m_tramp, QImode, offset);
25300 emit_move_insn (mem, gen_int_mode (0xe9, QImode));
25301
25302 mem = adjust_address (m_tramp, SImode, offset + 1);
25303
25304 /* Compute offset from the end of the jmp to the target function.
25305 In the case in which the trampoline stores the static chain on
25306 the stack, we need to skip the first insn which pushes the
25307 (call-saved) register static chain; this push is 1 byte. */
25308 offset += 5;
25309 disp = expand_binop (SImode, sub_optab, fnaddr,
25310 plus_constant (Pmode, XEXP (m_tramp, 0),
25311 offset - (MEM_P (chain) ? 1 : 0)),
25312 NULL_RTX, 1, OPTAB_DIRECT);
25313 emit_move_insn (mem, disp);
25314 }
25315
25316 gcc_assert (offset <= TRAMPOLINE_SIZE);
25317
25318 #ifdef HAVE_ENABLE_EXECUTE_STACK
25319 #ifdef CHECK_EXECUTE_STACK_ENABLED
25320 if (CHECK_EXECUTE_STACK_ENABLED)
25321 #endif
25322 emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__enable_execute_stack"),
25323 LCT_NORMAL, VOIDmode, 1, XEXP (m_tramp, 0), Pmode);
25324 #endif
25325 }
25326 \f
25327 /* The following file contains several enumerations and data structures
25328 built from the definitions in i386-builtin-types.def. */
25329
25330 #include "i386-builtin-types.inc"
25331
25332 /* Table for the ix86 builtin non-function types. */
25333 static GTY(()) tree ix86_builtin_type_tab[(int) IX86_BT_LAST_CPTR + 1];
25334
25335 /* Retrieve an element from the above table, building some of
25336 the types lazily. */
25337
25338 static tree
25339 ix86_get_builtin_type (enum ix86_builtin_type tcode)
25340 {
25341 unsigned int index;
25342 tree type, itype;
25343
25344 gcc_assert ((unsigned)tcode < ARRAY_SIZE(ix86_builtin_type_tab));
25345
25346 type = ix86_builtin_type_tab[(int) tcode];
25347 if (type != NULL)
25348 return type;
25349
25350 gcc_assert (tcode > IX86_BT_LAST_PRIM);
25351 if (tcode <= IX86_BT_LAST_VECT)
25352 {
25353 enum machine_mode mode;
25354
25355 index = tcode - IX86_BT_LAST_PRIM - 1;
25356 itype = ix86_get_builtin_type (ix86_builtin_type_vect_base[index]);
25357 mode = ix86_builtin_type_vect_mode[index];
25358
25359 type = build_vector_type_for_mode (itype, mode);
25360 }
25361 else
25362 {
25363 int quals;
25364
25365 index = tcode - IX86_BT_LAST_VECT - 1;
25366 if (tcode <= IX86_BT_LAST_PTR)
25367 quals = TYPE_UNQUALIFIED;
25368 else
25369 quals = TYPE_QUAL_CONST;
25370
25371 itype = ix86_get_builtin_type (ix86_builtin_type_ptr_base[index]);
25372 if (quals != TYPE_UNQUALIFIED)
25373 itype = build_qualified_type (itype, quals);
25374
25375 type = build_pointer_type (itype);
25376 }
25377
25378 ix86_builtin_type_tab[(int) tcode] = type;
25379 return type;
25380 }
25381
25382 /* Table for the ix86 builtin function types. */
25383 static GTY(()) tree ix86_builtin_func_type_tab[(int) IX86_BT_LAST_ALIAS + 1];
25384
25385 /* Retrieve an element from the above table, building some of
25386 the types lazily. */
25387
25388 static tree
25389 ix86_get_builtin_func_type (enum ix86_builtin_func_type tcode)
25390 {
25391 tree type;
25392
25393 gcc_assert ((unsigned)tcode < ARRAY_SIZE (ix86_builtin_func_type_tab));
25394
25395 type = ix86_builtin_func_type_tab[(int) tcode];
25396 if (type != NULL)
25397 return type;
25398
25399 if (tcode <= IX86_BT_LAST_FUNC)
25400 {
25401 unsigned start = ix86_builtin_func_start[(int) tcode];
25402 unsigned after = ix86_builtin_func_start[(int) tcode + 1];
25403 tree rtype, atype, args = void_list_node;
25404 unsigned i;
25405
25406 rtype = ix86_get_builtin_type (ix86_builtin_func_args[start]);
25407 for (i = after - 1; i > start; --i)
25408 {
25409 atype = ix86_get_builtin_type (ix86_builtin_func_args[i]);
25410 args = tree_cons (NULL, atype, args);
25411 }
25412
25413 type = build_function_type (rtype, args);
25414 }
25415 else
25416 {
25417 unsigned index = tcode - IX86_BT_LAST_FUNC - 1;
25418 enum ix86_builtin_func_type icode;
25419
25420 icode = ix86_builtin_func_alias_base[index];
25421 type = ix86_get_builtin_func_type (icode);
25422 }
25423
25424 ix86_builtin_func_type_tab[(int) tcode] = type;
25425 return type;
25426 }
25427
25428
25429 /* Codes for all the SSE/MMX builtins. */
25430 enum ix86_builtins
25431 {
25432 IX86_BUILTIN_ADDPS,
25433 IX86_BUILTIN_ADDSS,
25434 IX86_BUILTIN_DIVPS,
25435 IX86_BUILTIN_DIVSS,
25436 IX86_BUILTIN_MULPS,
25437 IX86_BUILTIN_MULSS,
25438 IX86_BUILTIN_SUBPS,
25439 IX86_BUILTIN_SUBSS,
25440
25441 IX86_BUILTIN_CMPEQPS,
25442 IX86_BUILTIN_CMPLTPS,
25443 IX86_BUILTIN_CMPLEPS,
25444 IX86_BUILTIN_CMPGTPS,
25445 IX86_BUILTIN_CMPGEPS,
25446 IX86_BUILTIN_CMPNEQPS,
25447 IX86_BUILTIN_CMPNLTPS,
25448 IX86_BUILTIN_CMPNLEPS,
25449 IX86_BUILTIN_CMPNGTPS,
25450 IX86_BUILTIN_CMPNGEPS,
25451 IX86_BUILTIN_CMPORDPS,
25452 IX86_BUILTIN_CMPUNORDPS,
25453 IX86_BUILTIN_CMPEQSS,
25454 IX86_BUILTIN_CMPLTSS,
25455 IX86_BUILTIN_CMPLESS,
25456 IX86_BUILTIN_CMPNEQSS,
25457 IX86_BUILTIN_CMPNLTSS,
25458 IX86_BUILTIN_CMPNLESS,
25459 IX86_BUILTIN_CMPNGTSS,
25460 IX86_BUILTIN_CMPNGESS,
25461 IX86_BUILTIN_CMPORDSS,
25462 IX86_BUILTIN_CMPUNORDSS,
25463
25464 IX86_BUILTIN_COMIEQSS,
25465 IX86_BUILTIN_COMILTSS,
25466 IX86_BUILTIN_COMILESS,
25467 IX86_BUILTIN_COMIGTSS,
25468 IX86_BUILTIN_COMIGESS,
25469 IX86_BUILTIN_COMINEQSS,
25470 IX86_BUILTIN_UCOMIEQSS,
25471 IX86_BUILTIN_UCOMILTSS,
25472 IX86_BUILTIN_UCOMILESS,
25473 IX86_BUILTIN_UCOMIGTSS,
25474 IX86_BUILTIN_UCOMIGESS,
25475 IX86_BUILTIN_UCOMINEQSS,
25476
25477 IX86_BUILTIN_CVTPI2PS,
25478 IX86_BUILTIN_CVTPS2PI,
25479 IX86_BUILTIN_CVTSI2SS,
25480 IX86_BUILTIN_CVTSI642SS,
25481 IX86_BUILTIN_CVTSS2SI,
25482 IX86_BUILTIN_CVTSS2SI64,
25483 IX86_BUILTIN_CVTTPS2PI,
25484 IX86_BUILTIN_CVTTSS2SI,
25485 IX86_BUILTIN_CVTTSS2SI64,
25486
25487 IX86_BUILTIN_MAXPS,
25488 IX86_BUILTIN_MAXSS,
25489 IX86_BUILTIN_MINPS,
25490 IX86_BUILTIN_MINSS,
25491
25492 IX86_BUILTIN_LOADUPS,
25493 IX86_BUILTIN_STOREUPS,
25494 IX86_BUILTIN_MOVSS,
25495
25496 IX86_BUILTIN_MOVHLPS,
25497 IX86_BUILTIN_MOVLHPS,
25498 IX86_BUILTIN_LOADHPS,
25499 IX86_BUILTIN_LOADLPS,
25500 IX86_BUILTIN_STOREHPS,
25501 IX86_BUILTIN_STORELPS,
25502
25503 IX86_BUILTIN_MASKMOVQ,
25504 IX86_BUILTIN_MOVMSKPS,
25505 IX86_BUILTIN_PMOVMSKB,
25506
25507 IX86_BUILTIN_MOVNTPS,
25508 IX86_BUILTIN_MOVNTQ,
25509
25510 IX86_BUILTIN_LOADDQU,
25511 IX86_BUILTIN_STOREDQU,
25512
25513 IX86_BUILTIN_PACKSSWB,
25514 IX86_BUILTIN_PACKSSDW,
25515 IX86_BUILTIN_PACKUSWB,
25516
25517 IX86_BUILTIN_PADDB,
25518 IX86_BUILTIN_PADDW,
25519 IX86_BUILTIN_PADDD,
25520 IX86_BUILTIN_PADDQ,
25521 IX86_BUILTIN_PADDSB,
25522 IX86_BUILTIN_PADDSW,
25523 IX86_BUILTIN_PADDUSB,
25524 IX86_BUILTIN_PADDUSW,
25525 IX86_BUILTIN_PSUBB,
25526 IX86_BUILTIN_PSUBW,
25527 IX86_BUILTIN_PSUBD,
25528 IX86_BUILTIN_PSUBQ,
25529 IX86_BUILTIN_PSUBSB,
25530 IX86_BUILTIN_PSUBSW,
25531 IX86_BUILTIN_PSUBUSB,
25532 IX86_BUILTIN_PSUBUSW,
25533
25534 IX86_BUILTIN_PAND,
25535 IX86_BUILTIN_PANDN,
25536 IX86_BUILTIN_POR,
25537 IX86_BUILTIN_PXOR,
25538
25539 IX86_BUILTIN_PAVGB,
25540 IX86_BUILTIN_PAVGW,
25541
25542 IX86_BUILTIN_PCMPEQB,
25543 IX86_BUILTIN_PCMPEQW,
25544 IX86_BUILTIN_PCMPEQD,
25545 IX86_BUILTIN_PCMPGTB,
25546 IX86_BUILTIN_PCMPGTW,
25547 IX86_BUILTIN_PCMPGTD,
25548
25549 IX86_BUILTIN_PMADDWD,
25550
25551 IX86_BUILTIN_PMAXSW,
25552 IX86_BUILTIN_PMAXUB,
25553 IX86_BUILTIN_PMINSW,
25554 IX86_BUILTIN_PMINUB,
25555
25556 IX86_BUILTIN_PMULHUW,
25557 IX86_BUILTIN_PMULHW,
25558 IX86_BUILTIN_PMULLW,
25559
25560 IX86_BUILTIN_PSADBW,
25561 IX86_BUILTIN_PSHUFW,
25562
25563 IX86_BUILTIN_PSLLW,
25564 IX86_BUILTIN_PSLLD,
25565 IX86_BUILTIN_PSLLQ,
25566 IX86_BUILTIN_PSRAW,
25567 IX86_BUILTIN_PSRAD,
25568 IX86_BUILTIN_PSRLW,
25569 IX86_BUILTIN_PSRLD,
25570 IX86_BUILTIN_PSRLQ,
25571 IX86_BUILTIN_PSLLWI,
25572 IX86_BUILTIN_PSLLDI,
25573 IX86_BUILTIN_PSLLQI,
25574 IX86_BUILTIN_PSRAWI,
25575 IX86_BUILTIN_PSRADI,
25576 IX86_BUILTIN_PSRLWI,
25577 IX86_BUILTIN_PSRLDI,
25578 IX86_BUILTIN_PSRLQI,
25579
25580 IX86_BUILTIN_PUNPCKHBW,
25581 IX86_BUILTIN_PUNPCKHWD,
25582 IX86_BUILTIN_PUNPCKHDQ,
25583 IX86_BUILTIN_PUNPCKLBW,
25584 IX86_BUILTIN_PUNPCKLWD,
25585 IX86_BUILTIN_PUNPCKLDQ,
25586
25587 IX86_BUILTIN_SHUFPS,
25588
25589 IX86_BUILTIN_RCPPS,
25590 IX86_BUILTIN_RCPSS,
25591 IX86_BUILTIN_RSQRTPS,
25592 IX86_BUILTIN_RSQRTPS_NR,
25593 IX86_BUILTIN_RSQRTSS,
25594 IX86_BUILTIN_RSQRTF,
25595 IX86_BUILTIN_SQRTPS,
25596 IX86_BUILTIN_SQRTPS_NR,
25597 IX86_BUILTIN_SQRTSS,
25598
25599 IX86_BUILTIN_UNPCKHPS,
25600 IX86_BUILTIN_UNPCKLPS,
25601
25602 IX86_BUILTIN_ANDPS,
25603 IX86_BUILTIN_ANDNPS,
25604 IX86_BUILTIN_ORPS,
25605 IX86_BUILTIN_XORPS,
25606
25607 IX86_BUILTIN_EMMS,
25608 IX86_BUILTIN_LDMXCSR,
25609 IX86_BUILTIN_STMXCSR,
25610 IX86_BUILTIN_SFENCE,
25611
25612 IX86_BUILTIN_FXSAVE,
25613 IX86_BUILTIN_FXRSTOR,
25614 IX86_BUILTIN_FXSAVE64,
25615 IX86_BUILTIN_FXRSTOR64,
25616
25617 IX86_BUILTIN_XSAVE,
25618 IX86_BUILTIN_XRSTOR,
25619 IX86_BUILTIN_XSAVE64,
25620 IX86_BUILTIN_XRSTOR64,
25621
25622 IX86_BUILTIN_XSAVEOPT,
25623 IX86_BUILTIN_XSAVEOPT64,
25624
25625 /* 3DNow! Original */
25626 IX86_BUILTIN_FEMMS,
25627 IX86_BUILTIN_PAVGUSB,
25628 IX86_BUILTIN_PF2ID,
25629 IX86_BUILTIN_PFACC,
25630 IX86_BUILTIN_PFADD,
25631 IX86_BUILTIN_PFCMPEQ,
25632 IX86_BUILTIN_PFCMPGE,
25633 IX86_BUILTIN_PFCMPGT,
25634 IX86_BUILTIN_PFMAX,
25635 IX86_BUILTIN_PFMIN,
25636 IX86_BUILTIN_PFMUL,
25637 IX86_BUILTIN_PFRCP,
25638 IX86_BUILTIN_PFRCPIT1,
25639 IX86_BUILTIN_PFRCPIT2,
25640 IX86_BUILTIN_PFRSQIT1,
25641 IX86_BUILTIN_PFRSQRT,
25642 IX86_BUILTIN_PFSUB,
25643 IX86_BUILTIN_PFSUBR,
25644 IX86_BUILTIN_PI2FD,
25645 IX86_BUILTIN_PMULHRW,
25646
25647 /* 3DNow! Athlon Extensions */
25648 IX86_BUILTIN_PF2IW,
25649 IX86_BUILTIN_PFNACC,
25650 IX86_BUILTIN_PFPNACC,
25651 IX86_BUILTIN_PI2FW,
25652 IX86_BUILTIN_PSWAPDSI,
25653 IX86_BUILTIN_PSWAPDSF,
25654
25655 /* SSE2 */
25656 IX86_BUILTIN_ADDPD,
25657 IX86_BUILTIN_ADDSD,
25658 IX86_BUILTIN_DIVPD,
25659 IX86_BUILTIN_DIVSD,
25660 IX86_BUILTIN_MULPD,
25661 IX86_BUILTIN_MULSD,
25662 IX86_BUILTIN_SUBPD,
25663 IX86_BUILTIN_SUBSD,
25664
25665 IX86_BUILTIN_CMPEQPD,
25666 IX86_BUILTIN_CMPLTPD,
25667 IX86_BUILTIN_CMPLEPD,
25668 IX86_BUILTIN_CMPGTPD,
25669 IX86_BUILTIN_CMPGEPD,
25670 IX86_BUILTIN_CMPNEQPD,
25671 IX86_BUILTIN_CMPNLTPD,
25672 IX86_BUILTIN_CMPNLEPD,
25673 IX86_BUILTIN_CMPNGTPD,
25674 IX86_BUILTIN_CMPNGEPD,
25675 IX86_BUILTIN_CMPORDPD,
25676 IX86_BUILTIN_CMPUNORDPD,
25677 IX86_BUILTIN_CMPEQSD,
25678 IX86_BUILTIN_CMPLTSD,
25679 IX86_BUILTIN_CMPLESD,
25680 IX86_BUILTIN_CMPNEQSD,
25681 IX86_BUILTIN_CMPNLTSD,
25682 IX86_BUILTIN_CMPNLESD,
25683 IX86_BUILTIN_CMPORDSD,
25684 IX86_BUILTIN_CMPUNORDSD,
25685
25686 IX86_BUILTIN_COMIEQSD,
25687 IX86_BUILTIN_COMILTSD,
25688 IX86_BUILTIN_COMILESD,
25689 IX86_BUILTIN_COMIGTSD,
25690 IX86_BUILTIN_COMIGESD,
25691 IX86_BUILTIN_COMINEQSD,
25692 IX86_BUILTIN_UCOMIEQSD,
25693 IX86_BUILTIN_UCOMILTSD,
25694 IX86_BUILTIN_UCOMILESD,
25695 IX86_BUILTIN_UCOMIGTSD,
25696 IX86_BUILTIN_UCOMIGESD,
25697 IX86_BUILTIN_UCOMINEQSD,
25698
25699 IX86_BUILTIN_MAXPD,
25700 IX86_BUILTIN_MAXSD,
25701 IX86_BUILTIN_MINPD,
25702 IX86_BUILTIN_MINSD,
25703
25704 IX86_BUILTIN_ANDPD,
25705 IX86_BUILTIN_ANDNPD,
25706 IX86_BUILTIN_ORPD,
25707 IX86_BUILTIN_XORPD,
25708
25709 IX86_BUILTIN_SQRTPD,
25710 IX86_BUILTIN_SQRTSD,
25711
25712 IX86_BUILTIN_UNPCKHPD,
25713 IX86_BUILTIN_UNPCKLPD,
25714
25715 IX86_BUILTIN_SHUFPD,
25716
25717 IX86_BUILTIN_LOADUPD,
25718 IX86_BUILTIN_STOREUPD,
25719 IX86_BUILTIN_MOVSD,
25720
25721 IX86_BUILTIN_LOADHPD,
25722 IX86_BUILTIN_LOADLPD,
25723
25724 IX86_BUILTIN_CVTDQ2PD,
25725 IX86_BUILTIN_CVTDQ2PS,
25726
25727 IX86_BUILTIN_CVTPD2DQ,
25728 IX86_BUILTIN_CVTPD2PI,
25729 IX86_BUILTIN_CVTPD2PS,
25730 IX86_BUILTIN_CVTTPD2DQ,
25731 IX86_BUILTIN_CVTTPD2PI,
25732
25733 IX86_BUILTIN_CVTPI2PD,
25734 IX86_BUILTIN_CVTSI2SD,
25735 IX86_BUILTIN_CVTSI642SD,
25736
25737 IX86_BUILTIN_CVTSD2SI,
25738 IX86_BUILTIN_CVTSD2SI64,
25739 IX86_BUILTIN_CVTSD2SS,
25740 IX86_BUILTIN_CVTSS2SD,
25741 IX86_BUILTIN_CVTTSD2SI,
25742 IX86_BUILTIN_CVTTSD2SI64,
25743
25744 IX86_BUILTIN_CVTPS2DQ,
25745 IX86_BUILTIN_CVTPS2PD,
25746 IX86_BUILTIN_CVTTPS2DQ,
25747
25748 IX86_BUILTIN_MOVNTI,
25749 IX86_BUILTIN_MOVNTI64,
25750 IX86_BUILTIN_MOVNTPD,
25751 IX86_BUILTIN_MOVNTDQ,
25752
25753 IX86_BUILTIN_MOVQ128,
25754
25755 /* SSE2 MMX */
25756 IX86_BUILTIN_MASKMOVDQU,
25757 IX86_BUILTIN_MOVMSKPD,
25758 IX86_BUILTIN_PMOVMSKB128,
25759
25760 IX86_BUILTIN_PACKSSWB128,
25761 IX86_BUILTIN_PACKSSDW128,
25762 IX86_BUILTIN_PACKUSWB128,
25763
25764 IX86_BUILTIN_PADDB128,
25765 IX86_BUILTIN_PADDW128,
25766 IX86_BUILTIN_PADDD128,
25767 IX86_BUILTIN_PADDQ128,
25768 IX86_BUILTIN_PADDSB128,
25769 IX86_BUILTIN_PADDSW128,
25770 IX86_BUILTIN_PADDUSB128,
25771 IX86_BUILTIN_PADDUSW128,
25772 IX86_BUILTIN_PSUBB128,
25773 IX86_BUILTIN_PSUBW128,
25774 IX86_BUILTIN_PSUBD128,
25775 IX86_BUILTIN_PSUBQ128,
25776 IX86_BUILTIN_PSUBSB128,
25777 IX86_BUILTIN_PSUBSW128,
25778 IX86_BUILTIN_PSUBUSB128,
25779 IX86_BUILTIN_PSUBUSW128,
25780
25781 IX86_BUILTIN_PAND128,
25782 IX86_BUILTIN_PANDN128,
25783 IX86_BUILTIN_POR128,
25784 IX86_BUILTIN_PXOR128,
25785
25786 IX86_BUILTIN_PAVGB128,
25787 IX86_BUILTIN_PAVGW128,
25788
25789 IX86_BUILTIN_PCMPEQB128,
25790 IX86_BUILTIN_PCMPEQW128,
25791 IX86_BUILTIN_PCMPEQD128,
25792 IX86_BUILTIN_PCMPGTB128,
25793 IX86_BUILTIN_PCMPGTW128,
25794 IX86_BUILTIN_PCMPGTD128,
25795
25796 IX86_BUILTIN_PMADDWD128,
25797
25798 IX86_BUILTIN_PMAXSW128,
25799 IX86_BUILTIN_PMAXUB128,
25800 IX86_BUILTIN_PMINSW128,
25801 IX86_BUILTIN_PMINUB128,
25802
25803 IX86_BUILTIN_PMULUDQ,
25804 IX86_BUILTIN_PMULUDQ128,
25805 IX86_BUILTIN_PMULHUW128,
25806 IX86_BUILTIN_PMULHW128,
25807 IX86_BUILTIN_PMULLW128,
25808
25809 IX86_BUILTIN_PSADBW128,
25810 IX86_BUILTIN_PSHUFHW,
25811 IX86_BUILTIN_PSHUFLW,
25812 IX86_BUILTIN_PSHUFD,
25813
25814 IX86_BUILTIN_PSLLDQI128,
25815 IX86_BUILTIN_PSLLWI128,
25816 IX86_BUILTIN_PSLLDI128,
25817 IX86_BUILTIN_PSLLQI128,
25818 IX86_BUILTIN_PSRAWI128,
25819 IX86_BUILTIN_PSRADI128,
25820 IX86_BUILTIN_PSRLDQI128,
25821 IX86_BUILTIN_PSRLWI128,
25822 IX86_BUILTIN_PSRLDI128,
25823 IX86_BUILTIN_PSRLQI128,
25824
25825 IX86_BUILTIN_PSLLDQ128,
25826 IX86_BUILTIN_PSLLW128,
25827 IX86_BUILTIN_PSLLD128,
25828 IX86_BUILTIN_PSLLQ128,
25829 IX86_BUILTIN_PSRAW128,
25830 IX86_BUILTIN_PSRAD128,
25831 IX86_BUILTIN_PSRLW128,
25832 IX86_BUILTIN_PSRLD128,
25833 IX86_BUILTIN_PSRLQ128,
25834
25835 IX86_BUILTIN_PUNPCKHBW128,
25836 IX86_BUILTIN_PUNPCKHWD128,
25837 IX86_BUILTIN_PUNPCKHDQ128,
25838 IX86_BUILTIN_PUNPCKHQDQ128,
25839 IX86_BUILTIN_PUNPCKLBW128,
25840 IX86_BUILTIN_PUNPCKLWD128,
25841 IX86_BUILTIN_PUNPCKLDQ128,
25842 IX86_BUILTIN_PUNPCKLQDQ128,
25843
25844 IX86_BUILTIN_CLFLUSH,
25845 IX86_BUILTIN_MFENCE,
25846 IX86_BUILTIN_LFENCE,
25847 IX86_BUILTIN_PAUSE,
25848
25849 IX86_BUILTIN_BSRSI,
25850 IX86_BUILTIN_BSRDI,
25851 IX86_BUILTIN_RDPMC,
25852 IX86_BUILTIN_RDTSC,
25853 IX86_BUILTIN_RDTSCP,
25854 IX86_BUILTIN_ROLQI,
25855 IX86_BUILTIN_ROLHI,
25856 IX86_BUILTIN_RORQI,
25857 IX86_BUILTIN_RORHI,
25858
25859 /* SSE3. */
25860 IX86_BUILTIN_ADDSUBPS,
25861 IX86_BUILTIN_HADDPS,
25862 IX86_BUILTIN_HSUBPS,
25863 IX86_BUILTIN_MOVSHDUP,
25864 IX86_BUILTIN_MOVSLDUP,
25865 IX86_BUILTIN_ADDSUBPD,
25866 IX86_BUILTIN_HADDPD,
25867 IX86_BUILTIN_HSUBPD,
25868 IX86_BUILTIN_LDDQU,
25869
25870 IX86_BUILTIN_MONITOR,
25871 IX86_BUILTIN_MWAIT,
25872
25873 /* SSSE3. */
25874 IX86_BUILTIN_PHADDW,
25875 IX86_BUILTIN_PHADDD,
25876 IX86_BUILTIN_PHADDSW,
25877 IX86_BUILTIN_PHSUBW,
25878 IX86_BUILTIN_PHSUBD,
25879 IX86_BUILTIN_PHSUBSW,
25880 IX86_BUILTIN_PMADDUBSW,
25881 IX86_BUILTIN_PMULHRSW,
25882 IX86_BUILTIN_PSHUFB,
25883 IX86_BUILTIN_PSIGNB,
25884 IX86_BUILTIN_PSIGNW,
25885 IX86_BUILTIN_PSIGND,
25886 IX86_BUILTIN_PALIGNR,
25887 IX86_BUILTIN_PABSB,
25888 IX86_BUILTIN_PABSW,
25889 IX86_BUILTIN_PABSD,
25890
25891 IX86_BUILTIN_PHADDW128,
25892 IX86_BUILTIN_PHADDD128,
25893 IX86_BUILTIN_PHADDSW128,
25894 IX86_BUILTIN_PHSUBW128,
25895 IX86_BUILTIN_PHSUBD128,
25896 IX86_BUILTIN_PHSUBSW128,
25897 IX86_BUILTIN_PMADDUBSW128,
25898 IX86_BUILTIN_PMULHRSW128,
25899 IX86_BUILTIN_PSHUFB128,
25900 IX86_BUILTIN_PSIGNB128,
25901 IX86_BUILTIN_PSIGNW128,
25902 IX86_BUILTIN_PSIGND128,
25903 IX86_BUILTIN_PALIGNR128,
25904 IX86_BUILTIN_PABSB128,
25905 IX86_BUILTIN_PABSW128,
25906 IX86_BUILTIN_PABSD128,
25907
25908 /* AMDFAM10 - SSE4A New Instructions. */
25909 IX86_BUILTIN_MOVNTSD,
25910 IX86_BUILTIN_MOVNTSS,
25911 IX86_BUILTIN_EXTRQI,
25912 IX86_BUILTIN_EXTRQ,
25913 IX86_BUILTIN_INSERTQI,
25914 IX86_BUILTIN_INSERTQ,
25915
25916 /* SSE4.1. */
25917 IX86_BUILTIN_BLENDPD,
25918 IX86_BUILTIN_BLENDPS,
25919 IX86_BUILTIN_BLENDVPD,
25920 IX86_BUILTIN_BLENDVPS,
25921 IX86_BUILTIN_PBLENDVB128,
25922 IX86_BUILTIN_PBLENDW128,
25923
25924 IX86_BUILTIN_DPPD,
25925 IX86_BUILTIN_DPPS,
25926
25927 IX86_BUILTIN_INSERTPS128,
25928
25929 IX86_BUILTIN_MOVNTDQA,
25930 IX86_BUILTIN_MPSADBW128,
25931 IX86_BUILTIN_PACKUSDW128,
25932 IX86_BUILTIN_PCMPEQQ,
25933 IX86_BUILTIN_PHMINPOSUW128,
25934
25935 IX86_BUILTIN_PMAXSB128,
25936 IX86_BUILTIN_PMAXSD128,
25937 IX86_BUILTIN_PMAXUD128,
25938 IX86_BUILTIN_PMAXUW128,
25939
25940 IX86_BUILTIN_PMINSB128,
25941 IX86_BUILTIN_PMINSD128,
25942 IX86_BUILTIN_PMINUD128,
25943 IX86_BUILTIN_PMINUW128,
25944
25945 IX86_BUILTIN_PMOVSXBW128,
25946 IX86_BUILTIN_PMOVSXBD128,
25947 IX86_BUILTIN_PMOVSXBQ128,
25948 IX86_BUILTIN_PMOVSXWD128,
25949 IX86_BUILTIN_PMOVSXWQ128,
25950 IX86_BUILTIN_PMOVSXDQ128,
25951
25952 IX86_BUILTIN_PMOVZXBW128,
25953 IX86_BUILTIN_PMOVZXBD128,
25954 IX86_BUILTIN_PMOVZXBQ128,
25955 IX86_BUILTIN_PMOVZXWD128,
25956 IX86_BUILTIN_PMOVZXWQ128,
25957 IX86_BUILTIN_PMOVZXDQ128,
25958
25959 IX86_BUILTIN_PMULDQ128,
25960 IX86_BUILTIN_PMULLD128,
25961
25962 IX86_BUILTIN_ROUNDSD,
25963 IX86_BUILTIN_ROUNDSS,
25964
25965 IX86_BUILTIN_ROUNDPD,
25966 IX86_BUILTIN_ROUNDPS,
25967
25968 IX86_BUILTIN_FLOORPD,
25969 IX86_BUILTIN_CEILPD,
25970 IX86_BUILTIN_TRUNCPD,
25971 IX86_BUILTIN_RINTPD,
25972 IX86_BUILTIN_ROUNDPD_AZ,
25973
25974 IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX,
25975 IX86_BUILTIN_CEILPD_VEC_PACK_SFIX,
25976 IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX,
25977
25978 IX86_BUILTIN_FLOORPS,
25979 IX86_BUILTIN_CEILPS,
25980 IX86_BUILTIN_TRUNCPS,
25981 IX86_BUILTIN_RINTPS,
25982 IX86_BUILTIN_ROUNDPS_AZ,
25983
25984 IX86_BUILTIN_FLOORPS_SFIX,
25985 IX86_BUILTIN_CEILPS_SFIX,
25986 IX86_BUILTIN_ROUNDPS_AZ_SFIX,
25987
25988 IX86_BUILTIN_PTESTZ,
25989 IX86_BUILTIN_PTESTC,
25990 IX86_BUILTIN_PTESTNZC,
25991
25992 IX86_BUILTIN_VEC_INIT_V2SI,
25993 IX86_BUILTIN_VEC_INIT_V4HI,
25994 IX86_BUILTIN_VEC_INIT_V8QI,
25995 IX86_BUILTIN_VEC_EXT_V2DF,
25996 IX86_BUILTIN_VEC_EXT_V2DI,
25997 IX86_BUILTIN_VEC_EXT_V4SF,
25998 IX86_BUILTIN_VEC_EXT_V4SI,
25999 IX86_BUILTIN_VEC_EXT_V8HI,
26000 IX86_BUILTIN_VEC_EXT_V2SI,
26001 IX86_BUILTIN_VEC_EXT_V4HI,
26002 IX86_BUILTIN_VEC_EXT_V16QI,
26003 IX86_BUILTIN_VEC_SET_V2DI,
26004 IX86_BUILTIN_VEC_SET_V4SF,
26005 IX86_BUILTIN_VEC_SET_V4SI,
26006 IX86_BUILTIN_VEC_SET_V8HI,
26007 IX86_BUILTIN_VEC_SET_V4HI,
26008 IX86_BUILTIN_VEC_SET_V16QI,
26009
26010 IX86_BUILTIN_VEC_PACK_SFIX,
26011 IX86_BUILTIN_VEC_PACK_SFIX256,
26012
26013 /* SSE4.2. */
26014 IX86_BUILTIN_CRC32QI,
26015 IX86_BUILTIN_CRC32HI,
26016 IX86_BUILTIN_CRC32SI,
26017 IX86_BUILTIN_CRC32DI,
26018
26019 IX86_BUILTIN_PCMPESTRI128,
26020 IX86_BUILTIN_PCMPESTRM128,
26021 IX86_BUILTIN_PCMPESTRA128,
26022 IX86_BUILTIN_PCMPESTRC128,
26023 IX86_BUILTIN_PCMPESTRO128,
26024 IX86_BUILTIN_PCMPESTRS128,
26025 IX86_BUILTIN_PCMPESTRZ128,
26026 IX86_BUILTIN_PCMPISTRI128,
26027 IX86_BUILTIN_PCMPISTRM128,
26028 IX86_BUILTIN_PCMPISTRA128,
26029 IX86_BUILTIN_PCMPISTRC128,
26030 IX86_BUILTIN_PCMPISTRO128,
26031 IX86_BUILTIN_PCMPISTRS128,
26032 IX86_BUILTIN_PCMPISTRZ128,
26033
26034 IX86_BUILTIN_PCMPGTQ,
26035
26036 /* AES instructions */
26037 IX86_BUILTIN_AESENC128,
26038 IX86_BUILTIN_AESENCLAST128,
26039 IX86_BUILTIN_AESDEC128,
26040 IX86_BUILTIN_AESDECLAST128,
26041 IX86_BUILTIN_AESIMC128,
26042 IX86_BUILTIN_AESKEYGENASSIST128,
26043
26044 /* PCLMUL instruction */
26045 IX86_BUILTIN_PCLMULQDQ128,
26046
26047 /* AVX */
26048 IX86_BUILTIN_ADDPD256,
26049 IX86_BUILTIN_ADDPS256,
26050 IX86_BUILTIN_ADDSUBPD256,
26051 IX86_BUILTIN_ADDSUBPS256,
26052 IX86_BUILTIN_ANDPD256,
26053 IX86_BUILTIN_ANDPS256,
26054 IX86_BUILTIN_ANDNPD256,
26055 IX86_BUILTIN_ANDNPS256,
26056 IX86_BUILTIN_BLENDPD256,
26057 IX86_BUILTIN_BLENDPS256,
26058 IX86_BUILTIN_BLENDVPD256,
26059 IX86_BUILTIN_BLENDVPS256,
26060 IX86_BUILTIN_DIVPD256,
26061 IX86_BUILTIN_DIVPS256,
26062 IX86_BUILTIN_DPPS256,
26063 IX86_BUILTIN_HADDPD256,
26064 IX86_BUILTIN_HADDPS256,
26065 IX86_BUILTIN_HSUBPD256,
26066 IX86_BUILTIN_HSUBPS256,
26067 IX86_BUILTIN_MAXPD256,
26068 IX86_BUILTIN_MAXPS256,
26069 IX86_BUILTIN_MINPD256,
26070 IX86_BUILTIN_MINPS256,
26071 IX86_BUILTIN_MULPD256,
26072 IX86_BUILTIN_MULPS256,
26073 IX86_BUILTIN_ORPD256,
26074 IX86_BUILTIN_ORPS256,
26075 IX86_BUILTIN_SHUFPD256,
26076 IX86_BUILTIN_SHUFPS256,
26077 IX86_BUILTIN_SUBPD256,
26078 IX86_BUILTIN_SUBPS256,
26079 IX86_BUILTIN_XORPD256,
26080 IX86_BUILTIN_XORPS256,
26081 IX86_BUILTIN_CMPSD,
26082 IX86_BUILTIN_CMPSS,
26083 IX86_BUILTIN_CMPPD,
26084 IX86_BUILTIN_CMPPS,
26085 IX86_BUILTIN_CMPPD256,
26086 IX86_BUILTIN_CMPPS256,
26087 IX86_BUILTIN_CVTDQ2PD256,
26088 IX86_BUILTIN_CVTDQ2PS256,
26089 IX86_BUILTIN_CVTPD2PS256,
26090 IX86_BUILTIN_CVTPS2DQ256,
26091 IX86_BUILTIN_CVTPS2PD256,
26092 IX86_BUILTIN_CVTTPD2DQ256,
26093 IX86_BUILTIN_CVTPD2DQ256,
26094 IX86_BUILTIN_CVTTPS2DQ256,
26095 IX86_BUILTIN_EXTRACTF128PD256,
26096 IX86_BUILTIN_EXTRACTF128PS256,
26097 IX86_BUILTIN_EXTRACTF128SI256,
26098 IX86_BUILTIN_VZEROALL,
26099 IX86_BUILTIN_VZEROUPPER,
26100 IX86_BUILTIN_VPERMILVARPD,
26101 IX86_BUILTIN_VPERMILVARPS,
26102 IX86_BUILTIN_VPERMILVARPD256,
26103 IX86_BUILTIN_VPERMILVARPS256,
26104 IX86_BUILTIN_VPERMILPD,
26105 IX86_BUILTIN_VPERMILPS,
26106 IX86_BUILTIN_VPERMILPD256,
26107 IX86_BUILTIN_VPERMILPS256,
26108 IX86_BUILTIN_VPERMIL2PD,
26109 IX86_BUILTIN_VPERMIL2PS,
26110 IX86_BUILTIN_VPERMIL2PD256,
26111 IX86_BUILTIN_VPERMIL2PS256,
26112 IX86_BUILTIN_VPERM2F128PD256,
26113 IX86_BUILTIN_VPERM2F128PS256,
26114 IX86_BUILTIN_VPERM2F128SI256,
26115 IX86_BUILTIN_VBROADCASTSS,
26116 IX86_BUILTIN_VBROADCASTSD256,
26117 IX86_BUILTIN_VBROADCASTSS256,
26118 IX86_BUILTIN_VBROADCASTPD256,
26119 IX86_BUILTIN_VBROADCASTPS256,
26120 IX86_BUILTIN_VINSERTF128PD256,
26121 IX86_BUILTIN_VINSERTF128PS256,
26122 IX86_BUILTIN_VINSERTF128SI256,
26123 IX86_BUILTIN_LOADUPD256,
26124 IX86_BUILTIN_LOADUPS256,
26125 IX86_BUILTIN_STOREUPD256,
26126 IX86_BUILTIN_STOREUPS256,
26127 IX86_BUILTIN_LDDQU256,
26128 IX86_BUILTIN_MOVNTDQ256,
26129 IX86_BUILTIN_MOVNTPD256,
26130 IX86_BUILTIN_MOVNTPS256,
26131 IX86_BUILTIN_LOADDQU256,
26132 IX86_BUILTIN_STOREDQU256,
26133 IX86_BUILTIN_MASKLOADPD,
26134 IX86_BUILTIN_MASKLOADPS,
26135 IX86_BUILTIN_MASKSTOREPD,
26136 IX86_BUILTIN_MASKSTOREPS,
26137 IX86_BUILTIN_MASKLOADPD256,
26138 IX86_BUILTIN_MASKLOADPS256,
26139 IX86_BUILTIN_MASKSTOREPD256,
26140 IX86_BUILTIN_MASKSTOREPS256,
26141 IX86_BUILTIN_MOVSHDUP256,
26142 IX86_BUILTIN_MOVSLDUP256,
26143 IX86_BUILTIN_MOVDDUP256,
26144
26145 IX86_BUILTIN_SQRTPD256,
26146 IX86_BUILTIN_SQRTPS256,
26147 IX86_BUILTIN_SQRTPS_NR256,
26148 IX86_BUILTIN_RSQRTPS256,
26149 IX86_BUILTIN_RSQRTPS_NR256,
26150
26151 IX86_BUILTIN_RCPPS256,
26152
26153 IX86_BUILTIN_ROUNDPD256,
26154 IX86_BUILTIN_ROUNDPS256,
26155
26156 IX86_BUILTIN_FLOORPD256,
26157 IX86_BUILTIN_CEILPD256,
26158 IX86_BUILTIN_TRUNCPD256,
26159 IX86_BUILTIN_RINTPD256,
26160 IX86_BUILTIN_ROUNDPD_AZ256,
26161
26162 IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256,
26163 IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256,
26164 IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256,
26165
26166 IX86_BUILTIN_FLOORPS256,
26167 IX86_BUILTIN_CEILPS256,
26168 IX86_BUILTIN_TRUNCPS256,
26169 IX86_BUILTIN_RINTPS256,
26170 IX86_BUILTIN_ROUNDPS_AZ256,
26171
26172 IX86_BUILTIN_FLOORPS_SFIX256,
26173 IX86_BUILTIN_CEILPS_SFIX256,
26174 IX86_BUILTIN_ROUNDPS_AZ_SFIX256,
26175
26176 IX86_BUILTIN_UNPCKHPD256,
26177 IX86_BUILTIN_UNPCKLPD256,
26178 IX86_BUILTIN_UNPCKHPS256,
26179 IX86_BUILTIN_UNPCKLPS256,
26180
26181 IX86_BUILTIN_SI256_SI,
26182 IX86_BUILTIN_PS256_PS,
26183 IX86_BUILTIN_PD256_PD,
26184 IX86_BUILTIN_SI_SI256,
26185 IX86_BUILTIN_PS_PS256,
26186 IX86_BUILTIN_PD_PD256,
26187
26188 IX86_BUILTIN_VTESTZPD,
26189 IX86_BUILTIN_VTESTCPD,
26190 IX86_BUILTIN_VTESTNZCPD,
26191 IX86_BUILTIN_VTESTZPS,
26192 IX86_BUILTIN_VTESTCPS,
26193 IX86_BUILTIN_VTESTNZCPS,
26194 IX86_BUILTIN_VTESTZPD256,
26195 IX86_BUILTIN_VTESTCPD256,
26196 IX86_BUILTIN_VTESTNZCPD256,
26197 IX86_BUILTIN_VTESTZPS256,
26198 IX86_BUILTIN_VTESTCPS256,
26199 IX86_BUILTIN_VTESTNZCPS256,
26200 IX86_BUILTIN_PTESTZ256,
26201 IX86_BUILTIN_PTESTC256,
26202 IX86_BUILTIN_PTESTNZC256,
26203
26204 IX86_BUILTIN_MOVMSKPD256,
26205 IX86_BUILTIN_MOVMSKPS256,
26206
26207 /* AVX2 */
26208 IX86_BUILTIN_MPSADBW256,
26209 IX86_BUILTIN_PABSB256,
26210 IX86_BUILTIN_PABSW256,
26211 IX86_BUILTIN_PABSD256,
26212 IX86_BUILTIN_PACKSSDW256,
26213 IX86_BUILTIN_PACKSSWB256,
26214 IX86_BUILTIN_PACKUSDW256,
26215 IX86_BUILTIN_PACKUSWB256,
26216 IX86_BUILTIN_PADDB256,
26217 IX86_BUILTIN_PADDW256,
26218 IX86_BUILTIN_PADDD256,
26219 IX86_BUILTIN_PADDQ256,
26220 IX86_BUILTIN_PADDSB256,
26221 IX86_BUILTIN_PADDSW256,
26222 IX86_BUILTIN_PADDUSB256,
26223 IX86_BUILTIN_PADDUSW256,
26224 IX86_BUILTIN_PALIGNR256,
26225 IX86_BUILTIN_AND256I,
26226 IX86_BUILTIN_ANDNOT256I,
26227 IX86_BUILTIN_PAVGB256,
26228 IX86_BUILTIN_PAVGW256,
26229 IX86_BUILTIN_PBLENDVB256,
26230 IX86_BUILTIN_PBLENDVW256,
26231 IX86_BUILTIN_PCMPEQB256,
26232 IX86_BUILTIN_PCMPEQW256,
26233 IX86_BUILTIN_PCMPEQD256,
26234 IX86_BUILTIN_PCMPEQQ256,
26235 IX86_BUILTIN_PCMPGTB256,
26236 IX86_BUILTIN_PCMPGTW256,
26237 IX86_BUILTIN_PCMPGTD256,
26238 IX86_BUILTIN_PCMPGTQ256,
26239 IX86_BUILTIN_PHADDW256,
26240 IX86_BUILTIN_PHADDD256,
26241 IX86_BUILTIN_PHADDSW256,
26242 IX86_BUILTIN_PHSUBW256,
26243 IX86_BUILTIN_PHSUBD256,
26244 IX86_BUILTIN_PHSUBSW256,
26245 IX86_BUILTIN_PMADDUBSW256,
26246 IX86_BUILTIN_PMADDWD256,
26247 IX86_BUILTIN_PMAXSB256,
26248 IX86_BUILTIN_PMAXSW256,
26249 IX86_BUILTIN_PMAXSD256,
26250 IX86_BUILTIN_PMAXUB256,
26251 IX86_BUILTIN_PMAXUW256,
26252 IX86_BUILTIN_PMAXUD256,
26253 IX86_BUILTIN_PMINSB256,
26254 IX86_BUILTIN_PMINSW256,
26255 IX86_BUILTIN_PMINSD256,
26256 IX86_BUILTIN_PMINUB256,
26257 IX86_BUILTIN_PMINUW256,
26258 IX86_BUILTIN_PMINUD256,
26259 IX86_BUILTIN_PMOVMSKB256,
26260 IX86_BUILTIN_PMOVSXBW256,
26261 IX86_BUILTIN_PMOVSXBD256,
26262 IX86_BUILTIN_PMOVSXBQ256,
26263 IX86_BUILTIN_PMOVSXWD256,
26264 IX86_BUILTIN_PMOVSXWQ256,
26265 IX86_BUILTIN_PMOVSXDQ256,
26266 IX86_BUILTIN_PMOVZXBW256,
26267 IX86_BUILTIN_PMOVZXBD256,
26268 IX86_BUILTIN_PMOVZXBQ256,
26269 IX86_BUILTIN_PMOVZXWD256,
26270 IX86_BUILTIN_PMOVZXWQ256,
26271 IX86_BUILTIN_PMOVZXDQ256,
26272 IX86_BUILTIN_PMULDQ256,
26273 IX86_BUILTIN_PMULHRSW256,
26274 IX86_BUILTIN_PMULHUW256,
26275 IX86_BUILTIN_PMULHW256,
26276 IX86_BUILTIN_PMULLW256,
26277 IX86_BUILTIN_PMULLD256,
26278 IX86_BUILTIN_PMULUDQ256,
26279 IX86_BUILTIN_POR256,
26280 IX86_BUILTIN_PSADBW256,
26281 IX86_BUILTIN_PSHUFB256,
26282 IX86_BUILTIN_PSHUFD256,
26283 IX86_BUILTIN_PSHUFHW256,
26284 IX86_BUILTIN_PSHUFLW256,
26285 IX86_BUILTIN_PSIGNB256,
26286 IX86_BUILTIN_PSIGNW256,
26287 IX86_BUILTIN_PSIGND256,
26288 IX86_BUILTIN_PSLLDQI256,
26289 IX86_BUILTIN_PSLLWI256,
26290 IX86_BUILTIN_PSLLW256,
26291 IX86_BUILTIN_PSLLDI256,
26292 IX86_BUILTIN_PSLLD256,
26293 IX86_BUILTIN_PSLLQI256,
26294 IX86_BUILTIN_PSLLQ256,
26295 IX86_BUILTIN_PSRAWI256,
26296 IX86_BUILTIN_PSRAW256,
26297 IX86_BUILTIN_PSRADI256,
26298 IX86_BUILTIN_PSRAD256,
26299 IX86_BUILTIN_PSRLDQI256,
26300 IX86_BUILTIN_PSRLWI256,
26301 IX86_BUILTIN_PSRLW256,
26302 IX86_BUILTIN_PSRLDI256,
26303 IX86_BUILTIN_PSRLD256,
26304 IX86_BUILTIN_PSRLQI256,
26305 IX86_BUILTIN_PSRLQ256,
26306 IX86_BUILTIN_PSUBB256,
26307 IX86_BUILTIN_PSUBW256,
26308 IX86_BUILTIN_PSUBD256,
26309 IX86_BUILTIN_PSUBQ256,
26310 IX86_BUILTIN_PSUBSB256,
26311 IX86_BUILTIN_PSUBSW256,
26312 IX86_BUILTIN_PSUBUSB256,
26313 IX86_BUILTIN_PSUBUSW256,
26314 IX86_BUILTIN_PUNPCKHBW256,
26315 IX86_BUILTIN_PUNPCKHWD256,
26316 IX86_BUILTIN_PUNPCKHDQ256,
26317 IX86_BUILTIN_PUNPCKHQDQ256,
26318 IX86_BUILTIN_PUNPCKLBW256,
26319 IX86_BUILTIN_PUNPCKLWD256,
26320 IX86_BUILTIN_PUNPCKLDQ256,
26321 IX86_BUILTIN_PUNPCKLQDQ256,
26322 IX86_BUILTIN_PXOR256,
26323 IX86_BUILTIN_MOVNTDQA256,
26324 IX86_BUILTIN_VBROADCASTSS_PS,
26325 IX86_BUILTIN_VBROADCASTSS_PS256,
26326 IX86_BUILTIN_VBROADCASTSD_PD256,
26327 IX86_BUILTIN_VBROADCASTSI256,
26328 IX86_BUILTIN_PBLENDD256,
26329 IX86_BUILTIN_PBLENDD128,
26330 IX86_BUILTIN_PBROADCASTB256,
26331 IX86_BUILTIN_PBROADCASTW256,
26332 IX86_BUILTIN_PBROADCASTD256,
26333 IX86_BUILTIN_PBROADCASTQ256,
26334 IX86_BUILTIN_PBROADCASTB128,
26335 IX86_BUILTIN_PBROADCASTW128,
26336 IX86_BUILTIN_PBROADCASTD128,
26337 IX86_BUILTIN_PBROADCASTQ128,
26338 IX86_BUILTIN_VPERMVARSI256,
26339 IX86_BUILTIN_VPERMDF256,
26340 IX86_BUILTIN_VPERMVARSF256,
26341 IX86_BUILTIN_VPERMDI256,
26342 IX86_BUILTIN_VPERMTI256,
26343 IX86_BUILTIN_VEXTRACT128I256,
26344 IX86_BUILTIN_VINSERT128I256,
26345 IX86_BUILTIN_MASKLOADD,
26346 IX86_BUILTIN_MASKLOADQ,
26347 IX86_BUILTIN_MASKLOADD256,
26348 IX86_BUILTIN_MASKLOADQ256,
26349 IX86_BUILTIN_MASKSTORED,
26350 IX86_BUILTIN_MASKSTOREQ,
26351 IX86_BUILTIN_MASKSTORED256,
26352 IX86_BUILTIN_MASKSTOREQ256,
26353 IX86_BUILTIN_PSLLVV4DI,
26354 IX86_BUILTIN_PSLLVV2DI,
26355 IX86_BUILTIN_PSLLVV8SI,
26356 IX86_BUILTIN_PSLLVV4SI,
26357 IX86_BUILTIN_PSRAVV8SI,
26358 IX86_BUILTIN_PSRAVV4SI,
26359 IX86_BUILTIN_PSRLVV4DI,
26360 IX86_BUILTIN_PSRLVV2DI,
26361 IX86_BUILTIN_PSRLVV8SI,
26362 IX86_BUILTIN_PSRLVV4SI,
26363
26364 IX86_BUILTIN_GATHERSIV2DF,
26365 IX86_BUILTIN_GATHERSIV4DF,
26366 IX86_BUILTIN_GATHERDIV2DF,
26367 IX86_BUILTIN_GATHERDIV4DF,
26368 IX86_BUILTIN_GATHERSIV4SF,
26369 IX86_BUILTIN_GATHERSIV8SF,
26370 IX86_BUILTIN_GATHERDIV4SF,
26371 IX86_BUILTIN_GATHERDIV8SF,
26372 IX86_BUILTIN_GATHERSIV2DI,
26373 IX86_BUILTIN_GATHERSIV4DI,
26374 IX86_BUILTIN_GATHERDIV2DI,
26375 IX86_BUILTIN_GATHERDIV4DI,
26376 IX86_BUILTIN_GATHERSIV4SI,
26377 IX86_BUILTIN_GATHERSIV8SI,
26378 IX86_BUILTIN_GATHERDIV4SI,
26379 IX86_BUILTIN_GATHERDIV8SI,
26380
26381 /* Alternate 4 element gather for the vectorizer where
26382 all operands are 32-byte wide. */
26383 IX86_BUILTIN_GATHERALTSIV4DF,
26384 IX86_BUILTIN_GATHERALTDIV8SF,
26385 IX86_BUILTIN_GATHERALTSIV4DI,
26386 IX86_BUILTIN_GATHERALTDIV8SI,
26387
26388 /* TFmode support builtins. */
26389 IX86_BUILTIN_INFQ,
26390 IX86_BUILTIN_HUGE_VALQ,
26391 IX86_BUILTIN_FABSQ,
26392 IX86_BUILTIN_COPYSIGNQ,
26393
26394 /* Vectorizer support builtins. */
26395 IX86_BUILTIN_CPYSGNPS,
26396 IX86_BUILTIN_CPYSGNPD,
26397 IX86_BUILTIN_CPYSGNPS256,
26398 IX86_BUILTIN_CPYSGNPD256,
26399
26400 /* FMA4 instructions. */
26401 IX86_BUILTIN_VFMADDSS,
26402 IX86_BUILTIN_VFMADDSD,
26403 IX86_BUILTIN_VFMADDPS,
26404 IX86_BUILTIN_VFMADDPD,
26405 IX86_BUILTIN_VFMADDPS256,
26406 IX86_BUILTIN_VFMADDPD256,
26407 IX86_BUILTIN_VFMADDSUBPS,
26408 IX86_BUILTIN_VFMADDSUBPD,
26409 IX86_BUILTIN_VFMADDSUBPS256,
26410 IX86_BUILTIN_VFMADDSUBPD256,
26411
26412 /* FMA3 instructions. */
26413 IX86_BUILTIN_VFMADDSS3,
26414 IX86_BUILTIN_VFMADDSD3,
26415
26416 /* XOP instructions. */
26417 IX86_BUILTIN_VPCMOV,
26418 IX86_BUILTIN_VPCMOV_V2DI,
26419 IX86_BUILTIN_VPCMOV_V4SI,
26420 IX86_BUILTIN_VPCMOV_V8HI,
26421 IX86_BUILTIN_VPCMOV_V16QI,
26422 IX86_BUILTIN_VPCMOV_V4SF,
26423 IX86_BUILTIN_VPCMOV_V2DF,
26424 IX86_BUILTIN_VPCMOV256,
26425 IX86_BUILTIN_VPCMOV_V4DI256,
26426 IX86_BUILTIN_VPCMOV_V8SI256,
26427 IX86_BUILTIN_VPCMOV_V16HI256,
26428 IX86_BUILTIN_VPCMOV_V32QI256,
26429 IX86_BUILTIN_VPCMOV_V8SF256,
26430 IX86_BUILTIN_VPCMOV_V4DF256,
26431
26432 IX86_BUILTIN_VPPERM,
26433
26434 IX86_BUILTIN_VPMACSSWW,
26435 IX86_BUILTIN_VPMACSWW,
26436 IX86_BUILTIN_VPMACSSWD,
26437 IX86_BUILTIN_VPMACSWD,
26438 IX86_BUILTIN_VPMACSSDD,
26439 IX86_BUILTIN_VPMACSDD,
26440 IX86_BUILTIN_VPMACSSDQL,
26441 IX86_BUILTIN_VPMACSSDQH,
26442 IX86_BUILTIN_VPMACSDQL,
26443 IX86_BUILTIN_VPMACSDQH,
26444 IX86_BUILTIN_VPMADCSSWD,
26445 IX86_BUILTIN_VPMADCSWD,
26446
26447 IX86_BUILTIN_VPHADDBW,
26448 IX86_BUILTIN_VPHADDBD,
26449 IX86_BUILTIN_VPHADDBQ,
26450 IX86_BUILTIN_VPHADDWD,
26451 IX86_BUILTIN_VPHADDWQ,
26452 IX86_BUILTIN_VPHADDDQ,
26453 IX86_BUILTIN_VPHADDUBW,
26454 IX86_BUILTIN_VPHADDUBD,
26455 IX86_BUILTIN_VPHADDUBQ,
26456 IX86_BUILTIN_VPHADDUWD,
26457 IX86_BUILTIN_VPHADDUWQ,
26458 IX86_BUILTIN_VPHADDUDQ,
26459 IX86_BUILTIN_VPHSUBBW,
26460 IX86_BUILTIN_VPHSUBWD,
26461 IX86_BUILTIN_VPHSUBDQ,
26462
26463 IX86_BUILTIN_VPROTB,
26464 IX86_BUILTIN_VPROTW,
26465 IX86_BUILTIN_VPROTD,
26466 IX86_BUILTIN_VPROTQ,
26467 IX86_BUILTIN_VPROTB_IMM,
26468 IX86_BUILTIN_VPROTW_IMM,
26469 IX86_BUILTIN_VPROTD_IMM,
26470 IX86_BUILTIN_VPROTQ_IMM,
26471
26472 IX86_BUILTIN_VPSHLB,
26473 IX86_BUILTIN_VPSHLW,
26474 IX86_BUILTIN_VPSHLD,
26475 IX86_BUILTIN_VPSHLQ,
26476 IX86_BUILTIN_VPSHAB,
26477 IX86_BUILTIN_VPSHAW,
26478 IX86_BUILTIN_VPSHAD,
26479 IX86_BUILTIN_VPSHAQ,
26480
26481 IX86_BUILTIN_VFRCZSS,
26482 IX86_BUILTIN_VFRCZSD,
26483 IX86_BUILTIN_VFRCZPS,
26484 IX86_BUILTIN_VFRCZPD,
26485 IX86_BUILTIN_VFRCZPS256,
26486 IX86_BUILTIN_VFRCZPD256,
26487
26488 IX86_BUILTIN_VPCOMEQUB,
26489 IX86_BUILTIN_VPCOMNEUB,
26490 IX86_BUILTIN_VPCOMLTUB,
26491 IX86_BUILTIN_VPCOMLEUB,
26492 IX86_BUILTIN_VPCOMGTUB,
26493 IX86_BUILTIN_VPCOMGEUB,
26494 IX86_BUILTIN_VPCOMFALSEUB,
26495 IX86_BUILTIN_VPCOMTRUEUB,
26496
26497 IX86_BUILTIN_VPCOMEQUW,
26498 IX86_BUILTIN_VPCOMNEUW,
26499 IX86_BUILTIN_VPCOMLTUW,
26500 IX86_BUILTIN_VPCOMLEUW,
26501 IX86_BUILTIN_VPCOMGTUW,
26502 IX86_BUILTIN_VPCOMGEUW,
26503 IX86_BUILTIN_VPCOMFALSEUW,
26504 IX86_BUILTIN_VPCOMTRUEUW,
26505
26506 IX86_BUILTIN_VPCOMEQUD,
26507 IX86_BUILTIN_VPCOMNEUD,
26508 IX86_BUILTIN_VPCOMLTUD,
26509 IX86_BUILTIN_VPCOMLEUD,
26510 IX86_BUILTIN_VPCOMGTUD,
26511 IX86_BUILTIN_VPCOMGEUD,
26512 IX86_BUILTIN_VPCOMFALSEUD,
26513 IX86_BUILTIN_VPCOMTRUEUD,
26514
26515 IX86_BUILTIN_VPCOMEQUQ,
26516 IX86_BUILTIN_VPCOMNEUQ,
26517 IX86_BUILTIN_VPCOMLTUQ,
26518 IX86_BUILTIN_VPCOMLEUQ,
26519 IX86_BUILTIN_VPCOMGTUQ,
26520 IX86_BUILTIN_VPCOMGEUQ,
26521 IX86_BUILTIN_VPCOMFALSEUQ,
26522 IX86_BUILTIN_VPCOMTRUEUQ,
26523
26524 IX86_BUILTIN_VPCOMEQB,
26525 IX86_BUILTIN_VPCOMNEB,
26526 IX86_BUILTIN_VPCOMLTB,
26527 IX86_BUILTIN_VPCOMLEB,
26528 IX86_BUILTIN_VPCOMGTB,
26529 IX86_BUILTIN_VPCOMGEB,
26530 IX86_BUILTIN_VPCOMFALSEB,
26531 IX86_BUILTIN_VPCOMTRUEB,
26532
26533 IX86_BUILTIN_VPCOMEQW,
26534 IX86_BUILTIN_VPCOMNEW,
26535 IX86_BUILTIN_VPCOMLTW,
26536 IX86_BUILTIN_VPCOMLEW,
26537 IX86_BUILTIN_VPCOMGTW,
26538 IX86_BUILTIN_VPCOMGEW,
26539 IX86_BUILTIN_VPCOMFALSEW,
26540 IX86_BUILTIN_VPCOMTRUEW,
26541
26542 IX86_BUILTIN_VPCOMEQD,
26543 IX86_BUILTIN_VPCOMNED,
26544 IX86_BUILTIN_VPCOMLTD,
26545 IX86_BUILTIN_VPCOMLED,
26546 IX86_BUILTIN_VPCOMGTD,
26547 IX86_BUILTIN_VPCOMGED,
26548 IX86_BUILTIN_VPCOMFALSED,
26549 IX86_BUILTIN_VPCOMTRUED,
26550
26551 IX86_BUILTIN_VPCOMEQQ,
26552 IX86_BUILTIN_VPCOMNEQ,
26553 IX86_BUILTIN_VPCOMLTQ,
26554 IX86_BUILTIN_VPCOMLEQ,
26555 IX86_BUILTIN_VPCOMGTQ,
26556 IX86_BUILTIN_VPCOMGEQ,
26557 IX86_BUILTIN_VPCOMFALSEQ,
26558 IX86_BUILTIN_VPCOMTRUEQ,
26559
26560 /* LWP instructions. */
26561 IX86_BUILTIN_LLWPCB,
26562 IX86_BUILTIN_SLWPCB,
26563 IX86_BUILTIN_LWPVAL32,
26564 IX86_BUILTIN_LWPVAL64,
26565 IX86_BUILTIN_LWPINS32,
26566 IX86_BUILTIN_LWPINS64,
26567
26568 IX86_BUILTIN_CLZS,
26569
26570 /* RTM */
26571 IX86_BUILTIN_XBEGIN,
26572 IX86_BUILTIN_XEND,
26573 IX86_BUILTIN_XABORT,
26574 IX86_BUILTIN_XTEST,
26575
26576 /* BMI instructions. */
26577 IX86_BUILTIN_BEXTR32,
26578 IX86_BUILTIN_BEXTR64,
26579 IX86_BUILTIN_CTZS,
26580
26581 /* TBM instructions. */
26582 IX86_BUILTIN_BEXTRI32,
26583 IX86_BUILTIN_BEXTRI64,
26584
26585 /* BMI2 instructions. */
26586 IX86_BUILTIN_BZHI32,
26587 IX86_BUILTIN_BZHI64,
26588 IX86_BUILTIN_PDEP32,
26589 IX86_BUILTIN_PDEP64,
26590 IX86_BUILTIN_PEXT32,
26591 IX86_BUILTIN_PEXT64,
26592
26593 /* ADX instructions. */
26594 IX86_BUILTIN_ADDCARRYX32,
26595 IX86_BUILTIN_ADDCARRYX64,
26596
26597 /* FSGSBASE instructions. */
26598 IX86_BUILTIN_RDFSBASE32,
26599 IX86_BUILTIN_RDFSBASE64,
26600 IX86_BUILTIN_RDGSBASE32,
26601 IX86_BUILTIN_RDGSBASE64,
26602 IX86_BUILTIN_WRFSBASE32,
26603 IX86_BUILTIN_WRFSBASE64,
26604 IX86_BUILTIN_WRGSBASE32,
26605 IX86_BUILTIN_WRGSBASE64,
26606
26607 /* RDRND instructions. */
26608 IX86_BUILTIN_RDRAND16_STEP,
26609 IX86_BUILTIN_RDRAND32_STEP,
26610 IX86_BUILTIN_RDRAND64_STEP,
26611
26612 /* RDSEED instructions. */
26613 IX86_BUILTIN_RDSEED16_STEP,
26614 IX86_BUILTIN_RDSEED32_STEP,
26615 IX86_BUILTIN_RDSEED64_STEP,
26616
26617 /* F16C instructions. */
26618 IX86_BUILTIN_CVTPH2PS,
26619 IX86_BUILTIN_CVTPH2PS256,
26620 IX86_BUILTIN_CVTPS2PH,
26621 IX86_BUILTIN_CVTPS2PH256,
26622
26623 /* CFString built-in for darwin */
26624 IX86_BUILTIN_CFSTRING,
26625
26626 /* Builtins to get CPU type and supported features. */
26627 IX86_BUILTIN_CPU_INIT,
26628 IX86_BUILTIN_CPU_IS,
26629 IX86_BUILTIN_CPU_SUPPORTS,
26630
26631 IX86_BUILTIN_MAX
26632 };
26633
26634 /* Table for the ix86 builtin decls. */
26635 static GTY(()) tree ix86_builtins[(int) IX86_BUILTIN_MAX];
26636
26637 /* Table of all of the builtin functions that are possible with different ISA's
26638 but are waiting to be built until a function is declared to use that
26639 ISA. */
26640 struct builtin_isa {
26641 const char *name; /* function name */
26642 enum ix86_builtin_func_type tcode; /* type to use in the declaration */
26643 HOST_WIDE_INT isa; /* isa_flags this builtin is defined for */
26644 bool const_p; /* true if the declaration is constant */
26645 bool set_and_not_built_p;
26646 };
26647
26648 static struct builtin_isa ix86_builtins_isa[(int) IX86_BUILTIN_MAX];
26649
26650
26651 /* Add an ix86 target builtin function with CODE, NAME and TYPE. Save the MASK
26652 of which isa_flags to use in the ix86_builtins_isa array. Stores the
26653 function decl in the ix86_builtins array. Returns the function decl or
26654 NULL_TREE, if the builtin was not added.
26655
26656 If the front end has a special hook for builtin functions, delay adding
26657 builtin functions that aren't in the current ISA until the ISA is changed
26658 with function specific optimization. Doing so, can save about 300K for the
26659 default compiler. When the builtin is expanded, check at that time whether
26660 it is valid.
26661
26662 If the front end doesn't have a special hook, record all builtins, even if
26663 it isn't an instruction set in the current ISA in case the user uses
26664 function specific options for a different ISA, so that we don't get scope
26665 errors if a builtin is added in the middle of a function scope. */
26666
26667 static inline tree
26668 def_builtin (HOST_WIDE_INT mask, const char *name,
26669 enum ix86_builtin_func_type tcode,
26670 enum ix86_builtins code)
26671 {
26672 tree decl = NULL_TREE;
26673
26674 if (!(mask & OPTION_MASK_ISA_64BIT) || TARGET_64BIT)
26675 {
26676 ix86_builtins_isa[(int) code].isa = mask;
26677
26678 mask &= ~OPTION_MASK_ISA_64BIT;
26679 if (mask == 0
26680 || (mask & ix86_isa_flags) != 0
26681 || (lang_hooks.builtin_function
26682 == lang_hooks.builtin_function_ext_scope))
26683
26684 {
26685 tree type = ix86_get_builtin_func_type (tcode);
26686 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
26687 NULL, NULL_TREE);
26688 ix86_builtins[(int) code] = decl;
26689 ix86_builtins_isa[(int) code].set_and_not_built_p = false;
26690 }
26691 else
26692 {
26693 ix86_builtins[(int) code] = NULL_TREE;
26694 ix86_builtins_isa[(int) code].tcode = tcode;
26695 ix86_builtins_isa[(int) code].name = name;
26696 ix86_builtins_isa[(int) code].const_p = false;
26697 ix86_builtins_isa[(int) code].set_and_not_built_p = true;
26698 }
26699 }
26700
26701 return decl;
26702 }
26703
26704 /* Like def_builtin, but also marks the function decl "const". */
26705
26706 static inline tree
26707 def_builtin_const (HOST_WIDE_INT mask, const char *name,
26708 enum ix86_builtin_func_type tcode, enum ix86_builtins code)
26709 {
26710 tree decl = def_builtin (mask, name, tcode, code);
26711 if (decl)
26712 TREE_READONLY (decl) = 1;
26713 else
26714 ix86_builtins_isa[(int) code].const_p = true;
26715
26716 return decl;
26717 }
26718
26719 /* Add any new builtin functions for a given ISA that may not have been
26720 declared. This saves a bit of space compared to adding all of the
26721 declarations to the tree, even if we didn't use them. */
26722
26723 static void
26724 ix86_add_new_builtins (HOST_WIDE_INT isa)
26725 {
26726 int i;
26727
26728 for (i = 0; i < (int)IX86_BUILTIN_MAX; i++)
26729 {
26730 if ((ix86_builtins_isa[i].isa & isa) != 0
26731 && ix86_builtins_isa[i].set_and_not_built_p)
26732 {
26733 tree decl, type;
26734
26735 /* Don't define the builtin again. */
26736 ix86_builtins_isa[i].set_and_not_built_p = false;
26737
26738 type = ix86_get_builtin_func_type (ix86_builtins_isa[i].tcode);
26739 decl = add_builtin_function_ext_scope (ix86_builtins_isa[i].name,
26740 type, i, BUILT_IN_MD, NULL,
26741 NULL_TREE);
26742
26743 ix86_builtins[i] = decl;
26744 if (ix86_builtins_isa[i].const_p)
26745 TREE_READONLY (decl) = 1;
26746 }
26747 }
26748 }
26749
26750 /* Bits for builtin_description.flag. */
26751
26752 /* Set when we don't support the comparison natively, and should
26753 swap_comparison in order to support it. */
26754 #define BUILTIN_DESC_SWAP_OPERANDS 1
26755
26756 struct builtin_description
26757 {
26758 const HOST_WIDE_INT mask;
26759 const enum insn_code icode;
26760 const char *const name;
26761 const enum ix86_builtins code;
26762 const enum rtx_code comparison;
26763 const int flag;
26764 };
26765
26766 static const struct builtin_description bdesc_comi[] =
26767 {
26768 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comieq", IX86_BUILTIN_COMIEQSS, UNEQ, 0 },
26769 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comilt", IX86_BUILTIN_COMILTSS, UNLT, 0 },
26770 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comile", IX86_BUILTIN_COMILESS, UNLE, 0 },
26771 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comigt", IX86_BUILTIN_COMIGTSS, GT, 0 },
26772 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comige", IX86_BUILTIN_COMIGESS, GE, 0 },
26773 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comineq", IX86_BUILTIN_COMINEQSS, LTGT, 0 },
26774 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomieq", IX86_BUILTIN_UCOMIEQSS, UNEQ, 0 },
26775 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomilt", IX86_BUILTIN_UCOMILTSS, UNLT, 0 },
26776 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomile", IX86_BUILTIN_UCOMILESS, UNLE, 0 },
26777 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomigt", IX86_BUILTIN_UCOMIGTSS, GT, 0 },
26778 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomige", IX86_BUILTIN_UCOMIGESS, GE, 0 },
26779 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomineq", IX86_BUILTIN_UCOMINEQSS, LTGT, 0 },
26780 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdeq", IX86_BUILTIN_COMIEQSD, UNEQ, 0 },
26781 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdlt", IX86_BUILTIN_COMILTSD, UNLT, 0 },
26782 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdle", IX86_BUILTIN_COMILESD, UNLE, 0 },
26783 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdgt", IX86_BUILTIN_COMIGTSD, GT, 0 },
26784 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdge", IX86_BUILTIN_COMIGESD, GE, 0 },
26785 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdneq", IX86_BUILTIN_COMINEQSD, LTGT, 0 },
26786 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdeq", IX86_BUILTIN_UCOMIEQSD, UNEQ, 0 },
26787 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdlt", IX86_BUILTIN_UCOMILTSD, UNLT, 0 },
26788 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdle", IX86_BUILTIN_UCOMILESD, UNLE, 0 },
26789 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdgt", IX86_BUILTIN_UCOMIGTSD, GT, 0 },
26790 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdge", IX86_BUILTIN_UCOMIGESD, GE, 0 },
26791 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdneq", IX86_BUILTIN_UCOMINEQSD, LTGT, 0 },
26792 };
26793
26794 static const struct builtin_description bdesc_pcmpestr[] =
26795 {
26796 /* SSE4.2 */
26797 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestri128", IX86_BUILTIN_PCMPESTRI128, UNKNOWN, 0 },
26798 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestrm128", IX86_BUILTIN_PCMPESTRM128, UNKNOWN, 0 },
26799 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestria128", IX86_BUILTIN_PCMPESTRA128, UNKNOWN, (int) CCAmode },
26800 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestric128", IX86_BUILTIN_PCMPESTRC128, UNKNOWN, (int) CCCmode },
26801 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestrio128", IX86_BUILTIN_PCMPESTRO128, UNKNOWN, (int) CCOmode },
26802 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestris128", IX86_BUILTIN_PCMPESTRS128, UNKNOWN, (int) CCSmode },
26803 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestriz128", IX86_BUILTIN_PCMPESTRZ128, UNKNOWN, (int) CCZmode },
26804 };
26805
26806 static const struct builtin_description bdesc_pcmpistr[] =
26807 {
26808 /* SSE4.2 */
26809 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistri128", IX86_BUILTIN_PCMPISTRI128, UNKNOWN, 0 },
26810 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistrm128", IX86_BUILTIN_PCMPISTRM128, UNKNOWN, 0 },
26811 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistria128", IX86_BUILTIN_PCMPISTRA128, UNKNOWN, (int) CCAmode },
26812 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistric128", IX86_BUILTIN_PCMPISTRC128, UNKNOWN, (int) CCCmode },
26813 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistrio128", IX86_BUILTIN_PCMPISTRO128, UNKNOWN, (int) CCOmode },
26814 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistris128", IX86_BUILTIN_PCMPISTRS128, UNKNOWN, (int) CCSmode },
26815 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistriz128", IX86_BUILTIN_PCMPISTRZ128, UNKNOWN, (int) CCZmode },
26816 };
26817
26818 /* Special builtins with variable number of arguments. */
26819 static const struct builtin_description bdesc_special_args[] =
26820 {
26821 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_rdtsc", IX86_BUILTIN_RDTSC, UNKNOWN, (int) UINT64_FTYPE_VOID },
26822 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_rdtscp", IX86_BUILTIN_RDTSCP, UNKNOWN, (int) UINT64_FTYPE_PUNSIGNED },
26823 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_pause, "__builtin_ia32_pause", IX86_BUILTIN_PAUSE, UNKNOWN, (int) VOID_FTYPE_VOID },
26824
26825 /* MMX */
26826 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_emms, "__builtin_ia32_emms", IX86_BUILTIN_EMMS, UNKNOWN, (int) VOID_FTYPE_VOID },
26827
26828 /* 3DNow! */
26829 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_femms, "__builtin_ia32_femms", IX86_BUILTIN_FEMMS, UNKNOWN, (int) VOID_FTYPE_VOID },
26830
26831 /* FXSR, XSAVE and XSAVEOPT */
26832 { OPTION_MASK_ISA_FXSR, CODE_FOR_nothing, "__builtin_ia32_fxsave", IX86_BUILTIN_FXSAVE, UNKNOWN, (int) VOID_FTYPE_PVOID },
26833 { OPTION_MASK_ISA_FXSR, CODE_FOR_nothing, "__builtin_ia32_fxrstor", IX86_BUILTIN_FXRSTOR, UNKNOWN, (int) VOID_FTYPE_PVOID },
26834 { OPTION_MASK_ISA_XSAVE, CODE_FOR_nothing, "__builtin_ia32_xsave", IX86_BUILTIN_XSAVE, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
26835 { OPTION_MASK_ISA_XSAVE, CODE_FOR_nothing, "__builtin_ia32_xrstor", IX86_BUILTIN_XRSTOR, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
26836 { OPTION_MASK_ISA_XSAVEOPT, CODE_FOR_nothing, "__builtin_ia32_xsaveopt", IX86_BUILTIN_XSAVEOPT, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
26837
26838 { OPTION_MASK_ISA_FXSR | OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_fxsave64", IX86_BUILTIN_FXSAVE64, UNKNOWN, (int) VOID_FTYPE_PVOID },
26839 { OPTION_MASK_ISA_FXSR | OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_fxrstor64", IX86_BUILTIN_FXRSTOR64, UNKNOWN, (int) VOID_FTYPE_PVOID },
26840 { OPTION_MASK_ISA_XSAVE | OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_xsave64", IX86_BUILTIN_XSAVE64, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
26841 { OPTION_MASK_ISA_XSAVE | OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_xrstor64", IX86_BUILTIN_XRSTOR64, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
26842 { OPTION_MASK_ISA_XSAVEOPT | OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_xsaveopt64", IX86_BUILTIN_XSAVEOPT64, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
26843
26844 /* SSE */
26845 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_storeups, "__builtin_ia32_storeups", IX86_BUILTIN_STOREUPS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF },
26846 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movntv4sf, "__builtin_ia32_movntps", IX86_BUILTIN_MOVNTPS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF },
26847 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_loadups, "__builtin_ia32_loadups", IX86_BUILTIN_LOADUPS, UNKNOWN, (int) V4SF_FTYPE_PCFLOAT },
26848
26849 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_loadhps_exp, "__builtin_ia32_loadhps", IX86_BUILTIN_LOADHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_PCV2SF },
26850 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_loadlps_exp, "__builtin_ia32_loadlps", IX86_BUILTIN_LOADLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_PCV2SF },
26851 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_storehps, "__builtin_ia32_storehps", IX86_BUILTIN_STOREHPS, UNKNOWN, (int) VOID_FTYPE_PV2SF_V4SF },
26852 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_storelps, "__builtin_ia32_storelps", IX86_BUILTIN_STORELPS, UNKNOWN, (int) VOID_FTYPE_PV2SF_V4SF },
26853
26854 /* SSE or 3DNow!A */
26855 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_sse_sfence, "__builtin_ia32_sfence", IX86_BUILTIN_SFENCE, UNKNOWN, (int) VOID_FTYPE_VOID },
26856 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_sse_movntq, "__builtin_ia32_movntq", IX86_BUILTIN_MOVNTQ, UNKNOWN, (int) VOID_FTYPE_PULONGLONG_ULONGLONG },
26857
26858 /* SSE2 */
26859 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_lfence, "__builtin_ia32_lfence", IX86_BUILTIN_LFENCE, UNKNOWN, (int) VOID_FTYPE_VOID },
26860 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_mfence, 0, IX86_BUILTIN_MFENCE, UNKNOWN, (int) VOID_FTYPE_VOID },
26861 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_storeupd, "__builtin_ia32_storeupd", IX86_BUILTIN_STOREUPD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF },
26862 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_storedqu, "__builtin_ia32_storedqu", IX86_BUILTIN_STOREDQU, UNKNOWN, (int) VOID_FTYPE_PCHAR_V16QI },
26863 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntv2df, "__builtin_ia32_movntpd", IX86_BUILTIN_MOVNTPD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF },
26864 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntv2di, "__builtin_ia32_movntdq", IX86_BUILTIN_MOVNTDQ, UNKNOWN, (int) VOID_FTYPE_PV2DI_V2DI },
26865 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntisi, "__builtin_ia32_movnti", IX86_BUILTIN_MOVNTI, UNKNOWN, (int) VOID_FTYPE_PINT_INT },
26866 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_movntidi, "__builtin_ia32_movnti64", IX86_BUILTIN_MOVNTI64, UNKNOWN, (int) VOID_FTYPE_PLONGLONG_LONGLONG },
26867 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loadupd, "__builtin_ia32_loadupd", IX86_BUILTIN_LOADUPD, UNKNOWN, (int) V2DF_FTYPE_PCDOUBLE },
26868 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loaddqu, "__builtin_ia32_loaddqu", IX86_BUILTIN_LOADDQU, UNKNOWN, (int) V16QI_FTYPE_PCCHAR },
26869
26870 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loadhpd_exp, "__builtin_ia32_loadhpd", IX86_BUILTIN_LOADHPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_PCDOUBLE },
26871 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loadlpd_exp, "__builtin_ia32_loadlpd", IX86_BUILTIN_LOADLPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_PCDOUBLE },
26872
26873 /* SSE3 */
26874 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_lddqu, "__builtin_ia32_lddqu", IX86_BUILTIN_LDDQU, UNKNOWN, (int) V16QI_FTYPE_PCCHAR },
26875
26876 /* SSE4.1 */
26877 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_movntdqa, "__builtin_ia32_movntdqa", IX86_BUILTIN_MOVNTDQA, UNKNOWN, (int) V2DI_FTYPE_PV2DI },
26878
26879 /* SSE4A */
26880 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_vmmovntv2df, "__builtin_ia32_movntsd", IX86_BUILTIN_MOVNTSD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF },
26881 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_vmmovntv4sf, "__builtin_ia32_movntss", IX86_BUILTIN_MOVNTSS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF },
26882
26883 /* AVX */
26884 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vzeroall, "__builtin_ia32_vzeroall", IX86_BUILTIN_VZEROALL, UNKNOWN, (int) VOID_FTYPE_VOID },
26885 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vzeroupper, "__builtin_ia32_vzeroupper", IX86_BUILTIN_VZEROUPPER, UNKNOWN, (int) VOID_FTYPE_VOID },
26886
26887 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv4sf, "__builtin_ia32_vbroadcastss", IX86_BUILTIN_VBROADCASTSS, UNKNOWN, (int) V4SF_FTYPE_PCFLOAT },
26888 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv4df, "__builtin_ia32_vbroadcastsd256", IX86_BUILTIN_VBROADCASTSD256, UNKNOWN, (int) V4DF_FTYPE_PCDOUBLE },
26889 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv8sf, "__builtin_ia32_vbroadcastss256", IX86_BUILTIN_VBROADCASTSS256, UNKNOWN, (int) V8SF_FTYPE_PCFLOAT },
26890 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastf128_v4df, "__builtin_ia32_vbroadcastf128_pd256", IX86_BUILTIN_VBROADCASTPD256, UNKNOWN, (int) V4DF_FTYPE_PCV2DF },
26891 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastf128_v8sf, "__builtin_ia32_vbroadcastf128_ps256", IX86_BUILTIN_VBROADCASTPS256, UNKNOWN, (int) V8SF_FTYPE_PCV4SF },
26892
26893 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_loadupd256, "__builtin_ia32_loadupd256", IX86_BUILTIN_LOADUPD256, UNKNOWN, (int) V4DF_FTYPE_PCDOUBLE },
26894 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_loadups256, "__builtin_ia32_loadups256", IX86_BUILTIN_LOADUPS256, UNKNOWN, (int) V8SF_FTYPE_PCFLOAT },
26895 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_storeupd256, "__builtin_ia32_storeupd256", IX86_BUILTIN_STOREUPD256, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V4DF },
26896 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_storeups256, "__builtin_ia32_storeups256", IX86_BUILTIN_STOREUPS256, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V8SF },
26897 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_loaddqu256, "__builtin_ia32_loaddqu256", IX86_BUILTIN_LOADDQU256, UNKNOWN, (int) V32QI_FTYPE_PCCHAR },
26898 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_storedqu256, "__builtin_ia32_storedqu256", IX86_BUILTIN_STOREDQU256, UNKNOWN, (int) VOID_FTYPE_PCHAR_V32QI },
26899 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_lddqu256, "__builtin_ia32_lddqu256", IX86_BUILTIN_LDDQU256, UNKNOWN, (int) V32QI_FTYPE_PCCHAR },
26900
26901 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv4di, "__builtin_ia32_movntdq256", IX86_BUILTIN_MOVNTDQ256, UNKNOWN, (int) VOID_FTYPE_PV4DI_V4DI },
26902 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv4df, "__builtin_ia32_movntpd256", IX86_BUILTIN_MOVNTPD256, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V4DF },
26903 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv8sf, "__builtin_ia32_movntps256", IX86_BUILTIN_MOVNTPS256, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V8SF },
26904
26905 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadpd, "__builtin_ia32_maskloadpd", IX86_BUILTIN_MASKLOADPD, UNKNOWN, (int) V2DF_FTYPE_PCV2DF_V2DI },
26906 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadps, "__builtin_ia32_maskloadps", IX86_BUILTIN_MASKLOADPS, UNKNOWN, (int) V4SF_FTYPE_PCV4SF_V4SI },
26907 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadpd256, "__builtin_ia32_maskloadpd256", IX86_BUILTIN_MASKLOADPD256, UNKNOWN, (int) V4DF_FTYPE_PCV4DF_V4DI },
26908 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadps256, "__builtin_ia32_maskloadps256", IX86_BUILTIN_MASKLOADPS256, UNKNOWN, (int) V8SF_FTYPE_PCV8SF_V8SI },
26909 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstorepd, "__builtin_ia32_maskstorepd", IX86_BUILTIN_MASKSTOREPD, UNKNOWN, (int) VOID_FTYPE_PV2DF_V2DI_V2DF },
26910 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstoreps, "__builtin_ia32_maskstoreps", IX86_BUILTIN_MASKSTOREPS, UNKNOWN, (int) VOID_FTYPE_PV4SF_V4SI_V4SF },
26911 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstorepd256, "__builtin_ia32_maskstorepd256", IX86_BUILTIN_MASKSTOREPD256, UNKNOWN, (int) VOID_FTYPE_PV4DF_V4DI_V4DF },
26912 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstoreps256, "__builtin_ia32_maskstoreps256", IX86_BUILTIN_MASKSTOREPS256, UNKNOWN, (int) VOID_FTYPE_PV8SF_V8SI_V8SF },
26913
26914 /* AVX2 */
26915 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_movntdqa, "__builtin_ia32_movntdqa256", IX86_BUILTIN_MOVNTDQA256, UNKNOWN, (int) V4DI_FTYPE_PV4DI },
26916 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadd, "__builtin_ia32_maskloadd", IX86_BUILTIN_MASKLOADD, UNKNOWN, (int) V4SI_FTYPE_PCV4SI_V4SI },
26917 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadq, "__builtin_ia32_maskloadq", IX86_BUILTIN_MASKLOADQ, UNKNOWN, (int) V2DI_FTYPE_PCV2DI_V2DI },
26918 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadd256, "__builtin_ia32_maskloadd256", IX86_BUILTIN_MASKLOADD256, UNKNOWN, (int) V8SI_FTYPE_PCV8SI_V8SI },
26919 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadq256, "__builtin_ia32_maskloadq256", IX86_BUILTIN_MASKLOADQ256, UNKNOWN, (int) V4DI_FTYPE_PCV4DI_V4DI },
26920 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstored, "__builtin_ia32_maskstored", IX86_BUILTIN_MASKSTORED, UNKNOWN, (int) VOID_FTYPE_PV4SI_V4SI_V4SI },
26921 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstoreq, "__builtin_ia32_maskstoreq", IX86_BUILTIN_MASKSTOREQ, UNKNOWN, (int) VOID_FTYPE_PV2DI_V2DI_V2DI },
26922 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstored256, "__builtin_ia32_maskstored256", IX86_BUILTIN_MASKSTORED256, UNKNOWN, (int) VOID_FTYPE_PV8SI_V8SI_V8SI },
26923 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstoreq256, "__builtin_ia32_maskstoreq256", IX86_BUILTIN_MASKSTOREQ256, UNKNOWN, (int) VOID_FTYPE_PV4DI_V4DI_V4DI },
26924
26925 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_llwpcb, "__builtin_ia32_llwpcb", IX86_BUILTIN_LLWPCB, UNKNOWN, (int) VOID_FTYPE_PVOID },
26926 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_slwpcb, "__builtin_ia32_slwpcb", IX86_BUILTIN_SLWPCB, UNKNOWN, (int) PVOID_FTYPE_VOID },
26927 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpvalsi3, "__builtin_ia32_lwpval32", IX86_BUILTIN_LWPVAL32, UNKNOWN, (int) VOID_FTYPE_UINT_UINT_UINT },
26928 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpvaldi3, "__builtin_ia32_lwpval64", IX86_BUILTIN_LWPVAL64, UNKNOWN, (int) VOID_FTYPE_UINT64_UINT_UINT },
26929 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpinssi3, "__builtin_ia32_lwpins32", IX86_BUILTIN_LWPINS32, UNKNOWN, (int) UCHAR_FTYPE_UINT_UINT_UINT },
26930 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpinsdi3, "__builtin_ia32_lwpins64", IX86_BUILTIN_LWPINS64, UNKNOWN, (int) UCHAR_FTYPE_UINT64_UINT_UINT },
26931
26932 /* FSGSBASE */
26933 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdfsbasesi, "__builtin_ia32_rdfsbase32", IX86_BUILTIN_RDFSBASE32, UNKNOWN, (int) UNSIGNED_FTYPE_VOID },
26934 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdfsbasedi, "__builtin_ia32_rdfsbase64", IX86_BUILTIN_RDFSBASE64, UNKNOWN, (int) UINT64_FTYPE_VOID },
26935 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdgsbasesi, "__builtin_ia32_rdgsbase32", IX86_BUILTIN_RDGSBASE32, UNKNOWN, (int) UNSIGNED_FTYPE_VOID },
26936 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdgsbasedi, "__builtin_ia32_rdgsbase64", IX86_BUILTIN_RDGSBASE64, UNKNOWN, (int) UINT64_FTYPE_VOID },
26937 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrfsbasesi, "__builtin_ia32_wrfsbase32", IX86_BUILTIN_WRFSBASE32, UNKNOWN, (int) VOID_FTYPE_UNSIGNED },
26938 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrfsbasedi, "__builtin_ia32_wrfsbase64", IX86_BUILTIN_WRFSBASE64, UNKNOWN, (int) VOID_FTYPE_UINT64 },
26939 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrgsbasesi, "__builtin_ia32_wrgsbase32", IX86_BUILTIN_WRGSBASE32, UNKNOWN, (int) VOID_FTYPE_UNSIGNED },
26940 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrgsbasedi, "__builtin_ia32_wrgsbase64", IX86_BUILTIN_WRGSBASE64, UNKNOWN, (int) VOID_FTYPE_UINT64 },
26941
26942 /* RTM */
26943 { OPTION_MASK_ISA_RTM, CODE_FOR_xbegin, "__builtin_ia32_xbegin", IX86_BUILTIN_XBEGIN, UNKNOWN, (int) UNSIGNED_FTYPE_VOID },
26944 { OPTION_MASK_ISA_RTM, CODE_FOR_xend, "__builtin_ia32_xend", IX86_BUILTIN_XEND, UNKNOWN, (int) VOID_FTYPE_VOID },
26945 { OPTION_MASK_ISA_RTM, CODE_FOR_xtest, "__builtin_ia32_xtest", IX86_BUILTIN_XTEST, UNKNOWN, (int) INT_FTYPE_VOID },
26946 };
26947
26948 /* Builtins with variable number of arguments. */
26949 static const struct builtin_description bdesc_args[] =
26950 {
26951 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_bsr, "__builtin_ia32_bsrsi", IX86_BUILTIN_BSRSI, UNKNOWN, (int) INT_FTYPE_INT },
26952 { OPTION_MASK_ISA_64BIT, CODE_FOR_bsr_rex64, "__builtin_ia32_bsrdi", IX86_BUILTIN_BSRDI, UNKNOWN, (int) INT64_FTYPE_INT64 },
26953 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_rdpmc", IX86_BUILTIN_RDPMC, UNKNOWN, (int) UINT64_FTYPE_INT },
26954 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotlqi3, "__builtin_ia32_rolqi", IX86_BUILTIN_ROLQI, UNKNOWN, (int) UINT8_FTYPE_UINT8_INT },
26955 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotlhi3, "__builtin_ia32_rolhi", IX86_BUILTIN_ROLHI, UNKNOWN, (int) UINT16_FTYPE_UINT16_INT },
26956 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotrqi3, "__builtin_ia32_rorqi", IX86_BUILTIN_RORQI, UNKNOWN, (int) UINT8_FTYPE_UINT8_INT },
26957 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotrhi3, "__builtin_ia32_rorhi", IX86_BUILTIN_RORHI, UNKNOWN, (int) UINT16_FTYPE_UINT16_INT },
26958
26959 /* MMX */
26960 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv8qi3, "__builtin_ia32_paddb", IX86_BUILTIN_PADDB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26961 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv4hi3, "__builtin_ia32_paddw", IX86_BUILTIN_PADDW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26962 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv2si3, "__builtin_ia32_paddd", IX86_BUILTIN_PADDD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26963 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv8qi3, "__builtin_ia32_psubb", IX86_BUILTIN_PSUBB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26964 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv4hi3, "__builtin_ia32_psubw", IX86_BUILTIN_PSUBW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26965 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv2si3, "__builtin_ia32_psubd", IX86_BUILTIN_PSUBD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26966
26967 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ssaddv8qi3, "__builtin_ia32_paddsb", IX86_BUILTIN_PADDSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26968 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ssaddv4hi3, "__builtin_ia32_paddsw", IX86_BUILTIN_PADDSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26969 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_sssubv8qi3, "__builtin_ia32_psubsb", IX86_BUILTIN_PSUBSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26970 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_sssubv4hi3, "__builtin_ia32_psubsw", IX86_BUILTIN_PSUBSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26971 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_usaddv8qi3, "__builtin_ia32_paddusb", IX86_BUILTIN_PADDUSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26972 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_usaddv4hi3, "__builtin_ia32_paddusw", IX86_BUILTIN_PADDUSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26973 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ussubv8qi3, "__builtin_ia32_psubusb", IX86_BUILTIN_PSUBUSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26974 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ussubv4hi3, "__builtin_ia32_psubusw", IX86_BUILTIN_PSUBUSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26975
26976 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_mulv4hi3, "__builtin_ia32_pmullw", IX86_BUILTIN_PMULLW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26977 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_smulv4hi3_highpart, "__builtin_ia32_pmulhw", IX86_BUILTIN_PMULHW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26978
26979 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_andv2si3, "__builtin_ia32_pand", IX86_BUILTIN_PAND, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26980 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_andnotv2si3, "__builtin_ia32_pandn", IX86_BUILTIN_PANDN, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26981 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_iorv2si3, "__builtin_ia32_por", IX86_BUILTIN_POR, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26982 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_xorv2si3, "__builtin_ia32_pxor", IX86_BUILTIN_PXOR, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26983
26984 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_eqv8qi3, "__builtin_ia32_pcmpeqb", IX86_BUILTIN_PCMPEQB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26985 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_eqv4hi3, "__builtin_ia32_pcmpeqw", IX86_BUILTIN_PCMPEQW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26986 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_eqv2si3, "__builtin_ia32_pcmpeqd", IX86_BUILTIN_PCMPEQD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26987 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_gtv8qi3, "__builtin_ia32_pcmpgtb", IX86_BUILTIN_PCMPGTB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26988 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_gtv4hi3, "__builtin_ia32_pcmpgtw", IX86_BUILTIN_PCMPGTW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26989 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_gtv2si3, "__builtin_ia32_pcmpgtd", IX86_BUILTIN_PCMPGTD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26990
26991 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckhbw, "__builtin_ia32_punpckhbw", IX86_BUILTIN_PUNPCKHBW, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26992 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckhwd, "__builtin_ia32_punpckhwd", IX86_BUILTIN_PUNPCKHWD, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26993 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckhdq, "__builtin_ia32_punpckhdq", IX86_BUILTIN_PUNPCKHDQ, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26994 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpcklbw, "__builtin_ia32_punpcklbw", IX86_BUILTIN_PUNPCKLBW, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26995 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpcklwd, "__builtin_ia32_punpcklwd", IX86_BUILTIN_PUNPCKLWD, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI},
26996 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckldq, "__builtin_ia32_punpckldq", IX86_BUILTIN_PUNPCKLDQ, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI},
26997
26998 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_packsswb, "__builtin_ia32_packsswb", IX86_BUILTIN_PACKSSWB, UNKNOWN, (int) V8QI_FTYPE_V4HI_V4HI },
26999 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_packssdw, "__builtin_ia32_packssdw", IX86_BUILTIN_PACKSSDW, UNKNOWN, (int) V4HI_FTYPE_V2SI_V2SI },
27000 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_packuswb, "__builtin_ia32_packuswb", IX86_BUILTIN_PACKUSWB, UNKNOWN, (int) V8QI_FTYPE_V4HI_V4HI },
27001
27002 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_pmaddwd, "__builtin_ia32_pmaddwd", IX86_BUILTIN_PMADDWD, UNKNOWN, (int) V2SI_FTYPE_V4HI_V4HI },
27003
27004 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv4hi3, "__builtin_ia32_psllwi", IX86_BUILTIN_PSLLWI, UNKNOWN, (int) V4HI_FTYPE_V4HI_SI_COUNT },
27005 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv2si3, "__builtin_ia32_pslldi", IX86_BUILTIN_PSLLDI, UNKNOWN, (int) V2SI_FTYPE_V2SI_SI_COUNT },
27006 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv1di3, "__builtin_ia32_psllqi", IX86_BUILTIN_PSLLQI, UNKNOWN, (int) V1DI_FTYPE_V1DI_SI_COUNT },
27007 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv4hi3, "__builtin_ia32_psllw", IX86_BUILTIN_PSLLW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI_COUNT },
27008 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv2si3, "__builtin_ia32_pslld", IX86_BUILTIN_PSLLD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI_COUNT },
27009 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv1di3, "__builtin_ia32_psllq", IX86_BUILTIN_PSLLQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI_COUNT },
27010
27011 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv4hi3, "__builtin_ia32_psrlwi", IX86_BUILTIN_PSRLWI, UNKNOWN, (int) V4HI_FTYPE_V4HI_SI_COUNT },
27012 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv2si3, "__builtin_ia32_psrldi", IX86_BUILTIN_PSRLDI, UNKNOWN, (int) V2SI_FTYPE_V2SI_SI_COUNT },
27013 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv1di3, "__builtin_ia32_psrlqi", IX86_BUILTIN_PSRLQI, UNKNOWN, (int) V1DI_FTYPE_V1DI_SI_COUNT },
27014 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv4hi3, "__builtin_ia32_psrlw", IX86_BUILTIN_PSRLW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI_COUNT },
27015 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv2si3, "__builtin_ia32_psrld", IX86_BUILTIN_PSRLD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI_COUNT },
27016 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv1di3, "__builtin_ia32_psrlq", IX86_BUILTIN_PSRLQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI_COUNT },
27017
27018 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv4hi3, "__builtin_ia32_psrawi", IX86_BUILTIN_PSRAWI, UNKNOWN, (int) V4HI_FTYPE_V4HI_SI_COUNT },
27019 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv2si3, "__builtin_ia32_psradi", IX86_BUILTIN_PSRADI, UNKNOWN, (int) V2SI_FTYPE_V2SI_SI_COUNT },
27020 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv4hi3, "__builtin_ia32_psraw", IX86_BUILTIN_PSRAW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI_COUNT },
27021 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv2si3, "__builtin_ia32_psrad", IX86_BUILTIN_PSRAD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI_COUNT },
27022
27023 /* 3DNow! */
27024 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_pf2id, "__builtin_ia32_pf2id", IX86_BUILTIN_PF2ID, UNKNOWN, (int) V2SI_FTYPE_V2SF },
27025 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_floatv2si2, "__builtin_ia32_pi2fd", IX86_BUILTIN_PI2FD, UNKNOWN, (int) V2SF_FTYPE_V2SI },
27026 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rcpv2sf2, "__builtin_ia32_pfrcp", IX86_BUILTIN_PFRCP, UNKNOWN, (int) V2SF_FTYPE_V2SF },
27027 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rsqrtv2sf2, "__builtin_ia32_pfrsqrt", IX86_BUILTIN_PFRSQRT, UNKNOWN, (int) V2SF_FTYPE_V2SF },
27028
27029 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_uavgv8qi3, "__builtin_ia32_pavgusb", IX86_BUILTIN_PAVGUSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27030 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_haddv2sf3, "__builtin_ia32_pfacc", IX86_BUILTIN_PFACC, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27031 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_addv2sf3, "__builtin_ia32_pfadd", IX86_BUILTIN_PFADD, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27032 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_eqv2sf3, "__builtin_ia32_pfcmpeq", IX86_BUILTIN_PFCMPEQ, UNKNOWN, (int) V2SI_FTYPE_V2SF_V2SF },
27033 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_gev2sf3, "__builtin_ia32_pfcmpge", IX86_BUILTIN_PFCMPGE, UNKNOWN, (int) V2SI_FTYPE_V2SF_V2SF },
27034 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_gtv2sf3, "__builtin_ia32_pfcmpgt", IX86_BUILTIN_PFCMPGT, UNKNOWN, (int) V2SI_FTYPE_V2SF_V2SF },
27035 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_smaxv2sf3, "__builtin_ia32_pfmax", IX86_BUILTIN_PFMAX, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27036 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_sminv2sf3, "__builtin_ia32_pfmin", IX86_BUILTIN_PFMIN, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27037 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_mulv2sf3, "__builtin_ia32_pfmul", IX86_BUILTIN_PFMUL, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27038 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rcpit1v2sf3, "__builtin_ia32_pfrcpit1", IX86_BUILTIN_PFRCPIT1, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27039 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rcpit2v2sf3, "__builtin_ia32_pfrcpit2", IX86_BUILTIN_PFRCPIT2, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27040 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rsqit1v2sf3, "__builtin_ia32_pfrsqit1", IX86_BUILTIN_PFRSQIT1, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27041 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_subv2sf3, "__builtin_ia32_pfsub", IX86_BUILTIN_PFSUB, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27042 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_subrv2sf3, "__builtin_ia32_pfsubr", IX86_BUILTIN_PFSUBR, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27043 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_pmulhrwv4hi3, "__builtin_ia32_pmulhrw", IX86_BUILTIN_PMULHRW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27044
27045 /* 3DNow!A */
27046 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pf2iw, "__builtin_ia32_pf2iw", IX86_BUILTIN_PF2IW, UNKNOWN, (int) V2SI_FTYPE_V2SF },
27047 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pi2fw, "__builtin_ia32_pi2fw", IX86_BUILTIN_PI2FW, UNKNOWN, (int) V2SF_FTYPE_V2SI },
27048 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pswapdv2si2, "__builtin_ia32_pswapdsi", IX86_BUILTIN_PSWAPDSI, UNKNOWN, (int) V2SI_FTYPE_V2SI },
27049 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pswapdv2sf2, "__builtin_ia32_pswapdsf", IX86_BUILTIN_PSWAPDSF, UNKNOWN, (int) V2SF_FTYPE_V2SF },
27050 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_hsubv2sf3, "__builtin_ia32_pfnacc", IX86_BUILTIN_PFNACC, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27051 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_addsubv2sf3, "__builtin_ia32_pfpnacc", IX86_BUILTIN_PFPNACC, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27052
27053 /* SSE */
27054 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movmskps, "__builtin_ia32_movmskps", IX86_BUILTIN_MOVMSKPS, UNKNOWN, (int) INT_FTYPE_V4SF },
27055 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_sqrtv4sf2, "__builtin_ia32_sqrtps", IX86_BUILTIN_SQRTPS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
27056 { OPTION_MASK_ISA_SSE, CODE_FOR_sqrtv4sf2, "__builtin_ia32_sqrtps_nr", IX86_BUILTIN_SQRTPS_NR, UNKNOWN, (int) V4SF_FTYPE_V4SF },
27057 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_rsqrtv4sf2, "__builtin_ia32_rsqrtps", IX86_BUILTIN_RSQRTPS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
27058 { OPTION_MASK_ISA_SSE, CODE_FOR_rsqrtv4sf2, "__builtin_ia32_rsqrtps_nr", IX86_BUILTIN_RSQRTPS_NR, UNKNOWN, (int) V4SF_FTYPE_V4SF },
27059 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_rcpv4sf2, "__builtin_ia32_rcpps", IX86_BUILTIN_RCPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
27060 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtps2pi, "__builtin_ia32_cvtps2pi", IX86_BUILTIN_CVTPS2PI, UNKNOWN, (int) V2SI_FTYPE_V4SF },
27061 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtss2si, "__builtin_ia32_cvtss2si", IX86_BUILTIN_CVTSS2SI, UNKNOWN, (int) INT_FTYPE_V4SF },
27062 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvtss2siq, "__builtin_ia32_cvtss2si64", IX86_BUILTIN_CVTSS2SI64, UNKNOWN, (int) INT64_FTYPE_V4SF },
27063 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvttps2pi, "__builtin_ia32_cvttps2pi", IX86_BUILTIN_CVTTPS2PI, UNKNOWN, (int) V2SI_FTYPE_V4SF },
27064 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvttss2si, "__builtin_ia32_cvttss2si", IX86_BUILTIN_CVTTSS2SI, UNKNOWN, (int) INT_FTYPE_V4SF },
27065 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvttss2siq, "__builtin_ia32_cvttss2si64", IX86_BUILTIN_CVTTSS2SI64, UNKNOWN, (int) INT64_FTYPE_V4SF },
27066
27067 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_shufps, "__builtin_ia32_shufps", IX86_BUILTIN_SHUFPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
27068
27069 { OPTION_MASK_ISA_SSE, CODE_FOR_addv4sf3, "__builtin_ia32_addps", IX86_BUILTIN_ADDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27070 { OPTION_MASK_ISA_SSE, CODE_FOR_subv4sf3, "__builtin_ia32_subps", IX86_BUILTIN_SUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27071 { OPTION_MASK_ISA_SSE, CODE_FOR_mulv4sf3, "__builtin_ia32_mulps", IX86_BUILTIN_MULPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27072 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_divv4sf3, "__builtin_ia32_divps", IX86_BUILTIN_DIVPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27073 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmaddv4sf3, "__builtin_ia32_addss", IX86_BUILTIN_ADDSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27074 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsubv4sf3, "__builtin_ia32_subss", IX86_BUILTIN_SUBSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27075 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmulv4sf3, "__builtin_ia32_mulss", IX86_BUILTIN_MULSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27076 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmdivv4sf3, "__builtin_ia32_divss", IX86_BUILTIN_DIVSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27077
27078 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpeqps", IX86_BUILTIN_CMPEQPS, EQ, (int) V4SF_FTYPE_V4SF_V4SF },
27079 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpltps", IX86_BUILTIN_CMPLTPS, LT, (int) V4SF_FTYPE_V4SF_V4SF },
27080 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpleps", IX86_BUILTIN_CMPLEPS, LE, (int) V4SF_FTYPE_V4SF_V4SF },
27081 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpgtps", IX86_BUILTIN_CMPGTPS, LT, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
27082 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpgeps", IX86_BUILTIN_CMPGEPS, LE, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
27083 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpunordps", IX86_BUILTIN_CMPUNORDPS, UNORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
27084 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpneqps", IX86_BUILTIN_CMPNEQPS, NE, (int) V4SF_FTYPE_V4SF_V4SF },
27085 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpnltps", IX86_BUILTIN_CMPNLTPS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF },
27086 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpnleps", IX86_BUILTIN_CMPNLEPS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF },
27087 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpngtps", IX86_BUILTIN_CMPNGTPS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
27088 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpngeps", IX86_BUILTIN_CMPNGEPS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF_SWAP},
27089 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpordps", IX86_BUILTIN_CMPORDPS, ORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
27090 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpeqss", IX86_BUILTIN_CMPEQSS, EQ, (int) V4SF_FTYPE_V4SF_V4SF },
27091 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpltss", IX86_BUILTIN_CMPLTSS, LT, (int) V4SF_FTYPE_V4SF_V4SF },
27092 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpless", IX86_BUILTIN_CMPLESS, LE, (int) V4SF_FTYPE_V4SF_V4SF },
27093 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpunordss", IX86_BUILTIN_CMPUNORDSS, UNORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
27094 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpneqss", IX86_BUILTIN_CMPNEQSS, NE, (int) V4SF_FTYPE_V4SF_V4SF },
27095 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpnltss", IX86_BUILTIN_CMPNLTSS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF },
27096 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpnless", IX86_BUILTIN_CMPNLESS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF },
27097 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpngtss", IX86_BUILTIN_CMPNGTSS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
27098 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpngess", IX86_BUILTIN_CMPNGESS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
27099 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpordss", IX86_BUILTIN_CMPORDSS, ORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
27100
27101 { OPTION_MASK_ISA_SSE, CODE_FOR_sminv4sf3, "__builtin_ia32_minps", IX86_BUILTIN_MINPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27102 { OPTION_MASK_ISA_SSE, CODE_FOR_smaxv4sf3, "__builtin_ia32_maxps", IX86_BUILTIN_MAXPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27103 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsminv4sf3, "__builtin_ia32_minss", IX86_BUILTIN_MINSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27104 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsmaxv4sf3, "__builtin_ia32_maxss", IX86_BUILTIN_MAXSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27105
27106 { OPTION_MASK_ISA_SSE, CODE_FOR_andv4sf3, "__builtin_ia32_andps", IX86_BUILTIN_ANDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27107 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_andnotv4sf3, "__builtin_ia32_andnps", IX86_BUILTIN_ANDNPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27108 { OPTION_MASK_ISA_SSE, CODE_FOR_iorv4sf3, "__builtin_ia32_orps", IX86_BUILTIN_ORPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27109 { OPTION_MASK_ISA_SSE, CODE_FOR_xorv4sf3, "__builtin_ia32_xorps", IX86_BUILTIN_XORPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27110
27111 { OPTION_MASK_ISA_SSE, CODE_FOR_copysignv4sf3, "__builtin_ia32_copysignps", IX86_BUILTIN_CPYSGNPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27112
27113 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movss, "__builtin_ia32_movss", IX86_BUILTIN_MOVSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27114 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movhlps_exp, "__builtin_ia32_movhlps", IX86_BUILTIN_MOVHLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27115 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movlhps_exp, "__builtin_ia32_movlhps", IX86_BUILTIN_MOVLHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27116 { OPTION_MASK_ISA_SSE, CODE_FOR_vec_interleave_highv4sf, "__builtin_ia32_unpckhps", IX86_BUILTIN_UNPCKHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27117 { OPTION_MASK_ISA_SSE, CODE_FOR_vec_interleave_lowv4sf, "__builtin_ia32_unpcklps", IX86_BUILTIN_UNPCKLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27118
27119 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtpi2ps, "__builtin_ia32_cvtpi2ps", IX86_BUILTIN_CVTPI2PS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V2SI },
27120 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtsi2ss, "__builtin_ia32_cvtsi2ss", IX86_BUILTIN_CVTSI2SS, UNKNOWN, (int) V4SF_FTYPE_V4SF_SI },
27121 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvtsi2ssq, "__builtin_ia32_cvtsi642ss", IX86_BUILTIN_CVTSI642SS, UNKNOWN, V4SF_FTYPE_V4SF_DI },
27122
27123 { OPTION_MASK_ISA_SSE, CODE_FOR_rsqrtsf2, "__builtin_ia32_rsqrtf", IX86_BUILTIN_RSQRTF, UNKNOWN, (int) FLOAT_FTYPE_FLOAT },
27124
27125 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsqrtv4sf2, "__builtin_ia32_sqrtss", IX86_BUILTIN_SQRTSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_VEC_MERGE },
27126 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmrsqrtv4sf2, "__builtin_ia32_rsqrtss", IX86_BUILTIN_RSQRTSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_VEC_MERGE },
27127 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmrcpv4sf2, "__builtin_ia32_rcpss", IX86_BUILTIN_RCPSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_VEC_MERGE },
27128
27129 { OPTION_MASK_ISA_SSE, CODE_FOR_abstf2, 0, IX86_BUILTIN_FABSQ, UNKNOWN, (int) FLOAT128_FTYPE_FLOAT128 },
27130 { OPTION_MASK_ISA_SSE, CODE_FOR_copysigntf3, 0, IX86_BUILTIN_COPYSIGNQ, UNKNOWN, (int) FLOAT128_FTYPE_FLOAT128_FLOAT128 },
27131
27132 /* SSE MMX or 3Dnow!A */
27133 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_uavgv8qi3, "__builtin_ia32_pavgb", IX86_BUILTIN_PAVGB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27134 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_uavgv4hi3, "__builtin_ia32_pavgw", IX86_BUILTIN_PAVGW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27135 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_umulv4hi3_highpart, "__builtin_ia32_pmulhuw", IX86_BUILTIN_PMULHUW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27136
27137 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_umaxv8qi3, "__builtin_ia32_pmaxub", IX86_BUILTIN_PMAXUB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27138 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_smaxv4hi3, "__builtin_ia32_pmaxsw", IX86_BUILTIN_PMAXSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27139 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_uminv8qi3, "__builtin_ia32_pminub", IX86_BUILTIN_PMINUB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27140 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_sminv4hi3, "__builtin_ia32_pminsw", IX86_BUILTIN_PMINSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27141
27142 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_psadbw, "__builtin_ia32_psadbw", IX86_BUILTIN_PSADBW, UNKNOWN, (int) V1DI_FTYPE_V8QI_V8QI },
27143 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pmovmskb, "__builtin_ia32_pmovmskb", IX86_BUILTIN_PMOVMSKB, UNKNOWN, (int) INT_FTYPE_V8QI },
27144
27145 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pshufw, "__builtin_ia32_pshufw", IX86_BUILTIN_PSHUFW, UNKNOWN, (int) V4HI_FTYPE_V4HI_INT },
27146
27147 /* SSE2 */
27148 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_shufpd, "__builtin_ia32_shufpd", IX86_BUILTIN_SHUFPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
27149
27150 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movmskpd, "__builtin_ia32_movmskpd", IX86_BUILTIN_MOVMSKPD, UNKNOWN, (int) INT_FTYPE_V2DF },
27151 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pmovmskb, "__builtin_ia32_pmovmskb128", IX86_BUILTIN_PMOVMSKB128, UNKNOWN, (int) INT_FTYPE_V16QI },
27152 { OPTION_MASK_ISA_SSE2, CODE_FOR_sqrtv2df2, "__builtin_ia32_sqrtpd", IX86_BUILTIN_SQRTPD, UNKNOWN, (int) V2DF_FTYPE_V2DF },
27153 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtdq2pd, "__builtin_ia32_cvtdq2pd", IX86_BUILTIN_CVTDQ2PD, UNKNOWN, (int) V2DF_FTYPE_V4SI },
27154 { OPTION_MASK_ISA_SSE2, CODE_FOR_floatv4siv4sf2, "__builtin_ia32_cvtdq2ps", IX86_BUILTIN_CVTDQ2PS, UNKNOWN, (int) V4SF_FTYPE_V4SI },
27155
27156 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpd2dq, "__builtin_ia32_cvtpd2dq", IX86_BUILTIN_CVTPD2DQ, UNKNOWN, (int) V4SI_FTYPE_V2DF },
27157 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpd2pi, "__builtin_ia32_cvtpd2pi", IX86_BUILTIN_CVTPD2PI, UNKNOWN, (int) V2SI_FTYPE_V2DF },
27158 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpd2ps, "__builtin_ia32_cvtpd2ps", IX86_BUILTIN_CVTPD2PS, UNKNOWN, (int) V4SF_FTYPE_V2DF },
27159 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvttpd2dq, "__builtin_ia32_cvttpd2dq", IX86_BUILTIN_CVTTPD2DQ, UNKNOWN, (int) V4SI_FTYPE_V2DF },
27160 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvttpd2pi, "__builtin_ia32_cvttpd2pi", IX86_BUILTIN_CVTTPD2PI, UNKNOWN, (int) V2SI_FTYPE_V2DF },
27161
27162 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpi2pd, "__builtin_ia32_cvtpi2pd", IX86_BUILTIN_CVTPI2PD, UNKNOWN, (int) V2DF_FTYPE_V2SI },
27163
27164 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtsd2si, "__builtin_ia32_cvtsd2si", IX86_BUILTIN_CVTSD2SI, UNKNOWN, (int) INT_FTYPE_V2DF },
27165 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvttsd2si, "__builtin_ia32_cvttsd2si", IX86_BUILTIN_CVTTSD2SI, UNKNOWN, (int) INT_FTYPE_V2DF },
27166 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvtsd2siq, "__builtin_ia32_cvtsd2si64", IX86_BUILTIN_CVTSD2SI64, UNKNOWN, (int) INT64_FTYPE_V2DF },
27167 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvttsd2siq, "__builtin_ia32_cvttsd2si64", IX86_BUILTIN_CVTTSD2SI64, UNKNOWN, (int) INT64_FTYPE_V2DF },
27168
27169 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtps2dq, "__builtin_ia32_cvtps2dq", IX86_BUILTIN_CVTPS2DQ, UNKNOWN, (int) V4SI_FTYPE_V4SF },
27170 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtps2pd, "__builtin_ia32_cvtps2pd", IX86_BUILTIN_CVTPS2PD, UNKNOWN, (int) V2DF_FTYPE_V4SF },
27171 { OPTION_MASK_ISA_SSE2, CODE_FOR_fix_truncv4sfv4si2, "__builtin_ia32_cvttps2dq", IX86_BUILTIN_CVTTPS2DQ, UNKNOWN, (int) V4SI_FTYPE_V4SF },
27172
27173 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv2df3, "__builtin_ia32_addpd", IX86_BUILTIN_ADDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27174 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv2df3, "__builtin_ia32_subpd", IX86_BUILTIN_SUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27175 { OPTION_MASK_ISA_SSE2, CODE_FOR_mulv2df3, "__builtin_ia32_mulpd", IX86_BUILTIN_MULPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27176 { OPTION_MASK_ISA_SSE2, CODE_FOR_divv2df3, "__builtin_ia32_divpd", IX86_BUILTIN_DIVPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27177 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmaddv2df3, "__builtin_ia32_addsd", IX86_BUILTIN_ADDSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27178 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsubv2df3, "__builtin_ia32_subsd", IX86_BUILTIN_SUBSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27179 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmulv2df3, "__builtin_ia32_mulsd", IX86_BUILTIN_MULSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27180 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmdivv2df3, "__builtin_ia32_divsd", IX86_BUILTIN_DIVSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27181
27182 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpeqpd", IX86_BUILTIN_CMPEQPD, EQ, (int) V2DF_FTYPE_V2DF_V2DF },
27183 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpltpd", IX86_BUILTIN_CMPLTPD, LT, (int) V2DF_FTYPE_V2DF_V2DF },
27184 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmplepd", IX86_BUILTIN_CMPLEPD, LE, (int) V2DF_FTYPE_V2DF_V2DF },
27185 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpgtpd", IX86_BUILTIN_CMPGTPD, LT, (int) V2DF_FTYPE_V2DF_V2DF_SWAP },
27186 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpgepd", IX86_BUILTIN_CMPGEPD, LE, (int) V2DF_FTYPE_V2DF_V2DF_SWAP},
27187 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpunordpd", IX86_BUILTIN_CMPUNORDPD, UNORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
27188 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpneqpd", IX86_BUILTIN_CMPNEQPD, NE, (int) V2DF_FTYPE_V2DF_V2DF },
27189 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpnltpd", IX86_BUILTIN_CMPNLTPD, UNGE, (int) V2DF_FTYPE_V2DF_V2DF },
27190 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpnlepd", IX86_BUILTIN_CMPNLEPD, UNGT, (int) V2DF_FTYPE_V2DF_V2DF },
27191 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpngtpd", IX86_BUILTIN_CMPNGTPD, UNGE, (int) V2DF_FTYPE_V2DF_V2DF_SWAP },
27192 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpngepd", IX86_BUILTIN_CMPNGEPD, UNGT, (int) V2DF_FTYPE_V2DF_V2DF_SWAP },
27193 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpordpd", IX86_BUILTIN_CMPORDPD, ORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
27194 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpeqsd", IX86_BUILTIN_CMPEQSD, EQ, (int) V2DF_FTYPE_V2DF_V2DF },
27195 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpltsd", IX86_BUILTIN_CMPLTSD, LT, (int) V2DF_FTYPE_V2DF_V2DF },
27196 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmplesd", IX86_BUILTIN_CMPLESD, LE, (int) V2DF_FTYPE_V2DF_V2DF },
27197 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpunordsd", IX86_BUILTIN_CMPUNORDSD, UNORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
27198 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpneqsd", IX86_BUILTIN_CMPNEQSD, NE, (int) V2DF_FTYPE_V2DF_V2DF },
27199 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpnltsd", IX86_BUILTIN_CMPNLTSD, UNGE, (int) V2DF_FTYPE_V2DF_V2DF },
27200 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpnlesd", IX86_BUILTIN_CMPNLESD, UNGT, (int) V2DF_FTYPE_V2DF_V2DF },
27201 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpordsd", IX86_BUILTIN_CMPORDSD, ORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
27202
27203 { OPTION_MASK_ISA_SSE2, CODE_FOR_sminv2df3, "__builtin_ia32_minpd", IX86_BUILTIN_MINPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27204 { OPTION_MASK_ISA_SSE2, CODE_FOR_smaxv2df3, "__builtin_ia32_maxpd", IX86_BUILTIN_MAXPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27205 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsminv2df3, "__builtin_ia32_minsd", IX86_BUILTIN_MINSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27206 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsmaxv2df3, "__builtin_ia32_maxsd", IX86_BUILTIN_MAXSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27207
27208 { OPTION_MASK_ISA_SSE2, CODE_FOR_andv2df3, "__builtin_ia32_andpd", IX86_BUILTIN_ANDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27209 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_andnotv2df3, "__builtin_ia32_andnpd", IX86_BUILTIN_ANDNPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27210 { OPTION_MASK_ISA_SSE2, CODE_FOR_iorv2df3, "__builtin_ia32_orpd", IX86_BUILTIN_ORPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27211 { OPTION_MASK_ISA_SSE2, CODE_FOR_xorv2df3, "__builtin_ia32_xorpd", IX86_BUILTIN_XORPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27212
27213 { OPTION_MASK_ISA_SSE2, CODE_FOR_copysignv2df3, "__builtin_ia32_copysignpd", IX86_BUILTIN_CPYSGNPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27214
27215 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movsd, "__builtin_ia32_movsd", IX86_BUILTIN_MOVSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27216 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv2df, "__builtin_ia32_unpckhpd", IX86_BUILTIN_UNPCKHPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27217 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv2df, "__builtin_ia32_unpcklpd", IX86_BUILTIN_UNPCKLPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27218
27219 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_pack_sfix_v2df, "__builtin_ia32_vec_pack_sfix", IX86_BUILTIN_VEC_PACK_SFIX, UNKNOWN, (int) V4SI_FTYPE_V2DF_V2DF },
27220
27221 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv16qi3, "__builtin_ia32_paddb128", IX86_BUILTIN_PADDB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27222 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv8hi3, "__builtin_ia32_paddw128", IX86_BUILTIN_PADDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27223 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv4si3, "__builtin_ia32_paddd128", IX86_BUILTIN_PADDD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27224 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv2di3, "__builtin_ia32_paddq128", IX86_BUILTIN_PADDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27225 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv16qi3, "__builtin_ia32_psubb128", IX86_BUILTIN_PSUBB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27226 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv8hi3, "__builtin_ia32_psubw128", IX86_BUILTIN_PSUBW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27227 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv4si3, "__builtin_ia32_psubd128", IX86_BUILTIN_PSUBD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27228 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv2di3, "__builtin_ia32_psubq128", IX86_BUILTIN_PSUBQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27229
27230 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ssaddv16qi3, "__builtin_ia32_paddsb128", IX86_BUILTIN_PADDSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27231 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ssaddv8hi3, "__builtin_ia32_paddsw128", IX86_BUILTIN_PADDSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27232 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_sssubv16qi3, "__builtin_ia32_psubsb128", IX86_BUILTIN_PSUBSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27233 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_sssubv8hi3, "__builtin_ia32_psubsw128", IX86_BUILTIN_PSUBSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27234 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_usaddv16qi3, "__builtin_ia32_paddusb128", IX86_BUILTIN_PADDUSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27235 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_usaddv8hi3, "__builtin_ia32_paddusw128", IX86_BUILTIN_PADDUSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27236 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ussubv16qi3, "__builtin_ia32_psubusb128", IX86_BUILTIN_PSUBUSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27237 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ussubv8hi3, "__builtin_ia32_psubusw128", IX86_BUILTIN_PSUBUSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27238
27239 { OPTION_MASK_ISA_SSE2, CODE_FOR_mulv8hi3, "__builtin_ia32_pmullw128", IX86_BUILTIN_PMULLW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27240 { OPTION_MASK_ISA_SSE2, CODE_FOR_smulv8hi3_highpart, "__builtin_ia32_pmulhw128", IX86_BUILTIN_PMULHW128, UNKNOWN,(int) V8HI_FTYPE_V8HI_V8HI },
27241
27242 { OPTION_MASK_ISA_SSE2, CODE_FOR_andv2di3, "__builtin_ia32_pand128", IX86_BUILTIN_PAND128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27243 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_andnotv2di3, "__builtin_ia32_pandn128", IX86_BUILTIN_PANDN128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27244 { OPTION_MASK_ISA_SSE2, CODE_FOR_iorv2di3, "__builtin_ia32_por128", IX86_BUILTIN_POR128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27245 { OPTION_MASK_ISA_SSE2, CODE_FOR_xorv2di3, "__builtin_ia32_pxor128", IX86_BUILTIN_PXOR128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27246
27247 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_uavgv16qi3, "__builtin_ia32_pavgb128", IX86_BUILTIN_PAVGB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27248 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_uavgv8hi3, "__builtin_ia32_pavgw128", IX86_BUILTIN_PAVGW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27249
27250 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_eqv16qi3, "__builtin_ia32_pcmpeqb128", IX86_BUILTIN_PCMPEQB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27251 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_eqv8hi3, "__builtin_ia32_pcmpeqw128", IX86_BUILTIN_PCMPEQW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27252 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_eqv4si3, "__builtin_ia32_pcmpeqd128", IX86_BUILTIN_PCMPEQD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27253 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_gtv16qi3, "__builtin_ia32_pcmpgtb128", IX86_BUILTIN_PCMPGTB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27254 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_gtv8hi3, "__builtin_ia32_pcmpgtw128", IX86_BUILTIN_PCMPGTW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27255 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_gtv4si3, "__builtin_ia32_pcmpgtd128", IX86_BUILTIN_PCMPGTD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27256
27257 { OPTION_MASK_ISA_SSE2, CODE_FOR_umaxv16qi3, "__builtin_ia32_pmaxub128", IX86_BUILTIN_PMAXUB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27258 { OPTION_MASK_ISA_SSE2, CODE_FOR_smaxv8hi3, "__builtin_ia32_pmaxsw128", IX86_BUILTIN_PMAXSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27259 { OPTION_MASK_ISA_SSE2, CODE_FOR_uminv16qi3, "__builtin_ia32_pminub128", IX86_BUILTIN_PMINUB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27260 { OPTION_MASK_ISA_SSE2, CODE_FOR_sminv8hi3, "__builtin_ia32_pminsw128", IX86_BUILTIN_PMINSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27261
27262 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv16qi, "__builtin_ia32_punpckhbw128", IX86_BUILTIN_PUNPCKHBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27263 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv8hi, "__builtin_ia32_punpckhwd128", IX86_BUILTIN_PUNPCKHWD128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27264 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv4si, "__builtin_ia32_punpckhdq128", IX86_BUILTIN_PUNPCKHDQ128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27265 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv2di, "__builtin_ia32_punpckhqdq128", IX86_BUILTIN_PUNPCKHQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27266 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv16qi, "__builtin_ia32_punpcklbw128", IX86_BUILTIN_PUNPCKLBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27267 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv8hi, "__builtin_ia32_punpcklwd128", IX86_BUILTIN_PUNPCKLWD128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27268 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv4si, "__builtin_ia32_punpckldq128", IX86_BUILTIN_PUNPCKLDQ128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27269 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv2di, "__builtin_ia32_punpcklqdq128", IX86_BUILTIN_PUNPCKLQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27270
27271 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_packsswb, "__builtin_ia32_packsswb128", IX86_BUILTIN_PACKSSWB128, UNKNOWN, (int) V16QI_FTYPE_V8HI_V8HI },
27272 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_packssdw, "__builtin_ia32_packssdw128", IX86_BUILTIN_PACKSSDW128, UNKNOWN, (int) V8HI_FTYPE_V4SI_V4SI },
27273 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_packuswb, "__builtin_ia32_packuswb128", IX86_BUILTIN_PACKUSWB128, UNKNOWN, (int) V16QI_FTYPE_V8HI_V8HI },
27274
27275 { OPTION_MASK_ISA_SSE2, CODE_FOR_umulv8hi3_highpart, "__builtin_ia32_pmulhuw128", IX86_BUILTIN_PMULHUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27276 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_psadbw, "__builtin_ia32_psadbw128", IX86_BUILTIN_PSADBW128, UNKNOWN, (int) V2DI_FTYPE_V16QI_V16QI },
27277
27278 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_umulv1siv1di3, "__builtin_ia32_pmuludq", IX86_BUILTIN_PMULUDQ, UNKNOWN, (int) V1DI_FTYPE_V2SI_V2SI },
27279 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_widen_umult_even_v4si, "__builtin_ia32_pmuludq128", IX86_BUILTIN_PMULUDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI_V4SI },
27280
27281 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pmaddwd, "__builtin_ia32_pmaddwd128", IX86_BUILTIN_PMADDWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI_V8HI },
27282
27283 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtsi2sd, "__builtin_ia32_cvtsi2sd", IX86_BUILTIN_CVTSI2SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_SI },
27284 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvtsi2sdq, "__builtin_ia32_cvtsi642sd", IX86_BUILTIN_CVTSI642SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_DI },
27285 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtsd2ss, "__builtin_ia32_cvtsd2ss", IX86_BUILTIN_CVTSD2SS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V2DF },
27286 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtss2sd, "__builtin_ia32_cvtss2sd", IX86_BUILTIN_CVTSS2SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V4SF },
27287
27288 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ashlv1ti3, "__builtin_ia32_pslldqi128", IX86_BUILTIN_PSLLDQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT_CONVERT },
27289 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv8hi3, "__builtin_ia32_psllwi128", IX86_BUILTIN_PSLLWI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_SI_COUNT },
27290 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv4si3, "__builtin_ia32_pslldi128", IX86_BUILTIN_PSLLDI128, UNKNOWN, (int) V4SI_FTYPE_V4SI_SI_COUNT },
27291 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv2di3, "__builtin_ia32_psllqi128", IX86_BUILTIN_PSLLQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_SI_COUNT },
27292 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv8hi3, "__builtin_ia32_psllw128", IX86_BUILTIN_PSLLW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_COUNT },
27293 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv4si3, "__builtin_ia32_pslld128", IX86_BUILTIN_PSLLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_COUNT },
27294 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv2di3, "__builtin_ia32_psllq128", IX86_BUILTIN_PSLLQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_COUNT },
27295
27296 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_lshrv1ti3, "__builtin_ia32_psrldqi128", IX86_BUILTIN_PSRLDQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT_CONVERT },
27297 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv8hi3, "__builtin_ia32_psrlwi128", IX86_BUILTIN_PSRLWI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_SI_COUNT },
27298 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv4si3, "__builtin_ia32_psrldi128", IX86_BUILTIN_PSRLDI128, UNKNOWN, (int) V4SI_FTYPE_V4SI_SI_COUNT },
27299 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv2di3, "__builtin_ia32_psrlqi128", IX86_BUILTIN_PSRLQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_SI_COUNT },
27300 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv8hi3, "__builtin_ia32_psrlw128", IX86_BUILTIN_PSRLW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_COUNT },
27301 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv4si3, "__builtin_ia32_psrld128", IX86_BUILTIN_PSRLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_COUNT },
27302 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv2di3, "__builtin_ia32_psrlq128", IX86_BUILTIN_PSRLQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_COUNT },
27303
27304 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv8hi3, "__builtin_ia32_psrawi128", IX86_BUILTIN_PSRAWI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_SI_COUNT },
27305 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv4si3, "__builtin_ia32_psradi128", IX86_BUILTIN_PSRADI128, UNKNOWN, (int) V4SI_FTYPE_V4SI_SI_COUNT },
27306 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv8hi3, "__builtin_ia32_psraw128", IX86_BUILTIN_PSRAW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_COUNT },
27307 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv4si3, "__builtin_ia32_psrad128", IX86_BUILTIN_PSRAD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_COUNT },
27308
27309 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pshufd, "__builtin_ia32_pshufd", IX86_BUILTIN_PSHUFD, UNKNOWN, (int) V4SI_FTYPE_V4SI_INT },
27310 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pshuflw, "__builtin_ia32_pshuflw", IX86_BUILTIN_PSHUFLW, UNKNOWN, (int) V8HI_FTYPE_V8HI_INT },
27311 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pshufhw, "__builtin_ia32_pshufhw", IX86_BUILTIN_PSHUFHW, UNKNOWN, (int) V8HI_FTYPE_V8HI_INT },
27312
27313 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsqrtv2df2, "__builtin_ia32_sqrtsd", IX86_BUILTIN_SQRTSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_VEC_MERGE },
27314
27315 { OPTION_MASK_ISA_SSE, CODE_FOR_sse2_movq128, "__builtin_ia32_movq128", IX86_BUILTIN_MOVQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI },
27316
27317 /* SSE2 MMX */
27318 { OPTION_MASK_ISA_SSE2, CODE_FOR_mmx_addv1di3, "__builtin_ia32_paddq", IX86_BUILTIN_PADDQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI },
27319 { OPTION_MASK_ISA_SSE2, CODE_FOR_mmx_subv1di3, "__builtin_ia32_psubq", IX86_BUILTIN_PSUBQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI },
27320
27321 /* SSE3 */
27322 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_movshdup, "__builtin_ia32_movshdup", IX86_BUILTIN_MOVSHDUP, UNKNOWN, (int) V4SF_FTYPE_V4SF},
27323 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_movsldup, "__builtin_ia32_movsldup", IX86_BUILTIN_MOVSLDUP, UNKNOWN, (int) V4SF_FTYPE_V4SF },
27324
27325 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_addsubv4sf3, "__builtin_ia32_addsubps", IX86_BUILTIN_ADDSUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27326 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_addsubv2df3, "__builtin_ia32_addsubpd", IX86_BUILTIN_ADDSUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27327 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_haddv4sf3, "__builtin_ia32_haddps", IX86_BUILTIN_HADDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27328 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_haddv2df3, "__builtin_ia32_haddpd", IX86_BUILTIN_HADDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27329 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_hsubv4sf3, "__builtin_ia32_hsubps", IX86_BUILTIN_HSUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27330 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_hsubv2df3, "__builtin_ia32_hsubpd", IX86_BUILTIN_HSUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27331
27332 /* SSSE3 */
27333 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv16qi2, "__builtin_ia32_pabsb128", IX86_BUILTIN_PABSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI },
27334 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv8qi2, "__builtin_ia32_pabsb", IX86_BUILTIN_PABSB, UNKNOWN, (int) V8QI_FTYPE_V8QI },
27335 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv8hi2, "__builtin_ia32_pabsw128", IX86_BUILTIN_PABSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI },
27336 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv4hi2, "__builtin_ia32_pabsw", IX86_BUILTIN_PABSW, UNKNOWN, (int) V4HI_FTYPE_V4HI },
27337 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv4si2, "__builtin_ia32_pabsd128", IX86_BUILTIN_PABSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI },
27338 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv2si2, "__builtin_ia32_pabsd", IX86_BUILTIN_PABSD, UNKNOWN, (int) V2SI_FTYPE_V2SI },
27339
27340 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddwv8hi3, "__builtin_ia32_phaddw128", IX86_BUILTIN_PHADDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27341 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddwv4hi3, "__builtin_ia32_phaddw", IX86_BUILTIN_PHADDW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27342 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phadddv4si3, "__builtin_ia32_phaddd128", IX86_BUILTIN_PHADDD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27343 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phadddv2si3, "__builtin_ia32_phaddd", IX86_BUILTIN_PHADDD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27344 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddswv8hi3, "__builtin_ia32_phaddsw128", IX86_BUILTIN_PHADDSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27345 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddswv4hi3, "__builtin_ia32_phaddsw", IX86_BUILTIN_PHADDSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27346 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubwv8hi3, "__builtin_ia32_phsubw128", IX86_BUILTIN_PHSUBW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27347 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubwv4hi3, "__builtin_ia32_phsubw", IX86_BUILTIN_PHSUBW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27348 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubdv4si3, "__builtin_ia32_phsubd128", IX86_BUILTIN_PHSUBD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27349 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubdv2si3, "__builtin_ia32_phsubd", IX86_BUILTIN_PHSUBD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27350 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubswv8hi3, "__builtin_ia32_phsubsw128", IX86_BUILTIN_PHSUBSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27351 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubswv4hi3, "__builtin_ia32_phsubsw", IX86_BUILTIN_PHSUBSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27352 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmaddubsw128, "__builtin_ia32_pmaddubsw128", IX86_BUILTIN_PMADDUBSW128, UNKNOWN, (int) V8HI_FTYPE_V16QI_V16QI },
27353 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmaddubsw, "__builtin_ia32_pmaddubsw", IX86_BUILTIN_PMADDUBSW, UNKNOWN, (int) V4HI_FTYPE_V8QI_V8QI },
27354 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmulhrswv8hi3, "__builtin_ia32_pmulhrsw128", IX86_BUILTIN_PMULHRSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27355 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmulhrswv4hi3, "__builtin_ia32_pmulhrsw", IX86_BUILTIN_PMULHRSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27356 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pshufbv16qi3, "__builtin_ia32_pshufb128", IX86_BUILTIN_PSHUFB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27357 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pshufbv8qi3, "__builtin_ia32_pshufb", IX86_BUILTIN_PSHUFB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27358 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv16qi3, "__builtin_ia32_psignb128", IX86_BUILTIN_PSIGNB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27359 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv8qi3, "__builtin_ia32_psignb", IX86_BUILTIN_PSIGNB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27360 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv8hi3, "__builtin_ia32_psignw128", IX86_BUILTIN_PSIGNW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27361 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv4hi3, "__builtin_ia32_psignw", IX86_BUILTIN_PSIGNW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27362 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv4si3, "__builtin_ia32_psignd128", IX86_BUILTIN_PSIGND128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27363 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv2si3, "__builtin_ia32_psignd", IX86_BUILTIN_PSIGND, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27364
27365 /* SSSE3. */
27366 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_palignrti, "__builtin_ia32_palignr128", IX86_BUILTIN_PALIGNR128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_INT_CONVERT },
27367 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_palignrdi, "__builtin_ia32_palignr", IX86_BUILTIN_PALIGNR, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI_INT_CONVERT },
27368
27369 /* SSE4.1 */
27370 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendpd, "__builtin_ia32_blendpd", IX86_BUILTIN_BLENDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
27371 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendps, "__builtin_ia32_blendps", IX86_BUILTIN_BLENDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
27372 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendvpd, "__builtin_ia32_blendvpd", IX86_BUILTIN_BLENDVPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_V2DF },
27373 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendvps, "__builtin_ia32_blendvps", IX86_BUILTIN_BLENDVPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_V4SF },
27374 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_dppd, "__builtin_ia32_dppd", IX86_BUILTIN_DPPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
27375 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_dpps, "__builtin_ia32_dpps", IX86_BUILTIN_DPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
27376 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_insertps, "__builtin_ia32_insertps128", IX86_BUILTIN_INSERTPS128, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
27377 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_mpsadbw, "__builtin_ia32_mpsadbw128", IX86_BUILTIN_MPSADBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI_INT },
27378 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_pblendvb, "__builtin_ia32_pblendvb128", IX86_BUILTIN_PBLENDVB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI_V16QI },
27379 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_pblendw, "__builtin_ia32_pblendw128", IX86_BUILTIN_PBLENDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_INT },
27380
27381 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv8qiv8hi2, "__builtin_ia32_pmovsxbw128", IX86_BUILTIN_PMOVSXBW128, UNKNOWN, (int) V8HI_FTYPE_V16QI },
27382 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv4qiv4si2, "__builtin_ia32_pmovsxbd128", IX86_BUILTIN_PMOVSXBD128, UNKNOWN, (int) V4SI_FTYPE_V16QI },
27383 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv2qiv2di2, "__builtin_ia32_pmovsxbq128", IX86_BUILTIN_PMOVSXBQ128, UNKNOWN, (int) V2DI_FTYPE_V16QI },
27384 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv4hiv4si2, "__builtin_ia32_pmovsxwd128", IX86_BUILTIN_PMOVSXWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI },
27385 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv2hiv2di2, "__builtin_ia32_pmovsxwq128", IX86_BUILTIN_PMOVSXWQ128, UNKNOWN, (int) V2DI_FTYPE_V8HI },
27386 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv2siv2di2, "__builtin_ia32_pmovsxdq128", IX86_BUILTIN_PMOVSXDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI },
27387 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv8qiv8hi2, "__builtin_ia32_pmovzxbw128", IX86_BUILTIN_PMOVZXBW128, UNKNOWN, (int) V8HI_FTYPE_V16QI },
27388 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv4qiv4si2, "__builtin_ia32_pmovzxbd128", IX86_BUILTIN_PMOVZXBD128, UNKNOWN, (int) V4SI_FTYPE_V16QI },
27389 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv2qiv2di2, "__builtin_ia32_pmovzxbq128", IX86_BUILTIN_PMOVZXBQ128, UNKNOWN, (int) V2DI_FTYPE_V16QI },
27390 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv4hiv4si2, "__builtin_ia32_pmovzxwd128", IX86_BUILTIN_PMOVZXWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI },
27391 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv2hiv2di2, "__builtin_ia32_pmovzxwq128", IX86_BUILTIN_PMOVZXWQ128, UNKNOWN, (int) V2DI_FTYPE_V8HI },
27392 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv2siv2di2, "__builtin_ia32_pmovzxdq128", IX86_BUILTIN_PMOVZXDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI },
27393 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_phminposuw, "__builtin_ia32_phminposuw128", IX86_BUILTIN_PHMINPOSUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI },
27394
27395 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_packusdw, "__builtin_ia32_packusdw128", IX86_BUILTIN_PACKUSDW128, UNKNOWN, (int) V8HI_FTYPE_V4SI_V4SI },
27396 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_eqv2di3, "__builtin_ia32_pcmpeqq", IX86_BUILTIN_PCMPEQQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27397 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_smaxv16qi3, "__builtin_ia32_pmaxsb128", IX86_BUILTIN_PMAXSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27398 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_smaxv4si3, "__builtin_ia32_pmaxsd128", IX86_BUILTIN_PMAXSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27399 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_umaxv4si3, "__builtin_ia32_pmaxud128", IX86_BUILTIN_PMAXUD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27400 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_umaxv8hi3, "__builtin_ia32_pmaxuw128", IX86_BUILTIN_PMAXUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27401 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sminv16qi3, "__builtin_ia32_pminsb128", IX86_BUILTIN_PMINSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27402 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sminv4si3, "__builtin_ia32_pminsd128", IX86_BUILTIN_PMINSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27403 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_uminv4si3, "__builtin_ia32_pminud128", IX86_BUILTIN_PMINUD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27404 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_uminv8hi3, "__builtin_ia32_pminuw128", IX86_BUILTIN_PMINUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27405 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_mulv2siv2di3, "__builtin_ia32_pmuldq128", IX86_BUILTIN_PMULDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI_V4SI },
27406 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_mulv4si3, "__builtin_ia32_pmulld128", IX86_BUILTIN_PMULLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27407
27408 /* SSE4.1 */
27409 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_roundpd", IX86_BUILTIN_ROUNDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_INT },
27410 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_roundps", IX86_BUILTIN_ROUNDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_INT },
27411 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundsd, "__builtin_ia32_roundsd", IX86_BUILTIN_ROUNDSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
27412 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundss, "__builtin_ia32_roundss", IX86_BUILTIN_ROUNDSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
27413
27414 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_floorpd", IX86_BUILTIN_FLOORPD, (enum rtx_code) ROUND_FLOOR, (int) V2DF_FTYPE_V2DF_ROUND },
27415 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_ceilpd", IX86_BUILTIN_CEILPD, (enum rtx_code) ROUND_CEIL, (int) V2DF_FTYPE_V2DF_ROUND },
27416 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_truncpd", IX86_BUILTIN_TRUNCPD, (enum rtx_code) ROUND_TRUNC, (int) V2DF_FTYPE_V2DF_ROUND },
27417 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_rintpd", IX86_BUILTIN_RINTPD, (enum rtx_code) ROUND_MXCSR, (int) V2DF_FTYPE_V2DF_ROUND },
27418
27419 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd_vec_pack_sfix, "__builtin_ia32_floorpd_vec_pack_sfix", IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX, (enum rtx_code) ROUND_FLOOR, (int) V4SI_FTYPE_V2DF_V2DF_ROUND },
27420 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd_vec_pack_sfix, "__builtin_ia32_ceilpd_vec_pack_sfix", IX86_BUILTIN_CEILPD_VEC_PACK_SFIX, (enum rtx_code) ROUND_CEIL, (int) V4SI_FTYPE_V2DF_V2DF_ROUND },
27421
27422 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv2df2, "__builtin_ia32_roundpd_az", IX86_BUILTIN_ROUNDPD_AZ, UNKNOWN, (int) V2DF_FTYPE_V2DF },
27423 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv2df2_vec_pack_sfix, "__builtin_ia32_roundpd_az_vec_pack_sfix", IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX, UNKNOWN, (int) V4SI_FTYPE_V2DF_V2DF },
27424
27425 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_floorps", IX86_BUILTIN_FLOORPS, (enum rtx_code) ROUND_FLOOR, (int) V4SF_FTYPE_V4SF_ROUND },
27426 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_ceilps", IX86_BUILTIN_CEILPS, (enum rtx_code) ROUND_CEIL, (int) V4SF_FTYPE_V4SF_ROUND },
27427 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_truncps", IX86_BUILTIN_TRUNCPS, (enum rtx_code) ROUND_TRUNC, (int) V4SF_FTYPE_V4SF_ROUND },
27428 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_rintps", IX86_BUILTIN_RINTPS, (enum rtx_code) ROUND_MXCSR, (int) V4SF_FTYPE_V4SF_ROUND },
27429
27430 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps_sfix, "__builtin_ia32_floorps_sfix", IX86_BUILTIN_FLOORPS_SFIX, (enum rtx_code) ROUND_FLOOR, (int) V4SI_FTYPE_V4SF_ROUND },
27431 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps_sfix, "__builtin_ia32_ceilps_sfix", IX86_BUILTIN_CEILPS_SFIX, (enum rtx_code) ROUND_CEIL, (int) V4SI_FTYPE_V4SF_ROUND },
27432
27433 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv4sf2, "__builtin_ia32_roundps_az", IX86_BUILTIN_ROUNDPS_AZ, UNKNOWN, (int) V4SF_FTYPE_V4SF },
27434 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv4sf2_sfix, "__builtin_ia32_roundps_az_sfix", IX86_BUILTIN_ROUNDPS_AZ_SFIX, UNKNOWN, (int) V4SI_FTYPE_V4SF },
27435
27436 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestz128", IX86_BUILTIN_PTESTZ, EQ, (int) INT_FTYPE_V2DI_V2DI_PTEST },
27437 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestc128", IX86_BUILTIN_PTESTC, LTU, (int) INT_FTYPE_V2DI_V2DI_PTEST },
27438 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestnzc128", IX86_BUILTIN_PTESTNZC, GTU, (int) INT_FTYPE_V2DI_V2DI_PTEST },
27439
27440 /* SSE4.2 */
27441 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_gtv2di3, "__builtin_ia32_pcmpgtq", IX86_BUILTIN_PCMPGTQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27442 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32, CODE_FOR_sse4_2_crc32qi, "__builtin_ia32_crc32qi", IX86_BUILTIN_CRC32QI, UNKNOWN, (int) UINT_FTYPE_UINT_UCHAR },
27443 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32, CODE_FOR_sse4_2_crc32hi, "__builtin_ia32_crc32hi", IX86_BUILTIN_CRC32HI, UNKNOWN, (int) UINT_FTYPE_UINT_USHORT },
27444 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32, CODE_FOR_sse4_2_crc32si, "__builtin_ia32_crc32si", IX86_BUILTIN_CRC32SI, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
27445 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse4_2_crc32di, "__builtin_ia32_crc32di", IX86_BUILTIN_CRC32DI, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
27446
27447 /* SSE4A */
27448 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_extrqi, "__builtin_ia32_extrqi", IX86_BUILTIN_EXTRQI, UNKNOWN, (int) V2DI_FTYPE_V2DI_UINT_UINT },
27449 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_extrq, "__builtin_ia32_extrq", IX86_BUILTIN_EXTRQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V16QI },
27450 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_insertqi, "__builtin_ia32_insertqi", IX86_BUILTIN_INSERTQI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_UINT_UINT },
27451 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_insertq, "__builtin_ia32_insertq", IX86_BUILTIN_INSERTQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27452
27453 /* AES */
27454 { OPTION_MASK_ISA_SSE2, CODE_FOR_aeskeygenassist, 0, IX86_BUILTIN_AESKEYGENASSIST128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT },
27455 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesimc, 0, IX86_BUILTIN_AESIMC128, UNKNOWN, (int) V2DI_FTYPE_V2DI },
27456
27457 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesenc, 0, IX86_BUILTIN_AESENC128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27458 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesenclast, 0, IX86_BUILTIN_AESENCLAST128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27459 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesdec, 0, IX86_BUILTIN_AESDEC128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27460 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesdeclast, 0, IX86_BUILTIN_AESDECLAST128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27461
27462 /* PCLMUL */
27463 { OPTION_MASK_ISA_SSE2, CODE_FOR_pclmulqdq, 0, IX86_BUILTIN_PCLMULQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_INT },
27464
27465 /* AVX */
27466 { OPTION_MASK_ISA_AVX, CODE_FOR_addv4df3, "__builtin_ia32_addpd256", IX86_BUILTIN_ADDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27467 { OPTION_MASK_ISA_AVX, CODE_FOR_addv8sf3, "__builtin_ia32_addps256", IX86_BUILTIN_ADDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27468 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_addsubv4df3, "__builtin_ia32_addsubpd256", IX86_BUILTIN_ADDSUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27469 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_addsubv8sf3, "__builtin_ia32_addsubps256", IX86_BUILTIN_ADDSUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27470 { OPTION_MASK_ISA_AVX, CODE_FOR_andv4df3, "__builtin_ia32_andpd256", IX86_BUILTIN_ANDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27471 { OPTION_MASK_ISA_AVX, CODE_FOR_andv8sf3, "__builtin_ia32_andps256", IX86_BUILTIN_ANDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27472 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_andnotv4df3, "__builtin_ia32_andnpd256", IX86_BUILTIN_ANDNPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27473 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_andnotv8sf3, "__builtin_ia32_andnps256", IX86_BUILTIN_ANDNPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27474 { OPTION_MASK_ISA_AVX, CODE_FOR_divv4df3, "__builtin_ia32_divpd256", IX86_BUILTIN_DIVPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27475 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_divv8sf3, "__builtin_ia32_divps256", IX86_BUILTIN_DIVPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27476 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_haddv4df3, "__builtin_ia32_haddpd256", IX86_BUILTIN_HADDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27477 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_hsubv8sf3, "__builtin_ia32_hsubps256", IX86_BUILTIN_HSUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27478 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_hsubv4df3, "__builtin_ia32_hsubpd256", IX86_BUILTIN_HSUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27479 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_haddv8sf3, "__builtin_ia32_haddps256", IX86_BUILTIN_HADDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27480 { OPTION_MASK_ISA_AVX, CODE_FOR_smaxv4df3, "__builtin_ia32_maxpd256", IX86_BUILTIN_MAXPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27481 { OPTION_MASK_ISA_AVX, CODE_FOR_smaxv8sf3, "__builtin_ia32_maxps256", IX86_BUILTIN_MAXPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27482 { OPTION_MASK_ISA_AVX, CODE_FOR_sminv4df3, "__builtin_ia32_minpd256", IX86_BUILTIN_MINPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27483 { OPTION_MASK_ISA_AVX, CODE_FOR_sminv8sf3, "__builtin_ia32_minps256", IX86_BUILTIN_MINPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27484 { OPTION_MASK_ISA_AVX, CODE_FOR_mulv4df3, "__builtin_ia32_mulpd256", IX86_BUILTIN_MULPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27485 { OPTION_MASK_ISA_AVX, CODE_FOR_mulv8sf3, "__builtin_ia32_mulps256", IX86_BUILTIN_MULPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27486 { OPTION_MASK_ISA_AVX, CODE_FOR_iorv4df3, "__builtin_ia32_orpd256", IX86_BUILTIN_ORPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27487 { OPTION_MASK_ISA_AVX, CODE_FOR_iorv8sf3, "__builtin_ia32_orps256", IX86_BUILTIN_ORPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27488 { OPTION_MASK_ISA_AVX, CODE_FOR_subv4df3, "__builtin_ia32_subpd256", IX86_BUILTIN_SUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27489 { OPTION_MASK_ISA_AVX, CODE_FOR_subv8sf3, "__builtin_ia32_subps256", IX86_BUILTIN_SUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27490 { OPTION_MASK_ISA_AVX, CODE_FOR_xorv4df3, "__builtin_ia32_xorpd256", IX86_BUILTIN_XORPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27491 { OPTION_MASK_ISA_AVX, CODE_FOR_xorv8sf3, "__builtin_ia32_xorps256", IX86_BUILTIN_XORPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27492
27493 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv2df3, "__builtin_ia32_vpermilvarpd", IX86_BUILTIN_VPERMILVARPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DI },
27494 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv4sf3, "__builtin_ia32_vpermilvarps", IX86_BUILTIN_VPERMILVARPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SI },
27495 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv4df3, "__builtin_ia32_vpermilvarpd256", IX86_BUILTIN_VPERMILVARPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DI },
27496 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv8sf3, "__builtin_ia32_vpermilvarps256", IX86_BUILTIN_VPERMILVARPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SI },
27497
27498 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendpd256, "__builtin_ia32_blendpd256", IX86_BUILTIN_BLENDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
27499 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendps256, "__builtin_ia32_blendps256", IX86_BUILTIN_BLENDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
27500 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendvpd256, "__builtin_ia32_blendvpd256", IX86_BUILTIN_BLENDVPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_V4DF },
27501 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendvps256, "__builtin_ia32_blendvps256", IX86_BUILTIN_BLENDVPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_V8SF },
27502 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_dpps256, "__builtin_ia32_dpps256", IX86_BUILTIN_DPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
27503 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_shufpd256, "__builtin_ia32_shufpd256", IX86_BUILTIN_SHUFPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
27504 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_shufps256, "__builtin_ia32_shufps256", IX86_BUILTIN_SHUFPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
27505 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vmcmpv2df3, "__builtin_ia32_cmpsd", IX86_BUILTIN_CMPSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
27506 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vmcmpv4sf3, "__builtin_ia32_cmpss", IX86_BUILTIN_CMPSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
27507 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv2df3, "__builtin_ia32_cmppd", IX86_BUILTIN_CMPPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
27508 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv4sf3, "__builtin_ia32_cmpps", IX86_BUILTIN_CMPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
27509 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv4df3, "__builtin_ia32_cmppd256", IX86_BUILTIN_CMPPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
27510 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv8sf3, "__builtin_ia32_cmpps256", IX86_BUILTIN_CMPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
27511 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vextractf128v4df, "__builtin_ia32_vextractf128_pd256", IX86_BUILTIN_EXTRACTF128PD256, UNKNOWN, (int) V2DF_FTYPE_V4DF_INT },
27512 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vextractf128v8sf, "__builtin_ia32_vextractf128_ps256", IX86_BUILTIN_EXTRACTF128PS256, UNKNOWN, (int) V4SF_FTYPE_V8SF_INT },
27513 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vextractf128v8si, "__builtin_ia32_vextractf128_si256", IX86_BUILTIN_EXTRACTF128SI256, UNKNOWN, (int) V4SI_FTYPE_V8SI_INT },
27514 { OPTION_MASK_ISA_AVX, CODE_FOR_floatv4siv4df2, "__builtin_ia32_cvtdq2pd256", IX86_BUILTIN_CVTDQ2PD256, UNKNOWN, (int) V4DF_FTYPE_V4SI },
27515 { OPTION_MASK_ISA_AVX, CODE_FOR_floatv8siv8sf2, "__builtin_ia32_cvtdq2ps256", IX86_BUILTIN_CVTDQ2PS256, UNKNOWN, (int) V8SF_FTYPE_V8SI },
27516 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtpd2ps256, "__builtin_ia32_cvtpd2ps256", IX86_BUILTIN_CVTPD2PS256, UNKNOWN, (int) V4SF_FTYPE_V4DF },
27517 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtps2dq256, "__builtin_ia32_cvtps2dq256", IX86_BUILTIN_CVTPS2DQ256, UNKNOWN, (int) V8SI_FTYPE_V8SF },
27518 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtps2pd256, "__builtin_ia32_cvtps2pd256", IX86_BUILTIN_CVTPS2PD256, UNKNOWN, (int) V4DF_FTYPE_V4SF },
27519 { OPTION_MASK_ISA_AVX, CODE_FOR_fix_truncv4dfv4si2, "__builtin_ia32_cvttpd2dq256", IX86_BUILTIN_CVTTPD2DQ256, UNKNOWN, (int) V4SI_FTYPE_V4DF },
27520 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtpd2dq256, "__builtin_ia32_cvtpd2dq256", IX86_BUILTIN_CVTPD2DQ256, UNKNOWN, (int) V4SI_FTYPE_V4DF },
27521 { OPTION_MASK_ISA_AVX, CODE_FOR_fix_truncv8sfv8si2, "__builtin_ia32_cvttps2dq256", IX86_BUILTIN_CVTTPS2DQ256, UNKNOWN, (int) V8SI_FTYPE_V8SF },
27522 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v4df3, "__builtin_ia32_vperm2f128_pd256", IX86_BUILTIN_VPERM2F128PD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
27523 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v8sf3, "__builtin_ia32_vperm2f128_ps256", IX86_BUILTIN_VPERM2F128PS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
27524 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v8si3, "__builtin_ia32_vperm2f128_si256", IX86_BUILTIN_VPERM2F128SI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_INT },
27525 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv2df, "__builtin_ia32_vpermilpd", IX86_BUILTIN_VPERMILPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_INT },
27526 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv4sf, "__builtin_ia32_vpermilps", IX86_BUILTIN_VPERMILPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_INT },
27527 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv4df, "__builtin_ia32_vpermilpd256", IX86_BUILTIN_VPERMILPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT },
27528 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv8sf, "__builtin_ia32_vpermilps256", IX86_BUILTIN_VPERMILPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_INT },
27529 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vinsertf128v4df, "__builtin_ia32_vinsertf128_pd256", IX86_BUILTIN_VINSERTF128PD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V2DF_INT },
27530 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vinsertf128v8sf, "__builtin_ia32_vinsertf128_ps256", IX86_BUILTIN_VINSERTF128PS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V4SF_INT },
27531 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vinsertf128v8si, "__builtin_ia32_vinsertf128_si256", IX86_BUILTIN_VINSERTF128SI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_INT },
27532
27533 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movshdup256, "__builtin_ia32_movshdup256", IX86_BUILTIN_MOVSHDUP256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
27534 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movsldup256, "__builtin_ia32_movsldup256", IX86_BUILTIN_MOVSLDUP256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
27535 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movddup256, "__builtin_ia32_movddup256", IX86_BUILTIN_MOVDDUP256, UNKNOWN, (int) V4DF_FTYPE_V4DF },
27536
27537 { OPTION_MASK_ISA_AVX, CODE_FOR_sqrtv4df2, "__builtin_ia32_sqrtpd256", IX86_BUILTIN_SQRTPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF },
27538 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_sqrtv8sf2, "__builtin_ia32_sqrtps256", IX86_BUILTIN_SQRTPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
27539 { OPTION_MASK_ISA_AVX, CODE_FOR_sqrtv8sf2, "__builtin_ia32_sqrtps_nr256", IX86_BUILTIN_SQRTPS_NR256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
27540 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_rsqrtv8sf2, "__builtin_ia32_rsqrtps256", IX86_BUILTIN_RSQRTPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
27541 { OPTION_MASK_ISA_AVX, CODE_FOR_rsqrtv8sf2, "__builtin_ia32_rsqrtps_nr256", IX86_BUILTIN_RSQRTPS_NR256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
27542
27543 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_rcpv8sf2, "__builtin_ia32_rcpps256", IX86_BUILTIN_RCPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
27544
27545 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_roundpd256", IX86_BUILTIN_ROUNDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT },
27546 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_roundps256", IX86_BUILTIN_ROUNDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_INT },
27547
27548 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_floorpd256", IX86_BUILTIN_FLOORPD256, (enum rtx_code) ROUND_FLOOR, (int) V4DF_FTYPE_V4DF_ROUND },
27549 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_ceilpd256", IX86_BUILTIN_CEILPD256, (enum rtx_code) ROUND_CEIL, (int) V4DF_FTYPE_V4DF_ROUND },
27550 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_truncpd256", IX86_BUILTIN_TRUNCPD256, (enum rtx_code) ROUND_TRUNC, (int) V4DF_FTYPE_V4DF_ROUND },
27551 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_rintpd256", IX86_BUILTIN_RINTPD256, (enum rtx_code) ROUND_MXCSR, (int) V4DF_FTYPE_V4DF_ROUND },
27552
27553 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv4df2, "__builtin_ia32_roundpd_az256", IX86_BUILTIN_ROUNDPD_AZ256, UNKNOWN, (int) V4DF_FTYPE_V4DF },
27554 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv4df2_vec_pack_sfix, "__builtin_ia32_roundpd_az_vec_pack_sfix256", IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256, UNKNOWN, (int) V8SI_FTYPE_V4DF_V4DF },
27555
27556 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd_vec_pack_sfix256, "__builtin_ia32_floorpd_vec_pack_sfix256", IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256, (enum rtx_code) ROUND_FLOOR, (int) V8SI_FTYPE_V4DF_V4DF_ROUND },
27557 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd_vec_pack_sfix256, "__builtin_ia32_ceilpd_vec_pack_sfix256", IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256, (enum rtx_code) ROUND_CEIL, (int) V8SI_FTYPE_V4DF_V4DF_ROUND },
27558
27559 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_floorps256", IX86_BUILTIN_FLOORPS256, (enum rtx_code) ROUND_FLOOR, (int) V8SF_FTYPE_V8SF_ROUND },
27560 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_ceilps256", IX86_BUILTIN_CEILPS256, (enum rtx_code) ROUND_CEIL, (int) V8SF_FTYPE_V8SF_ROUND },
27561 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_truncps256", IX86_BUILTIN_TRUNCPS256, (enum rtx_code) ROUND_TRUNC, (int) V8SF_FTYPE_V8SF_ROUND },
27562 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_rintps256", IX86_BUILTIN_RINTPS256, (enum rtx_code) ROUND_MXCSR, (int) V8SF_FTYPE_V8SF_ROUND },
27563
27564 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps_sfix256, "__builtin_ia32_floorps_sfix256", IX86_BUILTIN_FLOORPS_SFIX256, (enum rtx_code) ROUND_FLOOR, (int) V8SI_FTYPE_V8SF_ROUND },
27565 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps_sfix256, "__builtin_ia32_ceilps_sfix256", IX86_BUILTIN_CEILPS_SFIX256, (enum rtx_code) ROUND_CEIL, (int) V8SI_FTYPE_V8SF_ROUND },
27566
27567 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv8sf2, "__builtin_ia32_roundps_az256", IX86_BUILTIN_ROUNDPS_AZ256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
27568 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv8sf2_sfix, "__builtin_ia32_roundps_az_sfix256", IX86_BUILTIN_ROUNDPS_AZ_SFIX256, UNKNOWN, (int) V8SI_FTYPE_V8SF },
27569
27570 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpckhpd256, "__builtin_ia32_unpckhpd256", IX86_BUILTIN_UNPCKHPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27571 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpcklpd256, "__builtin_ia32_unpcklpd256", IX86_BUILTIN_UNPCKLPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27572 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpckhps256, "__builtin_ia32_unpckhps256", IX86_BUILTIN_UNPCKHPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27573 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpcklps256, "__builtin_ia32_unpcklps256", IX86_BUILTIN_UNPCKLPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27574
27575 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_si256_si, "__builtin_ia32_si256_si", IX86_BUILTIN_SI256_SI, UNKNOWN, (int) V8SI_FTYPE_V4SI },
27576 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ps256_ps, "__builtin_ia32_ps256_ps", IX86_BUILTIN_PS256_PS, UNKNOWN, (int) V8SF_FTYPE_V4SF },
27577 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_pd256_pd, "__builtin_ia32_pd256_pd", IX86_BUILTIN_PD256_PD, UNKNOWN, (int) V4DF_FTYPE_V2DF },
27578 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_extract_lo_v8si, "__builtin_ia32_si_si256", IX86_BUILTIN_SI_SI256, UNKNOWN, (int) V4SI_FTYPE_V8SI },
27579 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_extract_lo_v8sf, "__builtin_ia32_ps_ps256", IX86_BUILTIN_PS_PS256, UNKNOWN, (int) V4SF_FTYPE_V8SF },
27580 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_extract_lo_v4df, "__builtin_ia32_pd_pd256", IX86_BUILTIN_PD_PD256, UNKNOWN, (int) V2DF_FTYPE_V4DF },
27581
27582 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd, "__builtin_ia32_vtestzpd", IX86_BUILTIN_VTESTZPD, EQ, (int) INT_FTYPE_V2DF_V2DF_PTEST },
27583 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd, "__builtin_ia32_vtestcpd", IX86_BUILTIN_VTESTCPD, LTU, (int) INT_FTYPE_V2DF_V2DF_PTEST },
27584 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd, "__builtin_ia32_vtestnzcpd", IX86_BUILTIN_VTESTNZCPD, GTU, (int) INT_FTYPE_V2DF_V2DF_PTEST },
27585 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps, "__builtin_ia32_vtestzps", IX86_BUILTIN_VTESTZPS, EQ, (int) INT_FTYPE_V4SF_V4SF_PTEST },
27586 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps, "__builtin_ia32_vtestcps", IX86_BUILTIN_VTESTCPS, LTU, (int) INT_FTYPE_V4SF_V4SF_PTEST },
27587 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps, "__builtin_ia32_vtestnzcps", IX86_BUILTIN_VTESTNZCPS, GTU, (int) INT_FTYPE_V4SF_V4SF_PTEST },
27588 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd256, "__builtin_ia32_vtestzpd256", IX86_BUILTIN_VTESTZPD256, EQ, (int) INT_FTYPE_V4DF_V4DF_PTEST },
27589 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd256, "__builtin_ia32_vtestcpd256", IX86_BUILTIN_VTESTCPD256, LTU, (int) INT_FTYPE_V4DF_V4DF_PTEST },
27590 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd256, "__builtin_ia32_vtestnzcpd256", IX86_BUILTIN_VTESTNZCPD256, GTU, (int) INT_FTYPE_V4DF_V4DF_PTEST },
27591 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestzps256", IX86_BUILTIN_VTESTZPS256, EQ, (int) INT_FTYPE_V8SF_V8SF_PTEST },
27592 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestcps256", IX86_BUILTIN_VTESTCPS256, LTU, (int) INT_FTYPE_V8SF_V8SF_PTEST },
27593 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestnzcps256", IX86_BUILTIN_VTESTNZCPS256, GTU, (int) INT_FTYPE_V8SF_V8SF_PTEST },
27594 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, "__builtin_ia32_ptestz256", IX86_BUILTIN_PTESTZ256, EQ, (int) INT_FTYPE_V4DI_V4DI_PTEST },
27595 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, "__builtin_ia32_ptestc256", IX86_BUILTIN_PTESTC256, LTU, (int) INT_FTYPE_V4DI_V4DI_PTEST },
27596 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, "__builtin_ia32_ptestnzc256", IX86_BUILTIN_PTESTNZC256, GTU, (int) INT_FTYPE_V4DI_V4DI_PTEST },
27597
27598 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movmskpd256, "__builtin_ia32_movmskpd256", IX86_BUILTIN_MOVMSKPD256, UNKNOWN, (int) INT_FTYPE_V4DF },
27599 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movmskps256, "__builtin_ia32_movmskps256", IX86_BUILTIN_MOVMSKPS256, UNKNOWN, (int) INT_FTYPE_V8SF },
27600
27601 { OPTION_MASK_ISA_AVX, CODE_FOR_copysignv8sf3, "__builtin_ia32_copysignps256", IX86_BUILTIN_CPYSGNPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27602 { OPTION_MASK_ISA_AVX, CODE_FOR_copysignv4df3, "__builtin_ia32_copysignpd256", IX86_BUILTIN_CPYSGNPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27603
27604 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_pack_sfix_v4df, "__builtin_ia32_vec_pack_sfix256 ", IX86_BUILTIN_VEC_PACK_SFIX256, UNKNOWN, (int) V8SI_FTYPE_V4DF_V4DF },
27605
27606 /* AVX2 */
27607 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_mpsadbw, "__builtin_ia32_mpsadbw256", IX86_BUILTIN_MPSADBW256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI_INT },
27608 { OPTION_MASK_ISA_AVX2, CODE_FOR_absv32qi2, "__builtin_ia32_pabsb256", IX86_BUILTIN_PABSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI },
27609 { OPTION_MASK_ISA_AVX2, CODE_FOR_absv16hi2, "__builtin_ia32_pabsw256", IX86_BUILTIN_PABSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI },
27610 { OPTION_MASK_ISA_AVX2, CODE_FOR_absv8si2, "__builtin_ia32_pabsd256", IX86_BUILTIN_PABSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI },
27611 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packssdw, "__builtin_ia32_packssdw256", IX86_BUILTIN_PACKSSDW256, UNKNOWN, (int) V16HI_FTYPE_V8SI_V8SI },
27612 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packsswb, "__builtin_ia32_packsswb256", IX86_BUILTIN_PACKSSWB256, UNKNOWN, (int) V32QI_FTYPE_V16HI_V16HI },
27613 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packusdw, "__builtin_ia32_packusdw256", IX86_BUILTIN_PACKUSDW256, UNKNOWN, (int) V16HI_FTYPE_V8SI_V8SI },
27614 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packuswb, "__builtin_ia32_packuswb256", IX86_BUILTIN_PACKUSWB256, UNKNOWN, (int) V32QI_FTYPE_V16HI_V16HI },
27615 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv32qi3, "__builtin_ia32_paddb256", IX86_BUILTIN_PADDB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27616 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv16hi3, "__builtin_ia32_paddw256", IX86_BUILTIN_PADDW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27617 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv8si3, "__builtin_ia32_paddd256", IX86_BUILTIN_PADDD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27618 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv4di3, "__builtin_ia32_paddq256", IX86_BUILTIN_PADDQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27619 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ssaddv32qi3, "__builtin_ia32_paddsb256", IX86_BUILTIN_PADDSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27620 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ssaddv16hi3, "__builtin_ia32_paddsw256", IX86_BUILTIN_PADDSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27621 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_usaddv32qi3, "__builtin_ia32_paddusb256", IX86_BUILTIN_PADDUSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27622 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_usaddv16hi3, "__builtin_ia32_paddusw256", IX86_BUILTIN_PADDUSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27623 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_palignrv2ti, "__builtin_ia32_palignr256", IX86_BUILTIN_PALIGNR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI_INT_CONVERT },
27624 { OPTION_MASK_ISA_AVX2, CODE_FOR_andv4di3, "__builtin_ia32_andsi256", IX86_BUILTIN_AND256I, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27625 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_andnotv4di3, "__builtin_ia32_andnotsi256", IX86_BUILTIN_ANDNOT256I, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27626 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_uavgv32qi3, "__builtin_ia32_pavgb256", IX86_BUILTIN_PAVGB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27627 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_uavgv16hi3, "__builtin_ia32_pavgw256", IX86_BUILTIN_PAVGW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27628 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblendvb, "__builtin_ia32_pblendvb256", IX86_BUILTIN_PBLENDVB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI_V32QI },
27629 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblendw, "__builtin_ia32_pblendw256", IX86_BUILTIN_PBLENDVW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI_INT },
27630 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv32qi3, "__builtin_ia32_pcmpeqb256", IX86_BUILTIN_PCMPEQB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27631 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv16hi3, "__builtin_ia32_pcmpeqw256", IX86_BUILTIN_PCMPEQW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27632 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv8si3, "__builtin_ia32_pcmpeqd256", IX86_BUILTIN_PCMPEQD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27633 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv4di3, "__builtin_ia32_pcmpeqq256", IX86_BUILTIN_PCMPEQQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27634 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv32qi3, "__builtin_ia32_pcmpgtb256", IX86_BUILTIN_PCMPGTB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27635 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv16hi3, "__builtin_ia32_pcmpgtw256", IX86_BUILTIN_PCMPGTW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27636 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv8si3, "__builtin_ia32_pcmpgtd256", IX86_BUILTIN_PCMPGTD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27637 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv4di3, "__builtin_ia32_pcmpgtq256", IX86_BUILTIN_PCMPGTQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27638 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phaddwv16hi3, "__builtin_ia32_phaddw256", IX86_BUILTIN_PHADDW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27639 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phadddv8si3, "__builtin_ia32_phaddd256", IX86_BUILTIN_PHADDD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27640 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phaddswv16hi3, "__builtin_ia32_phaddsw256", IX86_BUILTIN_PHADDSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27641 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phsubwv16hi3, "__builtin_ia32_phsubw256", IX86_BUILTIN_PHSUBW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27642 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phsubdv8si3, "__builtin_ia32_phsubd256", IX86_BUILTIN_PHSUBD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27643 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phsubswv16hi3, "__builtin_ia32_phsubsw256", IX86_BUILTIN_PHSUBSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27644 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmaddubsw256, "__builtin_ia32_pmaddubsw256", IX86_BUILTIN_PMADDUBSW256, UNKNOWN, (int) V16HI_FTYPE_V32QI_V32QI },
27645 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmaddwd, "__builtin_ia32_pmaddwd256", IX86_BUILTIN_PMADDWD256, UNKNOWN, (int) V8SI_FTYPE_V16HI_V16HI },
27646 { OPTION_MASK_ISA_AVX2, CODE_FOR_smaxv32qi3, "__builtin_ia32_pmaxsb256", IX86_BUILTIN_PMAXSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27647 { OPTION_MASK_ISA_AVX2, CODE_FOR_smaxv16hi3, "__builtin_ia32_pmaxsw256", IX86_BUILTIN_PMAXSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27648 { OPTION_MASK_ISA_AVX2, CODE_FOR_smaxv8si3 , "__builtin_ia32_pmaxsd256", IX86_BUILTIN_PMAXSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27649 { OPTION_MASK_ISA_AVX2, CODE_FOR_umaxv32qi3, "__builtin_ia32_pmaxub256", IX86_BUILTIN_PMAXUB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27650 { OPTION_MASK_ISA_AVX2, CODE_FOR_umaxv16hi3, "__builtin_ia32_pmaxuw256", IX86_BUILTIN_PMAXUW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27651 { OPTION_MASK_ISA_AVX2, CODE_FOR_umaxv8si3 , "__builtin_ia32_pmaxud256", IX86_BUILTIN_PMAXUD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27652 { OPTION_MASK_ISA_AVX2, CODE_FOR_sminv32qi3, "__builtin_ia32_pminsb256", IX86_BUILTIN_PMINSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27653 { OPTION_MASK_ISA_AVX2, CODE_FOR_sminv16hi3, "__builtin_ia32_pminsw256", IX86_BUILTIN_PMINSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27654 { OPTION_MASK_ISA_AVX2, CODE_FOR_sminv8si3 , "__builtin_ia32_pminsd256", IX86_BUILTIN_PMINSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27655 { OPTION_MASK_ISA_AVX2, CODE_FOR_uminv32qi3, "__builtin_ia32_pminub256", IX86_BUILTIN_PMINUB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27656 { OPTION_MASK_ISA_AVX2, CODE_FOR_uminv16hi3, "__builtin_ia32_pminuw256", IX86_BUILTIN_PMINUW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27657 { OPTION_MASK_ISA_AVX2, CODE_FOR_uminv8si3 , "__builtin_ia32_pminud256", IX86_BUILTIN_PMINUD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27658 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmovmskb, "__builtin_ia32_pmovmskb256", IX86_BUILTIN_PMOVMSKB256, UNKNOWN, (int) INT_FTYPE_V32QI },
27659 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv16qiv16hi2, "__builtin_ia32_pmovsxbw256", IX86_BUILTIN_PMOVSXBW256, UNKNOWN, (int) V16HI_FTYPE_V16QI },
27660 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv8qiv8si2 , "__builtin_ia32_pmovsxbd256", IX86_BUILTIN_PMOVSXBD256, UNKNOWN, (int) V8SI_FTYPE_V16QI },
27661 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv4qiv4di2 , "__builtin_ia32_pmovsxbq256", IX86_BUILTIN_PMOVSXBQ256, UNKNOWN, (int) V4DI_FTYPE_V16QI },
27662 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv8hiv8si2 , "__builtin_ia32_pmovsxwd256", IX86_BUILTIN_PMOVSXWD256, UNKNOWN, (int) V8SI_FTYPE_V8HI },
27663 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv4hiv4di2 , "__builtin_ia32_pmovsxwq256", IX86_BUILTIN_PMOVSXWQ256, UNKNOWN, (int) V4DI_FTYPE_V8HI },
27664 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv4siv4di2 , "__builtin_ia32_pmovsxdq256", IX86_BUILTIN_PMOVSXDQ256, UNKNOWN, (int) V4DI_FTYPE_V4SI },
27665 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv16qiv16hi2, "__builtin_ia32_pmovzxbw256", IX86_BUILTIN_PMOVZXBW256, UNKNOWN, (int) V16HI_FTYPE_V16QI },
27666 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv8qiv8si2 , "__builtin_ia32_pmovzxbd256", IX86_BUILTIN_PMOVZXBD256, UNKNOWN, (int) V8SI_FTYPE_V16QI },
27667 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4qiv4di2 , "__builtin_ia32_pmovzxbq256", IX86_BUILTIN_PMOVZXBQ256, UNKNOWN, (int) V4DI_FTYPE_V16QI },
27668 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv8hiv8si2 , "__builtin_ia32_pmovzxwd256", IX86_BUILTIN_PMOVZXWD256, UNKNOWN, (int) V8SI_FTYPE_V8HI },
27669 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4hiv4di2 , "__builtin_ia32_pmovzxwq256", IX86_BUILTIN_PMOVZXWQ256, UNKNOWN, (int) V4DI_FTYPE_V8HI },
27670 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4siv4di2 , "__builtin_ia32_pmovzxdq256", IX86_BUILTIN_PMOVZXDQ256, UNKNOWN, (int) V4DI_FTYPE_V4SI },
27671 { OPTION_MASK_ISA_AVX2, CODE_FOR_vec_widen_smult_even_v8si, "__builtin_ia32_pmuldq256", IX86_BUILTIN_PMULDQ256, UNKNOWN, (int) V4DI_FTYPE_V8SI_V8SI },
27672 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_umulhrswv16hi3 , "__builtin_ia32_pmulhrsw256", IX86_BUILTIN_PMULHRSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27673 { OPTION_MASK_ISA_AVX2, CODE_FOR_umulv16hi3_highpart, "__builtin_ia32_pmulhuw256" , IX86_BUILTIN_PMULHUW256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27674 { OPTION_MASK_ISA_AVX2, CODE_FOR_smulv16hi3_highpart, "__builtin_ia32_pmulhw256" , IX86_BUILTIN_PMULHW256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27675 { OPTION_MASK_ISA_AVX2, CODE_FOR_mulv16hi3, "__builtin_ia32_pmullw256" , IX86_BUILTIN_PMULLW256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27676 { OPTION_MASK_ISA_AVX2, CODE_FOR_mulv8si3, "__builtin_ia32_pmulld256" , IX86_BUILTIN_PMULLD256 , UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27677 { OPTION_MASK_ISA_AVX2, CODE_FOR_vec_widen_umult_even_v8si, "__builtin_ia32_pmuludq256", IX86_BUILTIN_PMULUDQ256, UNKNOWN, (int) V4DI_FTYPE_V8SI_V8SI },
27678 { OPTION_MASK_ISA_AVX2, CODE_FOR_iorv4di3, "__builtin_ia32_por256", IX86_BUILTIN_POR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27679 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psadbw, "__builtin_ia32_psadbw256", IX86_BUILTIN_PSADBW256, UNKNOWN, (int) V16HI_FTYPE_V32QI_V32QI },
27680 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshufbv32qi3, "__builtin_ia32_pshufb256", IX86_BUILTIN_PSHUFB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27681 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshufdv3, "__builtin_ia32_pshufd256", IX86_BUILTIN_PSHUFD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_INT },
27682 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshufhwv3, "__builtin_ia32_pshufhw256", IX86_BUILTIN_PSHUFHW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_INT },
27683 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshuflwv3, "__builtin_ia32_pshuflw256", IX86_BUILTIN_PSHUFLW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_INT },
27684 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psignv32qi3, "__builtin_ia32_psignb256", IX86_BUILTIN_PSIGNB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27685 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psignv16hi3, "__builtin_ia32_psignw256", IX86_BUILTIN_PSIGNW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27686 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psignv8si3 , "__builtin_ia32_psignd256", IX86_BUILTIN_PSIGND256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27687 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlv2ti3, "__builtin_ia32_pslldqi256", IX86_BUILTIN_PSLLDQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_CONVERT },
27688 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv16hi3, "__builtin_ia32_psllwi256", IX86_BUILTIN_PSLLWI256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_SI_COUNT },
27689 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv16hi3, "__builtin_ia32_psllw256", IX86_BUILTIN_PSLLW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V8HI_COUNT },
27690 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv8si3, "__builtin_ia32_pslldi256", IX86_BUILTIN_PSLLDI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_SI_COUNT },
27691 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv8si3, "__builtin_ia32_pslld256", IX86_BUILTIN_PSLLD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_COUNT },
27692 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv4di3, "__builtin_ia32_psllqi256", IX86_BUILTIN_PSLLQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_COUNT },
27693 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv4di3, "__builtin_ia32_psllq256", IX86_BUILTIN_PSLLQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V2DI_COUNT },
27694 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv16hi3, "__builtin_ia32_psrawi256", IX86_BUILTIN_PSRAWI256, UNKNOWN, (int) V16HI_FTYPE_V16HI_SI_COUNT },
27695 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv16hi3, "__builtin_ia32_psraw256", IX86_BUILTIN_PSRAW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V8HI_COUNT },
27696 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv8si3, "__builtin_ia32_psradi256", IX86_BUILTIN_PSRADI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_SI_COUNT },
27697 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv8si3, "__builtin_ia32_psrad256", IX86_BUILTIN_PSRAD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_COUNT },
27698 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrv2ti3, "__builtin_ia32_psrldqi256", IX86_BUILTIN_PSRLDQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_CONVERT },
27699 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv16hi3, "__builtin_ia32_psrlwi256", IX86_BUILTIN_PSRLWI256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_SI_COUNT },
27700 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv16hi3, "__builtin_ia32_psrlw256", IX86_BUILTIN_PSRLW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V8HI_COUNT },
27701 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv8si3, "__builtin_ia32_psrldi256", IX86_BUILTIN_PSRLDI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_SI_COUNT },
27702 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv8si3, "__builtin_ia32_psrld256", IX86_BUILTIN_PSRLD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_COUNT },
27703 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv4di3, "__builtin_ia32_psrlqi256", IX86_BUILTIN_PSRLQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_COUNT },
27704 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv4di3, "__builtin_ia32_psrlq256", IX86_BUILTIN_PSRLQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V2DI_COUNT },
27705 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv32qi3, "__builtin_ia32_psubb256", IX86_BUILTIN_PSUBB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27706 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv16hi3, "__builtin_ia32_psubw256", IX86_BUILTIN_PSUBW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27707 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv8si3, "__builtin_ia32_psubd256", IX86_BUILTIN_PSUBD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27708 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv4di3, "__builtin_ia32_psubq256", IX86_BUILTIN_PSUBQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27709 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sssubv32qi3, "__builtin_ia32_psubsb256", IX86_BUILTIN_PSUBSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27710 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sssubv16hi3, "__builtin_ia32_psubsw256", IX86_BUILTIN_PSUBSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27711 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ussubv32qi3, "__builtin_ia32_psubusb256", IX86_BUILTIN_PSUBUSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27712 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ussubv16hi3, "__builtin_ia32_psubusw256", IX86_BUILTIN_PSUBUSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27713 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv32qi, "__builtin_ia32_punpckhbw256", IX86_BUILTIN_PUNPCKHBW256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27714 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv16hi, "__builtin_ia32_punpckhwd256", IX86_BUILTIN_PUNPCKHWD256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27715 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv8si, "__builtin_ia32_punpckhdq256", IX86_BUILTIN_PUNPCKHDQ256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27716 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv4di, "__builtin_ia32_punpckhqdq256", IX86_BUILTIN_PUNPCKHQDQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27717 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv32qi, "__builtin_ia32_punpcklbw256", IX86_BUILTIN_PUNPCKLBW256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27718 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv16hi, "__builtin_ia32_punpcklwd256", IX86_BUILTIN_PUNPCKLWD256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27719 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv8si, "__builtin_ia32_punpckldq256", IX86_BUILTIN_PUNPCKLDQ256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27720 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv4di, "__builtin_ia32_punpcklqdq256", IX86_BUILTIN_PUNPCKLQDQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27721 { OPTION_MASK_ISA_AVX2, CODE_FOR_xorv4di3, "__builtin_ia32_pxor256", IX86_BUILTIN_PXOR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27722 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv4sf, "__builtin_ia32_vbroadcastss_ps", IX86_BUILTIN_VBROADCASTSS_PS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
27723 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv8sf, "__builtin_ia32_vbroadcastss_ps256", IX86_BUILTIN_VBROADCASTSS_PS256, UNKNOWN, (int) V8SF_FTYPE_V4SF },
27724 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv4df, "__builtin_ia32_vbroadcastsd_pd256", IX86_BUILTIN_VBROADCASTSD_PD256, UNKNOWN, (int) V4DF_FTYPE_V2DF },
27725 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vbroadcasti128_v4di, "__builtin_ia32_vbroadcastsi256", IX86_BUILTIN_VBROADCASTSI256, UNKNOWN, (int) V4DI_FTYPE_V2DI },
27726 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblenddv4si, "__builtin_ia32_pblendd128", IX86_BUILTIN_PBLENDD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_INT },
27727 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblenddv8si, "__builtin_ia32_pblendd256", IX86_BUILTIN_PBLENDD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_INT },
27728 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv32qi, "__builtin_ia32_pbroadcastb256", IX86_BUILTIN_PBROADCASTB256, UNKNOWN, (int) V32QI_FTYPE_V16QI },
27729 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv16hi, "__builtin_ia32_pbroadcastw256", IX86_BUILTIN_PBROADCASTW256, UNKNOWN, (int) V16HI_FTYPE_V8HI },
27730 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv8si, "__builtin_ia32_pbroadcastd256", IX86_BUILTIN_PBROADCASTD256, UNKNOWN, (int) V8SI_FTYPE_V4SI },
27731 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv4di, "__builtin_ia32_pbroadcastq256", IX86_BUILTIN_PBROADCASTQ256, UNKNOWN, (int) V4DI_FTYPE_V2DI },
27732 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv16qi, "__builtin_ia32_pbroadcastb128", IX86_BUILTIN_PBROADCASTB128, UNKNOWN, (int) V16QI_FTYPE_V16QI },
27733 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv8hi, "__builtin_ia32_pbroadcastw128", IX86_BUILTIN_PBROADCASTW128, UNKNOWN, (int) V8HI_FTYPE_V8HI },
27734 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv4si, "__builtin_ia32_pbroadcastd128", IX86_BUILTIN_PBROADCASTD128, UNKNOWN, (int) V4SI_FTYPE_V4SI },
27735 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv2di, "__builtin_ia32_pbroadcastq128", IX86_BUILTIN_PBROADCASTQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI },
27736 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permvarv8si, "__builtin_ia32_permvarsi256", IX86_BUILTIN_VPERMVARSI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27737 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permvarv8sf, "__builtin_ia32_permvarsf256", IX86_BUILTIN_VPERMVARSF256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SI },
27738 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv4df, "__builtin_ia32_permdf256", IX86_BUILTIN_VPERMDF256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT },
27739 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv4di, "__builtin_ia32_permdi256", IX86_BUILTIN_VPERMDI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT },
27740 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv2ti, "__builtin_ia32_permti256", IX86_BUILTIN_VPERMTI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI_INT },
27741 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_extracti128, "__builtin_ia32_extract128i256", IX86_BUILTIN_VEXTRACT128I256, UNKNOWN, (int) V2DI_FTYPE_V4DI_INT },
27742 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_inserti128, "__builtin_ia32_insert128i256", IX86_BUILTIN_VINSERT128I256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V2DI_INT },
27743 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv4di, "__builtin_ia32_psllv4di", IX86_BUILTIN_PSLLVV4DI, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27744 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv2di, "__builtin_ia32_psllv2di", IX86_BUILTIN_PSLLVV2DI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27745 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv8si, "__builtin_ia32_psllv8si", IX86_BUILTIN_PSLLVV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27746 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv4si, "__builtin_ia32_psllv4si", IX86_BUILTIN_PSLLVV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27747 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashrvv8si, "__builtin_ia32_psrav8si", IX86_BUILTIN_PSRAVV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27748 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashrvv4si, "__builtin_ia32_psrav4si", IX86_BUILTIN_PSRAVV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27749 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv4di, "__builtin_ia32_psrlv4di", IX86_BUILTIN_PSRLVV4DI, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27750 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv2di, "__builtin_ia32_psrlv2di", IX86_BUILTIN_PSRLVV2DI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27751 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv8si, "__builtin_ia32_psrlv8si", IX86_BUILTIN_PSRLVV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27752 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv4si, "__builtin_ia32_psrlv4si", IX86_BUILTIN_PSRLVV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27753
27754 { OPTION_MASK_ISA_LZCNT, CODE_FOR_clzhi2_lzcnt, "__builtin_clzs", IX86_BUILTIN_CLZS, UNKNOWN, (int) UINT16_FTYPE_UINT16 },
27755
27756 /* BMI */
27757 { OPTION_MASK_ISA_BMI, CODE_FOR_bmi_bextr_si, "__builtin_ia32_bextr_u32", IX86_BUILTIN_BEXTR32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
27758 { OPTION_MASK_ISA_BMI, CODE_FOR_bmi_bextr_di, "__builtin_ia32_bextr_u64", IX86_BUILTIN_BEXTR64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
27759 { OPTION_MASK_ISA_BMI, CODE_FOR_ctzhi2, "__builtin_ctzs", IX86_BUILTIN_CTZS, UNKNOWN, (int) UINT16_FTYPE_UINT16 },
27760
27761 /* TBM */
27762 { OPTION_MASK_ISA_TBM, CODE_FOR_tbm_bextri_si, "__builtin_ia32_bextri_u32", IX86_BUILTIN_BEXTRI32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
27763 { OPTION_MASK_ISA_TBM, CODE_FOR_tbm_bextri_di, "__builtin_ia32_bextri_u64", IX86_BUILTIN_BEXTRI64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
27764
27765 /* F16C */
27766 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtph2ps, "__builtin_ia32_vcvtph2ps", IX86_BUILTIN_CVTPH2PS, UNKNOWN, (int) V4SF_FTYPE_V8HI },
27767 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtph2ps256, "__builtin_ia32_vcvtph2ps256", IX86_BUILTIN_CVTPH2PS256, UNKNOWN, (int) V8SF_FTYPE_V8HI },
27768 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtps2ph, "__builtin_ia32_vcvtps2ph", IX86_BUILTIN_CVTPS2PH, UNKNOWN, (int) V8HI_FTYPE_V4SF_INT },
27769 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtps2ph256, "__builtin_ia32_vcvtps2ph256", IX86_BUILTIN_CVTPS2PH256, UNKNOWN, (int) V8HI_FTYPE_V8SF_INT },
27770
27771 /* BMI2 */
27772 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_bzhi_si3, "__builtin_ia32_bzhi_si", IX86_BUILTIN_BZHI32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
27773 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_bzhi_di3, "__builtin_ia32_bzhi_di", IX86_BUILTIN_BZHI64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
27774 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pdep_si3, "__builtin_ia32_pdep_si", IX86_BUILTIN_PDEP32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
27775 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pdep_di3, "__builtin_ia32_pdep_di", IX86_BUILTIN_PDEP64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
27776 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pext_si3, "__builtin_ia32_pext_si", IX86_BUILTIN_PEXT32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
27777 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pext_di3, "__builtin_ia32_pext_di", IX86_BUILTIN_PEXT64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
27778 };
27779
27780 /* FMA4 and XOP. */
27781 #define MULTI_ARG_4_DF2_DI_I V2DF_FTYPE_V2DF_V2DF_V2DI_INT
27782 #define MULTI_ARG_4_DF2_DI_I1 V4DF_FTYPE_V4DF_V4DF_V4DI_INT
27783 #define MULTI_ARG_4_SF2_SI_I V4SF_FTYPE_V4SF_V4SF_V4SI_INT
27784 #define MULTI_ARG_4_SF2_SI_I1 V8SF_FTYPE_V8SF_V8SF_V8SI_INT
27785 #define MULTI_ARG_3_SF V4SF_FTYPE_V4SF_V4SF_V4SF
27786 #define MULTI_ARG_3_DF V2DF_FTYPE_V2DF_V2DF_V2DF
27787 #define MULTI_ARG_3_SF2 V8SF_FTYPE_V8SF_V8SF_V8SF
27788 #define MULTI_ARG_3_DF2 V4DF_FTYPE_V4DF_V4DF_V4DF
27789 #define MULTI_ARG_3_DI V2DI_FTYPE_V2DI_V2DI_V2DI
27790 #define MULTI_ARG_3_SI V4SI_FTYPE_V4SI_V4SI_V4SI
27791 #define MULTI_ARG_3_SI_DI V4SI_FTYPE_V4SI_V4SI_V2DI
27792 #define MULTI_ARG_3_HI V8HI_FTYPE_V8HI_V8HI_V8HI
27793 #define MULTI_ARG_3_HI_SI V8HI_FTYPE_V8HI_V8HI_V4SI
27794 #define MULTI_ARG_3_QI V16QI_FTYPE_V16QI_V16QI_V16QI
27795 #define MULTI_ARG_3_DI2 V4DI_FTYPE_V4DI_V4DI_V4DI
27796 #define MULTI_ARG_3_SI2 V8SI_FTYPE_V8SI_V8SI_V8SI
27797 #define MULTI_ARG_3_HI2 V16HI_FTYPE_V16HI_V16HI_V16HI
27798 #define MULTI_ARG_3_QI2 V32QI_FTYPE_V32QI_V32QI_V32QI
27799 #define MULTI_ARG_2_SF V4SF_FTYPE_V4SF_V4SF
27800 #define MULTI_ARG_2_DF V2DF_FTYPE_V2DF_V2DF
27801 #define MULTI_ARG_2_DI V2DI_FTYPE_V2DI_V2DI
27802 #define MULTI_ARG_2_SI V4SI_FTYPE_V4SI_V4SI
27803 #define MULTI_ARG_2_HI V8HI_FTYPE_V8HI_V8HI
27804 #define MULTI_ARG_2_QI V16QI_FTYPE_V16QI_V16QI
27805 #define MULTI_ARG_2_DI_IMM V2DI_FTYPE_V2DI_SI
27806 #define MULTI_ARG_2_SI_IMM V4SI_FTYPE_V4SI_SI
27807 #define MULTI_ARG_2_HI_IMM V8HI_FTYPE_V8HI_SI
27808 #define MULTI_ARG_2_QI_IMM V16QI_FTYPE_V16QI_SI
27809 #define MULTI_ARG_2_DI_CMP V2DI_FTYPE_V2DI_V2DI_CMP
27810 #define MULTI_ARG_2_SI_CMP V4SI_FTYPE_V4SI_V4SI_CMP
27811 #define MULTI_ARG_2_HI_CMP V8HI_FTYPE_V8HI_V8HI_CMP
27812 #define MULTI_ARG_2_QI_CMP V16QI_FTYPE_V16QI_V16QI_CMP
27813 #define MULTI_ARG_2_SF_TF V4SF_FTYPE_V4SF_V4SF_TF
27814 #define MULTI_ARG_2_DF_TF V2DF_FTYPE_V2DF_V2DF_TF
27815 #define MULTI_ARG_2_DI_TF V2DI_FTYPE_V2DI_V2DI_TF
27816 #define MULTI_ARG_2_SI_TF V4SI_FTYPE_V4SI_V4SI_TF
27817 #define MULTI_ARG_2_HI_TF V8HI_FTYPE_V8HI_V8HI_TF
27818 #define MULTI_ARG_2_QI_TF V16QI_FTYPE_V16QI_V16QI_TF
27819 #define MULTI_ARG_1_SF V4SF_FTYPE_V4SF
27820 #define MULTI_ARG_1_DF V2DF_FTYPE_V2DF
27821 #define MULTI_ARG_1_SF2 V8SF_FTYPE_V8SF
27822 #define MULTI_ARG_1_DF2 V4DF_FTYPE_V4DF
27823 #define MULTI_ARG_1_DI V2DI_FTYPE_V2DI
27824 #define MULTI_ARG_1_SI V4SI_FTYPE_V4SI
27825 #define MULTI_ARG_1_HI V8HI_FTYPE_V8HI
27826 #define MULTI_ARG_1_QI V16QI_FTYPE_V16QI
27827 #define MULTI_ARG_1_SI_DI V2DI_FTYPE_V4SI
27828 #define MULTI_ARG_1_HI_DI V2DI_FTYPE_V8HI
27829 #define MULTI_ARG_1_HI_SI V4SI_FTYPE_V8HI
27830 #define MULTI_ARG_1_QI_DI V2DI_FTYPE_V16QI
27831 #define MULTI_ARG_1_QI_SI V4SI_FTYPE_V16QI
27832 #define MULTI_ARG_1_QI_HI V8HI_FTYPE_V16QI
27833
27834 static const struct builtin_description bdesc_multi_arg[] =
27835 {
27836 { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_vmfmadd_v4sf,
27837 "__builtin_ia32_vfmaddss", IX86_BUILTIN_VFMADDSS,
27838 UNKNOWN, (int)MULTI_ARG_3_SF },
27839 { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_vmfmadd_v2df,
27840 "__builtin_ia32_vfmaddsd", IX86_BUILTIN_VFMADDSD,
27841 UNKNOWN, (int)MULTI_ARG_3_DF },
27842
27843 { OPTION_MASK_ISA_FMA, CODE_FOR_fmai_vmfmadd_v4sf,
27844 "__builtin_ia32_vfmaddss3", IX86_BUILTIN_VFMADDSS3,
27845 UNKNOWN, (int)MULTI_ARG_3_SF },
27846 { OPTION_MASK_ISA_FMA, CODE_FOR_fmai_vmfmadd_v2df,
27847 "__builtin_ia32_vfmaddsd3", IX86_BUILTIN_VFMADDSD3,
27848 UNKNOWN, (int)MULTI_ARG_3_DF },
27849
27850 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v4sf,
27851 "__builtin_ia32_vfmaddps", IX86_BUILTIN_VFMADDPS,
27852 UNKNOWN, (int)MULTI_ARG_3_SF },
27853 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v2df,
27854 "__builtin_ia32_vfmaddpd", IX86_BUILTIN_VFMADDPD,
27855 UNKNOWN, (int)MULTI_ARG_3_DF },
27856 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v8sf,
27857 "__builtin_ia32_vfmaddps256", IX86_BUILTIN_VFMADDPS256,
27858 UNKNOWN, (int)MULTI_ARG_3_SF2 },
27859 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v4df,
27860 "__builtin_ia32_vfmaddpd256", IX86_BUILTIN_VFMADDPD256,
27861 UNKNOWN, (int)MULTI_ARG_3_DF2 },
27862
27863 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v4sf,
27864 "__builtin_ia32_vfmaddsubps", IX86_BUILTIN_VFMADDSUBPS,
27865 UNKNOWN, (int)MULTI_ARG_3_SF },
27866 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v2df,
27867 "__builtin_ia32_vfmaddsubpd", IX86_BUILTIN_VFMADDSUBPD,
27868 UNKNOWN, (int)MULTI_ARG_3_DF },
27869 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v8sf,
27870 "__builtin_ia32_vfmaddsubps256", IX86_BUILTIN_VFMADDSUBPS256,
27871 UNKNOWN, (int)MULTI_ARG_3_SF2 },
27872 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v4df,
27873 "__builtin_ia32_vfmaddsubpd256", IX86_BUILTIN_VFMADDSUBPD256,
27874 UNKNOWN, (int)MULTI_ARG_3_DF2 },
27875
27876 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v2di, "__builtin_ia32_vpcmov", IX86_BUILTIN_VPCMOV, UNKNOWN, (int)MULTI_ARG_3_DI },
27877 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v2di, "__builtin_ia32_vpcmov_v2di", IX86_BUILTIN_VPCMOV_V2DI, UNKNOWN, (int)MULTI_ARG_3_DI },
27878 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4si, "__builtin_ia32_vpcmov_v4si", IX86_BUILTIN_VPCMOV_V4SI, UNKNOWN, (int)MULTI_ARG_3_SI },
27879 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v8hi, "__builtin_ia32_vpcmov_v8hi", IX86_BUILTIN_VPCMOV_V8HI, UNKNOWN, (int)MULTI_ARG_3_HI },
27880 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v16qi, "__builtin_ia32_vpcmov_v16qi",IX86_BUILTIN_VPCMOV_V16QI,UNKNOWN, (int)MULTI_ARG_3_QI },
27881 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v2df, "__builtin_ia32_vpcmov_v2df", IX86_BUILTIN_VPCMOV_V2DF, UNKNOWN, (int)MULTI_ARG_3_DF },
27882 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4sf, "__builtin_ia32_vpcmov_v4sf", IX86_BUILTIN_VPCMOV_V4SF, UNKNOWN, (int)MULTI_ARG_3_SF },
27883
27884 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4di256, "__builtin_ia32_vpcmov256", IX86_BUILTIN_VPCMOV256, UNKNOWN, (int)MULTI_ARG_3_DI2 },
27885 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4di256, "__builtin_ia32_vpcmov_v4di256", IX86_BUILTIN_VPCMOV_V4DI256, UNKNOWN, (int)MULTI_ARG_3_DI2 },
27886 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v8si256, "__builtin_ia32_vpcmov_v8si256", IX86_BUILTIN_VPCMOV_V8SI256, UNKNOWN, (int)MULTI_ARG_3_SI2 },
27887 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v16hi256, "__builtin_ia32_vpcmov_v16hi256", IX86_BUILTIN_VPCMOV_V16HI256, UNKNOWN, (int)MULTI_ARG_3_HI2 },
27888 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v32qi256, "__builtin_ia32_vpcmov_v32qi256", IX86_BUILTIN_VPCMOV_V32QI256, UNKNOWN, (int)MULTI_ARG_3_QI2 },
27889 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4df256, "__builtin_ia32_vpcmov_v4df256", IX86_BUILTIN_VPCMOV_V4DF256, UNKNOWN, (int)MULTI_ARG_3_DF2 },
27890 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v8sf256, "__builtin_ia32_vpcmov_v8sf256", IX86_BUILTIN_VPCMOV_V8SF256, UNKNOWN, (int)MULTI_ARG_3_SF2 },
27891
27892 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pperm, "__builtin_ia32_vpperm", IX86_BUILTIN_VPPERM, UNKNOWN, (int)MULTI_ARG_3_QI },
27893
27894 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssww, "__builtin_ia32_vpmacssww", IX86_BUILTIN_VPMACSSWW, UNKNOWN, (int)MULTI_ARG_3_HI },
27895 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsww, "__builtin_ia32_vpmacsww", IX86_BUILTIN_VPMACSWW, UNKNOWN, (int)MULTI_ARG_3_HI },
27896 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsswd, "__builtin_ia32_vpmacsswd", IX86_BUILTIN_VPMACSSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
27897 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacswd, "__builtin_ia32_vpmacswd", IX86_BUILTIN_VPMACSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
27898 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssdd, "__builtin_ia32_vpmacssdd", IX86_BUILTIN_VPMACSSDD, UNKNOWN, (int)MULTI_ARG_3_SI },
27899 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsdd, "__builtin_ia32_vpmacsdd", IX86_BUILTIN_VPMACSDD, UNKNOWN, (int)MULTI_ARG_3_SI },
27900 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssdql, "__builtin_ia32_vpmacssdql", IX86_BUILTIN_VPMACSSDQL, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
27901 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssdqh, "__builtin_ia32_vpmacssdqh", IX86_BUILTIN_VPMACSSDQH, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
27902 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsdql, "__builtin_ia32_vpmacsdql", IX86_BUILTIN_VPMACSDQL, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
27903 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsdqh, "__builtin_ia32_vpmacsdqh", IX86_BUILTIN_VPMACSDQH, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
27904 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmadcsswd, "__builtin_ia32_vpmadcsswd", IX86_BUILTIN_VPMADCSSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
27905 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmadcswd, "__builtin_ia32_vpmadcswd", IX86_BUILTIN_VPMADCSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
27906
27907 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv2di3, "__builtin_ia32_vprotq", IX86_BUILTIN_VPROTQ, UNKNOWN, (int)MULTI_ARG_2_DI },
27908 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv4si3, "__builtin_ia32_vprotd", IX86_BUILTIN_VPROTD, UNKNOWN, (int)MULTI_ARG_2_SI },
27909 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv8hi3, "__builtin_ia32_vprotw", IX86_BUILTIN_VPROTW, UNKNOWN, (int)MULTI_ARG_2_HI },
27910 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv16qi3, "__builtin_ia32_vprotb", IX86_BUILTIN_VPROTB, UNKNOWN, (int)MULTI_ARG_2_QI },
27911 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv2di3, "__builtin_ia32_vprotqi", IX86_BUILTIN_VPROTQ_IMM, UNKNOWN, (int)MULTI_ARG_2_DI_IMM },
27912 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv4si3, "__builtin_ia32_vprotdi", IX86_BUILTIN_VPROTD_IMM, UNKNOWN, (int)MULTI_ARG_2_SI_IMM },
27913 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv8hi3, "__builtin_ia32_vprotwi", IX86_BUILTIN_VPROTW_IMM, UNKNOWN, (int)MULTI_ARG_2_HI_IMM },
27914 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv16qi3, "__builtin_ia32_vprotbi", IX86_BUILTIN_VPROTB_IMM, UNKNOWN, (int)MULTI_ARG_2_QI_IMM },
27915 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav2di3, "__builtin_ia32_vpshaq", IX86_BUILTIN_VPSHAQ, UNKNOWN, (int)MULTI_ARG_2_DI },
27916 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav4si3, "__builtin_ia32_vpshad", IX86_BUILTIN_VPSHAD, UNKNOWN, (int)MULTI_ARG_2_SI },
27917 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav8hi3, "__builtin_ia32_vpshaw", IX86_BUILTIN_VPSHAW, UNKNOWN, (int)MULTI_ARG_2_HI },
27918 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav16qi3, "__builtin_ia32_vpshab", IX86_BUILTIN_VPSHAB, UNKNOWN, (int)MULTI_ARG_2_QI },
27919 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv2di3, "__builtin_ia32_vpshlq", IX86_BUILTIN_VPSHLQ, UNKNOWN, (int)MULTI_ARG_2_DI },
27920 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv4si3, "__builtin_ia32_vpshld", IX86_BUILTIN_VPSHLD, UNKNOWN, (int)MULTI_ARG_2_SI },
27921 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv8hi3, "__builtin_ia32_vpshlw", IX86_BUILTIN_VPSHLW, UNKNOWN, (int)MULTI_ARG_2_HI },
27922 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv16qi3, "__builtin_ia32_vpshlb", IX86_BUILTIN_VPSHLB, UNKNOWN, (int)MULTI_ARG_2_QI },
27923
27924 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vmfrczv4sf2, "__builtin_ia32_vfrczss", IX86_BUILTIN_VFRCZSS, UNKNOWN, (int)MULTI_ARG_2_SF },
27925 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vmfrczv2df2, "__builtin_ia32_vfrczsd", IX86_BUILTIN_VFRCZSD, UNKNOWN, (int)MULTI_ARG_2_DF },
27926 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv4sf2, "__builtin_ia32_vfrczps", IX86_BUILTIN_VFRCZPS, UNKNOWN, (int)MULTI_ARG_1_SF },
27927 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv2df2, "__builtin_ia32_vfrczpd", IX86_BUILTIN_VFRCZPD, UNKNOWN, (int)MULTI_ARG_1_DF },
27928 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv8sf2, "__builtin_ia32_vfrczps256", IX86_BUILTIN_VFRCZPS256, UNKNOWN, (int)MULTI_ARG_1_SF2 },
27929 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv4df2, "__builtin_ia32_vfrczpd256", IX86_BUILTIN_VFRCZPD256, UNKNOWN, (int)MULTI_ARG_1_DF2 },
27930
27931 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddbw, "__builtin_ia32_vphaddbw", IX86_BUILTIN_VPHADDBW, UNKNOWN, (int)MULTI_ARG_1_QI_HI },
27932 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddbd, "__builtin_ia32_vphaddbd", IX86_BUILTIN_VPHADDBD, UNKNOWN, (int)MULTI_ARG_1_QI_SI },
27933 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddbq, "__builtin_ia32_vphaddbq", IX86_BUILTIN_VPHADDBQ, UNKNOWN, (int)MULTI_ARG_1_QI_DI },
27934 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddwd, "__builtin_ia32_vphaddwd", IX86_BUILTIN_VPHADDWD, UNKNOWN, (int)MULTI_ARG_1_HI_SI },
27935 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddwq, "__builtin_ia32_vphaddwq", IX86_BUILTIN_VPHADDWQ, UNKNOWN, (int)MULTI_ARG_1_HI_DI },
27936 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phadddq, "__builtin_ia32_vphadddq", IX86_BUILTIN_VPHADDDQ, UNKNOWN, (int)MULTI_ARG_1_SI_DI },
27937 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddubw, "__builtin_ia32_vphaddubw", IX86_BUILTIN_VPHADDUBW, UNKNOWN, (int)MULTI_ARG_1_QI_HI },
27938 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddubd, "__builtin_ia32_vphaddubd", IX86_BUILTIN_VPHADDUBD, UNKNOWN, (int)MULTI_ARG_1_QI_SI },
27939 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddubq, "__builtin_ia32_vphaddubq", IX86_BUILTIN_VPHADDUBQ, UNKNOWN, (int)MULTI_ARG_1_QI_DI },
27940 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phadduwd, "__builtin_ia32_vphadduwd", IX86_BUILTIN_VPHADDUWD, UNKNOWN, (int)MULTI_ARG_1_HI_SI },
27941 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phadduwq, "__builtin_ia32_vphadduwq", IX86_BUILTIN_VPHADDUWQ, UNKNOWN, (int)MULTI_ARG_1_HI_DI },
27942 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddudq, "__builtin_ia32_vphaddudq", IX86_BUILTIN_VPHADDUDQ, UNKNOWN, (int)MULTI_ARG_1_SI_DI },
27943 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phsubbw, "__builtin_ia32_vphsubbw", IX86_BUILTIN_VPHSUBBW, UNKNOWN, (int)MULTI_ARG_1_QI_HI },
27944 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phsubwd, "__builtin_ia32_vphsubwd", IX86_BUILTIN_VPHSUBWD, UNKNOWN, (int)MULTI_ARG_1_HI_SI },
27945 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phsubdq, "__builtin_ia32_vphsubdq", IX86_BUILTIN_VPHSUBDQ, UNKNOWN, (int)MULTI_ARG_1_SI_DI },
27946
27947 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomeqb", IX86_BUILTIN_VPCOMEQB, EQ, (int)MULTI_ARG_2_QI_CMP },
27948 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomneb", IX86_BUILTIN_VPCOMNEB, NE, (int)MULTI_ARG_2_QI_CMP },
27949 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomneqb", IX86_BUILTIN_VPCOMNEB, NE, (int)MULTI_ARG_2_QI_CMP },
27950 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomltb", IX86_BUILTIN_VPCOMLTB, LT, (int)MULTI_ARG_2_QI_CMP },
27951 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomleb", IX86_BUILTIN_VPCOMLEB, LE, (int)MULTI_ARG_2_QI_CMP },
27952 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomgtb", IX86_BUILTIN_VPCOMGTB, GT, (int)MULTI_ARG_2_QI_CMP },
27953 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomgeb", IX86_BUILTIN_VPCOMGEB, GE, (int)MULTI_ARG_2_QI_CMP },
27954
27955 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomeqw", IX86_BUILTIN_VPCOMEQW, EQ, (int)MULTI_ARG_2_HI_CMP },
27956 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomnew", IX86_BUILTIN_VPCOMNEW, NE, (int)MULTI_ARG_2_HI_CMP },
27957 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomneqw", IX86_BUILTIN_VPCOMNEW, NE, (int)MULTI_ARG_2_HI_CMP },
27958 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomltw", IX86_BUILTIN_VPCOMLTW, LT, (int)MULTI_ARG_2_HI_CMP },
27959 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomlew", IX86_BUILTIN_VPCOMLEW, LE, (int)MULTI_ARG_2_HI_CMP },
27960 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomgtw", IX86_BUILTIN_VPCOMGTW, GT, (int)MULTI_ARG_2_HI_CMP },
27961 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomgew", IX86_BUILTIN_VPCOMGEW, GE, (int)MULTI_ARG_2_HI_CMP },
27962
27963 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomeqd", IX86_BUILTIN_VPCOMEQD, EQ, (int)MULTI_ARG_2_SI_CMP },
27964 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomned", IX86_BUILTIN_VPCOMNED, NE, (int)MULTI_ARG_2_SI_CMP },
27965 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomneqd", IX86_BUILTIN_VPCOMNED, NE, (int)MULTI_ARG_2_SI_CMP },
27966 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomltd", IX86_BUILTIN_VPCOMLTD, LT, (int)MULTI_ARG_2_SI_CMP },
27967 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomled", IX86_BUILTIN_VPCOMLED, LE, (int)MULTI_ARG_2_SI_CMP },
27968 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomgtd", IX86_BUILTIN_VPCOMGTD, GT, (int)MULTI_ARG_2_SI_CMP },
27969 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomged", IX86_BUILTIN_VPCOMGED, GE, (int)MULTI_ARG_2_SI_CMP },
27970
27971 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomeqq", IX86_BUILTIN_VPCOMEQQ, EQ, (int)MULTI_ARG_2_DI_CMP },
27972 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomneq", IX86_BUILTIN_VPCOMNEQ, NE, (int)MULTI_ARG_2_DI_CMP },
27973 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomneqq", IX86_BUILTIN_VPCOMNEQ, NE, (int)MULTI_ARG_2_DI_CMP },
27974 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomltq", IX86_BUILTIN_VPCOMLTQ, LT, (int)MULTI_ARG_2_DI_CMP },
27975 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomleq", IX86_BUILTIN_VPCOMLEQ, LE, (int)MULTI_ARG_2_DI_CMP },
27976 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomgtq", IX86_BUILTIN_VPCOMGTQ, GT, (int)MULTI_ARG_2_DI_CMP },
27977 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomgeq", IX86_BUILTIN_VPCOMGEQ, GE, (int)MULTI_ARG_2_DI_CMP },
27978
27979 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v16qi3,"__builtin_ia32_vpcomequb", IX86_BUILTIN_VPCOMEQUB, EQ, (int)MULTI_ARG_2_QI_CMP },
27980 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v16qi3,"__builtin_ia32_vpcomneub", IX86_BUILTIN_VPCOMNEUB, NE, (int)MULTI_ARG_2_QI_CMP },
27981 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v16qi3,"__builtin_ia32_vpcomnequb", IX86_BUILTIN_VPCOMNEUB, NE, (int)MULTI_ARG_2_QI_CMP },
27982 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomltub", IX86_BUILTIN_VPCOMLTUB, LTU, (int)MULTI_ARG_2_QI_CMP },
27983 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomleub", IX86_BUILTIN_VPCOMLEUB, LEU, (int)MULTI_ARG_2_QI_CMP },
27984 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomgtub", IX86_BUILTIN_VPCOMGTUB, GTU, (int)MULTI_ARG_2_QI_CMP },
27985 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomgeub", IX86_BUILTIN_VPCOMGEUB, GEU, (int)MULTI_ARG_2_QI_CMP },
27986
27987 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v8hi3, "__builtin_ia32_vpcomequw", IX86_BUILTIN_VPCOMEQUW, EQ, (int)MULTI_ARG_2_HI_CMP },
27988 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v8hi3, "__builtin_ia32_vpcomneuw", IX86_BUILTIN_VPCOMNEUW, NE, (int)MULTI_ARG_2_HI_CMP },
27989 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v8hi3, "__builtin_ia32_vpcomnequw", IX86_BUILTIN_VPCOMNEUW, NE, (int)MULTI_ARG_2_HI_CMP },
27990 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomltuw", IX86_BUILTIN_VPCOMLTUW, LTU, (int)MULTI_ARG_2_HI_CMP },
27991 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomleuw", IX86_BUILTIN_VPCOMLEUW, LEU, (int)MULTI_ARG_2_HI_CMP },
27992 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomgtuw", IX86_BUILTIN_VPCOMGTUW, GTU, (int)MULTI_ARG_2_HI_CMP },
27993 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomgeuw", IX86_BUILTIN_VPCOMGEUW, GEU, (int)MULTI_ARG_2_HI_CMP },
27994
27995 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v4si3, "__builtin_ia32_vpcomequd", IX86_BUILTIN_VPCOMEQUD, EQ, (int)MULTI_ARG_2_SI_CMP },
27996 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v4si3, "__builtin_ia32_vpcomneud", IX86_BUILTIN_VPCOMNEUD, NE, (int)MULTI_ARG_2_SI_CMP },
27997 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v4si3, "__builtin_ia32_vpcomnequd", IX86_BUILTIN_VPCOMNEUD, NE, (int)MULTI_ARG_2_SI_CMP },
27998 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomltud", IX86_BUILTIN_VPCOMLTUD, LTU, (int)MULTI_ARG_2_SI_CMP },
27999 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomleud", IX86_BUILTIN_VPCOMLEUD, LEU, (int)MULTI_ARG_2_SI_CMP },
28000 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomgtud", IX86_BUILTIN_VPCOMGTUD, GTU, (int)MULTI_ARG_2_SI_CMP },
28001 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomgeud", IX86_BUILTIN_VPCOMGEUD, GEU, (int)MULTI_ARG_2_SI_CMP },
28002
28003 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v2di3, "__builtin_ia32_vpcomequq", IX86_BUILTIN_VPCOMEQUQ, EQ, (int)MULTI_ARG_2_DI_CMP },
28004 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v2di3, "__builtin_ia32_vpcomneuq", IX86_BUILTIN_VPCOMNEUQ, NE, (int)MULTI_ARG_2_DI_CMP },
28005 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v2di3, "__builtin_ia32_vpcomnequq", IX86_BUILTIN_VPCOMNEUQ, NE, (int)MULTI_ARG_2_DI_CMP },
28006 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomltuq", IX86_BUILTIN_VPCOMLTUQ, LTU, (int)MULTI_ARG_2_DI_CMP },
28007 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomleuq", IX86_BUILTIN_VPCOMLEUQ, LEU, (int)MULTI_ARG_2_DI_CMP },
28008 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomgtuq", IX86_BUILTIN_VPCOMGTUQ, GTU, (int)MULTI_ARG_2_DI_CMP },
28009 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomgeuq", IX86_BUILTIN_VPCOMGEUQ, GEU, (int)MULTI_ARG_2_DI_CMP },
28010
28011 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomfalseb", IX86_BUILTIN_VPCOMFALSEB, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_QI_TF },
28012 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomfalsew", IX86_BUILTIN_VPCOMFALSEW, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_HI_TF },
28013 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomfalsed", IX86_BUILTIN_VPCOMFALSED, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_SI_TF },
28014 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomfalseq", IX86_BUILTIN_VPCOMFALSEQ, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_DI_TF },
28015 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomfalseub",IX86_BUILTIN_VPCOMFALSEUB,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_QI_TF },
28016 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomfalseuw",IX86_BUILTIN_VPCOMFALSEUW,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_HI_TF },
28017 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomfalseud",IX86_BUILTIN_VPCOMFALSEUD,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_SI_TF },
28018 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomfalseuq",IX86_BUILTIN_VPCOMFALSEUQ,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_DI_TF },
28019
28020 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomtrueb", IX86_BUILTIN_VPCOMTRUEB, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_QI_TF },
28021 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomtruew", IX86_BUILTIN_VPCOMTRUEW, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_HI_TF },
28022 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomtrued", IX86_BUILTIN_VPCOMTRUED, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_SI_TF },
28023 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomtrueq", IX86_BUILTIN_VPCOMTRUEQ, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_DI_TF },
28024 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomtrueub", IX86_BUILTIN_VPCOMTRUEUB, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_QI_TF },
28025 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomtrueuw", IX86_BUILTIN_VPCOMTRUEUW, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_HI_TF },
28026 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomtrueud", IX86_BUILTIN_VPCOMTRUEUD, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_SI_TF },
28027 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomtrueuq", IX86_BUILTIN_VPCOMTRUEUQ, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_DI_TF },
28028
28029 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v2df3, "__builtin_ia32_vpermil2pd", IX86_BUILTIN_VPERMIL2PD, UNKNOWN, (int)MULTI_ARG_4_DF2_DI_I },
28030 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v4sf3, "__builtin_ia32_vpermil2ps", IX86_BUILTIN_VPERMIL2PS, UNKNOWN, (int)MULTI_ARG_4_SF2_SI_I },
28031 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v4df3, "__builtin_ia32_vpermil2pd256", IX86_BUILTIN_VPERMIL2PD256, UNKNOWN, (int)MULTI_ARG_4_DF2_DI_I1 },
28032 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v8sf3, "__builtin_ia32_vpermil2ps256", IX86_BUILTIN_VPERMIL2PS256, UNKNOWN, (int)MULTI_ARG_4_SF2_SI_I1 },
28033
28034 };
28035 \f
28036 /* TM vector builtins. */
28037
28038 /* Reuse the existing x86-specific `struct builtin_description' cause
28039 we're lazy. Add casts to make them fit. */
28040 static const struct builtin_description bdesc_tm[] =
28041 {
28042 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WM64", (enum ix86_builtins) BUILT_IN_TM_STORE_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
28043 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WaRM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
28044 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WaWM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
28045 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
28046 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RaRM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
28047 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RaWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
28048 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RfWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
28049
28050 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WM128", (enum ix86_builtins) BUILT_IN_TM_STORE_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
28051 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WaRM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
28052 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WaWM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
28053 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
28054 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RaRM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
28055 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RaWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
28056 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RfWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
28057
28058 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WM256", (enum ix86_builtins) BUILT_IN_TM_STORE_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
28059 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WaRM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
28060 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WaWM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
28061 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
28062 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RaRM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
28063 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RaWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
28064 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RfWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
28065
28066 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_LM64", (enum ix86_builtins) BUILT_IN_TM_LOG_M64, UNKNOWN, VOID_FTYPE_PCVOID },
28067 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_LM128", (enum ix86_builtins) BUILT_IN_TM_LOG_M128, UNKNOWN, VOID_FTYPE_PCVOID },
28068 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_LM256", (enum ix86_builtins) BUILT_IN_TM_LOG_M256, UNKNOWN, VOID_FTYPE_PCVOID },
28069 };
28070
28071 /* TM callbacks. */
28072
28073 /* Return the builtin decl needed to load a vector of TYPE. */
28074
28075 static tree
28076 ix86_builtin_tm_load (tree type)
28077 {
28078 if (TREE_CODE (type) == VECTOR_TYPE)
28079 {
28080 switch (tree_low_cst (TYPE_SIZE (type), 1))
28081 {
28082 case 64:
28083 return builtin_decl_explicit (BUILT_IN_TM_LOAD_M64);
28084 case 128:
28085 return builtin_decl_explicit (BUILT_IN_TM_LOAD_M128);
28086 case 256:
28087 return builtin_decl_explicit (BUILT_IN_TM_LOAD_M256);
28088 }
28089 }
28090 return NULL_TREE;
28091 }
28092
28093 /* Return the builtin decl needed to store a vector of TYPE. */
28094
28095 static tree
28096 ix86_builtin_tm_store (tree type)
28097 {
28098 if (TREE_CODE (type) == VECTOR_TYPE)
28099 {
28100 switch (tree_low_cst (TYPE_SIZE (type), 1))
28101 {
28102 case 64:
28103 return builtin_decl_explicit (BUILT_IN_TM_STORE_M64);
28104 case 128:
28105 return builtin_decl_explicit (BUILT_IN_TM_STORE_M128);
28106 case 256:
28107 return builtin_decl_explicit (BUILT_IN_TM_STORE_M256);
28108 }
28109 }
28110 return NULL_TREE;
28111 }
28112 \f
28113 /* Initialize the transactional memory vector load/store builtins. */
28114
28115 static void
28116 ix86_init_tm_builtins (void)
28117 {
28118 enum ix86_builtin_func_type ftype;
28119 const struct builtin_description *d;
28120 size_t i;
28121 tree decl;
28122 tree attrs_load, attrs_type_load, attrs_store, attrs_type_store;
28123 tree attrs_log, attrs_type_log;
28124
28125 if (!flag_tm)
28126 return;
28127
28128 /* If there are no builtins defined, we must be compiling in a
28129 language without trans-mem support. */
28130 if (!builtin_decl_explicit_p (BUILT_IN_TM_LOAD_1))
28131 return;
28132
28133 /* Use whatever attributes a normal TM load has. */
28134 decl = builtin_decl_explicit (BUILT_IN_TM_LOAD_1);
28135 attrs_load = DECL_ATTRIBUTES (decl);
28136 attrs_type_load = TYPE_ATTRIBUTES (TREE_TYPE (decl));
28137 /* Use whatever attributes a normal TM store has. */
28138 decl = builtin_decl_explicit (BUILT_IN_TM_STORE_1);
28139 attrs_store = DECL_ATTRIBUTES (decl);
28140 attrs_type_store = TYPE_ATTRIBUTES (TREE_TYPE (decl));
28141 /* Use whatever attributes a normal TM log has. */
28142 decl = builtin_decl_explicit (BUILT_IN_TM_LOG);
28143 attrs_log = DECL_ATTRIBUTES (decl);
28144 attrs_type_log = TYPE_ATTRIBUTES (TREE_TYPE (decl));
28145
28146 for (i = 0, d = bdesc_tm;
28147 i < ARRAY_SIZE (bdesc_tm);
28148 i++, d++)
28149 {
28150 if ((d->mask & ix86_isa_flags) != 0
28151 || (lang_hooks.builtin_function
28152 == lang_hooks.builtin_function_ext_scope))
28153 {
28154 tree type, attrs, attrs_type;
28155 enum built_in_function code = (enum built_in_function) d->code;
28156
28157 ftype = (enum ix86_builtin_func_type) d->flag;
28158 type = ix86_get_builtin_func_type (ftype);
28159
28160 if (BUILTIN_TM_LOAD_P (code))
28161 {
28162 attrs = attrs_load;
28163 attrs_type = attrs_type_load;
28164 }
28165 else if (BUILTIN_TM_STORE_P (code))
28166 {
28167 attrs = attrs_store;
28168 attrs_type = attrs_type_store;
28169 }
28170 else
28171 {
28172 attrs = attrs_log;
28173 attrs_type = attrs_type_log;
28174 }
28175 decl = add_builtin_function (d->name, type, code, BUILT_IN_NORMAL,
28176 /* The builtin without the prefix for
28177 calling it directly. */
28178 d->name + strlen ("__builtin_"),
28179 attrs);
28180 /* add_builtin_function() will set the DECL_ATTRIBUTES, now
28181 set the TYPE_ATTRIBUTES. */
28182 decl_attributes (&TREE_TYPE (decl), attrs_type, ATTR_FLAG_BUILT_IN);
28183
28184 set_builtin_decl (code, decl, false);
28185 }
28186 }
28187 }
28188
28189 /* Set up all the MMX/SSE builtins, even builtins for instructions that are not
28190 in the current target ISA to allow the user to compile particular modules
28191 with different target specific options that differ from the command line
28192 options. */
28193 static void
28194 ix86_init_mmx_sse_builtins (void)
28195 {
28196 const struct builtin_description * d;
28197 enum ix86_builtin_func_type ftype;
28198 size_t i;
28199
28200 /* Add all special builtins with variable number of operands. */
28201 for (i = 0, d = bdesc_special_args;
28202 i < ARRAY_SIZE (bdesc_special_args);
28203 i++, d++)
28204 {
28205 if (d->name == 0)
28206 continue;
28207
28208 ftype = (enum ix86_builtin_func_type) d->flag;
28209 def_builtin (d->mask, d->name, ftype, d->code);
28210 }
28211
28212 /* Add all builtins with variable number of operands. */
28213 for (i = 0, d = bdesc_args;
28214 i < ARRAY_SIZE (bdesc_args);
28215 i++, d++)
28216 {
28217 if (d->name == 0)
28218 continue;
28219
28220 ftype = (enum ix86_builtin_func_type) d->flag;
28221 def_builtin_const (d->mask, d->name, ftype, d->code);
28222 }
28223
28224 /* pcmpestr[im] insns. */
28225 for (i = 0, d = bdesc_pcmpestr;
28226 i < ARRAY_SIZE (bdesc_pcmpestr);
28227 i++, d++)
28228 {
28229 if (d->code == IX86_BUILTIN_PCMPESTRM128)
28230 ftype = V16QI_FTYPE_V16QI_INT_V16QI_INT_INT;
28231 else
28232 ftype = INT_FTYPE_V16QI_INT_V16QI_INT_INT;
28233 def_builtin_const (d->mask, d->name, ftype, d->code);
28234 }
28235
28236 /* pcmpistr[im] insns. */
28237 for (i = 0, d = bdesc_pcmpistr;
28238 i < ARRAY_SIZE (bdesc_pcmpistr);
28239 i++, d++)
28240 {
28241 if (d->code == IX86_BUILTIN_PCMPISTRM128)
28242 ftype = V16QI_FTYPE_V16QI_V16QI_INT;
28243 else
28244 ftype = INT_FTYPE_V16QI_V16QI_INT;
28245 def_builtin_const (d->mask, d->name, ftype, d->code);
28246 }
28247
28248 /* comi/ucomi insns. */
28249 for (i = 0, d = bdesc_comi; i < ARRAY_SIZE (bdesc_comi); i++, d++)
28250 {
28251 if (d->mask == OPTION_MASK_ISA_SSE2)
28252 ftype = INT_FTYPE_V2DF_V2DF;
28253 else
28254 ftype = INT_FTYPE_V4SF_V4SF;
28255 def_builtin_const (d->mask, d->name, ftype, d->code);
28256 }
28257
28258 /* SSE */
28259 def_builtin (OPTION_MASK_ISA_SSE, "__builtin_ia32_ldmxcsr",
28260 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_LDMXCSR);
28261 def_builtin (OPTION_MASK_ISA_SSE, "__builtin_ia32_stmxcsr",
28262 UNSIGNED_FTYPE_VOID, IX86_BUILTIN_STMXCSR);
28263
28264 /* SSE or 3DNow!A */
28265 def_builtin (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
28266 "__builtin_ia32_maskmovq", VOID_FTYPE_V8QI_V8QI_PCHAR,
28267 IX86_BUILTIN_MASKMOVQ);
28268
28269 /* SSE2 */
28270 def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_maskmovdqu",
28271 VOID_FTYPE_V16QI_V16QI_PCHAR, IX86_BUILTIN_MASKMOVDQU);
28272
28273 def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_clflush",
28274 VOID_FTYPE_PCVOID, IX86_BUILTIN_CLFLUSH);
28275 x86_mfence = def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_mfence",
28276 VOID_FTYPE_VOID, IX86_BUILTIN_MFENCE);
28277
28278 /* SSE3. */
28279 def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_monitor",
28280 VOID_FTYPE_PCVOID_UNSIGNED_UNSIGNED, IX86_BUILTIN_MONITOR);
28281 def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_mwait",
28282 VOID_FTYPE_UNSIGNED_UNSIGNED, IX86_BUILTIN_MWAIT);
28283
28284 /* AES */
28285 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesenc128",
28286 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENC128);
28287 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesenclast128",
28288 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENCLAST128);
28289 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesdec128",
28290 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDEC128);
28291 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesdeclast128",
28292 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDECLAST128);
28293 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesimc128",
28294 V2DI_FTYPE_V2DI, IX86_BUILTIN_AESIMC128);
28295 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aeskeygenassist128",
28296 V2DI_FTYPE_V2DI_INT, IX86_BUILTIN_AESKEYGENASSIST128);
28297
28298 /* PCLMUL */
28299 def_builtin_const (OPTION_MASK_ISA_PCLMUL, "__builtin_ia32_pclmulqdq128",
28300 V2DI_FTYPE_V2DI_V2DI_INT, IX86_BUILTIN_PCLMULQDQ128);
28301
28302 /* RDRND */
28303 def_builtin (OPTION_MASK_ISA_RDRND, "__builtin_ia32_rdrand16_step",
28304 INT_FTYPE_PUSHORT, IX86_BUILTIN_RDRAND16_STEP);
28305 def_builtin (OPTION_MASK_ISA_RDRND, "__builtin_ia32_rdrand32_step",
28306 INT_FTYPE_PUNSIGNED, IX86_BUILTIN_RDRAND32_STEP);
28307 def_builtin (OPTION_MASK_ISA_RDRND | OPTION_MASK_ISA_64BIT,
28308 "__builtin_ia32_rdrand64_step", INT_FTYPE_PULONGLONG,
28309 IX86_BUILTIN_RDRAND64_STEP);
28310
28311 /* AVX2 */
28312 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv2df",
28313 V2DF_FTYPE_V2DF_PCDOUBLE_V4SI_V2DF_INT,
28314 IX86_BUILTIN_GATHERSIV2DF);
28315
28316 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4df",
28317 V4DF_FTYPE_V4DF_PCDOUBLE_V4SI_V4DF_INT,
28318 IX86_BUILTIN_GATHERSIV4DF);
28319
28320 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv2df",
28321 V2DF_FTYPE_V2DF_PCDOUBLE_V2DI_V2DF_INT,
28322 IX86_BUILTIN_GATHERDIV2DF);
28323
28324 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4df",
28325 V4DF_FTYPE_V4DF_PCDOUBLE_V4DI_V4DF_INT,
28326 IX86_BUILTIN_GATHERDIV4DF);
28327
28328 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4sf",
28329 V4SF_FTYPE_V4SF_PCFLOAT_V4SI_V4SF_INT,
28330 IX86_BUILTIN_GATHERSIV4SF);
28331
28332 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv8sf",
28333 V8SF_FTYPE_V8SF_PCFLOAT_V8SI_V8SF_INT,
28334 IX86_BUILTIN_GATHERSIV8SF);
28335
28336 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4sf",
28337 V4SF_FTYPE_V4SF_PCFLOAT_V2DI_V4SF_INT,
28338 IX86_BUILTIN_GATHERDIV4SF);
28339
28340 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4sf256",
28341 V4SF_FTYPE_V4SF_PCFLOAT_V4DI_V4SF_INT,
28342 IX86_BUILTIN_GATHERDIV8SF);
28343
28344 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv2di",
28345 V2DI_FTYPE_V2DI_PCINT64_V4SI_V2DI_INT,
28346 IX86_BUILTIN_GATHERSIV2DI);
28347
28348 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4di",
28349 V4DI_FTYPE_V4DI_PCINT64_V4SI_V4DI_INT,
28350 IX86_BUILTIN_GATHERSIV4DI);
28351
28352 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv2di",
28353 V2DI_FTYPE_V2DI_PCINT64_V2DI_V2DI_INT,
28354 IX86_BUILTIN_GATHERDIV2DI);
28355
28356 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4di",
28357 V4DI_FTYPE_V4DI_PCINT64_V4DI_V4DI_INT,
28358 IX86_BUILTIN_GATHERDIV4DI);
28359
28360 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4si",
28361 V4SI_FTYPE_V4SI_PCINT_V4SI_V4SI_INT,
28362 IX86_BUILTIN_GATHERSIV4SI);
28363
28364 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv8si",
28365 V8SI_FTYPE_V8SI_PCINT_V8SI_V8SI_INT,
28366 IX86_BUILTIN_GATHERSIV8SI);
28367
28368 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4si",
28369 V4SI_FTYPE_V4SI_PCINT_V2DI_V4SI_INT,
28370 IX86_BUILTIN_GATHERDIV4SI);
28371
28372 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4si256",
28373 V4SI_FTYPE_V4SI_PCINT_V4DI_V4SI_INT,
28374 IX86_BUILTIN_GATHERDIV8SI);
28375
28376 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4df ",
28377 V4DF_FTYPE_V4DF_PCDOUBLE_V8SI_V4DF_INT,
28378 IX86_BUILTIN_GATHERALTSIV4DF);
28379
28380 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4sf256 ",
28381 V8SF_FTYPE_V8SF_PCFLOAT_V4DI_V8SF_INT,
28382 IX86_BUILTIN_GATHERALTDIV8SF);
28383
28384 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4di ",
28385 V4DI_FTYPE_V4DI_PCINT64_V8SI_V4DI_INT,
28386 IX86_BUILTIN_GATHERALTSIV4DI);
28387
28388 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4si256 ",
28389 V8SI_FTYPE_V8SI_PCINT_V4DI_V8SI_INT,
28390 IX86_BUILTIN_GATHERALTDIV8SI);
28391
28392 /* RTM. */
28393 def_builtin (OPTION_MASK_ISA_RTM, "__builtin_ia32_xabort",
28394 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_XABORT);
28395
28396 /* MMX access to the vec_init patterns. */
28397 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v2si",
28398 V2SI_FTYPE_INT_INT, IX86_BUILTIN_VEC_INIT_V2SI);
28399
28400 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v4hi",
28401 V4HI_FTYPE_HI_HI_HI_HI,
28402 IX86_BUILTIN_VEC_INIT_V4HI);
28403
28404 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v8qi",
28405 V8QI_FTYPE_QI_QI_QI_QI_QI_QI_QI_QI,
28406 IX86_BUILTIN_VEC_INIT_V8QI);
28407
28408 /* Access to the vec_extract patterns. */
28409 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2df",
28410 DOUBLE_FTYPE_V2DF_INT, IX86_BUILTIN_VEC_EXT_V2DF);
28411 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2di",
28412 DI_FTYPE_V2DI_INT, IX86_BUILTIN_VEC_EXT_V2DI);
28413 def_builtin_const (OPTION_MASK_ISA_SSE, "__builtin_ia32_vec_ext_v4sf",
28414 FLOAT_FTYPE_V4SF_INT, IX86_BUILTIN_VEC_EXT_V4SF);
28415 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v4si",
28416 SI_FTYPE_V4SI_INT, IX86_BUILTIN_VEC_EXT_V4SI);
28417 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v8hi",
28418 HI_FTYPE_V8HI_INT, IX86_BUILTIN_VEC_EXT_V8HI);
28419
28420 def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
28421 "__builtin_ia32_vec_ext_v4hi",
28422 HI_FTYPE_V4HI_INT, IX86_BUILTIN_VEC_EXT_V4HI);
28423
28424 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_ext_v2si",
28425 SI_FTYPE_V2SI_INT, IX86_BUILTIN_VEC_EXT_V2SI);
28426
28427 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v16qi",
28428 QI_FTYPE_V16QI_INT, IX86_BUILTIN_VEC_EXT_V16QI);
28429
28430 /* Access to the vec_set patterns. */
28431 def_builtin_const (OPTION_MASK_ISA_SSE4_1 | OPTION_MASK_ISA_64BIT,
28432 "__builtin_ia32_vec_set_v2di",
28433 V2DI_FTYPE_V2DI_DI_INT, IX86_BUILTIN_VEC_SET_V2DI);
28434
28435 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4sf",
28436 V4SF_FTYPE_V4SF_FLOAT_INT, IX86_BUILTIN_VEC_SET_V4SF);
28437
28438 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4si",
28439 V4SI_FTYPE_V4SI_SI_INT, IX86_BUILTIN_VEC_SET_V4SI);
28440
28441 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_set_v8hi",
28442 V8HI_FTYPE_V8HI_HI_INT, IX86_BUILTIN_VEC_SET_V8HI);
28443
28444 def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
28445 "__builtin_ia32_vec_set_v4hi",
28446 V4HI_FTYPE_V4HI_HI_INT, IX86_BUILTIN_VEC_SET_V4HI);
28447
28448 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v16qi",
28449 V16QI_FTYPE_V16QI_QI_INT, IX86_BUILTIN_VEC_SET_V16QI);
28450
28451 /* RDSEED */
28452 def_builtin (OPTION_MASK_ISA_RDSEED, "__builtin_ia32_rdseed_hi_step",
28453 INT_FTYPE_PUSHORT, IX86_BUILTIN_RDSEED16_STEP);
28454 def_builtin (OPTION_MASK_ISA_RDSEED, "__builtin_ia32_rdseed_si_step",
28455 INT_FTYPE_PUNSIGNED, IX86_BUILTIN_RDSEED32_STEP);
28456 def_builtin (OPTION_MASK_ISA_RDSEED | OPTION_MASK_ISA_64BIT,
28457 "__builtin_ia32_rdseed_di_step",
28458 INT_FTYPE_PULONGLONG, IX86_BUILTIN_RDSEED64_STEP);
28459
28460 /* ADCX */
28461 def_builtin (0, "__builtin_ia32_addcarryx_u32",
28462 UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED, IX86_BUILTIN_ADDCARRYX32);
28463 def_builtin (OPTION_MASK_ISA_64BIT,
28464 "__builtin_ia32_addcarryx_u64",
28465 UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG,
28466 IX86_BUILTIN_ADDCARRYX64);
28467
28468 /* Add FMA4 multi-arg argument instructions */
28469 for (i = 0, d = bdesc_multi_arg; i < ARRAY_SIZE (bdesc_multi_arg); i++, d++)
28470 {
28471 if (d->name == 0)
28472 continue;
28473
28474 ftype = (enum ix86_builtin_func_type) d->flag;
28475 def_builtin_const (d->mask, d->name, ftype, d->code);
28476 }
28477 }
28478
28479 /* This builds the processor_model struct type defined in
28480 libgcc/config/i386/cpuinfo.c */
28481
28482 static tree
28483 build_processor_model_struct (void)
28484 {
28485 const char *field_name[] = {"__cpu_vendor", "__cpu_type", "__cpu_subtype",
28486 "__cpu_features"};
28487 tree field = NULL_TREE, field_chain = NULL_TREE;
28488 int i;
28489 tree type = make_node (RECORD_TYPE);
28490
28491 /* The first 3 fields are unsigned int. */
28492 for (i = 0; i < 3; ++i)
28493 {
28494 field = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
28495 get_identifier (field_name[i]), unsigned_type_node);
28496 if (field_chain != NULL_TREE)
28497 DECL_CHAIN (field) = field_chain;
28498 field_chain = field;
28499 }
28500
28501 /* The last field is an array of unsigned integers of size one. */
28502 field = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
28503 get_identifier (field_name[3]),
28504 build_array_type (unsigned_type_node,
28505 build_index_type (size_one_node)));
28506 if (field_chain != NULL_TREE)
28507 DECL_CHAIN (field) = field_chain;
28508 field_chain = field;
28509
28510 finish_builtin_struct (type, "__processor_model", field_chain, NULL_TREE);
28511 return type;
28512 }
28513
28514 /* Returns a extern, comdat VAR_DECL of type TYPE and name NAME. */
28515
28516 static tree
28517 make_var_decl (tree type, const char *name)
28518 {
28519 tree new_decl;
28520
28521 new_decl = build_decl (UNKNOWN_LOCATION,
28522 VAR_DECL,
28523 get_identifier(name),
28524 type);
28525
28526 DECL_EXTERNAL (new_decl) = 1;
28527 TREE_STATIC (new_decl) = 1;
28528 TREE_PUBLIC (new_decl) = 1;
28529 DECL_INITIAL (new_decl) = 0;
28530 DECL_ARTIFICIAL (new_decl) = 0;
28531 DECL_PRESERVE_P (new_decl) = 1;
28532
28533 make_decl_one_only (new_decl, DECL_ASSEMBLER_NAME (new_decl));
28534 assemble_variable (new_decl, 0, 0, 0);
28535
28536 return new_decl;
28537 }
28538
28539 /* FNDECL is a __builtin_cpu_is or a __builtin_cpu_supports call that is folded
28540 into an integer defined in libgcc/config/i386/cpuinfo.c */
28541
28542 static tree
28543 fold_builtin_cpu (tree fndecl, tree *args)
28544 {
28545 unsigned int i;
28546 enum ix86_builtins fn_code = (enum ix86_builtins)
28547 DECL_FUNCTION_CODE (fndecl);
28548 tree param_string_cst = NULL;
28549
28550 /* This is the order of bit-fields in __processor_features in cpuinfo.c */
28551 enum processor_features
28552 {
28553 F_CMOV = 0,
28554 F_MMX,
28555 F_POPCNT,
28556 F_SSE,
28557 F_SSE2,
28558 F_SSE3,
28559 F_SSSE3,
28560 F_SSE4_1,
28561 F_SSE4_2,
28562 F_AVX,
28563 F_AVX2,
28564 F_MAX
28565 };
28566
28567 /* These are the values for vendor types and cpu types and subtypes
28568 in cpuinfo.c. Cpu types and subtypes should be subtracted by
28569 the corresponding start value. */
28570 enum processor_model
28571 {
28572 M_INTEL = 1,
28573 M_AMD,
28574 M_CPU_TYPE_START,
28575 M_INTEL_ATOM,
28576 M_INTEL_CORE2,
28577 M_INTEL_COREI7,
28578 M_AMDFAM10H,
28579 M_AMDFAM15H,
28580 M_CPU_SUBTYPE_START,
28581 M_INTEL_COREI7_NEHALEM,
28582 M_INTEL_COREI7_WESTMERE,
28583 M_INTEL_COREI7_SANDYBRIDGE,
28584 M_AMDFAM10H_BARCELONA,
28585 M_AMDFAM10H_SHANGHAI,
28586 M_AMDFAM10H_ISTANBUL,
28587 M_AMDFAM15H_BDVER1,
28588 M_AMDFAM15H_BDVER2
28589 };
28590
28591 static struct _arch_names_table
28592 {
28593 const char *const name;
28594 const enum processor_model model;
28595 }
28596 const arch_names_table[] =
28597 {
28598 {"amd", M_AMD},
28599 {"intel", M_INTEL},
28600 {"atom", M_INTEL_ATOM},
28601 {"core2", M_INTEL_CORE2},
28602 {"corei7", M_INTEL_COREI7},
28603 {"nehalem", M_INTEL_COREI7_NEHALEM},
28604 {"westmere", M_INTEL_COREI7_WESTMERE},
28605 {"sandybridge", M_INTEL_COREI7_SANDYBRIDGE},
28606 {"amdfam10h", M_AMDFAM10H},
28607 {"barcelona", M_AMDFAM10H_BARCELONA},
28608 {"shanghai", M_AMDFAM10H_SHANGHAI},
28609 {"istanbul", M_AMDFAM10H_ISTANBUL},
28610 {"amdfam15h", M_AMDFAM15H},
28611 {"bdver1", M_AMDFAM15H_BDVER1},
28612 {"bdver2", M_AMDFAM15H_BDVER2},
28613 };
28614
28615 static struct _isa_names_table
28616 {
28617 const char *const name;
28618 const enum processor_features feature;
28619 }
28620 const isa_names_table[] =
28621 {
28622 {"cmov", F_CMOV},
28623 {"mmx", F_MMX},
28624 {"popcnt", F_POPCNT},
28625 {"sse", F_SSE},
28626 {"sse2", F_SSE2},
28627 {"sse3", F_SSE3},
28628 {"ssse3", F_SSSE3},
28629 {"sse4.1", F_SSE4_1},
28630 {"sse4.2", F_SSE4_2},
28631 {"avx", F_AVX},
28632 {"avx2", F_AVX2}
28633 };
28634
28635 static tree __processor_model_type = NULL_TREE;
28636 static tree __cpu_model_var = NULL_TREE;
28637
28638 if (__processor_model_type == NULL_TREE)
28639 __processor_model_type = build_processor_model_struct ();
28640
28641 if (__cpu_model_var == NULL_TREE)
28642 __cpu_model_var = make_var_decl (__processor_model_type,
28643 "__cpu_model");
28644
28645 gcc_assert ((args != NULL) && (*args != NULL));
28646
28647 param_string_cst = *args;
28648 while (param_string_cst
28649 && TREE_CODE (param_string_cst) != STRING_CST)
28650 {
28651 /* *args must be a expr that can contain other EXPRS leading to a
28652 STRING_CST. */
28653 if (!EXPR_P (param_string_cst))
28654 {
28655 error ("Parameter to builtin must be a string constant or literal");
28656 return integer_zero_node;
28657 }
28658 param_string_cst = TREE_OPERAND (EXPR_CHECK (param_string_cst), 0);
28659 }
28660
28661 gcc_assert (param_string_cst);
28662
28663 if (fn_code == IX86_BUILTIN_CPU_IS)
28664 {
28665 tree ref;
28666 tree field;
28667 unsigned int field_val = 0;
28668 unsigned int NUM_ARCH_NAMES
28669 = sizeof (arch_names_table) / sizeof (struct _arch_names_table);
28670
28671 for (i = 0; i < NUM_ARCH_NAMES; i++)
28672 if (strcmp (arch_names_table[i].name,
28673 TREE_STRING_POINTER (param_string_cst)) == 0)
28674 break;
28675
28676 if (i == NUM_ARCH_NAMES)
28677 {
28678 error ("Parameter to builtin not valid: %s",
28679 TREE_STRING_POINTER (param_string_cst));
28680 return integer_zero_node;
28681 }
28682
28683 field = TYPE_FIELDS (__processor_model_type);
28684 field_val = arch_names_table[i].model;
28685
28686 /* CPU types are stored in the next field. */
28687 if (field_val > M_CPU_TYPE_START
28688 && field_val < M_CPU_SUBTYPE_START)
28689 {
28690 field = DECL_CHAIN (field);
28691 field_val -= M_CPU_TYPE_START;
28692 }
28693
28694 /* CPU subtypes are stored in the next field. */
28695 if (field_val > M_CPU_SUBTYPE_START)
28696 {
28697 field = DECL_CHAIN ( DECL_CHAIN (field));
28698 field_val -= M_CPU_SUBTYPE_START;
28699 }
28700
28701 /* Get the appropriate field in __cpu_model. */
28702 ref = build3 (COMPONENT_REF, TREE_TYPE (field), __cpu_model_var,
28703 field, NULL_TREE);
28704
28705 /* Check the value. */
28706 return build2 (EQ_EXPR, unsigned_type_node, ref,
28707 build_int_cstu (unsigned_type_node, field_val));
28708 }
28709 else if (fn_code == IX86_BUILTIN_CPU_SUPPORTS)
28710 {
28711 tree ref;
28712 tree array_elt;
28713 tree field;
28714 unsigned int field_val = 0;
28715 unsigned int NUM_ISA_NAMES
28716 = sizeof (isa_names_table) / sizeof (struct _isa_names_table);
28717
28718 for (i = 0; i < NUM_ISA_NAMES; i++)
28719 if (strcmp (isa_names_table[i].name,
28720 TREE_STRING_POINTER (param_string_cst)) == 0)
28721 break;
28722
28723 if (i == NUM_ISA_NAMES)
28724 {
28725 error ("Parameter to builtin not valid: %s",
28726 TREE_STRING_POINTER (param_string_cst));
28727 return integer_zero_node;
28728 }
28729
28730 field = TYPE_FIELDS (__processor_model_type);
28731 /* Get the last field, which is __cpu_features. */
28732 while (DECL_CHAIN (field))
28733 field = DECL_CHAIN (field);
28734
28735 /* Get the appropriate field: __cpu_model.__cpu_features */
28736 ref = build3 (COMPONENT_REF, TREE_TYPE (field), __cpu_model_var,
28737 field, NULL_TREE);
28738
28739 /* Access the 0th element of __cpu_features array. */
28740 array_elt = build4 (ARRAY_REF, unsigned_type_node, ref,
28741 integer_zero_node, NULL_TREE, NULL_TREE);
28742
28743 field_val = (1 << isa_names_table[i].feature);
28744 /* Return __cpu_model.__cpu_features[0] & field_val */
28745 return build2 (BIT_AND_EXPR, unsigned_type_node, array_elt,
28746 build_int_cstu (unsigned_type_node, field_val));
28747 }
28748 gcc_unreachable ();
28749 }
28750
28751 static tree
28752 ix86_fold_builtin (tree fndecl, int n_args,
28753 tree *args, bool ignore ATTRIBUTE_UNUSED)
28754 {
28755 if (DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD)
28756 {
28757 enum ix86_builtins fn_code = (enum ix86_builtins)
28758 DECL_FUNCTION_CODE (fndecl);
28759 if (fn_code == IX86_BUILTIN_CPU_IS
28760 || fn_code == IX86_BUILTIN_CPU_SUPPORTS)
28761 {
28762 gcc_assert (n_args == 1);
28763 return fold_builtin_cpu (fndecl, args);
28764 }
28765 }
28766
28767 #ifdef SUBTARGET_FOLD_BUILTIN
28768 return SUBTARGET_FOLD_BUILTIN (fndecl, n_args, args, ignore);
28769 #endif
28770
28771 return NULL_TREE;
28772 }
28773
28774 /* Make builtins to detect cpu type and features supported. NAME is
28775 the builtin name, CODE is the builtin code, and FTYPE is the function
28776 type of the builtin. */
28777
28778 static void
28779 make_cpu_type_builtin (const char* name, int code,
28780 enum ix86_builtin_func_type ftype, bool is_const)
28781 {
28782 tree decl;
28783 tree type;
28784
28785 type = ix86_get_builtin_func_type (ftype);
28786 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
28787 NULL, NULL_TREE);
28788 gcc_assert (decl != NULL_TREE);
28789 ix86_builtins[(int) code] = decl;
28790 TREE_READONLY (decl) = is_const;
28791 }
28792
28793 /* Make builtins to get CPU type and features supported. The created
28794 builtins are :
28795
28796 __builtin_cpu_init (), to detect cpu type and features,
28797 __builtin_cpu_is ("<CPUNAME>"), to check if cpu is of type <CPUNAME>,
28798 __builtin_cpu_supports ("<FEATURE>"), to check if cpu supports <FEATURE>
28799 */
28800
28801 static void
28802 ix86_init_platform_type_builtins (void)
28803 {
28804 make_cpu_type_builtin ("__builtin_cpu_init", IX86_BUILTIN_CPU_INIT,
28805 INT_FTYPE_VOID, false);
28806 make_cpu_type_builtin ("__builtin_cpu_is", IX86_BUILTIN_CPU_IS,
28807 INT_FTYPE_PCCHAR, true);
28808 make_cpu_type_builtin ("__builtin_cpu_supports", IX86_BUILTIN_CPU_SUPPORTS,
28809 INT_FTYPE_PCCHAR, true);
28810 }
28811
28812 /* Internal method for ix86_init_builtins. */
28813
28814 static void
28815 ix86_init_builtins_va_builtins_abi (void)
28816 {
28817 tree ms_va_ref, sysv_va_ref;
28818 tree fnvoid_va_end_ms, fnvoid_va_end_sysv;
28819 tree fnvoid_va_start_ms, fnvoid_va_start_sysv;
28820 tree fnvoid_va_copy_ms, fnvoid_va_copy_sysv;
28821 tree fnattr_ms = NULL_TREE, fnattr_sysv = NULL_TREE;
28822
28823 if (!TARGET_64BIT)
28824 return;
28825 fnattr_ms = build_tree_list (get_identifier ("ms_abi"), NULL_TREE);
28826 fnattr_sysv = build_tree_list (get_identifier ("sysv_abi"), NULL_TREE);
28827 ms_va_ref = build_reference_type (ms_va_list_type_node);
28828 sysv_va_ref =
28829 build_pointer_type (TREE_TYPE (sysv_va_list_type_node));
28830
28831 fnvoid_va_end_ms =
28832 build_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
28833 fnvoid_va_start_ms =
28834 build_varargs_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
28835 fnvoid_va_end_sysv =
28836 build_function_type_list (void_type_node, sysv_va_ref, NULL_TREE);
28837 fnvoid_va_start_sysv =
28838 build_varargs_function_type_list (void_type_node, sysv_va_ref,
28839 NULL_TREE);
28840 fnvoid_va_copy_ms =
28841 build_function_type_list (void_type_node, ms_va_ref, ms_va_list_type_node,
28842 NULL_TREE);
28843 fnvoid_va_copy_sysv =
28844 build_function_type_list (void_type_node, sysv_va_ref,
28845 sysv_va_ref, NULL_TREE);
28846
28847 add_builtin_function ("__builtin_ms_va_start", fnvoid_va_start_ms,
28848 BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_ms);
28849 add_builtin_function ("__builtin_ms_va_end", fnvoid_va_end_ms,
28850 BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_ms);
28851 add_builtin_function ("__builtin_ms_va_copy", fnvoid_va_copy_ms,
28852 BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_ms);
28853 add_builtin_function ("__builtin_sysv_va_start", fnvoid_va_start_sysv,
28854 BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_sysv);
28855 add_builtin_function ("__builtin_sysv_va_end", fnvoid_va_end_sysv,
28856 BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_sysv);
28857 add_builtin_function ("__builtin_sysv_va_copy", fnvoid_va_copy_sysv,
28858 BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_sysv);
28859 }
28860
28861 static void
28862 ix86_init_builtin_types (void)
28863 {
28864 tree float128_type_node, float80_type_node;
28865
28866 /* The __float80 type. */
28867 float80_type_node = long_double_type_node;
28868 if (TYPE_MODE (float80_type_node) != XFmode)
28869 {
28870 /* The __float80 type. */
28871 float80_type_node = make_node (REAL_TYPE);
28872
28873 TYPE_PRECISION (float80_type_node) = 80;
28874 layout_type (float80_type_node);
28875 }
28876 lang_hooks.types.register_builtin_type (float80_type_node, "__float80");
28877
28878 /* The __float128 type. */
28879 float128_type_node = make_node (REAL_TYPE);
28880 TYPE_PRECISION (float128_type_node) = 128;
28881 layout_type (float128_type_node);
28882 lang_hooks.types.register_builtin_type (float128_type_node, "__float128");
28883
28884 /* This macro is built by i386-builtin-types.awk. */
28885 DEFINE_BUILTIN_PRIMITIVE_TYPES;
28886 }
28887
28888 static void
28889 ix86_init_builtins (void)
28890 {
28891 tree t;
28892
28893 ix86_init_builtin_types ();
28894
28895 /* Builtins to get CPU type and features. */
28896 ix86_init_platform_type_builtins ();
28897
28898 /* TFmode support builtins. */
28899 def_builtin_const (0, "__builtin_infq",
28900 FLOAT128_FTYPE_VOID, IX86_BUILTIN_INFQ);
28901 def_builtin_const (0, "__builtin_huge_valq",
28902 FLOAT128_FTYPE_VOID, IX86_BUILTIN_HUGE_VALQ);
28903
28904 /* We will expand them to normal call if SSE isn't available since
28905 they are used by libgcc. */
28906 t = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128);
28907 t = add_builtin_function ("__builtin_fabsq", t, IX86_BUILTIN_FABSQ,
28908 BUILT_IN_MD, "__fabstf2", NULL_TREE);
28909 TREE_READONLY (t) = 1;
28910 ix86_builtins[(int) IX86_BUILTIN_FABSQ] = t;
28911
28912 t = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128_FLOAT128);
28913 t = add_builtin_function ("__builtin_copysignq", t, IX86_BUILTIN_COPYSIGNQ,
28914 BUILT_IN_MD, "__copysigntf3", NULL_TREE);
28915 TREE_READONLY (t) = 1;
28916 ix86_builtins[(int) IX86_BUILTIN_COPYSIGNQ] = t;
28917
28918 ix86_init_tm_builtins ();
28919 ix86_init_mmx_sse_builtins ();
28920
28921 if (TARGET_LP64)
28922 ix86_init_builtins_va_builtins_abi ();
28923
28924 #ifdef SUBTARGET_INIT_BUILTINS
28925 SUBTARGET_INIT_BUILTINS;
28926 #endif
28927 }
28928
28929 /* Return the ix86 builtin for CODE. */
28930
28931 static tree
28932 ix86_builtin_decl (unsigned code, bool initialize_p ATTRIBUTE_UNUSED)
28933 {
28934 if (code >= IX86_BUILTIN_MAX)
28935 return error_mark_node;
28936
28937 return ix86_builtins[code];
28938 }
28939
28940 /* Errors in the source file can cause expand_expr to return const0_rtx
28941 where we expect a vector. To avoid crashing, use one of the vector
28942 clear instructions. */
28943 static rtx
28944 safe_vector_operand (rtx x, enum machine_mode mode)
28945 {
28946 if (x == const0_rtx)
28947 x = CONST0_RTX (mode);
28948 return x;
28949 }
28950
28951 /* Subroutine of ix86_expand_builtin to take care of binop insns. */
28952
28953 static rtx
28954 ix86_expand_binop_builtin (enum insn_code icode, tree exp, rtx target)
28955 {
28956 rtx pat;
28957 tree arg0 = CALL_EXPR_ARG (exp, 0);
28958 tree arg1 = CALL_EXPR_ARG (exp, 1);
28959 rtx op0 = expand_normal (arg0);
28960 rtx op1 = expand_normal (arg1);
28961 enum machine_mode tmode = insn_data[icode].operand[0].mode;
28962 enum machine_mode mode0 = insn_data[icode].operand[1].mode;
28963 enum machine_mode mode1 = insn_data[icode].operand[2].mode;
28964
28965 if (VECTOR_MODE_P (mode0))
28966 op0 = safe_vector_operand (op0, mode0);
28967 if (VECTOR_MODE_P (mode1))
28968 op1 = safe_vector_operand (op1, mode1);
28969
28970 if (optimize || !target
28971 || GET_MODE (target) != tmode
28972 || !insn_data[icode].operand[0].predicate (target, tmode))
28973 target = gen_reg_rtx (tmode);
28974
28975 if (GET_MODE (op1) == SImode && mode1 == TImode)
28976 {
28977 rtx x = gen_reg_rtx (V4SImode);
28978 emit_insn (gen_sse2_loadd (x, op1));
28979 op1 = gen_lowpart (TImode, x);
28980 }
28981
28982 if (!insn_data[icode].operand[1].predicate (op0, mode0))
28983 op0 = copy_to_mode_reg (mode0, op0);
28984 if (!insn_data[icode].operand[2].predicate (op1, mode1))
28985 op1 = copy_to_mode_reg (mode1, op1);
28986
28987 pat = GEN_FCN (icode) (target, op0, op1);
28988 if (! pat)
28989 return 0;
28990
28991 emit_insn (pat);
28992
28993 return target;
28994 }
28995
28996 /* Subroutine of ix86_expand_builtin to take care of 2-4 argument insns. */
28997
28998 static rtx
28999 ix86_expand_multi_arg_builtin (enum insn_code icode, tree exp, rtx target,
29000 enum ix86_builtin_func_type m_type,
29001 enum rtx_code sub_code)
29002 {
29003 rtx pat;
29004 int i;
29005 int nargs;
29006 bool comparison_p = false;
29007 bool tf_p = false;
29008 bool last_arg_constant = false;
29009 int num_memory = 0;
29010 struct {
29011 rtx op;
29012 enum machine_mode mode;
29013 } args[4];
29014
29015 enum machine_mode tmode = insn_data[icode].operand[0].mode;
29016
29017 switch (m_type)
29018 {
29019 case MULTI_ARG_4_DF2_DI_I:
29020 case MULTI_ARG_4_DF2_DI_I1:
29021 case MULTI_ARG_4_SF2_SI_I:
29022 case MULTI_ARG_4_SF2_SI_I1:
29023 nargs = 4;
29024 last_arg_constant = true;
29025 break;
29026
29027 case MULTI_ARG_3_SF:
29028 case MULTI_ARG_3_DF:
29029 case MULTI_ARG_3_SF2:
29030 case MULTI_ARG_3_DF2:
29031 case MULTI_ARG_3_DI:
29032 case MULTI_ARG_3_SI:
29033 case MULTI_ARG_3_SI_DI:
29034 case MULTI_ARG_3_HI:
29035 case MULTI_ARG_3_HI_SI:
29036 case MULTI_ARG_3_QI:
29037 case MULTI_ARG_3_DI2:
29038 case MULTI_ARG_3_SI2:
29039 case MULTI_ARG_3_HI2:
29040 case MULTI_ARG_3_QI2:
29041 nargs = 3;
29042 break;
29043
29044 case MULTI_ARG_2_SF:
29045 case MULTI_ARG_2_DF:
29046 case MULTI_ARG_2_DI:
29047 case MULTI_ARG_2_SI:
29048 case MULTI_ARG_2_HI:
29049 case MULTI_ARG_2_QI:
29050 nargs = 2;
29051 break;
29052
29053 case MULTI_ARG_2_DI_IMM:
29054 case MULTI_ARG_2_SI_IMM:
29055 case MULTI_ARG_2_HI_IMM:
29056 case MULTI_ARG_2_QI_IMM:
29057 nargs = 2;
29058 last_arg_constant = true;
29059 break;
29060
29061 case MULTI_ARG_1_SF:
29062 case MULTI_ARG_1_DF:
29063 case MULTI_ARG_1_SF2:
29064 case MULTI_ARG_1_DF2:
29065 case MULTI_ARG_1_DI:
29066 case MULTI_ARG_1_SI:
29067 case MULTI_ARG_1_HI:
29068 case MULTI_ARG_1_QI:
29069 case MULTI_ARG_1_SI_DI:
29070 case MULTI_ARG_1_HI_DI:
29071 case MULTI_ARG_1_HI_SI:
29072 case MULTI_ARG_1_QI_DI:
29073 case MULTI_ARG_1_QI_SI:
29074 case MULTI_ARG_1_QI_HI:
29075 nargs = 1;
29076 break;
29077
29078 case MULTI_ARG_2_DI_CMP:
29079 case MULTI_ARG_2_SI_CMP:
29080 case MULTI_ARG_2_HI_CMP:
29081 case MULTI_ARG_2_QI_CMP:
29082 nargs = 2;
29083 comparison_p = true;
29084 break;
29085
29086 case MULTI_ARG_2_SF_TF:
29087 case MULTI_ARG_2_DF_TF:
29088 case MULTI_ARG_2_DI_TF:
29089 case MULTI_ARG_2_SI_TF:
29090 case MULTI_ARG_2_HI_TF:
29091 case MULTI_ARG_2_QI_TF:
29092 nargs = 2;
29093 tf_p = true;
29094 break;
29095
29096 default:
29097 gcc_unreachable ();
29098 }
29099
29100 if (optimize || !target
29101 || GET_MODE (target) != tmode
29102 || !insn_data[icode].operand[0].predicate (target, tmode))
29103 target = gen_reg_rtx (tmode);
29104
29105 gcc_assert (nargs <= 4);
29106
29107 for (i = 0; i < nargs; i++)
29108 {
29109 tree arg = CALL_EXPR_ARG (exp, i);
29110 rtx op = expand_normal (arg);
29111 int adjust = (comparison_p) ? 1 : 0;
29112 enum machine_mode mode = insn_data[icode].operand[i+adjust+1].mode;
29113
29114 if (last_arg_constant && i == nargs - 1)
29115 {
29116 if (!insn_data[icode].operand[i + 1].predicate (op, mode))
29117 {
29118 enum insn_code new_icode = icode;
29119 switch (icode)
29120 {
29121 case CODE_FOR_xop_vpermil2v2df3:
29122 case CODE_FOR_xop_vpermil2v4sf3:
29123 case CODE_FOR_xop_vpermil2v4df3:
29124 case CODE_FOR_xop_vpermil2v8sf3:
29125 error ("the last argument must be a 2-bit immediate");
29126 return gen_reg_rtx (tmode);
29127 case CODE_FOR_xop_rotlv2di3:
29128 new_icode = CODE_FOR_rotlv2di3;
29129 goto xop_rotl;
29130 case CODE_FOR_xop_rotlv4si3:
29131 new_icode = CODE_FOR_rotlv4si3;
29132 goto xop_rotl;
29133 case CODE_FOR_xop_rotlv8hi3:
29134 new_icode = CODE_FOR_rotlv8hi3;
29135 goto xop_rotl;
29136 case CODE_FOR_xop_rotlv16qi3:
29137 new_icode = CODE_FOR_rotlv16qi3;
29138 xop_rotl:
29139 if (CONST_INT_P (op))
29140 {
29141 int mask = GET_MODE_BITSIZE (GET_MODE_INNER (tmode)) - 1;
29142 op = GEN_INT (INTVAL (op) & mask);
29143 gcc_checking_assert
29144 (insn_data[icode].operand[i + 1].predicate (op, mode));
29145 }
29146 else
29147 {
29148 gcc_checking_assert
29149 (nargs == 2
29150 && insn_data[new_icode].operand[0].mode == tmode
29151 && insn_data[new_icode].operand[1].mode == tmode
29152 && insn_data[new_icode].operand[2].mode == mode
29153 && insn_data[new_icode].operand[0].predicate
29154 == insn_data[icode].operand[0].predicate
29155 && insn_data[new_icode].operand[1].predicate
29156 == insn_data[icode].operand[1].predicate);
29157 icode = new_icode;
29158 goto non_constant;
29159 }
29160 break;
29161 default:
29162 gcc_unreachable ();
29163 }
29164 }
29165 }
29166 else
29167 {
29168 non_constant:
29169 if (VECTOR_MODE_P (mode))
29170 op = safe_vector_operand (op, mode);
29171
29172 /* If we aren't optimizing, only allow one memory operand to be
29173 generated. */
29174 if (memory_operand (op, mode))
29175 num_memory++;
29176
29177 gcc_assert (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode);
29178
29179 if (optimize
29180 || !insn_data[icode].operand[i+adjust+1].predicate (op, mode)
29181 || num_memory > 1)
29182 op = force_reg (mode, op);
29183 }
29184
29185 args[i].op = op;
29186 args[i].mode = mode;
29187 }
29188
29189 switch (nargs)
29190 {
29191 case 1:
29192 pat = GEN_FCN (icode) (target, args[0].op);
29193 break;
29194
29195 case 2:
29196 if (tf_p)
29197 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
29198 GEN_INT ((int)sub_code));
29199 else if (! comparison_p)
29200 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
29201 else
29202 {
29203 rtx cmp_op = gen_rtx_fmt_ee (sub_code, GET_MODE (target),
29204 args[0].op,
29205 args[1].op);
29206
29207 pat = GEN_FCN (icode) (target, cmp_op, args[0].op, args[1].op);
29208 }
29209 break;
29210
29211 case 3:
29212 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
29213 break;
29214
29215 case 4:
29216 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op, args[3].op);
29217 break;
29218
29219 default:
29220 gcc_unreachable ();
29221 }
29222
29223 if (! pat)
29224 return 0;
29225
29226 emit_insn (pat);
29227 return target;
29228 }
29229
29230 /* Subroutine of ix86_expand_args_builtin to take care of scalar unop
29231 insns with vec_merge. */
29232
29233 static rtx
29234 ix86_expand_unop_vec_merge_builtin (enum insn_code icode, tree exp,
29235 rtx target)
29236 {
29237 rtx pat;
29238 tree arg0 = CALL_EXPR_ARG (exp, 0);
29239 rtx op1, op0 = expand_normal (arg0);
29240 enum machine_mode tmode = insn_data[icode].operand[0].mode;
29241 enum machine_mode mode0 = insn_data[icode].operand[1].mode;
29242
29243 if (optimize || !target
29244 || GET_MODE (target) != tmode
29245 || !insn_data[icode].operand[0].predicate (target, tmode))
29246 target = gen_reg_rtx (tmode);
29247
29248 if (VECTOR_MODE_P (mode0))
29249 op0 = safe_vector_operand (op0, mode0);
29250
29251 if ((optimize && !register_operand (op0, mode0))
29252 || !insn_data[icode].operand[1].predicate (op0, mode0))
29253 op0 = copy_to_mode_reg (mode0, op0);
29254
29255 op1 = op0;
29256 if (!insn_data[icode].operand[2].predicate (op1, mode0))
29257 op1 = copy_to_mode_reg (mode0, op1);
29258
29259 pat = GEN_FCN (icode) (target, op0, op1);
29260 if (! pat)
29261 return 0;
29262 emit_insn (pat);
29263 return target;
29264 }
29265
29266 /* Subroutine of ix86_expand_builtin to take care of comparison insns. */
29267
29268 static rtx
29269 ix86_expand_sse_compare (const struct builtin_description *d,
29270 tree exp, rtx target, bool swap)
29271 {
29272 rtx pat;
29273 tree arg0 = CALL_EXPR_ARG (exp, 0);
29274 tree arg1 = CALL_EXPR_ARG (exp, 1);
29275 rtx op0 = expand_normal (arg0);
29276 rtx op1 = expand_normal (arg1);
29277 rtx op2;
29278 enum machine_mode tmode = insn_data[d->icode].operand[0].mode;
29279 enum machine_mode mode0 = insn_data[d->icode].operand[1].mode;
29280 enum machine_mode mode1 = insn_data[d->icode].operand[2].mode;
29281 enum rtx_code comparison = d->comparison;
29282
29283 if (VECTOR_MODE_P (mode0))
29284 op0 = safe_vector_operand (op0, mode0);
29285 if (VECTOR_MODE_P (mode1))
29286 op1 = safe_vector_operand (op1, mode1);
29287
29288 /* Swap operands if we have a comparison that isn't available in
29289 hardware. */
29290 if (swap)
29291 {
29292 rtx tmp = gen_reg_rtx (mode1);
29293 emit_move_insn (tmp, op1);
29294 op1 = op0;
29295 op0 = tmp;
29296 }
29297
29298 if (optimize || !target
29299 || GET_MODE (target) != tmode
29300 || !insn_data[d->icode].operand[0].predicate (target, tmode))
29301 target = gen_reg_rtx (tmode);
29302
29303 if ((optimize && !register_operand (op0, mode0))
29304 || !insn_data[d->icode].operand[1].predicate (op0, mode0))
29305 op0 = copy_to_mode_reg (mode0, op0);
29306 if ((optimize && !register_operand (op1, mode1))
29307 || !insn_data[d->icode].operand[2].predicate (op1, mode1))
29308 op1 = copy_to_mode_reg (mode1, op1);
29309
29310 op2 = gen_rtx_fmt_ee (comparison, mode0, op0, op1);
29311 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
29312 if (! pat)
29313 return 0;
29314 emit_insn (pat);
29315 return target;
29316 }
29317
29318 /* Subroutine of ix86_expand_builtin to take care of comi insns. */
29319
29320 static rtx
29321 ix86_expand_sse_comi (const struct builtin_description *d, tree exp,
29322 rtx target)
29323 {
29324 rtx pat;
29325 tree arg0 = CALL_EXPR_ARG (exp, 0);
29326 tree arg1 = CALL_EXPR_ARG (exp, 1);
29327 rtx op0 = expand_normal (arg0);
29328 rtx op1 = expand_normal (arg1);
29329 enum machine_mode mode0 = insn_data[d->icode].operand[0].mode;
29330 enum machine_mode mode1 = insn_data[d->icode].operand[1].mode;
29331 enum rtx_code comparison = d->comparison;
29332
29333 if (VECTOR_MODE_P (mode0))
29334 op0 = safe_vector_operand (op0, mode0);
29335 if (VECTOR_MODE_P (mode1))
29336 op1 = safe_vector_operand (op1, mode1);
29337
29338 /* Swap operands if we have a comparison that isn't available in
29339 hardware. */
29340 if (d->flag & BUILTIN_DESC_SWAP_OPERANDS)
29341 {
29342 rtx tmp = op1;
29343 op1 = op0;
29344 op0 = tmp;
29345 }
29346
29347 target = gen_reg_rtx (SImode);
29348 emit_move_insn (target, const0_rtx);
29349 target = gen_rtx_SUBREG (QImode, target, 0);
29350
29351 if ((optimize && !register_operand (op0, mode0))
29352 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
29353 op0 = copy_to_mode_reg (mode0, op0);
29354 if ((optimize && !register_operand (op1, mode1))
29355 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
29356 op1 = copy_to_mode_reg (mode1, op1);
29357
29358 pat = GEN_FCN (d->icode) (op0, op1);
29359 if (! pat)
29360 return 0;
29361 emit_insn (pat);
29362 emit_insn (gen_rtx_SET (VOIDmode,
29363 gen_rtx_STRICT_LOW_PART (VOIDmode, target),
29364 gen_rtx_fmt_ee (comparison, QImode,
29365 SET_DEST (pat),
29366 const0_rtx)));
29367
29368 return SUBREG_REG (target);
29369 }
29370
29371 /* Subroutines of ix86_expand_args_builtin to take care of round insns. */
29372
29373 static rtx
29374 ix86_expand_sse_round (const struct builtin_description *d, tree exp,
29375 rtx target)
29376 {
29377 rtx pat;
29378 tree arg0 = CALL_EXPR_ARG (exp, 0);
29379 rtx op1, op0 = expand_normal (arg0);
29380 enum machine_mode tmode = insn_data[d->icode].operand[0].mode;
29381 enum machine_mode mode0 = insn_data[d->icode].operand[1].mode;
29382
29383 if (optimize || target == 0
29384 || GET_MODE (target) != tmode
29385 || !insn_data[d->icode].operand[0].predicate (target, tmode))
29386 target = gen_reg_rtx (tmode);
29387
29388 if (VECTOR_MODE_P (mode0))
29389 op0 = safe_vector_operand (op0, mode0);
29390
29391 if ((optimize && !register_operand (op0, mode0))
29392 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
29393 op0 = copy_to_mode_reg (mode0, op0);
29394
29395 op1 = GEN_INT (d->comparison);
29396
29397 pat = GEN_FCN (d->icode) (target, op0, op1);
29398 if (! pat)
29399 return 0;
29400 emit_insn (pat);
29401 return target;
29402 }
29403
29404 static rtx
29405 ix86_expand_sse_round_vec_pack_sfix (const struct builtin_description *d,
29406 tree exp, rtx target)
29407 {
29408 rtx pat;
29409 tree arg0 = CALL_EXPR_ARG (exp, 0);
29410 tree arg1 = CALL_EXPR_ARG (exp, 1);
29411 rtx op0 = expand_normal (arg0);
29412 rtx op1 = expand_normal (arg1);
29413 rtx op2;
29414 enum machine_mode tmode = insn_data[d->icode].operand[0].mode;
29415 enum machine_mode mode0 = insn_data[d->icode].operand[1].mode;
29416 enum machine_mode mode1 = insn_data[d->icode].operand[2].mode;
29417
29418 if (optimize || target == 0
29419 || GET_MODE (target) != tmode
29420 || !insn_data[d->icode].operand[0].predicate (target, tmode))
29421 target = gen_reg_rtx (tmode);
29422
29423 op0 = safe_vector_operand (op0, mode0);
29424 op1 = safe_vector_operand (op1, mode1);
29425
29426 if ((optimize && !register_operand (op0, mode0))
29427 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
29428 op0 = copy_to_mode_reg (mode0, op0);
29429 if ((optimize && !register_operand (op1, mode1))
29430 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
29431 op1 = copy_to_mode_reg (mode1, op1);
29432
29433 op2 = GEN_INT (d->comparison);
29434
29435 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
29436 if (! pat)
29437 return 0;
29438 emit_insn (pat);
29439 return target;
29440 }
29441
29442 /* Subroutine of ix86_expand_builtin to take care of ptest insns. */
29443
29444 static rtx
29445 ix86_expand_sse_ptest (const struct builtin_description *d, tree exp,
29446 rtx target)
29447 {
29448 rtx pat;
29449 tree arg0 = CALL_EXPR_ARG (exp, 0);
29450 tree arg1 = CALL_EXPR_ARG (exp, 1);
29451 rtx op0 = expand_normal (arg0);
29452 rtx op1 = expand_normal (arg1);
29453 enum machine_mode mode0 = insn_data[d->icode].operand[0].mode;
29454 enum machine_mode mode1 = insn_data[d->icode].operand[1].mode;
29455 enum rtx_code comparison = d->comparison;
29456
29457 if (VECTOR_MODE_P (mode0))
29458 op0 = safe_vector_operand (op0, mode0);
29459 if (VECTOR_MODE_P (mode1))
29460 op1 = safe_vector_operand (op1, mode1);
29461
29462 target = gen_reg_rtx (SImode);
29463 emit_move_insn (target, const0_rtx);
29464 target = gen_rtx_SUBREG (QImode, target, 0);
29465
29466 if ((optimize && !register_operand (op0, mode0))
29467 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
29468 op0 = copy_to_mode_reg (mode0, op0);
29469 if ((optimize && !register_operand (op1, mode1))
29470 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
29471 op1 = copy_to_mode_reg (mode1, op1);
29472
29473 pat = GEN_FCN (d->icode) (op0, op1);
29474 if (! pat)
29475 return 0;
29476 emit_insn (pat);
29477 emit_insn (gen_rtx_SET (VOIDmode,
29478 gen_rtx_STRICT_LOW_PART (VOIDmode, target),
29479 gen_rtx_fmt_ee (comparison, QImode,
29480 SET_DEST (pat),
29481 const0_rtx)));
29482
29483 return SUBREG_REG (target);
29484 }
29485
29486 /* Subroutine of ix86_expand_builtin to take care of pcmpestr[im] insns. */
29487
29488 static rtx
29489 ix86_expand_sse_pcmpestr (const struct builtin_description *d,
29490 tree exp, rtx target)
29491 {
29492 rtx pat;
29493 tree arg0 = CALL_EXPR_ARG (exp, 0);
29494 tree arg1 = CALL_EXPR_ARG (exp, 1);
29495 tree arg2 = CALL_EXPR_ARG (exp, 2);
29496 tree arg3 = CALL_EXPR_ARG (exp, 3);
29497 tree arg4 = CALL_EXPR_ARG (exp, 4);
29498 rtx scratch0, scratch1;
29499 rtx op0 = expand_normal (arg0);
29500 rtx op1 = expand_normal (arg1);
29501 rtx op2 = expand_normal (arg2);
29502 rtx op3 = expand_normal (arg3);
29503 rtx op4 = expand_normal (arg4);
29504 enum machine_mode tmode0, tmode1, modev2, modei3, modev4, modei5, modeimm;
29505
29506 tmode0 = insn_data[d->icode].operand[0].mode;
29507 tmode1 = insn_data[d->icode].operand[1].mode;
29508 modev2 = insn_data[d->icode].operand[2].mode;
29509 modei3 = insn_data[d->icode].operand[3].mode;
29510 modev4 = insn_data[d->icode].operand[4].mode;
29511 modei5 = insn_data[d->icode].operand[5].mode;
29512 modeimm = insn_data[d->icode].operand[6].mode;
29513
29514 if (VECTOR_MODE_P (modev2))
29515 op0 = safe_vector_operand (op0, modev2);
29516 if (VECTOR_MODE_P (modev4))
29517 op2 = safe_vector_operand (op2, modev4);
29518
29519 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
29520 op0 = copy_to_mode_reg (modev2, op0);
29521 if (!insn_data[d->icode].operand[3].predicate (op1, modei3))
29522 op1 = copy_to_mode_reg (modei3, op1);
29523 if ((optimize && !register_operand (op2, modev4))
29524 || !insn_data[d->icode].operand[4].predicate (op2, modev4))
29525 op2 = copy_to_mode_reg (modev4, op2);
29526 if (!insn_data[d->icode].operand[5].predicate (op3, modei5))
29527 op3 = copy_to_mode_reg (modei5, op3);
29528
29529 if (!insn_data[d->icode].operand[6].predicate (op4, modeimm))
29530 {
29531 error ("the fifth argument must be an 8-bit immediate");
29532 return const0_rtx;
29533 }
29534
29535 if (d->code == IX86_BUILTIN_PCMPESTRI128)
29536 {
29537 if (optimize || !target
29538 || GET_MODE (target) != tmode0
29539 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
29540 target = gen_reg_rtx (tmode0);
29541
29542 scratch1 = gen_reg_rtx (tmode1);
29543
29544 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2, op3, op4);
29545 }
29546 else if (d->code == IX86_BUILTIN_PCMPESTRM128)
29547 {
29548 if (optimize || !target
29549 || GET_MODE (target) != tmode1
29550 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
29551 target = gen_reg_rtx (tmode1);
29552
29553 scratch0 = gen_reg_rtx (tmode0);
29554
29555 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2, op3, op4);
29556 }
29557 else
29558 {
29559 gcc_assert (d->flag);
29560
29561 scratch0 = gen_reg_rtx (tmode0);
29562 scratch1 = gen_reg_rtx (tmode1);
29563
29564 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2, op3, op4);
29565 }
29566
29567 if (! pat)
29568 return 0;
29569
29570 emit_insn (pat);
29571
29572 if (d->flag)
29573 {
29574 target = gen_reg_rtx (SImode);
29575 emit_move_insn (target, const0_rtx);
29576 target = gen_rtx_SUBREG (QImode, target, 0);
29577
29578 emit_insn
29579 (gen_rtx_SET (VOIDmode, gen_rtx_STRICT_LOW_PART (VOIDmode, target),
29580 gen_rtx_fmt_ee (EQ, QImode,
29581 gen_rtx_REG ((enum machine_mode) d->flag,
29582 FLAGS_REG),
29583 const0_rtx)));
29584 return SUBREG_REG (target);
29585 }
29586 else
29587 return target;
29588 }
29589
29590
29591 /* Subroutine of ix86_expand_builtin to take care of pcmpistr[im] insns. */
29592
29593 static rtx
29594 ix86_expand_sse_pcmpistr (const struct builtin_description *d,
29595 tree exp, rtx target)
29596 {
29597 rtx pat;
29598 tree arg0 = CALL_EXPR_ARG (exp, 0);
29599 tree arg1 = CALL_EXPR_ARG (exp, 1);
29600 tree arg2 = CALL_EXPR_ARG (exp, 2);
29601 rtx scratch0, scratch1;
29602 rtx op0 = expand_normal (arg0);
29603 rtx op1 = expand_normal (arg1);
29604 rtx op2 = expand_normal (arg2);
29605 enum machine_mode tmode0, tmode1, modev2, modev3, modeimm;
29606
29607 tmode0 = insn_data[d->icode].operand[0].mode;
29608 tmode1 = insn_data[d->icode].operand[1].mode;
29609 modev2 = insn_data[d->icode].operand[2].mode;
29610 modev3 = insn_data[d->icode].operand[3].mode;
29611 modeimm = insn_data[d->icode].operand[4].mode;
29612
29613 if (VECTOR_MODE_P (modev2))
29614 op0 = safe_vector_operand (op0, modev2);
29615 if (VECTOR_MODE_P (modev3))
29616 op1 = safe_vector_operand (op1, modev3);
29617
29618 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
29619 op0 = copy_to_mode_reg (modev2, op0);
29620 if ((optimize && !register_operand (op1, modev3))
29621 || !insn_data[d->icode].operand[3].predicate (op1, modev3))
29622 op1 = copy_to_mode_reg (modev3, op1);
29623
29624 if (!insn_data[d->icode].operand[4].predicate (op2, modeimm))
29625 {
29626 error ("the third argument must be an 8-bit immediate");
29627 return const0_rtx;
29628 }
29629
29630 if (d->code == IX86_BUILTIN_PCMPISTRI128)
29631 {
29632 if (optimize || !target
29633 || GET_MODE (target) != tmode0
29634 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
29635 target = gen_reg_rtx (tmode0);
29636
29637 scratch1 = gen_reg_rtx (tmode1);
29638
29639 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2);
29640 }
29641 else if (d->code == IX86_BUILTIN_PCMPISTRM128)
29642 {
29643 if (optimize || !target
29644 || GET_MODE (target) != tmode1
29645 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
29646 target = gen_reg_rtx (tmode1);
29647
29648 scratch0 = gen_reg_rtx (tmode0);
29649
29650 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2);
29651 }
29652 else
29653 {
29654 gcc_assert (d->flag);
29655
29656 scratch0 = gen_reg_rtx (tmode0);
29657 scratch1 = gen_reg_rtx (tmode1);
29658
29659 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2);
29660 }
29661
29662 if (! pat)
29663 return 0;
29664
29665 emit_insn (pat);
29666
29667 if (d->flag)
29668 {
29669 target = gen_reg_rtx (SImode);
29670 emit_move_insn (target, const0_rtx);
29671 target = gen_rtx_SUBREG (QImode, target, 0);
29672
29673 emit_insn
29674 (gen_rtx_SET (VOIDmode, gen_rtx_STRICT_LOW_PART (VOIDmode, target),
29675 gen_rtx_fmt_ee (EQ, QImode,
29676 gen_rtx_REG ((enum machine_mode) d->flag,
29677 FLAGS_REG),
29678 const0_rtx)));
29679 return SUBREG_REG (target);
29680 }
29681 else
29682 return target;
29683 }
29684
29685 /* Subroutine of ix86_expand_builtin to take care of insns with
29686 variable number of operands. */
29687
29688 static rtx
29689 ix86_expand_args_builtin (const struct builtin_description *d,
29690 tree exp, rtx target)
29691 {
29692 rtx pat, real_target;
29693 unsigned int i, nargs;
29694 unsigned int nargs_constant = 0;
29695 int num_memory = 0;
29696 struct
29697 {
29698 rtx op;
29699 enum machine_mode mode;
29700 } args[4];
29701 bool last_arg_count = false;
29702 enum insn_code icode = d->icode;
29703 const struct insn_data_d *insn_p = &insn_data[icode];
29704 enum machine_mode tmode = insn_p->operand[0].mode;
29705 enum machine_mode rmode = VOIDmode;
29706 bool swap = false;
29707 enum rtx_code comparison = d->comparison;
29708
29709 switch ((enum ix86_builtin_func_type) d->flag)
29710 {
29711 case V2DF_FTYPE_V2DF_ROUND:
29712 case V4DF_FTYPE_V4DF_ROUND:
29713 case V4SF_FTYPE_V4SF_ROUND:
29714 case V8SF_FTYPE_V8SF_ROUND:
29715 case V4SI_FTYPE_V4SF_ROUND:
29716 case V8SI_FTYPE_V8SF_ROUND:
29717 return ix86_expand_sse_round (d, exp, target);
29718 case V4SI_FTYPE_V2DF_V2DF_ROUND:
29719 case V8SI_FTYPE_V4DF_V4DF_ROUND:
29720 return ix86_expand_sse_round_vec_pack_sfix (d, exp, target);
29721 case INT_FTYPE_V8SF_V8SF_PTEST:
29722 case INT_FTYPE_V4DI_V4DI_PTEST:
29723 case INT_FTYPE_V4DF_V4DF_PTEST:
29724 case INT_FTYPE_V4SF_V4SF_PTEST:
29725 case INT_FTYPE_V2DI_V2DI_PTEST:
29726 case INT_FTYPE_V2DF_V2DF_PTEST:
29727 return ix86_expand_sse_ptest (d, exp, target);
29728 case FLOAT128_FTYPE_FLOAT128:
29729 case FLOAT_FTYPE_FLOAT:
29730 case INT_FTYPE_INT:
29731 case UINT64_FTYPE_INT:
29732 case UINT16_FTYPE_UINT16:
29733 case INT64_FTYPE_INT64:
29734 case INT64_FTYPE_V4SF:
29735 case INT64_FTYPE_V2DF:
29736 case INT_FTYPE_V16QI:
29737 case INT_FTYPE_V8QI:
29738 case INT_FTYPE_V8SF:
29739 case INT_FTYPE_V4DF:
29740 case INT_FTYPE_V4SF:
29741 case INT_FTYPE_V2DF:
29742 case INT_FTYPE_V32QI:
29743 case V16QI_FTYPE_V16QI:
29744 case V8SI_FTYPE_V8SF:
29745 case V8SI_FTYPE_V4SI:
29746 case V8HI_FTYPE_V8HI:
29747 case V8HI_FTYPE_V16QI:
29748 case V8QI_FTYPE_V8QI:
29749 case V8SF_FTYPE_V8SF:
29750 case V8SF_FTYPE_V8SI:
29751 case V8SF_FTYPE_V4SF:
29752 case V8SF_FTYPE_V8HI:
29753 case V4SI_FTYPE_V4SI:
29754 case V4SI_FTYPE_V16QI:
29755 case V4SI_FTYPE_V4SF:
29756 case V4SI_FTYPE_V8SI:
29757 case V4SI_FTYPE_V8HI:
29758 case V4SI_FTYPE_V4DF:
29759 case V4SI_FTYPE_V2DF:
29760 case V4HI_FTYPE_V4HI:
29761 case V4DF_FTYPE_V4DF:
29762 case V4DF_FTYPE_V4SI:
29763 case V4DF_FTYPE_V4SF:
29764 case V4DF_FTYPE_V2DF:
29765 case V4SF_FTYPE_V4SF:
29766 case V4SF_FTYPE_V4SI:
29767 case V4SF_FTYPE_V8SF:
29768 case V4SF_FTYPE_V4DF:
29769 case V4SF_FTYPE_V8HI:
29770 case V4SF_FTYPE_V2DF:
29771 case V2DI_FTYPE_V2DI:
29772 case V2DI_FTYPE_V16QI:
29773 case V2DI_FTYPE_V8HI:
29774 case V2DI_FTYPE_V4SI:
29775 case V2DF_FTYPE_V2DF:
29776 case V2DF_FTYPE_V4SI:
29777 case V2DF_FTYPE_V4DF:
29778 case V2DF_FTYPE_V4SF:
29779 case V2DF_FTYPE_V2SI:
29780 case V2SI_FTYPE_V2SI:
29781 case V2SI_FTYPE_V4SF:
29782 case V2SI_FTYPE_V2SF:
29783 case V2SI_FTYPE_V2DF:
29784 case V2SF_FTYPE_V2SF:
29785 case V2SF_FTYPE_V2SI:
29786 case V32QI_FTYPE_V32QI:
29787 case V32QI_FTYPE_V16QI:
29788 case V16HI_FTYPE_V16HI:
29789 case V16HI_FTYPE_V8HI:
29790 case V8SI_FTYPE_V8SI:
29791 case V16HI_FTYPE_V16QI:
29792 case V8SI_FTYPE_V16QI:
29793 case V4DI_FTYPE_V16QI:
29794 case V8SI_FTYPE_V8HI:
29795 case V4DI_FTYPE_V8HI:
29796 case V4DI_FTYPE_V4SI:
29797 case V4DI_FTYPE_V2DI:
29798 nargs = 1;
29799 break;
29800 case V4SF_FTYPE_V4SF_VEC_MERGE:
29801 case V2DF_FTYPE_V2DF_VEC_MERGE:
29802 return ix86_expand_unop_vec_merge_builtin (icode, exp, target);
29803 case FLOAT128_FTYPE_FLOAT128_FLOAT128:
29804 case V16QI_FTYPE_V16QI_V16QI:
29805 case V16QI_FTYPE_V8HI_V8HI:
29806 case V8QI_FTYPE_V8QI_V8QI:
29807 case V8QI_FTYPE_V4HI_V4HI:
29808 case V8HI_FTYPE_V8HI_V8HI:
29809 case V8HI_FTYPE_V16QI_V16QI:
29810 case V8HI_FTYPE_V4SI_V4SI:
29811 case V8SF_FTYPE_V8SF_V8SF:
29812 case V8SF_FTYPE_V8SF_V8SI:
29813 case V4SI_FTYPE_V4SI_V4SI:
29814 case V4SI_FTYPE_V8HI_V8HI:
29815 case V4SI_FTYPE_V4SF_V4SF:
29816 case V4SI_FTYPE_V2DF_V2DF:
29817 case V4HI_FTYPE_V4HI_V4HI:
29818 case V4HI_FTYPE_V8QI_V8QI:
29819 case V4HI_FTYPE_V2SI_V2SI:
29820 case V4DF_FTYPE_V4DF_V4DF:
29821 case V4DF_FTYPE_V4DF_V4DI:
29822 case V4SF_FTYPE_V4SF_V4SF:
29823 case V4SF_FTYPE_V4SF_V4SI:
29824 case V4SF_FTYPE_V4SF_V2SI:
29825 case V4SF_FTYPE_V4SF_V2DF:
29826 case V4SF_FTYPE_V4SF_DI:
29827 case V4SF_FTYPE_V4SF_SI:
29828 case V2DI_FTYPE_V2DI_V2DI:
29829 case V2DI_FTYPE_V16QI_V16QI:
29830 case V2DI_FTYPE_V4SI_V4SI:
29831 case V2UDI_FTYPE_V4USI_V4USI:
29832 case V2DI_FTYPE_V2DI_V16QI:
29833 case V2DI_FTYPE_V2DF_V2DF:
29834 case V2SI_FTYPE_V2SI_V2SI:
29835 case V2SI_FTYPE_V4HI_V4HI:
29836 case V2SI_FTYPE_V2SF_V2SF:
29837 case V2DF_FTYPE_V2DF_V2DF:
29838 case V2DF_FTYPE_V2DF_V4SF:
29839 case V2DF_FTYPE_V2DF_V2DI:
29840 case V2DF_FTYPE_V2DF_DI:
29841 case V2DF_FTYPE_V2DF_SI:
29842 case V2SF_FTYPE_V2SF_V2SF:
29843 case V1DI_FTYPE_V1DI_V1DI:
29844 case V1DI_FTYPE_V8QI_V8QI:
29845 case V1DI_FTYPE_V2SI_V2SI:
29846 case V32QI_FTYPE_V16HI_V16HI:
29847 case V16HI_FTYPE_V8SI_V8SI:
29848 case V32QI_FTYPE_V32QI_V32QI:
29849 case V16HI_FTYPE_V32QI_V32QI:
29850 case V16HI_FTYPE_V16HI_V16HI:
29851 case V8SI_FTYPE_V4DF_V4DF:
29852 case V8SI_FTYPE_V8SI_V8SI:
29853 case V8SI_FTYPE_V16HI_V16HI:
29854 case V4DI_FTYPE_V4DI_V4DI:
29855 case V4DI_FTYPE_V8SI_V8SI:
29856 case V4UDI_FTYPE_V8USI_V8USI:
29857 if (comparison == UNKNOWN)
29858 return ix86_expand_binop_builtin (icode, exp, target);
29859 nargs = 2;
29860 break;
29861 case V4SF_FTYPE_V4SF_V4SF_SWAP:
29862 case V2DF_FTYPE_V2DF_V2DF_SWAP:
29863 gcc_assert (comparison != UNKNOWN);
29864 nargs = 2;
29865 swap = true;
29866 break;
29867 case V16HI_FTYPE_V16HI_V8HI_COUNT:
29868 case V16HI_FTYPE_V16HI_SI_COUNT:
29869 case V8SI_FTYPE_V8SI_V4SI_COUNT:
29870 case V8SI_FTYPE_V8SI_SI_COUNT:
29871 case V4DI_FTYPE_V4DI_V2DI_COUNT:
29872 case V4DI_FTYPE_V4DI_INT_COUNT:
29873 case V8HI_FTYPE_V8HI_V8HI_COUNT:
29874 case V8HI_FTYPE_V8HI_SI_COUNT:
29875 case V4SI_FTYPE_V4SI_V4SI_COUNT:
29876 case V4SI_FTYPE_V4SI_SI_COUNT:
29877 case V4HI_FTYPE_V4HI_V4HI_COUNT:
29878 case V4HI_FTYPE_V4HI_SI_COUNT:
29879 case V2DI_FTYPE_V2DI_V2DI_COUNT:
29880 case V2DI_FTYPE_V2DI_SI_COUNT:
29881 case V2SI_FTYPE_V2SI_V2SI_COUNT:
29882 case V2SI_FTYPE_V2SI_SI_COUNT:
29883 case V1DI_FTYPE_V1DI_V1DI_COUNT:
29884 case V1DI_FTYPE_V1DI_SI_COUNT:
29885 nargs = 2;
29886 last_arg_count = true;
29887 break;
29888 case UINT64_FTYPE_UINT64_UINT64:
29889 case UINT_FTYPE_UINT_UINT:
29890 case UINT_FTYPE_UINT_USHORT:
29891 case UINT_FTYPE_UINT_UCHAR:
29892 case UINT16_FTYPE_UINT16_INT:
29893 case UINT8_FTYPE_UINT8_INT:
29894 nargs = 2;
29895 break;
29896 case V2DI_FTYPE_V2DI_INT_CONVERT:
29897 nargs = 2;
29898 rmode = V1TImode;
29899 nargs_constant = 1;
29900 break;
29901 case V4DI_FTYPE_V4DI_INT_CONVERT:
29902 nargs = 2;
29903 rmode = V2TImode;
29904 nargs_constant = 1;
29905 break;
29906 case V8HI_FTYPE_V8HI_INT:
29907 case V8HI_FTYPE_V8SF_INT:
29908 case V8HI_FTYPE_V4SF_INT:
29909 case V8SF_FTYPE_V8SF_INT:
29910 case V4SI_FTYPE_V4SI_INT:
29911 case V4SI_FTYPE_V8SI_INT:
29912 case V4HI_FTYPE_V4HI_INT:
29913 case V4DF_FTYPE_V4DF_INT:
29914 case V4SF_FTYPE_V4SF_INT:
29915 case V4SF_FTYPE_V8SF_INT:
29916 case V2DI_FTYPE_V2DI_INT:
29917 case V2DF_FTYPE_V2DF_INT:
29918 case V2DF_FTYPE_V4DF_INT:
29919 case V16HI_FTYPE_V16HI_INT:
29920 case V8SI_FTYPE_V8SI_INT:
29921 case V4DI_FTYPE_V4DI_INT:
29922 case V2DI_FTYPE_V4DI_INT:
29923 nargs = 2;
29924 nargs_constant = 1;
29925 break;
29926 case V16QI_FTYPE_V16QI_V16QI_V16QI:
29927 case V8SF_FTYPE_V8SF_V8SF_V8SF:
29928 case V4DF_FTYPE_V4DF_V4DF_V4DF:
29929 case V4SF_FTYPE_V4SF_V4SF_V4SF:
29930 case V2DF_FTYPE_V2DF_V2DF_V2DF:
29931 case V32QI_FTYPE_V32QI_V32QI_V32QI:
29932 nargs = 3;
29933 break;
29934 case V32QI_FTYPE_V32QI_V32QI_INT:
29935 case V16HI_FTYPE_V16HI_V16HI_INT:
29936 case V16QI_FTYPE_V16QI_V16QI_INT:
29937 case V4DI_FTYPE_V4DI_V4DI_INT:
29938 case V8HI_FTYPE_V8HI_V8HI_INT:
29939 case V8SI_FTYPE_V8SI_V8SI_INT:
29940 case V8SI_FTYPE_V8SI_V4SI_INT:
29941 case V8SF_FTYPE_V8SF_V8SF_INT:
29942 case V8SF_FTYPE_V8SF_V4SF_INT:
29943 case V4SI_FTYPE_V4SI_V4SI_INT:
29944 case V4DF_FTYPE_V4DF_V4DF_INT:
29945 case V4DF_FTYPE_V4DF_V2DF_INT:
29946 case V4SF_FTYPE_V4SF_V4SF_INT:
29947 case V2DI_FTYPE_V2DI_V2DI_INT:
29948 case V4DI_FTYPE_V4DI_V2DI_INT:
29949 case V2DF_FTYPE_V2DF_V2DF_INT:
29950 nargs = 3;
29951 nargs_constant = 1;
29952 break;
29953 case V4DI_FTYPE_V4DI_V4DI_INT_CONVERT:
29954 nargs = 3;
29955 rmode = V4DImode;
29956 nargs_constant = 1;
29957 break;
29958 case V2DI_FTYPE_V2DI_V2DI_INT_CONVERT:
29959 nargs = 3;
29960 rmode = V2DImode;
29961 nargs_constant = 1;
29962 break;
29963 case V1DI_FTYPE_V1DI_V1DI_INT_CONVERT:
29964 nargs = 3;
29965 rmode = DImode;
29966 nargs_constant = 1;
29967 break;
29968 case V2DI_FTYPE_V2DI_UINT_UINT:
29969 nargs = 3;
29970 nargs_constant = 2;
29971 break;
29972 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT:
29973 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT:
29974 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT:
29975 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT:
29976 nargs = 4;
29977 nargs_constant = 1;
29978 break;
29979 case V2DI_FTYPE_V2DI_V2DI_UINT_UINT:
29980 nargs = 4;
29981 nargs_constant = 2;
29982 break;
29983 case UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED:
29984 case UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG:
29985 nargs = 4;
29986 break;
29987 default:
29988 gcc_unreachable ();
29989 }
29990
29991 gcc_assert (nargs <= ARRAY_SIZE (args));
29992
29993 if (comparison != UNKNOWN)
29994 {
29995 gcc_assert (nargs == 2);
29996 return ix86_expand_sse_compare (d, exp, target, swap);
29997 }
29998
29999 if (rmode == VOIDmode || rmode == tmode)
30000 {
30001 if (optimize
30002 || target == 0
30003 || GET_MODE (target) != tmode
30004 || !insn_p->operand[0].predicate (target, tmode))
30005 target = gen_reg_rtx (tmode);
30006 real_target = target;
30007 }
30008 else
30009 {
30010 target = gen_reg_rtx (rmode);
30011 real_target = simplify_gen_subreg (tmode, target, rmode, 0);
30012 }
30013
30014 for (i = 0; i < nargs; i++)
30015 {
30016 tree arg = CALL_EXPR_ARG (exp, i);
30017 rtx op = expand_normal (arg);
30018 enum machine_mode mode = insn_p->operand[i + 1].mode;
30019 bool match = insn_p->operand[i + 1].predicate (op, mode);
30020
30021 if (last_arg_count && (i + 1) == nargs)
30022 {
30023 /* SIMD shift insns take either an 8-bit immediate or
30024 register as count. But builtin functions take int as
30025 count. If count doesn't match, we put it in register. */
30026 if (!match)
30027 {
30028 op = simplify_gen_subreg (SImode, op, GET_MODE (op), 0);
30029 if (!insn_p->operand[i + 1].predicate (op, mode))
30030 op = copy_to_reg (op);
30031 }
30032 }
30033 else if ((nargs - i) <= nargs_constant)
30034 {
30035 if (!match)
30036 switch (icode)
30037 {
30038 case CODE_FOR_avx2_inserti128:
30039 case CODE_FOR_avx2_extracti128:
30040 error ("the last argument must be an 1-bit immediate");
30041 return const0_rtx;
30042
30043 case CODE_FOR_sse4_1_roundsd:
30044 case CODE_FOR_sse4_1_roundss:
30045
30046 case CODE_FOR_sse4_1_roundpd:
30047 case CODE_FOR_sse4_1_roundps:
30048 case CODE_FOR_avx_roundpd256:
30049 case CODE_FOR_avx_roundps256:
30050
30051 case CODE_FOR_sse4_1_roundpd_vec_pack_sfix:
30052 case CODE_FOR_sse4_1_roundps_sfix:
30053 case CODE_FOR_avx_roundpd_vec_pack_sfix256:
30054 case CODE_FOR_avx_roundps_sfix256:
30055
30056 case CODE_FOR_sse4_1_blendps:
30057 case CODE_FOR_avx_blendpd256:
30058 case CODE_FOR_avx_vpermilv4df:
30059 error ("the last argument must be a 4-bit immediate");
30060 return const0_rtx;
30061
30062 case CODE_FOR_sse4_1_blendpd:
30063 case CODE_FOR_avx_vpermilv2df:
30064 case CODE_FOR_xop_vpermil2v2df3:
30065 case CODE_FOR_xop_vpermil2v4sf3:
30066 case CODE_FOR_xop_vpermil2v4df3:
30067 case CODE_FOR_xop_vpermil2v8sf3:
30068 error ("the last argument must be a 2-bit immediate");
30069 return const0_rtx;
30070
30071 case CODE_FOR_avx_vextractf128v4df:
30072 case CODE_FOR_avx_vextractf128v8sf:
30073 case CODE_FOR_avx_vextractf128v8si:
30074 case CODE_FOR_avx_vinsertf128v4df:
30075 case CODE_FOR_avx_vinsertf128v8sf:
30076 case CODE_FOR_avx_vinsertf128v8si:
30077 error ("the last argument must be a 1-bit immediate");
30078 return const0_rtx;
30079
30080 case CODE_FOR_avx_vmcmpv2df3:
30081 case CODE_FOR_avx_vmcmpv4sf3:
30082 case CODE_FOR_avx_cmpv2df3:
30083 case CODE_FOR_avx_cmpv4sf3:
30084 case CODE_FOR_avx_cmpv4df3:
30085 case CODE_FOR_avx_cmpv8sf3:
30086 error ("the last argument must be a 5-bit immediate");
30087 return const0_rtx;
30088
30089 default:
30090 switch (nargs_constant)
30091 {
30092 case 2:
30093 if ((nargs - i) == nargs_constant)
30094 {
30095 error ("the next to last argument must be an 8-bit immediate");
30096 break;
30097 }
30098 case 1:
30099 error ("the last argument must be an 8-bit immediate");
30100 break;
30101 default:
30102 gcc_unreachable ();
30103 }
30104 return const0_rtx;
30105 }
30106 }
30107 else
30108 {
30109 if (VECTOR_MODE_P (mode))
30110 op = safe_vector_operand (op, mode);
30111
30112 /* If we aren't optimizing, only allow one memory operand to
30113 be generated. */
30114 if (memory_operand (op, mode))
30115 num_memory++;
30116
30117 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
30118 {
30119 if (optimize || !match || num_memory > 1)
30120 op = copy_to_mode_reg (mode, op);
30121 }
30122 else
30123 {
30124 op = copy_to_reg (op);
30125 op = simplify_gen_subreg (mode, op, GET_MODE (op), 0);
30126 }
30127 }
30128
30129 args[i].op = op;
30130 args[i].mode = mode;
30131 }
30132
30133 switch (nargs)
30134 {
30135 case 1:
30136 pat = GEN_FCN (icode) (real_target, args[0].op);
30137 break;
30138 case 2:
30139 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op);
30140 break;
30141 case 3:
30142 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
30143 args[2].op);
30144 break;
30145 case 4:
30146 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
30147 args[2].op, args[3].op);
30148 break;
30149 default:
30150 gcc_unreachable ();
30151 }
30152
30153 if (! pat)
30154 return 0;
30155
30156 emit_insn (pat);
30157 return target;
30158 }
30159
30160 /* Subroutine of ix86_expand_builtin to take care of special insns
30161 with variable number of operands. */
30162
30163 static rtx
30164 ix86_expand_special_args_builtin (const struct builtin_description *d,
30165 tree exp, rtx target)
30166 {
30167 tree arg;
30168 rtx pat, op;
30169 unsigned int i, nargs, arg_adjust, memory;
30170 struct
30171 {
30172 rtx op;
30173 enum machine_mode mode;
30174 } args[3];
30175 enum insn_code icode = d->icode;
30176 bool last_arg_constant = false;
30177 const struct insn_data_d *insn_p = &insn_data[icode];
30178 enum machine_mode tmode = insn_p->operand[0].mode;
30179 enum { load, store } klass;
30180
30181 switch ((enum ix86_builtin_func_type) d->flag)
30182 {
30183 case VOID_FTYPE_VOID:
30184 if (icode == CODE_FOR_avx_vzeroupper)
30185 target = GEN_INT (vzeroupper_intrinsic);
30186 emit_insn (GEN_FCN (icode) (target));
30187 return 0;
30188 case VOID_FTYPE_UINT64:
30189 case VOID_FTYPE_UNSIGNED:
30190 nargs = 0;
30191 klass = store;
30192 memory = 0;
30193 break;
30194
30195 case INT_FTYPE_VOID:
30196 case UINT64_FTYPE_VOID:
30197 case UNSIGNED_FTYPE_VOID:
30198 nargs = 0;
30199 klass = load;
30200 memory = 0;
30201 break;
30202 case UINT64_FTYPE_PUNSIGNED:
30203 case V2DI_FTYPE_PV2DI:
30204 case V4DI_FTYPE_PV4DI:
30205 case V32QI_FTYPE_PCCHAR:
30206 case V16QI_FTYPE_PCCHAR:
30207 case V8SF_FTYPE_PCV4SF:
30208 case V8SF_FTYPE_PCFLOAT:
30209 case V4SF_FTYPE_PCFLOAT:
30210 case V4DF_FTYPE_PCV2DF:
30211 case V4DF_FTYPE_PCDOUBLE:
30212 case V2DF_FTYPE_PCDOUBLE:
30213 case VOID_FTYPE_PVOID:
30214 nargs = 1;
30215 klass = load;
30216 memory = 0;
30217 break;
30218 case VOID_FTYPE_PV2SF_V4SF:
30219 case VOID_FTYPE_PV4DI_V4DI:
30220 case VOID_FTYPE_PV2DI_V2DI:
30221 case VOID_FTYPE_PCHAR_V32QI:
30222 case VOID_FTYPE_PCHAR_V16QI:
30223 case VOID_FTYPE_PFLOAT_V8SF:
30224 case VOID_FTYPE_PFLOAT_V4SF:
30225 case VOID_FTYPE_PDOUBLE_V4DF:
30226 case VOID_FTYPE_PDOUBLE_V2DF:
30227 case VOID_FTYPE_PLONGLONG_LONGLONG:
30228 case VOID_FTYPE_PULONGLONG_ULONGLONG:
30229 case VOID_FTYPE_PINT_INT:
30230 nargs = 1;
30231 klass = store;
30232 /* Reserve memory operand for target. */
30233 memory = ARRAY_SIZE (args);
30234 break;
30235 case V4SF_FTYPE_V4SF_PCV2SF:
30236 case V2DF_FTYPE_V2DF_PCDOUBLE:
30237 nargs = 2;
30238 klass = load;
30239 memory = 1;
30240 break;
30241 case V8SF_FTYPE_PCV8SF_V8SI:
30242 case V4DF_FTYPE_PCV4DF_V4DI:
30243 case V4SF_FTYPE_PCV4SF_V4SI:
30244 case V2DF_FTYPE_PCV2DF_V2DI:
30245 case V8SI_FTYPE_PCV8SI_V8SI:
30246 case V4DI_FTYPE_PCV4DI_V4DI:
30247 case V4SI_FTYPE_PCV4SI_V4SI:
30248 case V2DI_FTYPE_PCV2DI_V2DI:
30249 nargs = 2;
30250 klass = load;
30251 memory = 0;
30252 break;
30253 case VOID_FTYPE_PV8SF_V8SI_V8SF:
30254 case VOID_FTYPE_PV4DF_V4DI_V4DF:
30255 case VOID_FTYPE_PV4SF_V4SI_V4SF:
30256 case VOID_FTYPE_PV2DF_V2DI_V2DF:
30257 case VOID_FTYPE_PV8SI_V8SI_V8SI:
30258 case VOID_FTYPE_PV4DI_V4DI_V4DI:
30259 case VOID_FTYPE_PV4SI_V4SI_V4SI:
30260 case VOID_FTYPE_PV2DI_V2DI_V2DI:
30261 nargs = 2;
30262 klass = store;
30263 /* Reserve memory operand for target. */
30264 memory = ARRAY_SIZE (args);
30265 break;
30266 case VOID_FTYPE_UINT_UINT_UINT:
30267 case VOID_FTYPE_UINT64_UINT_UINT:
30268 case UCHAR_FTYPE_UINT_UINT_UINT:
30269 case UCHAR_FTYPE_UINT64_UINT_UINT:
30270 nargs = 3;
30271 klass = load;
30272 memory = ARRAY_SIZE (args);
30273 last_arg_constant = true;
30274 break;
30275 default:
30276 gcc_unreachable ();
30277 }
30278
30279 gcc_assert (nargs <= ARRAY_SIZE (args));
30280
30281 if (klass == store)
30282 {
30283 arg = CALL_EXPR_ARG (exp, 0);
30284 op = expand_normal (arg);
30285 gcc_assert (target == 0);
30286 if (memory)
30287 {
30288 if (GET_MODE (op) != Pmode)
30289 op = convert_to_mode (Pmode, op, 1);
30290 target = gen_rtx_MEM (tmode, force_reg (Pmode, op));
30291 }
30292 else
30293 target = force_reg (tmode, op);
30294 arg_adjust = 1;
30295 }
30296 else
30297 {
30298 arg_adjust = 0;
30299 if (optimize
30300 || target == 0
30301 || !register_operand (target, tmode)
30302 || GET_MODE (target) != tmode)
30303 target = gen_reg_rtx (tmode);
30304 }
30305
30306 for (i = 0; i < nargs; i++)
30307 {
30308 enum machine_mode mode = insn_p->operand[i + 1].mode;
30309 bool match;
30310
30311 arg = CALL_EXPR_ARG (exp, i + arg_adjust);
30312 op = expand_normal (arg);
30313 match = insn_p->operand[i + 1].predicate (op, mode);
30314
30315 if (last_arg_constant && (i + 1) == nargs)
30316 {
30317 if (!match)
30318 {
30319 if (icode == CODE_FOR_lwp_lwpvalsi3
30320 || icode == CODE_FOR_lwp_lwpinssi3
30321 || icode == CODE_FOR_lwp_lwpvaldi3
30322 || icode == CODE_FOR_lwp_lwpinsdi3)
30323 error ("the last argument must be a 32-bit immediate");
30324 else
30325 error ("the last argument must be an 8-bit immediate");
30326 return const0_rtx;
30327 }
30328 }
30329 else
30330 {
30331 if (i == memory)
30332 {
30333 /* This must be the memory operand. */
30334 if (GET_MODE (op) != Pmode)
30335 op = convert_to_mode (Pmode, op, 1);
30336 op = gen_rtx_MEM (mode, force_reg (Pmode, op));
30337 gcc_assert (GET_MODE (op) == mode
30338 || GET_MODE (op) == VOIDmode);
30339 }
30340 else
30341 {
30342 /* This must be register. */
30343 if (VECTOR_MODE_P (mode))
30344 op = safe_vector_operand (op, mode);
30345
30346 gcc_assert (GET_MODE (op) == mode
30347 || GET_MODE (op) == VOIDmode);
30348 op = copy_to_mode_reg (mode, op);
30349 }
30350 }
30351
30352 args[i].op = op;
30353 args[i].mode = mode;
30354 }
30355
30356 switch (nargs)
30357 {
30358 case 0:
30359 pat = GEN_FCN (icode) (target);
30360 break;
30361 case 1:
30362 pat = GEN_FCN (icode) (target, args[0].op);
30363 break;
30364 case 2:
30365 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
30366 break;
30367 case 3:
30368 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
30369 break;
30370 default:
30371 gcc_unreachable ();
30372 }
30373
30374 if (! pat)
30375 return 0;
30376 emit_insn (pat);
30377 return klass == store ? 0 : target;
30378 }
30379
30380 /* Return the integer constant in ARG. Constrain it to be in the range
30381 of the subparts of VEC_TYPE; issue an error if not. */
30382
30383 static int
30384 get_element_number (tree vec_type, tree arg)
30385 {
30386 unsigned HOST_WIDE_INT elt, max = TYPE_VECTOR_SUBPARTS (vec_type) - 1;
30387
30388 if (!host_integerp (arg, 1)
30389 || (elt = tree_low_cst (arg, 1), elt > max))
30390 {
30391 error ("selector must be an integer constant in the range 0..%wi", max);
30392 return 0;
30393 }
30394
30395 return elt;
30396 }
30397
30398 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
30399 ix86_expand_vector_init. We DO have language-level syntax for this, in
30400 the form of (type){ init-list }. Except that since we can't place emms
30401 instructions from inside the compiler, we can't allow the use of MMX
30402 registers unless the user explicitly asks for it. So we do *not* define
30403 vec_set/vec_extract/vec_init patterns for MMX modes in mmx.md. Instead
30404 we have builtins invoked by mmintrin.h that gives us license to emit
30405 these sorts of instructions. */
30406
30407 static rtx
30408 ix86_expand_vec_init_builtin (tree type, tree exp, rtx target)
30409 {
30410 enum machine_mode tmode = TYPE_MODE (type);
30411 enum machine_mode inner_mode = GET_MODE_INNER (tmode);
30412 int i, n_elt = GET_MODE_NUNITS (tmode);
30413 rtvec v = rtvec_alloc (n_elt);
30414
30415 gcc_assert (VECTOR_MODE_P (tmode));
30416 gcc_assert (call_expr_nargs (exp) == n_elt);
30417
30418 for (i = 0; i < n_elt; ++i)
30419 {
30420 rtx x = expand_normal (CALL_EXPR_ARG (exp, i));
30421 RTVEC_ELT (v, i) = gen_lowpart (inner_mode, x);
30422 }
30423
30424 if (!target || !register_operand (target, tmode))
30425 target = gen_reg_rtx (tmode);
30426
30427 ix86_expand_vector_init (true, target, gen_rtx_PARALLEL (tmode, v));
30428 return target;
30429 }
30430
30431 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
30432 ix86_expand_vector_extract. They would be redundant (for non-MMX) if we
30433 had a language-level syntax for referencing vector elements. */
30434
30435 static rtx
30436 ix86_expand_vec_ext_builtin (tree exp, rtx target)
30437 {
30438 enum machine_mode tmode, mode0;
30439 tree arg0, arg1;
30440 int elt;
30441 rtx op0;
30442
30443 arg0 = CALL_EXPR_ARG (exp, 0);
30444 arg1 = CALL_EXPR_ARG (exp, 1);
30445
30446 op0 = expand_normal (arg0);
30447 elt = get_element_number (TREE_TYPE (arg0), arg1);
30448
30449 tmode = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
30450 mode0 = TYPE_MODE (TREE_TYPE (arg0));
30451 gcc_assert (VECTOR_MODE_P (mode0));
30452
30453 op0 = force_reg (mode0, op0);
30454
30455 if (optimize || !target || !register_operand (target, tmode))
30456 target = gen_reg_rtx (tmode);
30457
30458 ix86_expand_vector_extract (true, target, op0, elt);
30459
30460 return target;
30461 }
30462
30463 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
30464 ix86_expand_vector_set. They would be redundant (for non-MMX) if we had
30465 a language-level syntax for referencing vector elements. */
30466
30467 static rtx
30468 ix86_expand_vec_set_builtin (tree exp)
30469 {
30470 enum machine_mode tmode, mode1;
30471 tree arg0, arg1, arg2;
30472 int elt;
30473 rtx op0, op1, target;
30474
30475 arg0 = CALL_EXPR_ARG (exp, 0);
30476 arg1 = CALL_EXPR_ARG (exp, 1);
30477 arg2 = CALL_EXPR_ARG (exp, 2);
30478
30479 tmode = TYPE_MODE (TREE_TYPE (arg0));
30480 mode1 = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
30481 gcc_assert (VECTOR_MODE_P (tmode));
30482
30483 op0 = expand_expr (arg0, NULL_RTX, tmode, EXPAND_NORMAL);
30484 op1 = expand_expr (arg1, NULL_RTX, mode1, EXPAND_NORMAL);
30485 elt = get_element_number (TREE_TYPE (arg0), arg2);
30486
30487 if (GET_MODE (op1) != mode1 && GET_MODE (op1) != VOIDmode)
30488 op1 = convert_modes (mode1, GET_MODE (op1), op1, true);
30489
30490 op0 = force_reg (tmode, op0);
30491 op1 = force_reg (mode1, op1);
30492
30493 /* OP0 is the source of these builtin functions and shouldn't be
30494 modified. Create a copy, use it and return it as target. */
30495 target = gen_reg_rtx (tmode);
30496 emit_move_insn (target, op0);
30497 ix86_expand_vector_set (true, target, op1, elt);
30498
30499 return target;
30500 }
30501
30502 /* Expand an expression EXP that calls a built-in function,
30503 with result going to TARGET if that's convenient
30504 (and in mode MODE if that's convenient).
30505 SUBTARGET may be used as the target for computing one of EXP's operands.
30506 IGNORE is nonzero if the value is to be ignored. */
30507
30508 static rtx
30509 ix86_expand_builtin (tree exp, rtx target, rtx subtarget ATTRIBUTE_UNUSED,
30510 enum machine_mode mode ATTRIBUTE_UNUSED,
30511 int ignore ATTRIBUTE_UNUSED)
30512 {
30513 const struct builtin_description *d;
30514 size_t i;
30515 enum insn_code icode;
30516 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
30517 tree arg0, arg1, arg2, arg3, arg4;
30518 rtx op0, op1, op2, op3, op4, pat, insn;
30519 enum machine_mode mode0, mode1, mode2, mode3, mode4;
30520 unsigned int fcode = DECL_FUNCTION_CODE (fndecl);
30521
30522 /* For CPU builtins that can be folded, fold first and expand the fold. */
30523 switch (fcode)
30524 {
30525 case IX86_BUILTIN_CPU_INIT:
30526 {
30527 /* Make it call __cpu_indicator_init in libgcc. */
30528 tree call_expr, fndecl, type;
30529 type = build_function_type_list (integer_type_node, NULL_TREE);
30530 fndecl = build_fn_decl ("__cpu_indicator_init", type);
30531 call_expr = build_call_expr (fndecl, 0);
30532 return expand_expr (call_expr, target, mode, EXPAND_NORMAL);
30533 }
30534 case IX86_BUILTIN_CPU_IS:
30535 case IX86_BUILTIN_CPU_SUPPORTS:
30536 {
30537 tree arg0 = CALL_EXPR_ARG (exp, 0);
30538 tree fold_expr = fold_builtin_cpu (fndecl, &arg0);
30539 gcc_assert (fold_expr != NULL_TREE);
30540 return expand_expr (fold_expr, target, mode, EXPAND_NORMAL);
30541 }
30542 }
30543
30544 /* Determine whether the builtin function is available under the current ISA.
30545 Originally the builtin was not created if it wasn't applicable to the
30546 current ISA based on the command line switches. With function specific
30547 options, we need to check in the context of the function making the call
30548 whether it is supported. */
30549 if (ix86_builtins_isa[fcode].isa
30550 && !(ix86_builtins_isa[fcode].isa & ix86_isa_flags))
30551 {
30552 char *opts = ix86_target_string (ix86_builtins_isa[fcode].isa, 0, NULL,
30553 NULL, (enum fpmath_unit) 0, false);
30554
30555 if (!opts)
30556 error ("%qE needs unknown isa option", fndecl);
30557 else
30558 {
30559 gcc_assert (opts != NULL);
30560 error ("%qE needs isa option %s", fndecl, opts);
30561 free (opts);
30562 }
30563 return const0_rtx;
30564 }
30565
30566 switch (fcode)
30567 {
30568 case IX86_BUILTIN_MASKMOVQ:
30569 case IX86_BUILTIN_MASKMOVDQU:
30570 icode = (fcode == IX86_BUILTIN_MASKMOVQ
30571 ? CODE_FOR_mmx_maskmovq
30572 : CODE_FOR_sse2_maskmovdqu);
30573 /* Note the arg order is different from the operand order. */
30574 arg1 = CALL_EXPR_ARG (exp, 0);
30575 arg2 = CALL_EXPR_ARG (exp, 1);
30576 arg0 = CALL_EXPR_ARG (exp, 2);
30577 op0 = expand_normal (arg0);
30578 op1 = expand_normal (arg1);
30579 op2 = expand_normal (arg2);
30580 mode0 = insn_data[icode].operand[0].mode;
30581 mode1 = insn_data[icode].operand[1].mode;
30582 mode2 = insn_data[icode].operand[2].mode;
30583
30584 if (GET_MODE (op0) != Pmode)
30585 op0 = convert_to_mode (Pmode, op0, 1);
30586 op0 = gen_rtx_MEM (mode1, force_reg (Pmode, op0));
30587
30588 if (!insn_data[icode].operand[0].predicate (op0, mode0))
30589 op0 = copy_to_mode_reg (mode0, op0);
30590 if (!insn_data[icode].operand[1].predicate (op1, mode1))
30591 op1 = copy_to_mode_reg (mode1, op1);
30592 if (!insn_data[icode].operand[2].predicate (op2, mode2))
30593 op2 = copy_to_mode_reg (mode2, op2);
30594 pat = GEN_FCN (icode) (op0, op1, op2);
30595 if (! pat)
30596 return 0;
30597 emit_insn (pat);
30598 return 0;
30599
30600 case IX86_BUILTIN_LDMXCSR:
30601 op0 = expand_normal (CALL_EXPR_ARG (exp, 0));
30602 target = assign_386_stack_local (SImode, SLOT_VIRTUAL);
30603 emit_move_insn (target, op0);
30604 emit_insn (gen_sse_ldmxcsr (target));
30605 return 0;
30606
30607 case IX86_BUILTIN_STMXCSR:
30608 target = assign_386_stack_local (SImode, SLOT_VIRTUAL);
30609 emit_insn (gen_sse_stmxcsr (target));
30610 return copy_to_mode_reg (SImode, target);
30611
30612 case IX86_BUILTIN_CLFLUSH:
30613 arg0 = CALL_EXPR_ARG (exp, 0);
30614 op0 = expand_normal (arg0);
30615 icode = CODE_FOR_sse2_clflush;
30616 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
30617 {
30618 if (GET_MODE (op0) != Pmode)
30619 op0 = convert_to_mode (Pmode, op0, 1);
30620 op0 = force_reg (Pmode, op0);
30621 }
30622
30623 emit_insn (gen_sse2_clflush (op0));
30624 return 0;
30625
30626 case IX86_BUILTIN_MONITOR:
30627 arg0 = CALL_EXPR_ARG (exp, 0);
30628 arg1 = CALL_EXPR_ARG (exp, 1);
30629 arg2 = CALL_EXPR_ARG (exp, 2);
30630 op0 = expand_normal (arg0);
30631 op1 = expand_normal (arg1);
30632 op2 = expand_normal (arg2);
30633 if (!REG_P (op0))
30634 {
30635 if (GET_MODE (op0) != Pmode)
30636 op0 = convert_to_mode (Pmode, op0, 1);
30637 op0 = force_reg (Pmode, op0);
30638 }
30639 if (!REG_P (op1))
30640 op1 = copy_to_mode_reg (SImode, op1);
30641 if (!REG_P (op2))
30642 op2 = copy_to_mode_reg (SImode, op2);
30643 emit_insn (ix86_gen_monitor (op0, op1, op2));
30644 return 0;
30645
30646 case IX86_BUILTIN_MWAIT:
30647 arg0 = CALL_EXPR_ARG (exp, 0);
30648 arg1 = CALL_EXPR_ARG (exp, 1);
30649 op0 = expand_normal (arg0);
30650 op1 = expand_normal (arg1);
30651 if (!REG_P (op0))
30652 op0 = copy_to_mode_reg (SImode, op0);
30653 if (!REG_P (op1))
30654 op1 = copy_to_mode_reg (SImode, op1);
30655 emit_insn (gen_sse3_mwait (op0, op1));
30656 return 0;
30657
30658 case IX86_BUILTIN_VEC_INIT_V2SI:
30659 case IX86_BUILTIN_VEC_INIT_V4HI:
30660 case IX86_BUILTIN_VEC_INIT_V8QI:
30661 return ix86_expand_vec_init_builtin (TREE_TYPE (exp), exp, target);
30662
30663 case IX86_BUILTIN_VEC_EXT_V2DF:
30664 case IX86_BUILTIN_VEC_EXT_V2DI:
30665 case IX86_BUILTIN_VEC_EXT_V4SF:
30666 case IX86_BUILTIN_VEC_EXT_V4SI:
30667 case IX86_BUILTIN_VEC_EXT_V8HI:
30668 case IX86_BUILTIN_VEC_EXT_V2SI:
30669 case IX86_BUILTIN_VEC_EXT_V4HI:
30670 case IX86_BUILTIN_VEC_EXT_V16QI:
30671 return ix86_expand_vec_ext_builtin (exp, target);
30672
30673 case IX86_BUILTIN_VEC_SET_V2DI:
30674 case IX86_BUILTIN_VEC_SET_V4SF:
30675 case IX86_BUILTIN_VEC_SET_V4SI:
30676 case IX86_BUILTIN_VEC_SET_V8HI:
30677 case IX86_BUILTIN_VEC_SET_V4HI:
30678 case IX86_BUILTIN_VEC_SET_V16QI:
30679 return ix86_expand_vec_set_builtin (exp);
30680
30681 case IX86_BUILTIN_INFQ:
30682 case IX86_BUILTIN_HUGE_VALQ:
30683 {
30684 REAL_VALUE_TYPE inf;
30685 rtx tmp;
30686
30687 real_inf (&inf);
30688 tmp = CONST_DOUBLE_FROM_REAL_VALUE (inf, mode);
30689
30690 tmp = validize_mem (force_const_mem (mode, tmp));
30691
30692 if (target == 0)
30693 target = gen_reg_rtx (mode);
30694
30695 emit_move_insn (target, tmp);
30696 return target;
30697 }
30698
30699 case IX86_BUILTIN_RDPMC:
30700 case IX86_BUILTIN_RDTSC:
30701 case IX86_BUILTIN_RDTSCP:
30702
30703 op0 = gen_reg_rtx (DImode);
30704 op1 = gen_reg_rtx (DImode);
30705
30706 if (fcode == IX86_BUILTIN_RDPMC)
30707 {
30708 arg0 = CALL_EXPR_ARG (exp, 0);
30709 op2 = expand_normal (arg0);
30710 if (!register_operand (op2, SImode))
30711 op2 = copy_to_mode_reg (SImode, op2);
30712
30713 insn = (TARGET_64BIT
30714 ? gen_rdpmc_rex64 (op0, op1, op2)
30715 : gen_rdpmc (op0, op2));
30716 emit_insn (insn);
30717 }
30718 else if (fcode == IX86_BUILTIN_RDTSC)
30719 {
30720 insn = (TARGET_64BIT
30721 ? gen_rdtsc_rex64 (op0, op1)
30722 : gen_rdtsc (op0));
30723 emit_insn (insn);
30724 }
30725 else
30726 {
30727 op2 = gen_reg_rtx (SImode);
30728
30729 insn = (TARGET_64BIT
30730 ? gen_rdtscp_rex64 (op0, op1, op2)
30731 : gen_rdtscp (op0, op2));
30732 emit_insn (insn);
30733
30734 arg0 = CALL_EXPR_ARG (exp, 0);
30735 op4 = expand_normal (arg0);
30736 if (!address_operand (op4, VOIDmode))
30737 {
30738 op4 = convert_memory_address (Pmode, op4);
30739 op4 = copy_addr_to_reg (op4);
30740 }
30741 emit_move_insn (gen_rtx_MEM (SImode, op4), op2);
30742 }
30743
30744 if (target == 0)
30745 target = gen_reg_rtx (mode);
30746
30747 if (TARGET_64BIT)
30748 {
30749 op1 = expand_simple_binop (DImode, ASHIFT, op1, GEN_INT (32),
30750 op1, 1, OPTAB_DIRECT);
30751 op0 = expand_simple_binop (DImode, IOR, op0, op1,
30752 op0, 1, OPTAB_DIRECT);
30753 }
30754
30755 emit_move_insn (target, op0);
30756 return target;
30757
30758 case IX86_BUILTIN_FXSAVE:
30759 case IX86_BUILTIN_FXRSTOR:
30760 case IX86_BUILTIN_FXSAVE64:
30761 case IX86_BUILTIN_FXRSTOR64:
30762 switch (fcode)
30763 {
30764 case IX86_BUILTIN_FXSAVE:
30765 icode = CODE_FOR_fxsave;
30766 break;
30767 case IX86_BUILTIN_FXRSTOR:
30768 icode = CODE_FOR_fxrstor;
30769 break;
30770 case IX86_BUILTIN_FXSAVE64:
30771 icode = CODE_FOR_fxsave64;
30772 break;
30773 case IX86_BUILTIN_FXRSTOR64:
30774 icode = CODE_FOR_fxrstor64;
30775 break;
30776 default:
30777 gcc_unreachable ();
30778 }
30779
30780 arg0 = CALL_EXPR_ARG (exp, 0);
30781 op0 = expand_normal (arg0);
30782
30783 if (!address_operand (op0, VOIDmode))
30784 {
30785 op0 = convert_memory_address (Pmode, op0);
30786 op0 = copy_addr_to_reg (op0);
30787 }
30788 op0 = gen_rtx_MEM (BLKmode, op0);
30789
30790 pat = GEN_FCN (icode) (op0);
30791 if (pat)
30792 emit_insn (pat);
30793 return 0;
30794
30795 case IX86_BUILTIN_XSAVE:
30796 case IX86_BUILTIN_XRSTOR:
30797 case IX86_BUILTIN_XSAVE64:
30798 case IX86_BUILTIN_XRSTOR64:
30799 case IX86_BUILTIN_XSAVEOPT:
30800 case IX86_BUILTIN_XSAVEOPT64:
30801 arg0 = CALL_EXPR_ARG (exp, 0);
30802 arg1 = CALL_EXPR_ARG (exp, 1);
30803 op0 = expand_normal (arg0);
30804 op1 = expand_normal (arg1);
30805
30806 if (!address_operand (op0, VOIDmode))
30807 {
30808 op0 = convert_memory_address (Pmode, op0);
30809 op0 = copy_addr_to_reg (op0);
30810 }
30811 op0 = gen_rtx_MEM (BLKmode, op0);
30812
30813 op1 = force_reg (DImode, op1);
30814
30815 if (TARGET_64BIT)
30816 {
30817 op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
30818 NULL, 1, OPTAB_DIRECT);
30819 switch (fcode)
30820 {
30821 case IX86_BUILTIN_XSAVE:
30822 icode = CODE_FOR_xsave_rex64;
30823 break;
30824 case IX86_BUILTIN_XRSTOR:
30825 icode = CODE_FOR_xrstor_rex64;
30826 break;
30827 case IX86_BUILTIN_XSAVE64:
30828 icode = CODE_FOR_xsave64;
30829 break;
30830 case IX86_BUILTIN_XRSTOR64:
30831 icode = CODE_FOR_xrstor64;
30832 break;
30833 case IX86_BUILTIN_XSAVEOPT:
30834 icode = CODE_FOR_xsaveopt_rex64;
30835 break;
30836 case IX86_BUILTIN_XSAVEOPT64:
30837 icode = CODE_FOR_xsaveopt64;
30838 break;
30839 default:
30840 gcc_unreachable ();
30841 }
30842
30843 op2 = gen_lowpart (SImode, op2);
30844 op1 = gen_lowpart (SImode, op1);
30845 pat = GEN_FCN (icode) (op0, op1, op2);
30846 }
30847 else
30848 {
30849 switch (fcode)
30850 {
30851 case IX86_BUILTIN_XSAVE:
30852 icode = CODE_FOR_xsave;
30853 break;
30854 case IX86_BUILTIN_XRSTOR:
30855 icode = CODE_FOR_xrstor;
30856 break;
30857 case IX86_BUILTIN_XSAVEOPT:
30858 icode = CODE_FOR_xsaveopt;
30859 break;
30860 default:
30861 gcc_unreachable ();
30862 }
30863 pat = GEN_FCN (icode) (op0, op1);
30864 }
30865
30866 if (pat)
30867 emit_insn (pat);
30868 return 0;
30869
30870 case IX86_BUILTIN_LLWPCB:
30871 arg0 = CALL_EXPR_ARG (exp, 0);
30872 op0 = expand_normal (arg0);
30873 icode = CODE_FOR_lwp_llwpcb;
30874 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
30875 {
30876 if (GET_MODE (op0) != Pmode)
30877 op0 = convert_to_mode (Pmode, op0, 1);
30878 op0 = force_reg (Pmode, op0);
30879 }
30880 emit_insn (gen_lwp_llwpcb (op0));
30881 return 0;
30882
30883 case IX86_BUILTIN_SLWPCB:
30884 icode = CODE_FOR_lwp_slwpcb;
30885 if (!target
30886 || !insn_data[icode].operand[0].predicate (target, Pmode))
30887 target = gen_reg_rtx (Pmode);
30888 emit_insn (gen_lwp_slwpcb (target));
30889 return target;
30890
30891 case IX86_BUILTIN_BEXTRI32:
30892 case IX86_BUILTIN_BEXTRI64:
30893 arg0 = CALL_EXPR_ARG (exp, 0);
30894 arg1 = CALL_EXPR_ARG (exp, 1);
30895 op0 = expand_normal (arg0);
30896 op1 = expand_normal (arg1);
30897 icode = (fcode == IX86_BUILTIN_BEXTRI32
30898 ? CODE_FOR_tbm_bextri_si
30899 : CODE_FOR_tbm_bextri_di);
30900 if (!CONST_INT_P (op1))
30901 {
30902 error ("last argument must be an immediate");
30903 return const0_rtx;
30904 }
30905 else
30906 {
30907 unsigned char length = (INTVAL (op1) >> 8) & 0xFF;
30908 unsigned char lsb_index = INTVAL (op1) & 0xFF;
30909 op1 = GEN_INT (length);
30910 op2 = GEN_INT (lsb_index);
30911 pat = GEN_FCN (icode) (target, op0, op1, op2);
30912 if (pat)
30913 emit_insn (pat);
30914 return target;
30915 }
30916
30917 case IX86_BUILTIN_RDRAND16_STEP:
30918 icode = CODE_FOR_rdrandhi_1;
30919 mode0 = HImode;
30920 goto rdrand_step;
30921
30922 case IX86_BUILTIN_RDRAND32_STEP:
30923 icode = CODE_FOR_rdrandsi_1;
30924 mode0 = SImode;
30925 goto rdrand_step;
30926
30927 case IX86_BUILTIN_RDRAND64_STEP:
30928 icode = CODE_FOR_rdranddi_1;
30929 mode0 = DImode;
30930
30931 rdrand_step:
30932 op0 = gen_reg_rtx (mode0);
30933 emit_insn (GEN_FCN (icode) (op0));
30934
30935 arg0 = CALL_EXPR_ARG (exp, 0);
30936 op1 = expand_normal (arg0);
30937 if (!address_operand (op1, VOIDmode))
30938 {
30939 op1 = convert_memory_address (Pmode, op1);
30940 op1 = copy_addr_to_reg (op1);
30941 }
30942 emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
30943
30944 op1 = gen_reg_rtx (SImode);
30945 emit_move_insn (op1, CONST1_RTX (SImode));
30946
30947 /* Emit SImode conditional move. */
30948 if (mode0 == HImode)
30949 {
30950 op2 = gen_reg_rtx (SImode);
30951 emit_insn (gen_zero_extendhisi2 (op2, op0));
30952 }
30953 else if (mode0 == SImode)
30954 op2 = op0;
30955 else
30956 op2 = gen_rtx_SUBREG (SImode, op0, 0);
30957
30958 if (target == 0)
30959 target = gen_reg_rtx (SImode);
30960
30961 pat = gen_rtx_GEU (VOIDmode, gen_rtx_REG (CCCmode, FLAGS_REG),
30962 const0_rtx);
30963 emit_insn (gen_rtx_SET (VOIDmode, target,
30964 gen_rtx_IF_THEN_ELSE (SImode, pat, op2, op1)));
30965 return target;
30966
30967 case IX86_BUILTIN_RDSEED16_STEP:
30968 icode = CODE_FOR_rdseedhi_1;
30969 mode0 = HImode;
30970 goto rdseed_step;
30971
30972 case IX86_BUILTIN_RDSEED32_STEP:
30973 icode = CODE_FOR_rdseedsi_1;
30974 mode0 = SImode;
30975 goto rdseed_step;
30976
30977 case IX86_BUILTIN_RDSEED64_STEP:
30978 icode = CODE_FOR_rdseeddi_1;
30979 mode0 = DImode;
30980
30981 rdseed_step:
30982 op0 = gen_reg_rtx (mode0);
30983 emit_insn (GEN_FCN (icode) (op0));
30984
30985 arg0 = CALL_EXPR_ARG (exp, 0);
30986 op1 = expand_normal (arg0);
30987 if (!address_operand (op1, VOIDmode))
30988 {
30989 op1 = convert_memory_address (Pmode, op1);
30990 op1 = copy_addr_to_reg (op1);
30991 }
30992 emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
30993
30994 op2 = gen_reg_rtx (QImode);
30995
30996 pat = gen_rtx_LTU (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
30997 const0_rtx);
30998 emit_insn (gen_rtx_SET (VOIDmode, op2, pat));
30999
31000 if (target == 0)
31001 target = gen_reg_rtx (SImode);
31002
31003 emit_insn (gen_zero_extendqisi2 (target, op2));
31004 return target;
31005
31006 case IX86_BUILTIN_ADDCARRYX32:
31007 icode = TARGET_ADX ? CODE_FOR_adcxsi3 : CODE_FOR_addsi3_carry;
31008 mode0 = SImode;
31009 goto addcarryx;
31010
31011 case IX86_BUILTIN_ADDCARRYX64:
31012 icode = TARGET_ADX ? CODE_FOR_adcxdi3 : CODE_FOR_adddi3_carry;
31013 mode0 = DImode;
31014
31015 addcarryx:
31016 arg0 = CALL_EXPR_ARG (exp, 0); /* unsigned char c_in. */
31017 arg1 = CALL_EXPR_ARG (exp, 1); /* unsigned int src1. */
31018 arg2 = CALL_EXPR_ARG (exp, 2); /* unsigned int src2. */
31019 arg3 = CALL_EXPR_ARG (exp, 3); /* unsigned int *sum_out. */
31020
31021 op0 = gen_reg_rtx (QImode);
31022
31023 /* Generate CF from input operand. */
31024 op1 = expand_normal (arg0);
31025 if (GET_MODE (op1) != QImode)
31026 op1 = convert_to_mode (QImode, op1, 1);
31027 op1 = copy_to_mode_reg (QImode, op1);
31028 emit_insn (gen_addqi3_cc (op0, op1, constm1_rtx));
31029
31030 /* Gen ADCX instruction to compute X+Y+CF. */
31031 op2 = expand_normal (arg1);
31032 op3 = expand_normal (arg2);
31033
31034 if (!REG_P (op2))
31035 op2 = copy_to_mode_reg (mode0, op2);
31036 if (!REG_P (op3))
31037 op3 = copy_to_mode_reg (mode0, op3);
31038
31039 op0 = gen_reg_rtx (mode0);
31040
31041 op4 = gen_rtx_REG (CCCmode, FLAGS_REG);
31042 pat = gen_rtx_LTU (VOIDmode, op4, const0_rtx);
31043 emit_insn (GEN_FCN (icode) (op0, op2, op3, op4, pat));
31044
31045 /* Store the result. */
31046 op4 = expand_normal (arg3);
31047 if (!address_operand (op4, VOIDmode))
31048 {
31049 op4 = convert_memory_address (Pmode, op4);
31050 op4 = copy_addr_to_reg (op4);
31051 }
31052 emit_move_insn (gen_rtx_MEM (mode0, op4), op0);
31053
31054 /* Return current CF value. */
31055 if (target == 0)
31056 target = gen_reg_rtx (QImode);
31057
31058 PUT_MODE (pat, QImode);
31059 emit_insn (gen_rtx_SET (VOIDmode, target, pat));
31060 return target;
31061
31062 case IX86_BUILTIN_GATHERSIV2DF:
31063 icode = CODE_FOR_avx2_gathersiv2df;
31064 goto gather_gen;
31065 case IX86_BUILTIN_GATHERSIV4DF:
31066 icode = CODE_FOR_avx2_gathersiv4df;
31067 goto gather_gen;
31068 case IX86_BUILTIN_GATHERDIV2DF:
31069 icode = CODE_FOR_avx2_gatherdiv2df;
31070 goto gather_gen;
31071 case IX86_BUILTIN_GATHERDIV4DF:
31072 icode = CODE_FOR_avx2_gatherdiv4df;
31073 goto gather_gen;
31074 case IX86_BUILTIN_GATHERSIV4SF:
31075 icode = CODE_FOR_avx2_gathersiv4sf;
31076 goto gather_gen;
31077 case IX86_BUILTIN_GATHERSIV8SF:
31078 icode = CODE_FOR_avx2_gathersiv8sf;
31079 goto gather_gen;
31080 case IX86_BUILTIN_GATHERDIV4SF:
31081 icode = CODE_FOR_avx2_gatherdiv4sf;
31082 goto gather_gen;
31083 case IX86_BUILTIN_GATHERDIV8SF:
31084 icode = CODE_FOR_avx2_gatherdiv8sf;
31085 goto gather_gen;
31086 case IX86_BUILTIN_GATHERSIV2DI:
31087 icode = CODE_FOR_avx2_gathersiv2di;
31088 goto gather_gen;
31089 case IX86_BUILTIN_GATHERSIV4DI:
31090 icode = CODE_FOR_avx2_gathersiv4di;
31091 goto gather_gen;
31092 case IX86_BUILTIN_GATHERDIV2DI:
31093 icode = CODE_FOR_avx2_gatherdiv2di;
31094 goto gather_gen;
31095 case IX86_BUILTIN_GATHERDIV4DI:
31096 icode = CODE_FOR_avx2_gatherdiv4di;
31097 goto gather_gen;
31098 case IX86_BUILTIN_GATHERSIV4SI:
31099 icode = CODE_FOR_avx2_gathersiv4si;
31100 goto gather_gen;
31101 case IX86_BUILTIN_GATHERSIV8SI:
31102 icode = CODE_FOR_avx2_gathersiv8si;
31103 goto gather_gen;
31104 case IX86_BUILTIN_GATHERDIV4SI:
31105 icode = CODE_FOR_avx2_gatherdiv4si;
31106 goto gather_gen;
31107 case IX86_BUILTIN_GATHERDIV8SI:
31108 icode = CODE_FOR_avx2_gatherdiv8si;
31109 goto gather_gen;
31110 case IX86_BUILTIN_GATHERALTSIV4DF:
31111 icode = CODE_FOR_avx2_gathersiv4df;
31112 goto gather_gen;
31113 case IX86_BUILTIN_GATHERALTDIV8SF:
31114 icode = CODE_FOR_avx2_gatherdiv8sf;
31115 goto gather_gen;
31116 case IX86_BUILTIN_GATHERALTSIV4DI:
31117 icode = CODE_FOR_avx2_gathersiv4di;
31118 goto gather_gen;
31119 case IX86_BUILTIN_GATHERALTDIV8SI:
31120 icode = CODE_FOR_avx2_gatherdiv8si;
31121 goto gather_gen;
31122
31123 gather_gen:
31124 arg0 = CALL_EXPR_ARG (exp, 0);
31125 arg1 = CALL_EXPR_ARG (exp, 1);
31126 arg2 = CALL_EXPR_ARG (exp, 2);
31127 arg3 = CALL_EXPR_ARG (exp, 3);
31128 arg4 = CALL_EXPR_ARG (exp, 4);
31129 op0 = expand_normal (arg0);
31130 op1 = expand_normal (arg1);
31131 op2 = expand_normal (arg2);
31132 op3 = expand_normal (arg3);
31133 op4 = expand_normal (arg4);
31134 /* Note the arg order is different from the operand order. */
31135 mode0 = insn_data[icode].operand[1].mode;
31136 mode2 = insn_data[icode].operand[3].mode;
31137 mode3 = insn_data[icode].operand[4].mode;
31138 mode4 = insn_data[icode].operand[5].mode;
31139
31140 if (target == NULL_RTX
31141 || GET_MODE (target) != insn_data[icode].operand[0].mode)
31142 subtarget = gen_reg_rtx (insn_data[icode].operand[0].mode);
31143 else
31144 subtarget = target;
31145
31146 if (fcode == IX86_BUILTIN_GATHERALTSIV4DF
31147 || fcode == IX86_BUILTIN_GATHERALTSIV4DI)
31148 {
31149 rtx half = gen_reg_rtx (V4SImode);
31150 if (!nonimmediate_operand (op2, V8SImode))
31151 op2 = copy_to_mode_reg (V8SImode, op2);
31152 emit_insn (gen_vec_extract_lo_v8si (half, op2));
31153 op2 = half;
31154 }
31155 else if (fcode == IX86_BUILTIN_GATHERALTDIV8SF
31156 || fcode == IX86_BUILTIN_GATHERALTDIV8SI)
31157 {
31158 rtx (*gen) (rtx, rtx);
31159 rtx half = gen_reg_rtx (mode0);
31160 if (mode0 == V4SFmode)
31161 gen = gen_vec_extract_lo_v8sf;
31162 else
31163 gen = gen_vec_extract_lo_v8si;
31164 if (!nonimmediate_operand (op0, GET_MODE (op0)))
31165 op0 = copy_to_mode_reg (GET_MODE (op0), op0);
31166 emit_insn (gen (half, op0));
31167 op0 = half;
31168 if (!nonimmediate_operand (op3, GET_MODE (op3)))
31169 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
31170 emit_insn (gen (half, op3));
31171 op3 = half;
31172 }
31173
31174 /* Force memory operand only with base register here. But we
31175 don't want to do it on memory operand for other builtin
31176 functions. */
31177 if (GET_MODE (op1) != Pmode)
31178 op1 = convert_to_mode (Pmode, op1, 1);
31179 op1 = force_reg (Pmode, op1);
31180
31181 if (!insn_data[icode].operand[1].predicate (op0, mode0))
31182 op0 = copy_to_mode_reg (mode0, op0);
31183 if (!insn_data[icode].operand[2].predicate (op1, Pmode))
31184 op1 = copy_to_mode_reg (Pmode, op1);
31185 if (!insn_data[icode].operand[3].predicate (op2, mode2))
31186 op2 = copy_to_mode_reg (mode2, op2);
31187 if (!insn_data[icode].operand[4].predicate (op3, mode3))
31188 op3 = copy_to_mode_reg (mode3, op3);
31189 if (!insn_data[icode].operand[5].predicate (op4, mode4))
31190 {
31191 error ("last argument must be scale 1, 2, 4, 8");
31192 return const0_rtx;
31193 }
31194
31195 /* Optimize. If mask is known to have all high bits set,
31196 replace op0 with pc_rtx to signal that the instruction
31197 overwrites the whole destination and doesn't use its
31198 previous contents. */
31199 if (optimize)
31200 {
31201 if (TREE_CODE (arg3) == VECTOR_CST)
31202 {
31203 unsigned int negative = 0;
31204 for (i = 0; i < VECTOR_CST_NELTS (arg3); ++i)
31205 {
31206 tree cst = VECTOR_CST_ELT (arg3, i);
31207 if (TREE_CODE (cst) == INTEGER_CST
31208 && tree_int_cst_sign_bit (cst))
31209 negative++;
31210 else if (TREE_CODE (cst) == REAL_CST
31211 && REAL_VALUE_NEGATIVE (TREE_REAL_CST (cst)))
31212 negative++;
31213 }
31214 if (negative == TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg3)))
31215 op0 = pc_rtx;
31216 }
31217 else if (TREE_CODE (arg3) == SSA_NAME)
31218 {
31219 /* Recognize also when mask is like:
31220 __v2df src = _mm_setzero_pd ();
31221 __v2df mask = _mm_cmpeq_pd (src, src);
31222 or
31223 __v8sf src = _mm256_setzero_ps ();
31224 __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ);
31225 as that is a cheaper way to load all ones into
31226 a register than having to load a constant from
31227 memory. */
31228 gimple def_stmt = SSA_NAME_DEF_STMT (arg3);
31229 if (is_gimple_call (def_stmt))
31230 {
31231 tree fndecl = gimple_call_fndecl (def_stmt);
31232 if (fndecl
31233 && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD)
31234 switch ((unsigned int) DECL_FUNCTION_CODE (fndecl))
31235 {
31236 case IX86_BUILTIN_CMPPD:
31237 case IX86_BUILTIN_CMPPS:
31238 case IX86_BUILTIN_CMPPD256:
31239 case IX86_BUILTIN_CMPPS256:
31240 if (!integer_zerop (gimple_call_arg (def_stmt, 2)))
31241 break;
31242 /* FALLTHRU */
31243 case IX86_BUILTIN_CMPEQPD:
31244 case IX86_BUILTIN_CMPEQPS:
31245 if (initializer_zerop (gimple_call_arg (def_stmt, 0))
31246 && initializer_zerop (gimple_call_arg (def_stmt,
31247 1)))
31248 op0 = pc_rtx;
31249 break;
31250 default:
31251 break;
31252 }
31253 }
31254 }
31255 }
31256
31257 pat = GEN_FCN (icode) (subtarget, op0, op1, op2, op3, op4);
31258 if (! pat)
31259 return const0_rtx;
31260 emit_insn (pat);
31261
31262 if (fcode == IX86_BUILTIN_GATHERDIV8SF
31263 || fcode == IX86_BUILTIN_GATHERDIV8SI)
31264 {
31265 enum machine_mode tmode = GET_MODE (subtarget) == V8SFmode
31266 ? V4SFmode : V4SImode;
31267 if (target == NULL_RTX)
31268 target = gen_reg_rtx (tmode);
31269 if (tmode == V4SFmode)
31270 emit_insn (gen_vec_extract_lo_v8sf (target, subtarget));
31271 else
31272 emit_insn (gen_vec_extract_lo_v8si (target, subtarget));
31273 }
31274 else
31275 target = subtarget;
31276
31277 return target;
31278
31279 case IX86_BUILTIN_XABORT:
31280 icode = CODE_FOR_xabort;
31281 arg0 = CALL_EXPR_ARG (exp, 0);
31282 op0 = expand_normal (arg0);
31283 mode0 = insn_data[icode].operand[0].mode;
31284 if (!insn_data[icode].operand[0].predicate (op0, mode0))
31285 {
31286 error ("the xabort's argument must be an 8-bit immediate");
31287 return const0_rtx;
31288 }
31289 emit_insn (gen_xabort (op0));
31290 return 0;
31291
31292 default:
31293 break;
31294 }
31295
31296 for (i = 0, d = bdesc_special_args;
31297 i < ARRAY_SIZE (bdesc_special_args);
31298 i++, d++)
31299 if (d->code == fcode)
31300 return ix86_expand_special_args_builtin (d, exp, target);
31301
31302 for (i = 0, d = bdesc_args;
31303 i < ARRAY_SIZE (bdesc_args);
31304 i++, d++)
31305 if (d->code == fcode)
31306 switch (fcode)
31307 {
31308 case IX86_BUILTIN_FABSQ:
31309 case IX86_BUILTIN_COPYSIGNQ:
31310 if (!TARGET_SSE)
31311 /* Emit a normal call if SSE isn't available. */
31312 return expand_call (exp, target, ignore);
31313 default:
31314 return ix86_expand_args_builtin (d, exp, target);
31315 }
31316
31317 for (i = 0, d = bdesc_comi; i < ARRAY_SIZE (bdesc_comi); i++, d++)
31318 if (d->code == fcode)
31319 return ix86_expand_sse_comi (d, exp, target);
31320
31321 for (i = 0, d = bdesc_pcmpestr;
31322 i < ARRAY_SIZE (bdesc_pcmpestr);
31323 i++, d++)
31324 if (d->code == fcode)
31325 return ix86_expand_sse_pcmpestr (d, exp, target);
31326
31327 for (i = 0, d = bdesc_pcmpistr;
31328 i < ARRAY_SIZE (bdesc_pcmpistr);
31329 i++, d++)
31330 if (d->code == fcode)
31331 return ix86_expand_sse_pcmpistr (d, exp, target);
31332
31333 for (i = 0, d = bdesc_multi_arg; i < ARRAY_SIZE (bdesc_multi_arg); i++, d++)
31334 if (d->code == fcode)
31335 return ix86_expand_multi_arg_builtin (d->icode, exp, target,
31336 (enum ix86_builtin_func_type)
31337 d->flag, d->comparison);
31338
31339 gcc_unreachable ();
31340 }
31341
31342 /* Returns a function decl for a vectorized version of the builtin function
31343 with builtin function code FN and the result vector type TYPE, or NULL_TREE
31344 if it is not available. */
31345
31346 static tree
31347 ix86_builtin_vectorized_function (tree fndecl, tree type_out,
31348 tree type_in)
31349 {
31350 enum machine_mode in_mode, out_mode;
31351 int in_n, out_n;
31352 enum built_in_function fn = DECL_FUNCTION_CODE (fndecl);
31353
31354 if (TREE_CODE (type_out) != VECTOR_TYPE
31355 || TREE_CODE (type_in) != VECTOR_TYPE
31356 || DECL_BUILT_IN_CLASS (fndecl) != BUILT_IN_NORMAL)
31357 return NULL_TREE;
31358
31359 out_mode = TYPE_MODE (TREE_TYPE (type_out));
31360 out_n = TYPE_VECTOR_SUBPARTS (type_out);
31361 in_mode = TYPE_MODE (TREE_TYPE (type_in));
31362 in_n = TYPE_VECTOR_SUBPARTS (type_in);
31363
31364 switch (fn)
31365 {
31366 case BUILT_IN_SQRT:
31367 if (out_mode == DFmode && in_mode == DFmode)
31368 {
31369 if (out_n == 2 && in_n == 2)
31370 return ix86_builtins[IX86_BUILTIN_SQRTPD];
31371 else if (out_n == 4 && in_n == 4)
31372 return ix86_builtins[IX86_BUILTIN_SQRTPD256];
31373 }
31374 break;
31375
31376 case BUILT_IN_SQRTF:
31377 if (out_mode == SFmode && in_mode == SFmode)
31378 {
31379 if (out_n == 4 && in_n == 4)
31380 return ix86_builtins[IX86_BUILTIN_SQRTPS_NR];
31381 else if (out_n == 8 && in_n == 8)
31382 return ix86_builtins[IX86_BUILTIN_SQRTPS_NR256];
31383 }
31384 break;
31385
31386 case BUILT_IN_IFLOOR:
31387 case BUILT_IN_LFLOOR:
31388 case BUILT_IN_LLFLOOR:
31389 /* The round insn does not trap on denormals. */
31390 if (flag_trapping_math || !TARGET_ROUND)
31391 break;
31392
31393 if (out_mode == SImode && in_mode == DFmode)
31394 {
31395 if (out_n == 4 && in_n == 2)
31396 return ix86_builtins[IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX];
31397 else if (out_n == 8 && in_n == 4)
31398 return ix86_builtins[IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256];
31399 }
31400 break;
31401
31402 case BUILT_IN_IFLOORF:
31403 case BUILT_IN_LFLOORF:
31404 case BUILT_IN_LLFLOORF:
31405 /* The round insn does not trap on denormals. */
31406 if (flag_trapping_math || !TARGET_ROUND)
31407 break;
31408
31409 if (out_mode == SImode && in_mode == SFmode)
31410 {
31411 if (out_n == 4 && in_n == 4)
31412 return ix86_builtins[IX86_BUILTIN_FLOORPS_SFIX];
31413 else if (out_n == 8 && in_n == 8)
31414 return ix86_builtins[IX86_BUILTIN_FLOORPS_SFIX256];
31415 }
31416 break;
31417
31418 case BUILT_IN_ICEIL:
31419 case BUILT_IN_LCEIL:
31420 case BUILT_IN_LLCEIL:
31421 /* The round insn does not trap on denormals. */
31422 if (flag_trapping_math || !TARGET_ROUND)
31423 break;
31424
31425 if (out_mode == SImode && in_mode == DFmode)
31426 {
31427 if (out_n == 4 && in_n == 2)
31428 return ix86_builtins[IX86_BUILTIN_CEILPD_VEC_PACK_SFIX];
31429 else if (out_n == 8 && in_n == 4)
31430 return ix86_builtins[IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256];
31431 }
31432 break;
31433
31434 case BUILT_IN_ICEILF:
31435 case BUILT_IN_LCEILF:
31436 case BUILT_IN_LLCEILF:
31437 /* The round insn does not trap on denormals. */
31438 if (flag_trapping_math || !TARGET_ROUND)
31439 break;
31440
31441 if (out_mode == SImode && in_mode == SFmode)
31442 {
31443 if (out_n == 4 && in_n == 4)
31444 return ix86_builtins[IX86_BUILTIN_CEILPS_SFIX];
31445 else if (out_n == 8 && in_n == 8)
31446 return ix86_builtins[IX86_BUILTIN_CEILPS_SFIX256];
31447 }
31448 break;
31449
31450 case BUILT_IN_IRINT:
31451 case BUILT_IN_LRINT:
31452 case BUILT_IN_LLRINT:
31453 if (out_mode == SImode && in_mode == DFmode)
31454 {
31455 if (out_n == 4 && in_n == 2)
31456 return ix86_builtins[IX86_BUILTIN_VEC_PACK_SFIX];
31457 else if (out_n == 8 && in_n == 4)
31458 return ix86_builtins[IX86_BUILTIN_VEC_PACK_SFIX256];
31459 }
31460 break;
31461
31462 case BUILT_IN_IRINTF:
31463 case BUILT_IN_LRINTF:
31464 case BUILT_IN_LLRINTF:
31465 if (out_mode == SImode && in_mode == SFmode)
31466 {
31467 if (out_n == 4 && in_n == 4)
31468 return ix86_builtins[IX86_BUILTIN_CVTPS2DQ];
31469 else if (out_n == 8 && in_n == 8)
31470 return ix86_builtins[IX86_BUILTIN_CVTPS2DQ256];
31471 }
31472 break;
31473
31474 case BUILT_IN_IROUND:
31475 case BUILT_IN_LROUND:
31476 case BUILT_IN_LLROUND:
31477 /* The round insn does not trap on denormals. */
31478 if (flag_trapping_math || !TARGET_ROUND)
31479 break;
31480
31481 if (out_mode == SImode && in_mode == DFmode)
31482 {
31483 if (out_n == 4 && in_n == 2)
31484 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX];
31485 else if (out_n == 8 && in_n == 4)
31486 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256];
31487 }
31488 break;
31489
31490 case BUILT_IN_IROUNDF:
31491 case BUILT_IN_LROUNDF:
31492 case BUILT_IN_LLROUNDF:
31493 /* The round insn does not trap on denormals. */
31494 if (flag_trapping_math || !TARGET_ROUND)
31495 break;
31496
31497 if (out_mode == SImode && in_mode == SFmode)
31498 {
31499 if (out_n == 4 && in_n == 4)
31500 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ_SFIX];
31501 else if (out_n == 8 && in_n == 8)
31502 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ_SFIX256];
31503 }
31504 break;
31505
31506 case BUILT_IN_COPYSIGN:
31507 if (out_mode == DFmode && in_mode == DFmode)
31508 {
31509 if (out_n == 2 && in_n == 2)
31510 return ix86_builtins[IX86_BUILTIN_CPYSGNPD];
31511 else if (out_n == 4 && in_n == 4)
31512 return ix86_builtins[IX86_BUILTIN_CPYSGNPD256];
31513 }
31514 break;
31515
31516 case BUILT_IN_COPYSIGNF:
31517 if (out_mode == SFmode && in_mode == SFmode)
31518 {
31519 if (out_n == 4 && in_n == 4)
31520 return ix86_builtins[IX86_BUILTIN_CPYSGNPS];
31521 else if (out_n == 8 && in_n == 8)
31522 return ix86_builtins[IX86_BUILTIN_CPYSGNPS256];
31523 }
31524 break;
31525
31526 case BUILT_IN_FLOOR:
31527 /* The round insn does not trap on denormals. */
31528 if (flag_trapping_math || !TARGET_ROUND)
31529 break;
31530
31531 if (out_mode == DFmode && in_mode == DFmode)
31532 {
31533 if (out_n == 2 && in_n == 2)
31534 return ix86_builtins[IX86_BUILTIN_FLOORPD];
31535 else if (out_n == 4 && in_n == 4)
31536 return ix86_builtins[IX86_BUILTIN_FLOORPD256];
31537 }
31538 break;
31539
31540 case BUILT_IN_FLOORF:
31541 /* The round insn does not trap on denormals. */
31542 if (flag_trapping_math || !TARGET_ROUND)
31543 break;
31544
31545 if (out_mode == SFmode && in_mode == SFmode)
31546 {
31547 if (out_n == 4 && in_n == 4)
31548 return ix86_builtins[IX86_BUILTIN_FLOORPS];
31549 else if (out_n == 8 && in_n == 8)
31550 return ix86_builtins[IX86_BUILTIN_FLOORPS256];
31551 }
31552 break;
31553
31554 case BUILT_IN_CEIL:
31555 /* The round insn does not trap on denormals. */
31556 if (flag_trapping_math || !TARGET_ROUND)
31557 break;
31558
31559 if (out_mode == DFmode && in_mode == DFmode)
31560 {
31561 if (out_n == 2 && in_n == 2)
31562 return ix86_builtins[IX86_BUILTIN_CEILPD];
31563 else if (out_n == 4 && in_n == 4)
31564 return ix86_builtins[IX86_BUILTIN_CEILPD256];
31565 }
31566 break;
31567
31568 case BUILT_IN_CEILF:
31569 /* The round insn does not trap on denormals. */
31570 if (flag_trapping_math || !TARGET_ROUND)
31571 break;
31572
31573 if (out_mode == SFmode && in_mode == SFmode)
31574 {
31575 if (out_n == 4 && in_n == 4)
31576 return ix86_builtins[IX86_BUILTIN_CEILPS];
31577 else if (out_n == 8 && in_n == 8)
31578 return ix86_builtins[IX86_BUILTIN_CEILPS256];
31579 }
31580 break;
31581
31582 case BUILT_IN_TRUNC:
31583 /* The round insn does not trap on denormals. */
31584 if (flag_trapping_math || !TARGET_ROUND)
31585 break;
31586
31587 if (out_mode == DFmode && in_mode == DFmode)
31588 {
31589 if (out_n == 2 && in_n == 2)
31590 return ix86_builtins[IX86_BUILTIN_TRUNCPD];
31591 else if (out_n == 4 && in_n == 4)
31592 return ix86_builtins[IX86_BUILTIN_TRUNCPD256];
31593 }
31594 break;
31595
31596 case BUILT_IN_TRUNCF:
31597 /* The round insn does not trap on denormals. */
31598 if (flag_trapping_math || !TARGET_ROUND)
31599 break;
31600
31601 if (out_mode == SFmode && in_mode == SFmode)
31602 {
31603 if (out_n == 4 && in_n == 4)
31604 return ix86_builtins[IX86_BUILTIN_TRUNCPS];
31605 else if (out_n == 8 && in_n == 8)
31606 return ix86_builtins[IX86_BUILTIN_TRUNCPS256];
31607 }
31608 break;
31609
31610 case BUILT_IN_RINT:
31611 /* The round insn does not trap on denormals. */
31612 if (flag_trapping_math || !TARGET_ROUND)
31613 break;
31614
31615 if (out_mode == DFmode && in_mode == DFmode)
31616 {
31617 if (out_n == 2 && in_n == 2)
31618 return ix86_builtins[IX86_BUILTIN_RINTPD];
31619 else if (out_n == 4 && in_n == 4)
31620 return ix86_builtins[IX86_BUILTIN_RINTPD256];
31621 }
31622 break;
31623
31624 case BUILT_IN_RINTF:
31625 /* The round insn does not trap on denormals. */
31626 if (flag_trapping_math || !TARGET_ROUND)
31627 break;
31628
31629 if (out_mode == SFmode && in_mode == SFmode)
31630 {
31631 if (out_n == 4 && in_n == 4)
31632 return ix86_builtins[IX86_BUILTIN_RINTPS];
31633 else if (out_n == 8 && in_n == 8)
31634 return ix86_builtins[IX86_BUILTIN_RINTPS256];
31635 }
31636 break;
31637
31638 case BUILT_IN_ROUND:
31639 /* The round insn does not trap on denormals. */
31640 if (flag_trapping_math || !TARGET_ROUND)
31641 break;
31642
31643 if (out_mode == DFmode && in_mode == DFmode)
31644 {
31645 if (out_n == 2 && in_n == 2)
31646 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ];
31647 else if (out_n == 4 && in_n == 4)
31648 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ256];
31649 }
31650 break;
31651
31652 case BUILT_IN_ROUNDF:
31653 /* The round insn does not trap on denormals. */
31654 if (flag_trapping_math || !TARGET_ROUND)
31655 break;
31656
31657 if (out_mode == SFmode && in_mode == SFmode)
31658 {
31659 if (out_n == 4 && in_n == 4)
31660 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ];
31661 else if (out_n == 8 && in_n == 8)
31662 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ256];
31663 }
31664 break;
31665
31666 case BUILT_IN_FMA:
31667 if (out_mode == DFmode && in_mode == DFmode)
31668 {
31669 if (out_n == 2 && in_n == 2)
31670 return ix86_builtins[IX86_BUILTIN_VFMADDPD];
31671 if (out_n == 4 && in_n == 4)
31672 return ix86_builtins[IX86_BUILTIN_VFMADDPD256];
31673 }
31674 break;
31675
31676 case BUILT_IN_FMAF:
31677 if (out_mode == SFmode && in_mode == SFmode)
31678 {
31679 if (out_n == 4 && in_n == 4)
31680 return ix86_builtins[IX86_BUILTIN_VFMADDPS];
31681 if (out_n == 8 && in_n == 8)
31682 return ix86_builtins[IX86_BUILTIN_VFMADDPS256];
31683 }
31684 break;
31685
31686 default:
31687 break;
31688 }
31689
31690 /* Dispatch to a handler for a vectorization library. */
31691 if (ix86_veclib_handler)
31692 return ix86_veclib_handler ((enum built_in_function) fn, type_out,
31693 type_in);
31694
31695 return NULL_TREE;
31696 }
31697
31698 /* Handler for an SVML-style interface to
31699 a library with vectorized intrinsics. */
31700
31701 static tree
31702 ix86_veclibabi_svml (enum built_in_function fn, tree type_out, tree type_in)
31703 {
31704 char name[20];
31705 tree fntype, new_fndecl, args;
31706 unsigned arity;
31707 const char *bname;
31708 enum machine_mode el_mode, in_mode;
31709 int n, in_n;
31710
31711 /* The SVML is suitable for unsafe math only. */
31712 if (!flag_unsafe_math_optimizations)
31713 return NULL_TREE;
31714
31715 el_mode = TYPE_MODE (TREE_TYPE (type_out));
31716 n = TYPE_VECTOR_SUBPARTS (type_out);
31717 in_mode = TYPE_MODE (TREE_TYPE (type_in));
31718 in_n = TYPE_VECTOR_SUBPARTS (type_in);
31719 if (el_mode != in_mode
31720 || n != in_n)
31721 return NULL_TREE;
31722
31723 switch (fn)
31724 {
31725 case BUILT_IN_EXP:
31726 case BUILT_IN_LOG:
31727 case BUILT_IN_LOG10:
31728 case BUILT_IN_POW:
31729 case BUILT_IN_TANH:
31730 case BUILT_IN_TAN:
31731 case BUILT_IN_ATAN:
31732 case BUILT_IN_ATAN2:
31733 case BUILT_IN_ATANH:
31734 case BUILT_IN_CBRT:
31735 case BUILT_IN_SINH:
31736 case BUILT_IN_SIN:
31737 case BUILT_IN_ASINH:
31738 case BUILT_IN_ASIN:
31739 case BUILT_IN_COSH:
31740 case BUILT_IN_COS:
31741 case BUILT_IN_ACOSH:
31742 case BUILT_IN_ACOS:
31743 if (el_mode != DFmode || n != 2)
31744 return NULL_TREE;
31745 break;
31746
31747 case BUILT_IN_EXPF:
31748 case BUILT_IN_LOGF:
31749 case BUILT_IN_LOG10F:
31750 case BUILT_IN_POWF:
31751 case BUILT_IN_TANHF:
31752 case BUILT_IN_TANF:
31753 case BUILT_IN_ATANF:
31754 case BUILT_IN_ATAN2F:
31755 case BUILT_IN_ATANHF:
31756 case BUILT_IN_CBRTF:
31757 case BUILT_IN_SINHF:
31758 case BUILT_IN_SINF:
31759 case BUILT_IN_ASINHF:
31760 case BUILT_IN_ASINF:
31761 case BUILT_IN_COSHF:
31762 case BUILT_IN_COSF:
31763 case BUILT_IN_ACOSHF:
31764 case BUILT_IN_ACOSF:
31765 if (el_mode != SFmode || n != 4)
31766 return NULL_TREE;
31767 break;
31768
31769 default:
31770 return NULL_TREE;
31771 }
31772
31773 bname = IDENTIFIER_POINTER (DECL_NAME (builtin_decl_implicit (fn)));
31774
31775 if (fn == BUILT_IN_LOGF)
31776 strcpy (name, "vmlsLn4");
31777 else if (fn == BUILT_IN_LOG)
31778 strcpy (name, "vmldLn2");
31779 else if (n == 4)
31780 {
31781 sprintf (name, "vmls%s", bname+10);
31782 name[strlen (name)-1] = '4';
31783 }
31784 else
31785 sprintf (name, "vmld%s2", bname+10);
31786
31787 /* Convert to uppercase. */
31788 name[4] &= ~0x20;
31789
31790 arity = 0;
31791 for (args = DECL_ARGUMENTS (builtin_decl_implicit (fn));
31792 args;
31793 args = TREE_CHAIN (args))
31794 arity++;
31795
31796 if (arity == 1)
31797 fntype = build_function_type_list (type_out, type_in, NULL);
31798 else
31799 fntype = build_function_type_list (type_out, type_in, type_in, NULL);
31800
31801 /* Build a function declaration for the vectorized function. */
31802 new_fndecl = build_decl (BUILTINS_LOCATION,
31803 FUNCTION_DECL, get_identifier (name), fntype);
31804 TREE_PUBLIC (new_fndecl) = 1;
31805 DECL_EXTERNAL (new_fndecl) = 1;
31806 DECL_IS_NOVOPS (new_fndecl) = 1;
31807 TREE_READONLY (new_fndecl) = 1;
31808
31809 return new_fndecl;
31810 }
31811
31812 /* Handler for an ACML-style interface to
31813 a library with vectorized intrinsics. */
31814
31815 static tree
31816 ix86_veclibabi_acml (enum built_in_function fn, tree type_out, tree type_in)
31817 {
31818 char name[20] = "__vr.._";
31819 tree fntype, new_fndecl, args;
31820 unsigned arity;
31821 const char *bname;
31822 enum machine_mode el_mode, in_mode;
31823 int n, in_n;
31824
31825 /* The ACML is 64bits only and suitable for unsafe math only as
31826 it does not correctly support parts of IEEE with the required
31827 precision such as denormals. */
31828 if (!TARGET_64BIT
31829 || !flag_unsafe_math_optimizations)
31830 return NULL_TREE;
31831
31832 el_mode = TYPE_MODE (TREE_TYPE (type_out));
31833 n = TYPE_VECTOR_SUBPARTS (type_out);
31834 in_mode = TYPE_MODE (TREE_TYPE (type_in));
31835 in_n = TYPE_VECTOR_SUBPARTS (type_in);
31836 if (el_mode != in_mode
31837 || n != in_n)
31838 return NULL_TREE;
31839
31840 switch (fn)
31841 {
31842 case BUILT_IN_SIN:
31843 case BUILT_IN_COS:
31844 case BUILT_IN_EXP:
31845 case BUILT_IN_LOG:
31846 case BUILT_IN_LOG2:
31847 case BUILT_IN_LOG10:
31848 name[4] = 'd';
31849 name[5] = '2';
31850 if (el_mode != DFmode
31851 || n != 2)
31852 return NULL_TREE;
31853 break;
31854
31855 case BUILT_IN_SINF:
31856 case BUILT_IN_COSF:
31857 case BUILT_IN_EXPF:
31858 case BUILT_IN_POWF:
31859 case BUILT_IN_LOGF:
31860 case BUILT_IN_LOG2F:
31861 case BUILT_IN_LOG10F:
31862 name[4] = 's';
31863 name[5] = '4';
31864 if (el_mode != SFmode
31865 || n != 4)
31866 return NULL_TREE;
31867 break;
31868
31869 default:
31870 return NULL_TREE;
31871 }
31872
31873 bname = IDENTIFIER_POINTER (DECL_NAME (builtin_decl_implicit (fn)));
31874 sprintf (name + 7, "%s", bname+10);
31875
31876 arity = 0;
31877 for (args = DECL_ARGUMENTS (builtin_decl_implicit (fn));
31878 args;
31879 args = TREE_CHAIN (args))
31880 arity++;
31881
31882 if (arity == 1)
31883 fntype = build_function_type_list (type_out, type_in, NULL);
31884 else
31885 fntype = build_function_type_list (type_out, type_in, type_in, NULL);
31886
31887 /* Build a function declaration for the vectorized function. */
31888 new_fndecl = build_decl (BUILTINS_LOCATION,
31889 FUNCTION_DECL, get_identifier (name), fntype);
31890 TREE_PUBLIC (new_fndecl) = 1;
31891 DECL_EXTERNAL (new_fndecl) = 1;
31892 DECL_IS_NOVOPS (new_fndecl) = 1;
31893 TREE_READONLY (new_fndecl) = 1;
31894
31895 return new_fndecl;
31896 }
31897
31898 /* Returns a decl of a function that implements gather load with
31899 memory type MEM_VECTYPE and index type INDEX_VECTYPE and SCALE.
31900 Return NULL_TREE if it is not available. */
31901
31902 static tree
31903 ix86_vectorize_builtin_gather (const_tree mem_vectype,
31904 const_tree index_type, int scale)
31905 {
31906 bool si;
31907 enum ix86_builtins code;
31908
31909 if (! TARGET_AVX2)
31910 return NULL_TREE;
31911
31912 if ((TREE_CODE (index_type) != INTEGER_TYPE
31913 && !POINTER_TYPE_P (index_type))
31914 || (TYPE_MODE (index_type) != SImode
31915 && TYPE_MODE (index_type) != DImode))
31916 return NULL_TREE;
31917
31918 if (TYPE_PRECISION (index_type) > POINTER_SIZE)
31919 return NULL_TREE;
31920
31921 /* v*gather* insn sign extends index to pointer mode. */
31922 if (TYPE_PRECISION (index_type) < POINTER_SIZE
31923 && TYPE_UNSIGNED (index_type))
31924 return NULL_TREE;
31925
31926 if (scale <= 0
31927 || scale > 8
31928 || (scale & (scale - 1)) != 0)
31929 return NULL_TREE;
31930
31931 si = TYPE_MODE (index_type) == SImode;
31932 switch (TYPE_MODE (mem_vectype))
31933 {
31934 case V2DFmode:
31935 code = si ? IX86_BUILTIN_GATHERSIV2DF : IX86_BUILTIN_GATHERDIV2DF;
31936 break;
31937 case V4DFmode:
31938 code = si ? IX86_BUILTIN_GATHERALTSIV4DF : IX86_BUILTIN_GATHERDIV4DF;
31939 break;
31940 case V2DImode:
31941 code = si ? IX86_BUILTIN_GATHERSIV2DI : IX86_BUILTIN_GATHERDIV2DI;
31942 break;
31943 case V4DImode:
31944 code = si ? IX86_BUILTIN_GATHERALTSIV4DI : IX86_BUILTIN_GATHERDIV4DI;
31945 break;
31946 case V4SFmode:
31947 code = si ? IX86_BUILTIN_GATHERSIV4SF : IX86_BUILTIN_GATHERDIV4SF;
31948 break;
31949 case V8SFmode:
31950 code = si ? IX86_BUILTIN_GATHERSIV8SF : IX86_BUILTIN_GATHERALTDIV8SF;
31951 break;
31952 case V4SImode:
31953 code = si ? IX86_BUILTIN_GATHERSIV4SI : IX86_BUILTIN_GATHERDIV4SI;
31954 break;
31955 case V8SImode:
31956 code = si ? IX86_BUILTIN_GATHERSIV8SI : IX86_BUILTIN_GATHERALTDIV8SI;
31957 break;
31958 default:
31959 return NULL_TREE;
31960 }
31961
31962 return ix86_builtins[code];
31963 }
31964
31965 /* Returns a code for a target-specific builtin that implements
31966 reciprocal of the function, or NULL_TREE if not available. */
31967
31968 static tree
31969 ix86_builtin_reciprocal (unsigned int fn, bool md_fn,
31970 bool sqrt ATTRIBUTE_UNUSED)
31971 {
31972 if (! (TARGET_SSE_MATH && !optimize_insn_for_size_p ()
31973 && flag_finite_math_only && !flag_trapping_math
31974 && flag_unsafe_math_optimizations))
31975 return NULL_TREE;
31976
31977 if (md_fn)
31978 /* Machine dependent builtins. */
31979 switch (fn)
31980 {
31981 /* Vectorized version of sqrt to rsqrt conversion. */
31982 case IX86_BUILTIN_SQRTPS_NR:
31983 return ix86_builtins[IX86_BUILTIN_RSQRTPS_NR];
31984
31985 case IX86_BUILTIN_SQRTPS_NR256:
31986 return ix86_builtins[IX86_BUILTIN_RSQRTPS_NR256];
31987
31988 default:
31989 return NULL_TREE;
31990 }
31991 else
31992 /* Normal builtins. */
31993 switch (fn)
31994 {
31995 /* Sqrt to rsqrt conversion. */
31996 case BUILT_IN_SQRTF:
31997 return ix86_builtins[IX86_BUILTIN_RSQRTF];
31998
31999 default:
32000 return NULL_TREE;
32001 }
32002 }
32003 \f
32004 /* Helper for avx_vpermilps256_operand et al. This is also used by
32005 the expansion functions to turn the parallel back into a mask.
32006 The return value is 0 for no match and the imm8+1 for a match. */
32007
32008 int
32009 avx_vpermilp_parallel (rtx par, enum machine_mode mode)
32010 {
32011 unsigned i, nelt = GET_MODE_NUNITS (mode);
32012 unsigned mask = 0;
32013 unsigned char ipar[8];
32014
32015 if (XVECLEN (par, 0) != (int) nelt)
32016 return 0;
32017
32018 /* Validate that all of the elements are constants, and not totally
32019 out of range. Copy the data into an integral array to make the
32020 subsequent checks easier. */
32021 for (i = 0; i < nelt; ++i)
32022 {
32023 rtx er = XVECEXP (par, 0, i);
32024 unsigned HOST_WIDE_INT ei;
32025
32026 if (!CONST_INT_P (er))
32027 return 0;
32028 ei = INTVAL (er);
32029 if (ei >= nelt)
32030 return 0;
32031 ipar[i] = ei;
32032 }
32033
32034 switch (mode)
32035 {
32036 case V4DFmode:
32037 /* In the 256-bit DFmode case, we can only move elements within
32038 a 128-bit lane. */
32039 for (i = 0; i < 2; ++i)
32040 {
32041 if (ipar[i] >= 2)
32042 return 0;
32043 mask |= ipar[i] << i;
32044 }
32045 for (i = 2; i < 4; ++i)
32046 {
32047 if (ipar[i] < 2)
32048 return 0;
32049 mask |= (ipar[i] - 2) << i;
32050 }
32051 break;
32052
32053 case V8SFmode:
32054 /* In the 256-bit SFmode case, we have full freedom of movement
32055 within the low 128-bit lane, but the high 128-bit lane must
32056 mirror the exact same pattern. */
32057 for (i = 0; i < 4; ++i)
32058 if (ipar[i] + 4 != ipar[i + 4])
32059 return 0;
32060 nelt = 4;
32061 /* FALLTHRU */
32062
32063 case V2DFmode:
32064 case V4SFmode:
32065 /* In the 128-bit case, we've full freedom in the placement of
32066 the elements from the source operand. */
32067 for (i = 0; i < nelt; ++i)
32068 mask |= ipar[i] << (i * (nelt / 2));
32069 break;
32070
32071 default:
32072 gcc_unreachable ();
32073 }
32074
32075 /* Make sure success has a non-zero value by adding one. */
32076 return mask + 1;
32077 }
32078
32079 /* Helper for avx_vperm2f128_v4df_operand et al. This is also used by
32080 the expansion functions to turn the parallel back into a mask.
32081 The return value is 0 for no match and the imm8+1 for a match. */
32082
32083 int
32084 avx_vperm2f128_parallel (rtx par, enum machine_mode mode)
32085 {
32086 unsigned i, nelt = GET_MODE_NUNITS (mode), nelt2 = nelt / 2;
32087 unsigned mask = 0;
32088 unsigned char ipar[8];
32089
32090 if (XVECLEN (par, 0) != (int) nelt)
32091 return 0;
32092
32093 /* Validate that all of the elements are constants, and not totally
32094 out of range. Copy the data into an integral array to make the
32095 subsequent checks easier. */
32096 for (i = 0; i < nelt; ++i)
32097 {
32098 rtx er = XVECEXP (par, 0, i);
32099 unsigned HOST_WIDE_INT ei;
32100
32101 if (!CONST_INT_P (er))
32102 return 0;
32103 ei = INTVAL (er);
32104 if (ei >= 2 * nelt)
32105 return 0;
32106 ipar[i] = ei;
32107 }
32108
32109 /* Validate that the halves of the permute are halves. */
32110 for (i = 0; i < nelt2 - 1; ++i)
32111 if (ipar[i] + 1 != ipar[i + 1])
32112 return 0;
32113 for (i = nelt2; i < nelt - 1; ++i)
32114 if (ipar[i] + 1 != ipar[i + 1])
32115 return 0;
32116
32117 /* Reconstruct the mask. */
32118 for (i = 0; i < 2; ++i)
32119 {
32120 unsigned e = ipar[i * nelt2];
32121 if (e % nelt2)
32122 return 0;
32123 e /= nelt2;
32124 mask |= e << (i * 4);
32125 }
32126
32127 /* Make sure success has a non-zero value by adding one. */
32128 return mask + 1;
32129 }
32130 \f
32131 /* Store OPERAND to the memory after reload is completed. This means
32132 that we can't easily use assign_stack_local. */
32133 rtx
32134 ix86_force_to_memory (enum machine_mode mode, rtx operand)
32135 {
32136 rtx result;
32137
32138 gcc_assert (reload_completed);
32139 if (ix86_using_red_zone ())
32140 {
32141 result = gen_rtx_MEM (mode,
32142 gen_rtx_PLUS (Pmode,
32143 stack_pointer_rtx,
32144 GEN_INT (-RED_ZONE_SIZE)));
32145 emit_move_insn (result, operand);
32146 }
32147 else if (TARGET_64BIT)
32148 {
32149 switch (mode)
32150 {
32151 case HImode:
32152 case SImode:
32153 operand = gen_lowpart (DImode, operand);
32154 /* FALLTHRU */
32155 case DImode:
32156 emit_insn (
32157 gen_rtx_SET (VOIDmode,
32158 gen_rtx_MEM (DImode,
32159 gen_rtx_PRE_DEC (DImode,
32160 stack_pointer_rtx)),
32161 operand));
32162 break;
32163 default:
32164 gcc_unreachable ();
32165 }
32166 result = gen_rtx_MEM (mode, stack_pointer_rtx);
32167 }
32168 else
32169 {
32170 switch (mode)
32171 {
32172 case DImode:
32173 {
32174 rtx operands[2];
32175 split_double_mode (mode, &operand, 1, operands, operands + 1);
32176 emit_insn (
32177 gen_rtx_SET (VOIDmode,
32178 gen_rtx_MEM (SImode,
32179 gen_rtx_PRE_DEC (Pmode,
32180 stack_pointer_rtx)),
32181 operands[1]));
32182 emit_insn (
32183 gen_rtx_SET (VOIDmode,
32184 gen_rtx_MEM (SImode,
32185 gen_rtx_PRE_DEC (Pmode,
32186 stack_pointer_rtx)),
32187 operands[0]));
32188 }
32189 break;
32190 case HImode:
32191 /* Store HImodes as SImodes. */
32192 operand = gen_lowpart (SImode, operand);
32193 /* FALLTHRU */
32194 case SImode:
32195 emit_insn (
32196 gen_rtx_SET (VOIDmode,
32197 gen_rtx_MEM (GET_MODE (operand),
32198 gen_rtx_PRE_DEC (SImode,
32199 stack_pointer_rtx)),
32200 operand));
32201 break;
32202 default:
32203 gcc_unreachable ();
32204 }
32205 result = gen_rtx_MEM (mode, stack_pointer_rtx);
32206 }
32207 return result;
32208 }
32209
32210 /* Free operand from the memory. */
32211 void
32212 ix86_free_from_memory (enum machine_mode mode)
32213 {
32214 if (!ix86_using_red_zone ())
32215 {
32216 int size;
32217
32218 if (mode == DImode || TARGET_64BIT)
32219 size = 8;
32220 else
32221 size = 4;
32222 /* Use LEA to deallocate stack space. In peephole2 it will be converted
32223 to pop or add instruction if registers are available. */
32224 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
32225 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
32226 GEN_INT (size))));
32227 }
32228 }
32229
32230 /* Return true if we use LRA instead of reload pass. */
32231 static bool
32232 ix86_lra_p (void)
32233 {
32234 return true;
32235 }
32236
32237 /* Return a register priority for hard reg REGNO. */
32238 static int
32239 ix86_register_priority (int hard_regno)
32240 {
32241 /* ebp and r13 as the base always wants a displacement, r12 as the
32242 base always wants an index. So discourage their usage in an
32243 address. */
32244 if (hard_regno == R12_REG || hard_regno == R13_REG)
32245 return 0;
32246 if (hard_regno == BP_REG)
32247 return 1;
32248 /* New x86-64 int registers result in bigger code size. Discourage
32249 them. */
32250 if (FIRST_REX_INT_REG <= hard_regno && hard_regno <= LAST_REX_INT_REG)
32251 return 2;
32252 /* New x86-64 SSE registers result in bigger code size. Discourage
32253 them. */
32254 if (FIRST_REX_SSE_REG <= hard_regno && hard_regno <= LAST_REX_SSE_REG)
32255 return 2;
32256 /* Usage of AX register results in smaller code. Prefer it. */
32257 if (hard_regno == 0)
32258 return 4;
32259 return 3;
32260 }
32261
32262 /* Implement TARGET_PREFERRED_RELOAD_CLASS.
32263
32264 Put float CONST_DOUBLE in the constant pool instead of fp regs.
32265 QImode must go into class Q_REGS.
32266 Narrow ALL_REGS to GENERAL_REGS. This supports allowing movsf and
32267 movdf to do mem-to-mem moves through integer regs. */
32268
32269 static reg_class_t
32270 ix86_preferred_reload_class (rtx x, reg_class_t regclass)
32271 {
32272 enum machine_mode mode = GET_MODE (x);
32273
32274 /* We're only allowed to return a subclass of CLASS. Many of the
32275 following checks fail for NO_REGS, so eliminate that early. */
32276 if (regclass == NO_REGS)
32277 return NO_REGS;
32278
32279 /* All classes can load zeros. */
32280 if (x == CONST0_RTX (mode))
32281 return regclass;
32282
32283 /* Force constants into memory if we are loading a (nonzero) constant into
32284 an MMX or SSE register. This is because there are no MMX/SSE instructions
32285 to load from a constant. */
32286 if (CONSTANT_P (x)
32287 && (MAYBE_MMX_CLASS_P (regclass) || MAYBE_SSE_CLASS_P (regclass)))
32288 return NO_REGS;
32289
32290 /* Prefer SSE regs only, if we can use them for math. */
32291 if (TARGET_SSE_MATH && !TARGET_MIX_SSE_I387 && SSE_FLOAT_MODE_P (mode))
32292 return SSE_CLASS_P (regclass) ? regclass : NO_REGS;
32293
32294 /* Floating-point constants need more complex checks. */
32295 if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) != VOIDmode)
32296 {
32297 /* General regs can load everything. */
32298 if (reg_class_subset_p (regclass, GENERAL_REGS))
32299 return regclass;
32300
32301 /* Floats can load 0 and 1 plus some others. Note that we eliminated
32302 zero above. We only want to wind up preferring 80387 registers if
32303 we plan on doing computation with them. */
32304 if (TARGET_80387
32305 && standard_80387_constant_p (x) > 0)
32306 {
32307 /* Limit class to non-sse. */
32308 if (regclass == FLOAT_SSE_REGS)
32309 return FLOAT_REGS;
32310 if (regclass == FP_TOP_SSE_REGS)
32311 return FP_TOP_REG;
32312 if (regclass == FP_SECOND_SSE_REGS)
32313 return FP_SECOND_REG;
32314 if (regclass == FLOAT_INT_REGS || regclass == FLOAT_REGS)
32315 return regclass;
32316 }
32317
32318 return NO_REGS;
32319 }
32320
32321 /* Generally when we see PLUS here, it's the function invariant
32322 (plus soft-fp const_int). Which can only be computed into general
32323 regs. */
32324 if (GET_CODE (x) == PLUS)
32325 return reg_class_subset_p (regclass, GENERAL_REGS) ? regclass : NO_REGS;
32326
32327 /* QImode constants are easy to load, but non-constant QImode data
32328 must go into Q_REGS. */
32329 if (GET_MODE (x) == QImode && !CONSTANT_P (x))
32330 {
32331 if (reg_class_subset_p (regclass, Q_REGS))
32332 return regclass;
32333 if (reg_class_subset_p (Q_REGS, regclass))
32334 return Q_REGS;
32335 return NO_REGS;
32336 }
32337
32338 return regclass;
32339 }
32340
32341 /* Discourage putting floating-point values in SSE registers unless
32342 SSE math is being used, and likewise for the 387 registers. */
32343 static reg_class_t
32344 ix86_preferred_output_reload_class (rtx x, reg_class_t regclass)
32345 {
32346 enum machine_mode mode = GET_MODE (x);
32347
32348 /* Restrict the output reload class to the register bank that we are doing
32349 math on. If we would like not to return a subset of CLASS, reject this
32350 alternative: if reload cannot do this, it will still use its choice. */
32351 mode = GET_MODE (x);
32352 if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode))
32353 return MAYBE_SSE_CLASS_P (regclass) ? SSE_REGS : NO_REGS;
32354
32355 if (X87_FLOAT_MODE_P (mode))
32356 {
32357 if (regclass == FP_TOP_SSE_REGS)
32358 return FP_TOP_REG;
32359 else if (regclass == FP_SECOND_SSE_REGS)
32360 return FP_SECOND_REG;
32361 else
32362 return FLOAT_CLASS_P (regclass) ? regclass : NO_REGS;
32363 }
32364
32365 return regclass;
32366 }
32367
32368 static reg_class_t
32369 ix86_secondary_reload (bool in_p, rtx x, reg_class_t rclass,
32370 enum machine_mode mode, secondary_reload_info *sri)
32371 {
32372 /* Double-word spills from general registers to non-offsettable memory
32373 references (zero-extended addresses) require special handling. */
32374 if (TARGET_64BIT
32375 && MEM_P (x)
32376 && GET_MODE_SIZE (mode) > UNITS_PER_WORD
32377 && rclass == GENERAL_REGS
32378 && !offsettable_memref_p (x))
32379 {
32380 sri->icode = (in_p
32381 ? CODE_FOR_reload_noff_load
32382 : CODE_FOR_reload_noff_store);
32383 /* Add the cost of moving address to a temporary. */
32384 sri->extra_cost = 1;
32385
32386 return NO_REGS;
32387 }
32388
32389 /* QImode spills from non-QI registers require
32390 intermediate register on 32bit targets. */
32391 if (!TARGET_64BIT
32392 && !in_p && mode == QImode
32393 && (rclass == GENERAL_REGS
32394 || rclass == LEGACY_REGS
32395 || rclass == NON_Q_REGS
32396 || rclass == SIREG
32397 || rclass == DIREG
32398 || rclass == INDEX_REGS))
32399 {
32400 int regno;
32401
32402 if (REG_P (x))
32403 regno = REGNO (x);
32404 else
32405 regno = -1;
32406
32407 if (regno >= FIRST_PSEUDO_REGISTER || GET_CODE (x) == SUBREG)
32408 regno = true_regnum (x);
32409
32410 /* Return Q_REGS if the operand is in memory. */
32411 if (regno == -1)
32412 return Q_REGS;
32413 }
32414
32415 /* This condition handles corner case where an expression involving
32416 pointers gets vectorized. We're trying to use the address of a
32417 stack slot as a vector initializer.
32418
32419 (set (reg:V2DI 74 [ vect_cst_.2 ])
32420 (vec_duplicate:V2DI (reg/f:DI 20 frame)))
32421
32422 Eventually frame gets turned into sp+offset like this:
32423
32424 (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
32425 (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
32426 (const_int 392 [0x188]))))
32427
32428 That later gets turned into:
32429
32430 (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
32431 (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
32432 (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))))
32433
32434 We'll have the following reload recorded:
32435
32436 Reload 0: reload_in (DI) =
32437 (plus:DI (reg/f:DI 7 sp)
32438 (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))
32439 reload_out (V2DI) = (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
32440 SSE_REGS, RELOAD_OTHER (opnum = 0), can't combine
32441 reload_in_reg: (plus:DI (reg/f:DI 7 sp) (const_int 392 [0x188]))
32442 reload_out_reg: (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
32443 reload_reg_rtx: (reg:V2DI 22 xmm1)
32444
32445 Which isn't going to work since SSE instructions can't handle scalar
32446 additions. Returning GENERAL_REGS forces the addition into integer
32447 register and reload can handle subsequent reloads without problems. */
32448
32449 if (in_p && GET_CODE (x) == PLUS
32450 && SSE_CLASS_P (rclass)
32451 && SCALAR_INT_MODE_P (mode))
32452 return GENERAL_REGS;
32453
32454 return NO_REGS;
32455 }
32456
32457 /* Implement TARGET_CLASS_LIKELY_SPILLED_P. */
32458
32459 static bool
32460 ix86_class_likely_spilled_p (reg_class_t rclass)
32461 {
32462 switch (rclass)
32463 {
32464 case AREG:
32465 case DREG:
32466 case CREG:
32467 case BREG:
32468 case AD_REGS:
32469 case SIREG:
32470 case DIREG:
32471 case SSE_FIRST_REG:
32472 case FP_TOP_REG:
32473 case FP_SECOND_REG:
32474 return true;
32475
32476 default:
32477 break;
32478 }
32479
32480 return false;
32481 }
32482
32483 /* If we are copying between general and FP registers, we need a memory
32484 location. The same is true for SSE and MMX registers.
32485
32486 To optimize register_move_cost performance, allow inline variant.
32487
32488 The macro can't work reliably when one of the CLASSES is class containing
32489 registers from multiple units (SSE, MMX, integer). We avoid this by never
32490 combining those units in single alternative in the machine description.
32491 Ensure that this constraint holds to avoid unexpected surprises.
32492
32493 When STRICT is false, we are being called from REGISTER_MOVE_COST, so do not
32494 enforce these sanity checks. */
32495
32496 static inline bool
32497 inline_secondary_memory_needed (enum reg_class class1, enum reg_class class2,
32498 enum machine_mode mode, int strict)
32499 {
32500 if (MAYBE_FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class1)
32501 || MAYBE_FLOAT_CLASS_P (class2) != FLOAT_CLASS_P (class2)
32502 || MAYBE_SSE_CLASS_P (class1) != SSE_CLASS_P (class1)
32503 || MAYBE_SSE_CLASS_P (class2) != SSE_CLASS_P (class2)
32504 || MAYBE_MMX_CLASS_P (class1) != MMX_CLASS_P (class1)
32505 || MAYBE_MMX_CLASS_P (class2) != MMX_CLASS_P (class2))
32506 {
32507 gcc_assert (!strict || lra_in_progress);
32508 return true;
32509 }
32510
32511 if (FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class2))
32512 return true;
32513
32514 /* ??? This is a lie. We do have moves between mmx/general, and for
32515 mmx/sse2. But by saying we need secondary memory we discourage the
32516 register allocator from using the mmx registers unless needed. */
32517 if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2))
32518 return true;
32519
32520 if (SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
32521 {
32522 /* SSE1 doesn't have any direct moves from other classes. */
32523 if (!TARGET_SSE2)
32524 return true;
32525
32526 /* If the target says that inter-unit moves are more expensive
32527 than moving through memory, then don't generate them. */
32528 if (!TARGET_INTER_UNIT_MOVES)
32529 return true;
32530
32531 /* Between SSE and general, we have moves no larger than word size. */
32532 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
32533 return true;
32534 }
32535
32536 return false;
32537 }
32538
32539 bool
32540 ix86_secondary_memory_needed (enum reg_class class1, enum reg_class class2,
32541 enum machine_mode mode, int strict)
32542 {
32543 return inline_secondary_memory_needed (class1, class2, mode, strict);
32544 }
32545
32546 /* Implement the TARGET_CLASS_MAX_NREGS hook.
32547
32548 On the 80386, this is the size of MODE in words,
32549 except in the FP regs, where a single reg is always enough. */
32550
32551 static unsigned char
32552 ix86_class_max_nregs (reg_class_t rclass, enum machine_mode mode)
32553 {
32554 if (MAYBE_INTEGER_CLASS_P (rclass))
32555 {
32556 if (mode == XFmode)
32557 return (TARGET_64BIT ? 2 : 3);
32558 else if (mode == XCmode)
32559 return (TARGET_64BIT ? 4 : 6);
32560 else
32561 return ((GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD);
32562 }
32563 else
32564 {
32565 if (COMPLEX_MODE_P (mode))
32566 return 2;
32567 else
32568 return 1;
32569 }
32570 }
32571
32572 /* Return true if the registers in CLASS cannot represent the change from
32573 modes FROM to TO. */
32574
32575 bool
32576 ix86_cannot_change_mode_class (enum machine_mode from, enum machine_mode to,
32577 enum reg_class regclass)
32578 {
32579 if (from == to)
32580 return false;
32581
32582 /* x87 registers can't do subreg at all, as all values are reformatted
32583 to extended precision. */
32584 if (MAYBE_FLOAT_CLASS_P (regclass))
32585 return true;
32586
32587 if (MAYBE_SSE_CLASS_P (regclass) || MAYBE_MMX_CLASS_P (regclass))
32588 {
32589 /* Vector registers do not support QI or HImode loads. If we don't
32590 disallow a change to these modes, reload will assume it's ok to
32591 drop the subreg from (subreg:SI (reg:HI 100) 0). This affects
32592 the vec_dupv4hi pattern. */
32593 if (GET_MODE_SIZE (from) < 4)
32594 return true;
32595
32596 /* Vector registers do not support subreg with nonzero offsets, which
32597 are otherwise valid for integer registers. Since we can't see
32598 whether we have a nonzero offset from here, prohibit all
32599 nonparadoxical subregs changing size. */
32600 if (GET_MODE_SIZE (to) < GET_MODE_SIZE (from))
32601 return true;
32602 }
32603
32604 return false;
32605 }
32606
32607 /* Return the cost of moving data of mode M between a
32608 register and memory. A value of 2 is the default; this cost is
32609 relative to those in `REGISTER_MOVE_COST'.
32610
32611 This function is used extensively by register_move_cost that is used to
32612 build tables at startup. Make it inline in this case.
32613 When IN is 2, return maximum of in and out move cost.
32614
32615 If moving between registers and memory is more expensive than
32616 between two registers, you should define this macro to express the
32617 relative cost.
32618
32619 Model also increased moving costs of QImode registers in non
32620 Q_REGS classes.
32621 */
32622 static inline int
32623 inline_memory_move_cost (enum machine_mode mode, enum reg_class regclass,
32624 int in)
32625 {
32626 int cost;
32627 if (FLOAT_CLASS_P (regclass))
32628 {
32629 int index;
32630 switch (mode)
32631 {
32632 case SFmode:
32633 index = 0;
32634 break;
32635 case DFmode:
32636 index = 1;
32637 break;
32638 case XFmode:
32639 index = 2;
32640 break;
32641 default:
32642 return 100;
32643 }
32644 if (in == 2)
32645 return MAX (ix86_cost->fp_load [index], ix86_cost->fp_store [index]);
32646 return in ? ix86_cost->fp_load [index] : ix86_cost->fp_store [index];
32647 }
32648 if (SSE_CLASS_P (regclass))
32649 {
32650 int index;
32651 switch (GET_MODE_SIZE (mode))
32652 {
32653 case 4:
32654 index = 0;
32655 break;
32656 case 8:
32657 index = 1;
32658 break;
32659 case 16:
32660 index = 2;
32661 break;
32662 default:
32663 return 100;
32664 }
32665 if (in == 2)
32666 return MAX (ix86_cost->sse_load [index], ix86_cost->sse_store [index]);
32667 return in ? ix86_cost->sse_load [index] : ix86_cost->sse_store [index];
32668 }
32669 if (MMX_CLASS_P (regclass))
32670 {
32671 int index;
32672 switch (GET_MODE_SIZE (mode))
32673 {
32674 case 4:
32675 index = 0;
32676 break;
32677 case 8:
32678 index = 1;
32679 break;
32680 default:
32681 return 100;
32682 }
32683 if (in)
32684 return MAX (ix86_cost->mmx_load [index], ix86_cost->mmx_store [index]);
32685 return in ? ix86_cost->mmx_load [index] : ix86_cost->mmx_store [index];
32686 }
32687 switch (GET_MODE_SIZE (mode))
32688 {
32689 case 1:
32690 if (Q_CLASS_P (regclass) || TARGET_64BIT)
32691 {
32692 if (!in)
32693 return ix86_cost->int_store[0];
32694 if (TARGET_PARTIAL_REG_DEPENDENCY
32695 && optimize_function_for_speed_p (cfun))
32696 cost = ix86_cost->movzbl_load;
32697 else
32698 cost = ix86_cost->int_load[0];
32699 if (in == 2)
32700 return MAX (cost, ix86_cost->int_store[0]);
32701 return cost;
32702 }
32703 else
32704 {
32705 if (in == 2)
32706 return MAX (ix86_cost->movzbl_load, ix86_cost->int_store[0] + 4);
32707 if (in)
32708 return ix86_cost->movzbl_load;
32709 else
32710 return ix86_cost->int_store[0] + 4;
32711 }
32712 break;
32713 case 2:
32714 if (in == 2)
32715 return MAX (ix86_cost->int_load[1], ix86_cost->int_store[1]);
32716 return in ? ix86_cost->int_load[1] : ix86_cost->int_store[1];
32717 default:
32718 /* Compute number of 32bit moves needed. TFmode is moved as XFmode. */
32719 if (mode == TFmode)
32720 mode = XFmode;
32721 if (in == 2)
32722 cost = MAX (ix86_cost->int_load[2] , ix86_cost->int_store[2]);
32723 else if (in)
32724 cost = ix86_cost->int_load[2];
32725 else
32726 cost = ix86_cost->int_store[2];
32727 return (cost * (((int) GET_MODE_SIZE (mode)
32728 + UNITS_PER_WORD - 1) / UNITS_PER_WORD));
32729 }
32730 }
32731
32732 static int
32733 ix86_memory_move_cost (enum machine_mode mode, reg_class_t regclass,
32734 bool in)
32735 {
32736 return inline_memory_move_cost (mode, (enum reg_class) regclass, in ? 1 : 0);
32737 }
32738
32739
32740 /* Return the cost of moving data from a register in class CLASS1 to
32741 one in class CLASS2.
32742
32743 It is not required that the cost always equal 2 when FROM is the same as TO;
32744 on some machines it is expensive to move between registers if they are not
32745 general registers. */
32746
32747 static int
32748 ix86_register_move_cost (enum machine_mode mode, reg_class_t class1_i,
32749 reg_class_t class2_i)
32750 {
32751 enum reg_class class1 = (enum reg_class) class1_i;
32752 enum reg_class class2 = (enum reg_class) class2_i;
32753
32754 /* In case we require secondary memory, compute cost of the store followed
32755 by load. In order to avoid bad register allocation choices, we need
32756 for this to be *at least* as high as the symmetric MEMORY_MOVE_COST. */
32757
32758 if (inline_secondary_memory_needed (class1, class2, mode, 0))
32759 {
32760 int cost = 1;
32761
32762 cost += inline_memory_move_cost (mode, class1, 2);
32763 cost += inline_memory_move_cost (mode, class2, 2);
32764
32765 /* In case of copying from general_purpose_register we may emit multiple
32766 stores followed by single load causing memory size mismatch stall.
32767 Count this as arbitrarily high cost of 20. */
32768 if (targetm.class_max_nregs (class1, mode)
32769 > targetm.class_max_nregs (class2, mode))
32770 cost += 20;
32771
32772 /* In the case of FP/MMX moves, the registers actually overlap, and we
32773 have to switch modes in order to treat them differently. */
32774 if ((MMX_CLASS_P (class1) && MAYBE_FLOAT_CLASS_P (class2))
32775 || (MMX_CLASS_P (class2) && MAYBE_FLOAT_CLASS_P (class1)))
32776 cost += 20;
32777
32778 return cost;
32779 }
32780
32781 /* Moves between SSE/MMX and integer unit are expensive. */
32782 if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2)
32783 || SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
32784
32785 /* ??? By keeping returned value relatively high, we limit the number
32786 of moves between integer and MMX/SSE registers for all targets.
32787 Additionally, high value prevents problem with x86_modes_tieable_p(),
32788 where integer modes in MMX/SSE registers are not tieable
32789 because of missing QImode and HImode moves to, from or between
32790 MMX/SSE registers. */
32791 return MAX (8, ix86_cost->mmxsse_to_integer);
32792
32793 if (MAYBE_FLOAT_CLASS_P (class1))
32794 return ix86_cost->fp_move;
32795 if (MAYBE_SSE_CLASS_P (class1))
32796 return ix86_cost->sse_move;
32797 if (MAYBE_MMX_CLASS_P (class1))
32798 return ix86_cost->mmx_move;
32799 return 2;
32800 }
32801
32802 /* Return TRUE if hard register REGNO can hold a value of machine-mode
32803 MODE. */
32804
32805 bool
32806 ix86_hard_regno_mode_ok (int regno, enum machine_mode mode)
32807 {
32808 /* Flags and only flags can only hold CCmode values. */
32809 if (CC_REGNO_P (regno))
32810 return GET_MODE_CLASS (mode) == MODE_CC;
32811 if (GET_MODE_CLASS (mode) == MODE_CC
32812 || GET_MODE_CLASS (mode) == MODE_RANDOM
32813 || GET_MODE_CLASS (mode) == MODE_PARTIAL_INT)
32814 return false;
32815 if (STACK_REGNO_P (regno))
32816 return VALID_FP_MODE_P (mode);
32817 if (SSE_REGNO_P (regno))
32818 {
32819 /* We implement the move patterns for all vector modes into and
32820 out of SSE registers, even when no operation instructions
32821 are available. OImode move is available only when AVX is
32822 enabled. */
32823 return ((TARGET_AVX && mode == OImode)
32824 || VALID_AVX256_REG_MODE (mode)
32825 || VALID_SSE_REG_MODE (mode)
32826 || VALID_SSE2_REG_MODE (mode)
32827 || VALID_MMX_REG_MODE (mode)
32828 || VALID_MMX_REG_MODE_3DNOW (mode));
32829 }
32830 if (MMX_REGNO_P (regno))
32831 {
32832 /* We implement the move patterns for 3DNOW modes even in MMX mode,
32833 so if the register is available at all, then we can move data of
32834 the given mode into or out of it. */
32835 return (VALID_MMX_REG_MODE (mode)
32836 || VALID_MMX_REG_MODE_3DNOW (mode));
32837 }
32838
32839 if (mode == QImode)
32840 {
32841 /* Take care for QImode values - they can be in non-QI regs,
32842 but then they do cause partial register stalls. */
32843 if (TARGET_64BIT || QI_REGNO_P (regno))
32844 return true;
32845 if (!TARGET_PARTIAL_REG_STALL)
32846 return true;
32847 return !can_create_pseudo_p ();
32848 }
32849 /* We handle both integer and floats in the general purpose registers. */
32850 else if (VALID_INT_MODE_P (mode))
32851 return true;
32852 else if (VALID_FP_MODE_P (mode))
32853 return true;
32854 else if (VALID_DFP_MODE_P (mode))
32855 return true;
32856 /* Lots of MMX code casts 8 byte vector modes to DImode. If we then go
32857 on to use that value in smaller contexts, this can easily force a
32858 pseudo to be allocated to GENERAL_REGS. Since this is no worse than
32859 supporting DImode, allow it. */
32860 else if (VALID_MMX_REG_MODE_3DNOW (mode) || VALID_MMX_REG_MODE (mode))
32861 return true;
32862
32863 return false;
32864 }
32865
32866 /* A subroutine of ix86_modes_tieable_p. Return true if MODE is a
32867 tieable integer mode. */
32868
32869 static bool
32870 ix86_tieable_integer_mode_p (enum machine_mode mode)
32871 {
32872 switch (mode)
32873 {
32874 case HImode:
32875 case SImode:
32876 return true;
32877
32878 case QImode:
32879 return TARGET_64BIT || !TARGET_PARTIAL_REG_STALL;
32880
32881 case DImode:
32882 return TARGET_64BIT;
32883
32884 default:
32885 return false;
32886 }
32887 }
32888
32889 /* Return true if MODE1 is accessible in a register that can hold MODE2
32890 without copying. That is, all register classes that can hold MODE2
32891 can also hold MODE1. */
32892
32893 bool
32894 ix86_modes_tieable_p (enum machine_mode mode1, enum machine_mode mode2)
32895 {
32896 if (mode1 == mode2)
32897 return true;
32898
32899 if (ix86_tieable_integer_mode_p (mode1)
32900 && ix86_tieable_integer_mode_p (mode2))
32901 return true;
32902
32903 /* MODE2 being XFmode implies fp stack or general regs, which means we
32904 can tie any smaller floating point modes to it. Note that we do not
32905 tie this with TFmode. */
32906 if (mode2 == XFmode)
32907 return mode1 == SFmode || mode1 == DFmode;
32908
32909 /* MODE2 being DFmode implies fp stack, general or sse regs, which means
32910 that we can tie it with SFmode. */
32911 if (mode2 == DFmode)
32912 return mode1 == SFmode;
32913
32914 /* If MODE2 is only appropriate for an SSE register, then tie with
32915 any other mode acceptable to SSE registers. */
32916 if (GET_MODE_SIZE (mode2) == 32
32917 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
32918 return (GET_MODE_SIZE (mode1) == 32
32919 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
32920 if (GET_MODE_SIZE (mode2) == 16
32921 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
32922 return (GET_MODE_SIZE (mode1) == 16
32923 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
32924
32925 /* If MODE2 is appropriate for an MMX register, then tie
32926 with any other mode acceptable to MMX registers. */
32927 if (GET_MODE_SIZE (mode2) == 8
32928 && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode2))
32929 return (GET_MODE_SIZE (mode1) == 8
32930 && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode1));
32931
32932 return false;
32933 }
32934
32935 /* Return the cost of moving between two registers of mode MODE. */
32936
32937 static int
32938 ix86_set_reg_reg_cost (enum machine_mode mode)
32939 {
32940 unsigned int units = UNITS_PER_WORD;
32941
32942 switch (GET_MODE_CLASS (mode))
32943 {
32944 default:
32945 break;
32946
32947 case MODE_CC:
32948 units = GET_MODE_SIZE (CCmode);
32949 break;
32950
32951 case MODE_FLOAT:
32952 if ((TARGET_SSE && mode == TFmode)
32953 || (TARGET_80387 && mode == XFmode)
32954 || ((TARGET_80387 || TARGET_SSE2) && mode == DFmode)
32955 || ((TARGET_80387 || TARGET_SSE) && mode == SFmode))
32956 units = GET_MODE_SIZE (mode);
32957 break;
32958
32959 case MODE_COMPLEX_FLOAT:
32960 if ((TARGET_SSE && mode == TCmode)
32961 || (TARGET_80387 && mode == XCmode)
32962 || ((TARGET_80387 || TARGET_SSE2) && mode == DCmode)
32963 || ((TARGET_80387 || TARGET_SSE) && mode == SCmode))
32964 units = GET_MODE_SIZE (mode);
32965 break;
32966
32967 case MODE_VECTOR_INT:
32968 case MODE_VECTOR_FLOAT:
32969 if ((TARGET_AVX && VALID_AVX256_REG_MODE (mode))
32970 || (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode))
32971 || (TARGET_SSE && VALID_SSE_REG_MODE (mode))
32972 || (TARGET_MMX && VALID_MMX_REG_MODE (mode)))
32973 units = GET_MODE_SIZE (mode);
32974 }
32975
32976 /* Return the cost of moving between two registers of mode MODE,
32977 assuming that the move will be in pieces of at most UNITS bytes. */
32978 return COSTS_N_INSNS ((GET_MODE_SIZE (mode) + units - 1) / units);
32979 }
32980
32981 /* Compute a (partial) cost for rtx X. Return true if the complete
32982 cost has been computed, and false if subexpressions should be
32983 scanned. In either case, *TOTAL contains the cost result. */
32984
32985 static bool
32986 ix86_rtx_costs (rtx x, int code_i, int outer_code_i, int opno, int *total,
32987 bool speed)
32988 {
32989 enum rtx_code code = (enum rtx_code) code_i;
32990 enum rtx_code outer_code = (enum rtx_code) outer_code_i;
32991 enum machine_mode mode = GET_MODE (x);
32992 const struct processor_costs *cost = speed ? ix86_cost : &ix86_size_cost;
32993
32994 switch (code)
32995 {
32996 case SET:
32997 if (register_operand (SET_DEST (x), VOIDmode)
32998 && reg_or_0_operand (SET_SRC (x), VOIDmode))
32999 {
33000 *total = ix86_set_reg_reg_cost (GET_MODE (SET_DEST (x)));
33001 return true;
33002 }
33003 return false;
33004
33005 case CONST_INT:
33006 case CONST:
33007 case LABEL_REF:
33008 case SYMBOL_REF:
33009 if (TARGET_64BIT && !x86_64_immediate_operand (x, VOIDmode))
33010 *total = 3;
33011 else if (TARGET_64BIT && !x86_64_zext_immediate_operand (x, VOIDmode))
33012 *total = 2;
33013 else if (flag_pic && SYMBOLIC_CONST (x)
33014 && (!TARGET_64BIT
33015 || (!GET_CODE (x) != LABEL_REF
33016 && (GET_CODE (x) != SYMBOL_REF
33017 || !SYMBOL_REF_LOCAL_P (x)))))
33018 *total = 1;
33019 else
33020 *total = 0;
33021 return true;
33022
33023 case CONST_DOUBLE:
33024 if (mode == VOIDmode)
33025 {
33026 *total = 0;
33027 return true;
33028 }
33029 switch (standard_80387_constant_p (x))
33030 {
33031 case 1: /* 0.0 */
33032 *total = 1;
33033 return true;
33034 default: /* Other constants */
33035 *total = 2;
33036 return true;
33037 case 0:
33038 case -1:
33039 break;
33040 }
33041 if (SSE_FLOAT_MODE_P (mode))
33042 {
33043 case CONST_VECTOR:
33044 switch (standard_sse_constant_p (x))
33045 {
33046 case 0:
33047 break;
33048 case 1: /* 0: xor eliminates false dependency */
33049 *total = 0;
33050 return true;
33051 default: /* -1: cmp contains false dependency */
33052 *total = 1;
33053 return true;
33054 }
33055 }
33056 /* Fall back to (MEM (SYMBOL_REF)), since that's where
33057 it'll probably end up. Add a penalty for size. */
33058 *total = (COSTS_N_INSNS (1)
33059 + (flag_pic != 0 && !TARGET_64BIT)
33060 + (mode == SFmode ? 0 : mode == DFmode ? 1 : 2));
33061 return true;
33062
33063 case ZERO_EXTEND:
33064 /* The zero extensions is often completely free on x86_64, so make
33065 it as cheap as possible. */
33066 if (TARGET_64BIT && mode == DImode
33067 && GET_MODE (XEXP (x, 0)) == SImode)
33068 *total = 1;
33069 else if (TARGET_ZERO_EXTEND_WITH_AND)
33070 *total = cost->add;
33071 else
33072 *total = cost->movzx;
33073 return false;
33074
33075 case SIGN_EXTEND:
33076 *total = cost->movsx;
33077 return false;
33078
33079 case ASHIFT:
33080 if (SCALAR_INT_MODE_P (mode)
33081 && GET_MODE_SIZE (mode) < UNITS_PER_WORD
33082 && CONST_INT_P (XEXP (x, 1)))
33083 {
33084 HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
33085 if (value == 1)
33086 {
33087 *total = cost->add;
33088 return false;
33089 }
33090 if ((value == 2 || value == 3)
33091 && cost->lea <= cost->shift_const)
33092 {
33093 *total = cost->lea;
33094 return false;
33095 }
33096 }
33097 /* FALLTHRU */
33098
33099 case ROTATE:
33100 case ASHIFTRT:
33101 case LSHIFTRT:
33102 case ROTATERT:
33103 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
33104 {
33105 /* ??? Should be SSE vector operation cost. */
33106 /* At least for published AMD latencies, this really is the same
33107 as the latency for a simple fpu operation like fabs. */
33108 /* V*QImode is emulated with 1-11 insns. */
33109 if (mode == V16QImode || mode == V32QImode)
33110 {
33111 int count = 11;
33112 if (TARGET_XOP && mode == V16QImode)
33113 {
33114 /* For XOP we use vpshab, which requires a broadcast of the
33115 value to the variable shift insn. For constants this
33116 means a V16Q const in mem; even when we can perform the
33117 shift with one insn set the cost to prefer paddb. */
33118 if (CONSTANT_P (XEXP (x, 1)))
33119 {
33120 *total = (cost->fabs
33121 + rtx_cost (XEXP (x, 0), code, 0, speed)
33122 + (speed ? 2 : COSTS_N_BYTES (16)));
33123 return true;
33124 }
33125 count = 3;
33126 }
33127 else if (TARGET_SSSE3)
33128 count = 7;
33129 *total = cost->fabs * count;
33130 }
33131 else
33132 *total = cost->fabs;
33133 }
33134 else if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
33135 {
33136 if (CONST_INT_P (XEXP (x, 1)))
33137 {
33138 if (INTVAL (XEXP (x, 1)) > 32)
33139 *total = cost->shift_const + COSTS_N_INSNS (2);
33140 else
33141 *total = cost->shift_const * 2;
33142 }
33143 else
33144 {
33145 if (GET_CODE (XEXP (x, 1)) == AND)
33146 *total = cost->shift_var * 2;
33147 else
33148 *total = cost->shift_var * 6 + COSTS_N_INSNS (2);
33149 }
33150 }
33151 else
33152 {
33153 if (CONST_INT_P (XEXP (x, 1)))
33154 *total = cost->shift_const;
33155 else
33156 *total = cost->shift_var;
33157 }
33158 return false;
33159
33160 case FMA:
33161 {
33162 rtx sub;
33163
33164 gcc_assert (FLOAT_MODE_P (mode));
33165 gcc_assert (TARGET_FMA || TARGET_FMA4);
33166
33167 /* ??? SSE scalar/vector cost should be used here. */
33168 /* ??? Bald assumption that fma has the same cost as fmul. */
33169 *total = cost->fmul;
33170 *total += rtx_cost (XEXP (x, 1), FMA, 1, speed);
33171
33172 /* Negate in op0 or op2 is free: FMS, FNMA, FNMS. */
33173 sub = XEXP (x, 0);
33174 if (GET_CODE (sub) == NEG)
33175 sub = XEXP (sub, 0);
33176 *total += rtx_cost (sub, FMA, 0, speed);
33177
33178 sub = XEXP (x, 2);
33179 if (GET_CODE (sub) == NEG)
33180 sub = XEXP (sub, 0);
33181 *total += rtx_cost (sub, FMA, 2, speed);
33182 return true;
33183 }
33184
33185 case MULT:
33186 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
33187 {
33188 /* ??? SSE scalar cost should be used here. */
33189 *total = cost->fmul;
33190 return false;
33191 }
33192 else if (X87_FLOAT_MODE_P (mode))
33193 {
33194 *total = cost->fmul;
33195 return false;
33196 }
33197 else if (FLOAT_MODE_P (mode))
33198 {
33199 /* ??? SSE vector cost should be used here. */
33200 *total = cost->fmul;
33201 return false;
33202 }
33203 else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
33204 {
33205 /* V*QImode is emulated with 7-13 insns. */
33206 if (mode == V16QImode || mode == V32QImode)
33207 {
33208 int extra = 11;
33209 if (TARGET_XOP && mode == V16QImode)
33210 extra = 5;
33211 else if (TARGET_SSSE3)
33212 extra = 6;
33213 *total = cost->fmul * 2 + cost->fabs * extra;
33214 }
33215 /* V*DImode is emulated with 5-8 insns. */
33216 else if (mode == V2DImode || mode == V4DImode)
33217 {
33218 if (TARGET_XOP && mode == V2DImode)
33219 *total = cost->fmul * 2 + cost->fabs * 3;
33220 else
33221 *total = cost->fmul * 3 + cost->fabs * 5;
33222 }
33223 /* Without sse4.1, we don't have PMULLD; it's emulated with 7
33224 insns, including two PMULUDQ. */
33225 else if (mode == V4SImode && !(TARGET_SSE4_1 || TARGET_AVX))
33226 *total = cost->fmul * 2 + cost->fabs * 5;
33227 else
33228 *total = cost->fmul;
33229 return false;
33230 }
33231 else
33232 {
33233 rtx op0 = XEXP (x, 0);
33234 rtx op1 = XEXP (x, 1);
33235 int nbits;
33236 if (CONST_INT_P (XEXP (x, 1)))
33237 {
33238 unsigned HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
33239 for (nbits = 0; value != 0; value &= value - 1)
33240 nbits++;
33241 }
33242 else
33243 /* This is arbitrary. */
33244 nbits = 7;
33245
33246 /* Compute costs correctly for widening multiplication. */
33247 if ((GET_CODE (op0) == SIGN_EXTEND || GET_CODE (op0) == ZERO_EXTEND)
33248 && GET_MODE_SIZE (GET_MODE (XEXP (op0, 0))) * 2
33249 == GET_MODE_SIZE (mode))
33250 {
33251 int is_mulwiden = 0;
33252 enum machine_mode inner_mode = GET_MODE (op0);
33253
33254 if (GET_CODE (op0) == GET_CODE (op1))
33255 is_mulwiden = 1, op1 = XEXP (op1, 0);
33256 else if (CONST_INT_P (op1))
33257 {
33258 if (GET_CODE (op0) == SIGN_EXTEND)
33259 is_mulwiden = trunc_int_for_mode (INTVAL (op1), inner_mode)
33260 == INTVAL (op1);
33261 else
33262 is_mulwiden = !(INTVAL (op1) & ~GET_MODE_MASK (inner_mode));
33263 }
33264
33265 if (is_mulwiden)
33266 op0 = XEXP (op0, 0), mode = GET_MODE (op0);
33267 }
33268
33269 *total = (cost->mult_init[MODE_INDEX (mode)]
33270 + nbits * cost->mult_bit
33271 + rtx_cost (op0, outer_code, opno, speed)
33272 + rtx_cost (op1, outer_code, opno, speed));
33273
33274 return true;
33275 }
33276
33277 case DIV:
33278 case UDIV:
33279 case MOD:
33280 case UMOD:
33281 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
33282 /* ??? SSE cost should be used here. */
33283 *total = cost->fdiv;
33284 else if (X87_FLOAT_MODE_P (mode))
33285 *total = cost->fdiv;
33286 else if (FLOAT_MODE_P (mode))
33287 /* ??? SSE vector cost should be used here. */
33288 *total = cost->fdiv;
33289 else
33290 *total = cost->divide[MODE_INDEX (mode)];
33291 return false;
33292
33293 case PLUS:
33294 if (GET_MODE_CLASS (mode) == MODE_INT
33295 && GET_MODE_SIZE (mode) <= UNITS_PER_WORD)
33296 {
33297 if (GET_CODE (XEXP (x, 0)) == PLUS
33298 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
33299 && CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 0), 1))
33300 && CONSTANT_P (XEXP (x, 1)))
33301 {
33302 HOST_WIDE_INT val = INTVAL (XEXP (XEXP (XEXP (x, 0), 0), 1));
33303 if (val == 2 || val == 4 || val == 8)
33304 {
33305 *total = cost->lea;
33306 *total += rtx_cost (XEXP (XEXP (x, 0), 1),
33307 outer_code, opno, speed);
33308 *total += rtx_cost (XEXP (XEXP (XEXP (x, 0), 0), 0),
33309 outer_code, opno, speed);
33310 *total += rtx_cost (XEXP (x, 1), outer_code, opno, speed);
33311 return true;
33312 }
33313 }
33314 else if (GET_CODE (XEXP (x, 0)) == MULT
33315 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
33316 {
33317 HOST_WIDE_INT val = INTVAL (XEXP (XEXP (x, 0), 1));
33318 if (val == 2 || val == 4 || val == 8)
33319 {
33320 *total = cost->lea;
33321 *total += rtx_cost (XEXP (XEXP (x, 0), 0),
33322 outer_code, opno, speed);
33323 *total += rtx_cost (XEXP (x, 1), outer_code, opno, speed);
33324 return true;
33325 }
33326 }
33327 else if (GET_CODE (XEXP (x, 0)) == PLUS)
33328 {
33329 *total = cost->lea;
33330 *total += rtx_cost (XEXP (XEXP (x, 0), 0),
33331 outer_code, opno, speed);
33332 *total += rtx_cost (XEXP (XEXP (x, 0), 1),
33333 outer_code, opno, speed);
33334 *total += rtx_cost (XEXP (x, 1), outer_code, opno, speed);
33335 return true;
33336 }
33337 }
33338 /* FALLTHRU */
33339
33340 case MINUS:
33341 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
33342 {
33343 /* ??? SSE cost should be used here. */
33344 *total = cost->fadd;
33345 return false;
33346 }
33347 else if (X87_FLOAT_MODE_P (mode))
33348 {
33349 *total = cost->fadd;
33350 return false;
33351 }
33352 else if (FLOAT_MODE_P (mode))
33353 {
33354 /* ??? SSE vector cost should be used here. */
33355 *total = cost->fadd;
33356 return false;
33357 }
33358 /* FALLTHRU */
33359
33360 case AND:
33361 case IOR:
33362 case XOR:
33363 if (GET_MODE_CLASS (mode) == MODE_INT
33364 && GET_MODE_SIZE (mode) > UNITS_PER_WORD)
33365 {
33366 *total = (cost->add * 2
33367 + (rtx_cost (XEXP (x, 0), outer_code, opno, speed)
33368 << (GET_MODE (XEXP (x, 0)) != DImode))
33369 + (rtx_cost (XEXP (x, 1), outer_code, opno, speed)
33370 << (GET_MODE (XEXP (x, 1)) != DImode)));
33371 return true;
33372 }
33373 /* FALLTHRU */
33374
33375 case NEG:
33376 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
33377 {
33378 /* ??? SSE cost should be used here. */
33379 *total = cost->fchs;
33380 return false;
33381 }
33382 else if (X87_FLOAT_MODE_P (mode))
33383 {
33384 *total = cost->fchs;
33385 return false;
33386 }
33387 else if (FLOAT_MODE_P (mode))
33388 {
33389 /* ??? SSE vector cost should be used here. */
33390 *total = cost->fchs;
33391 return false;
33392 }
33393 /* FALLTHRU */
33394
33395 case NOT:
33396 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
33397 {
33398 /* ??? Should be SSE vector operation cost. */
33399 /* At least for published AMD latencies, this really is the same
33400 as the latency for a simple fpu operation like fabs. */
33401 *total = cost->fabs;
33402 }
33403 else if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
33404 *total = cost->add * 2;
33405 else
33406 *total = cost->add;
33407 return false;
33408
33409 case COMPARE:
33410 if (GET_CODE (XEXP (x, 0)) == ZERO_EXTRACT
33411 && XEXP (XEXP (x, 0), 1) == const1_rtx
33412 && CONST_INT_P (XEXP (XEXP (x, 0), 2))
33413 && XEXP (x, 1) == const0_rtx)
33414 {
33415 /* This kind of construct is implemented using test[bwl].
33416 Treat it as if we had an AND. */
33417 *total = (cost->add
33418 + rtx_cost (XEXP (XEXP (x, 0), 0), outer_code, opno, speed)
33419 + rtx_cost (const1_rtx, outer_code, opno, speed));
33420 return true;
33421 }
33422 return false;
33423
33424 case FLOAT_EXTEND:
33425 if (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH))
33426 *total = 0;
33427 return false;
33428
33429 case ABS:
33430 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
33431 /* ??? SSE cost should be used here. */
33432 *total = cost->fabs;
33433 else if (X87_FLOAT_MODE_P (mode))
33434 *total = cost->fabs;
33435 else if (FLOAT_MODE_P (mode))
33436 /* ??? SSE vector cost should be used here. */
33437 *total = cost->fabs;
33438 return false;
33439
33440 case SQRT:
33441 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
33442 /* ??? SSE cost should be used here. */
33443 *total = cost->fsqrt;
33444 else if (X87_FLOAT_MODE_P (mode))
33445 *total = cost->fsqrt;
33446 else if (FLOAT_MODE_P (mode))
33447 /* ??? SSE vector cost should be used here. */
33448 *total = cost->fsqrt;
33449 return false;
33450
33451 case UNSPEC:
33452 if (XINT (x, 1) == UNSPEC_TP)
33453 *total = 0;
33454 return false;
33455
33456 case VEC_SELECT:
33457 case VEC_CONCAT:
33458 case VEC_MERGE:
33459 case VEC_DUPLICATE:
33460 /* ??? Assume all of these vector manipulation patterns are
33461 recognizable. In which case they all pretty much have the
33462 same cost. */
33463 *total = cost->fabs;
33464 return true;
33465
33466 default:
33467 return false;
33468 }
33469 }
33470
33471 #if TARGET_MACHO
33472
33473 static int current_machopic_label_num;
33474
33475 /* Given a symbol name and its associated stub, write out the
33476 definition of the stub. */
33477
33478 void
33479 machopic_output_stub (FILE *file, const char *symb, const char *stub)
33480 {
33481 unsigned int length;
33482 char *binder_name, *symbol_name, lazy_ptr_name[32];
33483 int label = ++current_machopic_label_num;
33484
33485 /* For 64-bit we shouldn't get here. */
33486 gcc_assert (!TARGET_64BIT);
33487
33488 /* Lose our funky encoding stuff so it doesn't contaminate the stub. */
33489 symb = targetm.strip_name_encoding (symb);
33490
33491 length = strlen (stub);
33492 binder_name = XALLOCAVEC (char, length + 32);
33493 GEN_BINDER_NAME_FOR_STUB (binder_name, stub, length);
33494
33495 length = strlen (symb);
33496 symbol_name = XALLOCAVEC (char, length + 32);
33497 GEN_SYMBOL_NAME_FOR_SYMBOL (symbol_name, symb, length);
33498
33499 sprintf (lazy_ptr_name, "L%d$lz", label);
33500
33501 if (MACHOPIC_ATT_STUB)
33502 switch_to_section (darwin_sections[machopic_picsymbol_stub3_section]);
33503 else if (MACHOPIC_PURE)
33504 switch_to_section (darwin_sections[machopic_picsymbol_stub2_section]);
33505 else
33506 switch_to_section (darwin_sections[machopic_symbol_stub_section]);
33507
33508 fprintf (file, "%s:\n", stub);
33509 fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
33510
33511 if (MACHOPIC_ATT_STUB)
33512 {
33513 fprintf (file, "\thlt ; hlt ; hlt ; hlt ; hlt\n");
33514 }
33515 else if (MACHOPIC_PURE)
33516 {
33517 /* PIC stub. */
33518 /* 25-byte PIC stub using "CALL get_pc_thunk". */
33519 rtx tmp = gen_rtx_REG (SImode, 2 /* ECX */);
33520 output_set_got (tmp, NULL_RTX); /* "CALL ___<cpu>.get_pc_thunk.cx". */
33521 fprintf (file, "LPC$%d:\tmovl\t%s-LPC$%d(%%ecx),%%ecx\n",
33522 label, lazy_ptr_name, label);
33523 fprintf (file, "\tjmp\t*%%ecx\n");
33524 }
33525 else
33526 fprintf (file, "\tjmp\t*%s\n", lazy_ptr_name);
33527
33528 /* The AT&T-style ("self-modifying") stub is not lazily bound, thus
33529 it needs no stub-binding-helper. */
33530 if (MACHOPIC_ATT_STUB)
33531 return;
33532
33533 fprintf (file, "%s:\n", binder_name);
33534
33535 if (MACHOPIC_PURE)
33536 {
33537 fprintf (file, "\tlea\t%s-%s(%%ecx),%%ecx\n", lazy_ptr_name, binder_name);
33538 fprintf (file, "\tpushl\t%%ecx\n");
33539 }
33540 else
33541 fprintf (file, "\tpushl\t$%s\n", lazy_ptr_name);
33542
33543 fputs ("\tjmp\tdyld_stub_binding_helper\n", file);
33544
33545 /* N.B. Keep the correspondence of these
33546 'symbol_ptr/symbol_ptr2/symbol_ptr3' sections consistent with the
33547 old-pic/new-pic/non-pic stubs; altering this will break
33548 compatibility with existing dylibs. */
33549 if (MACHOPIC_PURE)
33550 {
33551 /* 25-byte PIC stub using "CALL get_pc_thunk". */
33552 switch_to_section (darwin_sections[machopic_lazy_symbol_ptr2_section]);
33553 }
33554 else
33555 /* 16-byte -mdynamic-no-pic stub. */
33556 switch_to_section(darwin_sections[machopic_lazy_symbol_ptr3_section]);
33557
33558 fprintf (file, "%s:\n", lazy_ptr_name);
33559 fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
33560 fprintf (file, ASM_LONG "%s\n", binder_name);
33561 }
33562 #endif /* TARGET_MACHO */
33563
33564 /* Order the registers for register allocator. */
33565
33566 void
33567 x86_order_regs_for_local_alloc (void)
33568 {
33569 int pos = 0;
33570 int i;
33571
33572 /* First allocate the local general purpose registers. */
33573 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
33574 if (GENERAL_REGNO_P (i) && call_used_regs[i])
33575 reg_alloc_order [pos++] = i;
33576
33577 /* Global general purpose registers. */
33578 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
33579 if (GENERAL_REGNO_P (i) && !call_used_regs[i])
33580 reg_alloc_order [pos++] = i;
33581
33582 /* x87 registers come first in case we are doing FP math
33583 using them. */
33584 if (!TARGET_SSE_MATH)
33585 for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
33586 reg_alloc_order [pos++] = i;
33587
33588 /* SSE registers. */
33589 for (i = FIRST_SSE_REG; i <= LAST_SSE_REG; i++)
33590 reg_alloc_order [pos++] = i;
33591 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
33592 reg_alloc_order [pos++] = i;
33593
33594 /* x87 registers. */
33595 if (TARGET_SSE_MATH)
33596 for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
33597 reg_alloc_order [pos++] = i;
33598
33599 for (i = FIRST_MMX_REG; i <= LAST_MMX_REG; i++)
33600 reg_alloc_order [pos++] = i;
33601
33602 /* Initialize the rest of array as we do not allocate some registers
33603 at all. */
33604 while (pos < FIRST_PSEUDO_REGISTER)
33605 reg_alloc_order [pos++] = 0;
33606 }
33607
33608 /* Handle a "callee_pop_aggregate_return" attribute; arguments as
33609 in struct attribute_spec handler. */
33610 static tree
33611 ix86_handle_callee_pop_aggregate_return (tree *node, tree name,
33612 tree args,
33613 int flags ATTRIBUTE_UNUSED,
33614 bool *no_add_attrs)
33615 {
33616 if (TREE_CODE (*node) != FUNCTION_TYPE
33617 && TREE_CODE (*node) != METHOD_TYPE
33618 && TREE_CODE (*node) != FIELD_DECL
33619 && TREE_CODE (*node) != TYPE_DECL)
33620 {
33621 warning (OPT_Wattributes, "%qE attribute only applies to functions",
33622 name);
33623 *no_add_attrs = true;
33624 return NULL_TREE;
33625 }
33626 if (TARGET_64BIT)
33627 {
33628 warning (OPT_Wattributes, "%qE attribute only available for 32-bit",
33629 name);
33630 *no_add_attrs = true;
33631 return NULL_TREE;
33632 }
33633 if (is_attribute_p ("callee_pop_aggregate_return", name))
33634 {
33635 tree cst;
33636
33637 cst = TREE_VALUE (args);
33638 if (TREE_CODE (cst) != INTEGER_CST)
33639 {
33640 warning (OPT_Wattributes,
33641 "%qE attribute requires an integer constant argument",
33642 name);
33643 *no_add_attrs = true;
33644 }
33645 else if (compare_tree_int (cst, 0) != 0
33646 && compare_tree_int (cst, 1) != 0)
33647 {
33648 warning (OPT_Wattributes,
33649 "argument to %qE attribute is neither zero, nor one",
33650 name);
33651 *no_add_attrs = true;
33652 }
33653
33654 return NULL_TREE;
33655 }
33656
33657 return NULL_TREE;
33658 }
33659
33660 /* Handle a "ms_abi" or "sysv" attribute; arguments as in
33661 struct attribute_spec.handler. */
33662 static tree
33663 ix86_handle_abi_attribute (tree *node, tree name,
33664 tree args ATTRIBUTE_UNUSED,
33665 int flags ATTRIBUTE_UNUSED, bool *no_add_attrs)
33666 {
33667 if (TREE_CODE (*node) != FUNCTION_TYPE
33668 && TREE_CODE (*node) != METHOD_TYPE
33669 && TREE_CODE (*node) != FIELD_DECL
33670 && TREE_CODE (*node) != TYPE_DECL)
33671 {
33672 warning (OPT_Wattributes, "%qE attribute only applies to functions",
33673 name);
33674 *no_add_attrs = true;
33675 return NULL_TREE;
33676 }
33677
33678 /* Can combine regparm with all attributes but fastcall. */
33679 if (is_attribute_p ("ms_abi", name))
33680 {
33681 if (lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (*node)))
33682 {
33683 error ("ms_abi and sysv_abi attributes are not compatible");
33684 }
33685
33686 return NULL_TREE;
33687 }
33688 else if (is_attribute_p ("sysv_abi", name))
33689 {
33690 if (lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (*node)))
33691 {
33692 error ("ms_abi and sysv_abi attributes are not compatible");
33693 }
33694
33695 return NULL_TREE;
33696 }
33697
33698 return NULL_TREE;
33699 }
33700
33701 /* Handle a "ms_struct" or "gcc_struct" attribute; arguments as in
33702 struct attribute_spec.handler. */
33703 static tree
33704 ix86_handle_struct_attribute (tree *node, tree name,
33705 tree args ATTRIBUTE_UNUSED,
33706 int flags ATTRIBUTE_UNUSED, bool *no_add_attrs)
33707 {
33708 tree *type = NULL;
33709 if (DECL_P (*node))
33710 {
33711 if (TREE_CODE (*node) == TYPE_DECL)
33712 type = &TREE_TYPE (*node);
33713 }
33714 else
33715 type = node;
33716
33717 if (!(type && RECORD_OR_UNION_TYPE_P (*type)))
33718 {
33719 warning (OPT_Wattributes, "%qE attribute ignored",
33720 name);
33721 *no_add_attrs = true;
33722 }
33723
33724 else if ((is_attribute_p ("ms_struct", name)
33725 && lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (*type)))
33726 || ((is_attribute_p ("gcc_struct", name)
33727 && lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (*type)))))
33728 {
33729 warning (OPT_Wattributes, "%qE incompatible attribute ignored",
33730 name);
33731 *no_add_attrs = true;
33732 }
33733
33734 return NULL_TREE;
33735 }
33736
33737 static tree
33738 ix86_handle_fndecl_attribute (tree *node, tree name,
33739 tree args ATTRIBUTE_UNUSED,
33740 int flags ATTRIBUTE_UNUSED, bool *no_add_attrs)
33741 {
33742 if (TREE_CODE (*node) != FUNCTION_DECL)
33743 {
33744 warning (OPT_Wattributes, "%qE attribute only applies to functions",
33745 name);
33746 *no_add_attrs = true;
33747 }
33748 return NULL_TREE;
33749 }
33750
33751 static bool
33752 ix86_ms_bitfield_layout_p (const_tree record_type)
33753 {
33754 return ((TARGET_MS_BITFIELD_LAYOUT
33755 && !lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (record_type)))
33756 || lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (record_type)));
33757 }
33758
33759 /* Returns an expression indicating where the this parameter is
33760 located on entry to the FUNCTION. */
33761
33762 static rtx
33763 x86_this_parameter (tree function)
33764 {
33765 tree type = TREE_TYPE (function);
33766 bool aggr = aggregate_value_p (TREE_TYPE (type), type) != 0;
33767 int nregs;
33768
33769 if (TARGET_64BIT)
33770 {
33771 const int *parm_regs;
33772
33773 if (ix86_function_type_abi (type) == MS_ABI)
33774 parm_regs = x86_64_ms_abi_int_parameter_registers;
33775 else
33776 parm_regs = x86_64_int_parameter_registers;
33777 return gen_rtx_REG (Pmode, parm_regs[aggr]);
33778 }
33779
33780 nregs = ix86_function_regparm (type, function);
33781
33782 if (nregs > 0 && !stdarg_p (type))
33783 {
33784 int regno;
33785 unsigned int ccvt = ix86_get_callcvt (type);
33786
33787 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
33788 regno = aggr ? DX_REG : CX_REG;
33789 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
33790 {
33791 regno = CX_REG;
33792 if (aggr)
33793 return gen_rtx_MEM (SImode,
33794 plus_constant (Pmode, stack_pointer_rtx, 4));
33795 }
33796 else
33797 {
33798 regno = AX_REG;
33799 if (aggr)
33800 {
33801 regno = DX_REG;
33802 if (nregs == 1)
33803 return gen_rtx_MEM (SImode,
33804 plus_constant (Pmode,
33805 stack_pointer_rtx, 4));
33806 }
33807 }
33808 return gen_rtx_REG (SImode, regno);
33809 }
33810
33811 return gen_rtx_MEM (SImode, plus_constant (Pmode, stack_pointer_rtx,
33812 aggr ? 8 : 4));
33813 }
33814
33815 /* Determine whether x86_output_mi_thunk can succeed. */
33816
33817 static bool
33818 x86_can_output_mi_thunk (const_tree thunk ATTRIBUTE_UNUSED,
33819 HOST_WIDE_INT delta ATTRIBUTE_UNUSED,
33820 HOST_WIDE_INT vcall_offset, const_tree function)
33821 {
33822 /* 64-bit can handle anything. */
33823 if (TARGET_64BIT)
33824 return true;
33825
33826 /* For 32-bit, everything's fine if we have one free register. */
33827 if (ix86_function_regparm (TREE_TYPE (function), function) < 3)
33828 return true;
33829
33830 /* Need a free register for vcall_offset. */
33831 if (vcall_offset)
33832 return false;
33833
33834 /* Need a free register for GOT references. */
33835 if (flag_pic && !targetm.binds_local_p (function))
33836 return false;
33837
33838 /* Otherwise ok. */
33839 return true;
33840 }
33841
33842 /* Output the assembler code for a thunk function. THUNK_DECL is the
33843 declaration for the thunk function itself, FUNCTION is the decl for
33844 the target function. DELTA is an immediate constant offset to be
33845 added to THIS. If VCALL_OFFSET is nonzero, the word at
33846 *(*this + vcall_offset) should be added to THIS. */
33847
33848 static void
33849 x86_output_mi_thunk (FILE *file,
33850 tree thunk ATTRIBUTE_UNUSED, HOST_WIDE_INT delta,
33851 HOST_WIDE_INT vcall_offset, tree function)
33852 {
33853 rtx this_param = x86_this_parameter (function);
33854 rtx this_reg, tmp, fnaddr;
33855 unsigned int tmp_regno;
33856
33857 if (TARGET_64BIT)
33858 tmp_regno = R10_REG;
33859 else
33860 {
33861 unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (function));
33862 if ((ccvt & (IX86_CALLCVT_FASTCALL | IX86_CALLCVT_THISCALL)) != 0)
33863 tmp_regno = AX_REG;
33864 else
33865 tmp_regno = CX_REG;
33866 }
33867
33868 emit_note (NOTE_INSN_PROLOGUE_END);
33869
33870 /* If VCALL_OFFSET, we'll need THIS in a register. Might as well
33871 pull it in now and let DELTA benefit. */
33872 if (REG_P (this_param))
33873 this_reg = this_param;
33874 else if (vcall_offset)
33875 {
33876 /* Put the this parameter into %eax. */
33877 this_reg = gen_rtx_REG (Pmode, AX_REG);
33878 emit_move_insn (this_reg, this_param);
33879 }
33880 else
33881 this_reg = NULL_RTX;
33882
33883 /* Adjust the this parameter by a fixed constant. */
33884 if (delta)
33885 {
33886 rtx delta_rtx = GEN_INT (delta);
33887 rtx delta_dst = this_reg ? this_reg : this_param;
33888
33889 if (TARGET_64BIT)
33890 {
33891 if (!x86_64_general_operand (delta_rtx, Pmode))
33892 {
33893 tmp = gen_rtx_REG (Pmode, tmp_regno);
33894 emit_move_insn (tmp, delta_rtx);
33895 delta_rtx = tmp;
33896 }
33897 }
33898
33899 ix86_emit_binop (PLUS, Pmode, delta_dst, delta_rtx);
33900 }
33901
33902 /* Adjust the this parameter by a value stored in the vtable. */
33903 if (vcall_offset)
33904 {
33905 rtx vcall_addr, vcall_mem, this_mem;
33906
33907 tmp = gen_rtx_REG (Pmode, tmp_regno);
33908
33909 this_mem = gen_rtx_MEM (ptr_mode, this_reg);
33910 if (Pmode != ptr_mode)
33911 this_mem = gen_rtx_ZERO_EXTEND (Pmode, this_mem);
33912 emit_move_insn (tmp, this_mem);
33913
33914 /* Adjust the this parameter. */
33915 vcall_addr = plus_constant (Pmode, tmp, vcall_offset);
33916 if (TARGET_64BIT
33917 && !ix86_legitimate_address_p (ptr_mode, vcall_addr, true))
33918 {
33919 rtx tmp2 = gen_rtx_REG (Pmode, R11_REG);
33920 emit_move_insn (tmp2, GEN_INT (vcall_offset));
33921 vcall_addr = gen_rtx_PLUS (Pmode, tmp, tmp2);
33922 }
33923
33924 vcall_mem = gen_rtx_MEM (ptr_mode, vcall_addr);
33925 if (Pmode != ptr_mode)
33926 emit_insn (gen_addsi_1_zext (this_reg,
33927 gen_rtx_REG (ptr_mode,
33928 REGNO (this_reg)),
33929 vcall_mem));
33930 else
33931 ix86_emit_binop (PLUS, Pmode, this_reg, vcall_mem);
33932 }
33933
33934 /* If necessary, drop THIS back to its stack slot. */
33935 if (this_reg && this_reg != this_param)
33936 emit_move_insn (this_param, this_reg);
33937
33938 fnaddr = XEXP (DECL_RTL (function), 0);
33939 if (TARGET_64BIT)
33940 {
33941 if (!flag_pic || targetm.binds_local_p (function)
33942 || cfun->machine->call_abi == MS_ABI)
33943 ;
33944 else
33945 {
33946 tmp = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOTPCREL);
33947 tmp = gen_rtx_CONST (Pmode, tmp);
33948 fnaddr = gen_rtx_MEM (Pmode, tmp);
33949 }
33950 }
33951 else
33952 {
33953 if (!flag_pic || targetm.binds_local_p (function))
33954 ;
33955 #if TARGET_MACHO
33956 else if (TARGET_MACHO)
33957 {
33958 fnaddr = machopic_indirect_call_target (DECL_RTL (function));
33959 fnaddr = XEXP (fnaddr, 0);
33960 }
33961 #endif /* TARGET_MACHO */
33962 else
33963 {
33964 tmp = gen_rtx_REG (Pmode, CX_REG);
33965 output_set_got (tmp, NULL_RTX);
33966
33967 fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOT);
33968 fnaddr = gen_rtx_PLUS (Pmode, fnaddr, tmp);
33969 fnaddr = gen_rtx_MEM (Pmode, fnaddr);
33970 }
33971 }
33972
33973 /* Our sibling call patterns do not allow memories, because we have no
33974 predicate that can distinguish between frame and non-frame memory.
33975 For our purposes here, we can get away with (ab)using a jump pattern,
33976 because we're going to do no optimization. */
33977 if (MEM_P (fnaddr))
33978 emit_jump_insn (gen_indirect_jump (fnaddr));
33979 else
33980 {
33981 if (ix86_cmodel == CM_LARGE_PIC && SYMBOLIC_CONST (fnaddr))
33982 fnaddr = legitimize_pic_address (fnaddr,
33983 gen_rtx_REG (Pmode, tmp_regno));
33984
33985 if (!sibcall_insn_operand (fnaddr, word_mode))
33986 {
33987 tmp = gen_rtx_REG (word_mode, tmp_regno);
33988 if (GET_MODE (fnaddr) != word_mode)
33989 fnaddr = gen_rtx_ZERO_EXTEND (word_mode, fnaddr);
33990 emit_move_insn (tmp, fnaddr);
33991 fnaddr = tmp;
33992 }
33993
33994 tmp = gen_rtx_MEM (QImode, fnaddr);
33995 tmp = gen_rtx_CALL (VOIDmode, tmp, const0_rtx);
33996 tmp = emit_call_insn (tmp);
33997 SIBLING_CALL_P (tmp) = 1;
33998 }
33999 emit_barrier ();
34000
34001 /* Emit just enough of rest_of_compilation to get the insns emitted.
34002 Note that use_thunk calls assemble_start_function et al. */
34003 tmp = get_insns ();
34004 shorten_branches (tmp);
34005 final_start_function (tmp, file, 1);
34006 final (tmp, file, 1);
34007 final_end_function ();
34008 }
34009
34010 static void
34011 x86_file_start (void)
34012 {
34013 default_file_start ();
34014 #if TARGET_MACHO
34015 darwin_file_start ();
34016 #endif
34017 if (X86_FILE_START_VERSION_DIRECTIVE)
34018 fputs ("\t.version\t\"01.01\"\n", asm_out_file);
34019 if (X86_FILE_START_FLTUSED)
34020 fputs ("\t.global\t__fltused\n", asm_out_file);
34021 if (ix86_asm_dialect == ASM_INTEL)
34022 fputs ("\t.intel_syntax noprefix\n", asm_out_file);
34023 }
34024
34025 int
34026 x86_field_alignment (tree field, int computed)
34027 {
34028 enum machine_mode mode;
34029 tree type = TREE_TYPE (field);
34030
34031 if (TARGET_64BIT || TARGET_ALIGN_DOUBLE)
34032 return computed;
34033 mode = TYPE_MODE (strip_array_types (type));
34034 if (mode == DFmode || mode == DCmode
34035 || GET_MODE_CLASS (mode) == MODE_INT
34036 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
34037 return MIN (32, computed);
34038 return computed;
34039 }
34040
34041 /* Output assembler code to FILE to increment profiler label # LABELNO
34042 for profiling a function entry. */
34043 void
34044 x86_function_profiler (FILE *file, int labelno ATTRIBUTE_UNUSED)
34045 {
34046 const char *mcount_name = (flag_fentry ? MCOUNT_NAME_BEFORE_PROLOGUE
34047 : MCOUNT_NAME);
34048
34049 if (TARGET_64BIT)
34050 {
34051 #ifndef NO_PROFILE_COUNTERS
34052 fprintf (file, "\tleaq\t%sP%d(%%rip),%%r11\n", LPREFIX, labelno);
34053 #endif
34054
34055 if (DEFAULT_ABI == SYSV_ABI && flag_pic)
34056 fprintf (file, "\tcall\t*%s@GOTPCREL(%%rip)\n", mcount_name);
34057 else
34058 fprintf (file, "\tcall\t%s\n", mcount_name);
34059 }
34060 else if (flag_pic)
34061 {
34062 #ifndef NO_PROFILE_COUNTERS
34063 fprintf (file, "\tleal\t%sP%d@GOTOFF(%%ebx),%%" PROFILE_COUNT_REGISTER "\n",
34064 LPREFIX, labelno);
34065 #endif
34066 fprintf (file, "\tcall\t*%s@GOT(%%ebx)\n", mcount_name);
34067 }
34068 else
34069 {
34070 #ifndef NO_PROFILE_COUNTERS
34071 fprintf (file, "\tmovl\t$%sP%d,%%" PROFILE_COUNT_REGISTER "\n",
34072 LPREFIX, labelno);
34073 #endif
34074 fprintf (file, "\tcall\t%s\n", mcount_name);
34075 }
34076 }
34077
34078 /* We don't have exact information about the insn sizes, but we may assume
34079 quite safely that we are informed about all 1 byte insns and memory
34080 address sizes. This is enough to eliminate unnecessary padding in
34081 99% of cases. */
34082
34083 static int
34084 min_insn_size (rtx insn)
34085 {
34086 int l = 0, len;
34087
34088 if (!INSN_P (insn) || !active_insn_p (insn))
34089 return 0;
34090
34091 /* Discard alignments we've emit and jump instructions. */
34092 if (GET_CODE (PATTERN (insn)) == UNSPEC_VOLATILE
34093 && XINT (PATTERN (insn), 1) == UNSPECV_ALIGN)
34094 return 0;
34095 if (JUMP_TABLE_DATA_P (insn))
34096 return 0;
34097
34098 /* Important case - calls are always 5 bytes.
34099 It is common to have many calls in the row. */
34100 if (CALL_P (insn)
34101 && symbolic_reference_mentioned_p (PATTERN (insn))
34102 && !SIBLING_CALL_P (insn))
34103 return 5;
34104 len = get_attr_length (insn);
34105 if (len <= 1)
34106 return 1;
34107
34108 /* For normal instructions we rely on get_attr_length being exact,
34109 with a few exceptions. */
34110 if (!JUMP_P (insn))
34111 {
34112 enum attr_type type = get_attr_type (insn);
34113
34114 switch (type)
34115 {
34116 case TYPE_MULTI:
34117 if (GET_CODE (PATTERN (insn)) == ASM_INPUT
34118 || asm_noperands (PATTERN (insn)) >= 0)
34119 return 0;
34120 break;
34121 case TYPE_OTHER:
34122 case TYPE_FCMP:
34123 break;
34124 default:
34125 /* Otherwise trust get_attr_length. */
34126 return len;
34127 }
34128
34129 l = get_attr_length_address (insn);
34130 if (l < 4 && symbolic_reference_mentioned_p (PATTERN (insn)))
34131 l = 4;
34132 }
34133 if (l)
34134 return 1+l;
34135 else
34136 return 2;
34137 }
34138
34139 #ifdef ASM_OUTPUT_MAX_SKIP_PAD
34140
34141 /* AMD K8 core mispredicts jumps when there are more than 3 jumps in 16 byte
34142 window. */
34143
34144 static void
34145 ix86_avoid_jump_mispredicts (void)
34146 {
34147 rtx insn, start = get_insns ();
34148 int nbytes = 0, njumps = 0;
34149 int isjump = 0;
34150
34151 /* Look for all minimal intervals of instructions containing 4 jumps.
34152 The intervals are bounded by START and INSN. NBYTES is the total
34153 size of instructions in the interval including INSN and not including
34154 START. When the NBYTES is smaller than 16 bytes, it is possible
34155 that the end of START and INSN ends up in the same 16byte page.
34156
34157 The smallest offset in the page INSN can start is the case where START
34158 ends on the offset 0. Offset of INSN is then NBYTES - sizeof (INSN).
34159 We add p2align to 16byte window with maxskip 15 - NBYTES + sizeof (INSN).
34160 */
34161 for (insn = start; insn; insn = NEXT_INSN (insn))
34162 {
34163 int min_size;
34164
34165 if (LABEL_P (insn))
34166 {
34167 int align = label_to_alignment (insn);
34168 int max_skip = label_to_max_skip (insn);
34169
34170 if (max_skip > 15)
34171 max_skip = 15;
34172 /* If align > 3, only up to 16 - max_skip - 1 bytes can be
34173 already in the current 16 byte page, because otherwise
34174 ASM_OUTPUT_MAX_SKIP_ALIGN could skip max_skip or fewer
34175 bytes to reach 16 byte boundary. */
34176 if (align <= 0
34177 || (align <= 3 && max_skip != (1 << align) - 1))
34178 max_skip = 0;
34179 if (dump_file)
34180 fprintf (dump_file, "Label %i with max_skip %i\n",
34181 INSN_UID (insn), max_skip);
34182 if (max_skip)
34183 {
34184 while (nbytes + max_skip >= 16)
34185 {
34186 start = NEXT_INSN (start);
34187 if ((JUMP_P (start)
34188 && GET_CODE (PATTERN (start)) != ADDR_VEC
34189 && GET_CODE (PATTERN (start)) != ADDR_DIFF_VEC)
34190 || CALL_P (start))
34191 njumps--, isjump = 1;
34192 else
34193 isjump = 0;
34194 nbytes -= min_insn_size (start);
34195 }
34196 }
34197 continue;
34198 }
34199
34200 min_size = min_insn_size (insn);
34201 nbytes += min_size;
34202 if (dump_file)
34203 fprintf (dump_file, "Insn %i estimated to %i bytes\n",
34204 INSN_UID (insn), min_size);
34205 if ((JUMP_P (insn)
34206 && GET_CODE (PATTERN (insn)) != ADDR_VEC
34207 && GET_CODE (PATTERN (insn)) != ADDR_DIFF_VEC)
34208 || CALL_P (insn))
34209 njumps++;
34210 else
34211 continue;
34212
34213 while (njumps > 3)
34214 {
34215 start = NEXT_INSN (start);
34216 if ((JUMP_P (start)
34217 && GET_CODE (PATTERN (start)) != ADDR_VEC
34218 && GET_CODE (PATTERN (start)) != ADDR_DIFF_VEC)
34219 || CALL_P (start))
34220 njumps--, isjump = 1;
34221 else
34222 isjump = 0;
34223 nbytes -= min_insn_size (start);
34224 }
34225 gcc_assert (njumps >= 0);
34226 if (dump_file)
34227 fprintf (dump_file, "Interval %i to %i has %i bytes\n",
34228 INSN_UID (start), INSN_UID (insn), nbytes);
34229
34230 if (njumps == 3 && isjump && nbytes < 16)
34231 {
34232 int padsize = 15 - nbytes + min_insn_size (insn);
34233
34234 if (dump_file)
34235 fprintf (dump_file, "Padding insn %i by %i bytes!\n",
34236 INSN_UID (insn), padsize);
34237 emit_insn_before (gen_pad (GEN_INT (padsize)), insn);
34238 }
34239 }
34240 }
34241 #endif
34242
34243 /* AMD Athlon works faster
34244 when RET is not destination of conditional jump or directly preceded
34245 by other jump instruction. We avoid the penalty by inserting NOP just
34246 before the RET instructions in such cases. */
34247 static void
34248 ix86_pad_returns (void)
34249 {
34250 edge e;
34251 edge_iterator ei;
34252
34253 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR->preds)
34254 {
34255 basic_block bb = e->src;
34256 rtx ret = BB_END (bb);
34257 rtx prev;
34258 bool replace = false;
34259
34260 if (!JUMP_P (ret) || !ANY_RETURN_P (PATTERN (ret))
34261 || optimize_bb_for_size_p (bb))
34262 continue;
34263 for (prev = PREV_INSN (ret); prev; prev = PREV_INSN (prev))
34264 if (active_insn_p (prev) || LABEL_P (prev))
34265 break;
34266 if (prev && LABEL_P (prev))
34267 {
34268 edge e;
34269 edge_iterator ei;
34270
34271 FOR_EACH_EDGE (e, ei, bb->preds)
34272 if (EDGE_FREQUENCY (e) && e->src->index >= 0
34273 && !(e->flags & EDGE_FALLTHRU))
34274 replace = true;
34275 }
34276 if (!replace)
34277 {
34278 prev = prev_active_insn (ret);
34279 if (prev
34280 && ((JUMP_P (prev) && any_condjump_p (prev))
34281 || CALL_P (prev)))
34282 replace = true;
34283 /* Empty functions get branch mispredict even when
34284 the jump destination is not visible to us. */
34285 if (!prev && !optimize_function_for_size_p (cfun))
34286 replace = true;
34287 }
34288 if (replace)
34289 {
34290 emit_jump_insn_before (gen_simple_return_internal_long (), ret);
34291 delete_insn (ret);
34292 }
34293 }
34294 }
34295
34296 /* Count the minimum number of instructions in BB. Return 4 if the
34297 number of instructions >= 4. */
34298
34299 static int
34300 ix86_count_insn_bb (basic_block bb)
34301 {
34302 rtx insn;
34303 int insn_count = 0;
34304
34305 /* Count number of instructions in this block. Return 4 if the number
34306 of instructions >= 4. */
34307 FOR_BB_INSNS (bb, insn)
34308 {
34309 /* Only happen in exit blocks. */
34310 if (JUMP_P (insn)
34311 && ANY_RETURN_P (PATTERN (insn)))
34312 break;
34313
34314 if (NONDEBUG_INSN_P (insn)
34315 && GET_CODE (PATTERN (insn)) != USE
34316 && GET_CODE (PATTERN (insn)) != CLOBBER)
34317 {
34318 insn_count++;
34319 if (insn_count >= 4)
34320 return insn_count;
34321 }
34322 }
34323
34324 return insn_count;
34325 }
34326
34327
34328 /* Count the minimum number of instructions in code path in BB.
34329 Return 4 if the number of instructions >= 4. */
34330
34331 static int
34332 ix86_count_insn (basic_block bb)
34333 {
34334 edge e;
34335 edge_iterator ei;
34336 int min_prev_count;
34337
34338 /* Only bother counting instructions along paths with no
34339 more than 2 basic blocks between entry and exit. Given
34340 that BB has an edge to exit, determine if a predecessor
34341 of BB has an edge from entry. If so, compute the number
34342 of instructions in the predecessor block. If there
34343 happen to be multiple such blocks, compute the minimum. */
34344 min_prev_count = 4;
34345 FOR_EACH_EDGE (e, ei, bb->preds)
34346 {
34347 edge prev_e;
34348 edge_iterator prev_ei;
34349
34350 if (e->src == ENTRY_BLOCK_PTR)
34351 {
34352 min_prev_count = 0;
34353 break;
34354 }
34355 FOR_EACH_EDGE (prev_e, prev_ei, e->src->preds)
34356 {
34357 if (prev_e->src == ENTRY_BLOCK_PTR)
34358 {
34359 int count = ix86_count_insn_bb (e->src);
34360 if (count < min_prev_count)
34361 min_prev_count = count;
34362 break;
34363 }
34364 }
34365 }
34366
34367 if (min_prev_count < 4)
34368 min_prev_count += ix86_count_insn_bb (bb);
34369
34370 return min_prev_count;
34371 }
34372
34373 /* Pad short function to 4 instructions. */
34374
34375 static void
34376 ix86_pad_short_function (void)
34377 {
34378 edge e;
34379 edge_iterator ei;
34380
34381 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR->preds)
34382 {
34383 rtx ret = BB_END (e->src);
34384 if (JUMP_P (ret) && ANY_RETURN_P (PATTERN (ret)))
34385 {
34386 int insn_count = ix86_count_insn (e->src);
34387
34388 /* Pad short function. */
34389 if (insn_count < 4)
34390 {
34391 rtx insn = ret;
34392
34393 /* Find epilogue. */
34394 while (insn
34395 && (!NOTE_P (insn)
34396 || NOTE_KIND (insn) != NOTE_INSN_EPILOGUE_BEG))
34397 insn = PREV_INSN (insn);
34398
34399 if (!insn)
34400 insn = ret;
34401
34402 /* Two NOPs count as one instruction. */
34403 insn_count = 2 * (4 - insn_count);
34404 emit_insn_before (gen_nops (GEN_INT (insn_count)), insn);
34405 }
34406 }
34407 }
34408 }
34409
34410 /* Implement machine specific optimizations. We implement padding of returns
34411 for K8 CPUs and pass to avoid 4 jumps in the single 16 byte window. */
34412 static void
34413 ix86_reorg (void)
34414 {
34415 /* We are freeing block_for_insn in the toplev to keep compatibility
34416 with old MDEP_REORGS that are not CFG based. Recompute it now. */
34417 compute_bb_for_insn ();
34418
34419 /* Run the vzeroupper optimization if needed. */
34420 if (TARGET_VZEROUPPER)
34421 move_or_delete_vzeroupper ();
34422
34423 if (optimize && optimize_function_for_speed_p (cfun))
34424 {
34425 if (TARGET_PAD_SHORT_FUNCTION)
34426 ix86_pad_short_function ();
34427 else if (TARGET_PAD_RETURNS)
34428 ix86_pad_returns ();
34429 #ifdef ASM_OUTPUT_MAX_SKIP_PAD
34430 if (TARGET_FOUR_JUMP_LIMIT)
34431 ix86_avoid_jump_mispredicts ();
34432 #endif
34433 }
34434 }
34435
34436 /* Return nonzero when QImode register that must be represented via REX prefix
34437 is used. */
34438 bool
34439 x86_extended_QIreg_mentioned_p (rtx insn)
34440 {
34441 int i;
34442 extract_insn_cached (insn);
34443 for (i = 0; i < recog_data.n_operands; i++)
34444 if (GENERAL_REG_P (recog_data.operand[i])
34445 && !QI_REGNO_P (REGNO (recog_data.operand[i])))
34446 return true;
34447 return false;
34448 }
34449
34450 /* Return nonzero when P points to register encoded via REX prefix.
34451 Called via for_each_rtx. */
34452 static int
34453 extended_reg_mentioned_1 (rtx *p, void *data ATTRIBUTE_UNUSED)
34454 {
34455 unsigned int regno;
34456 if (!REG_P (*p))
34457 return 0;
34458 regno = REGNO (*p);
34459 return REX_INT_REGNO_P (regno) || REX_SSE_REGNO_P (regno);
34460 }
34461
34462 /* Return true when INSN mentions register that must be encoded using REX
34463 prefix. */
34464 bool
34465 x86_extended_reg_mentioned_p (rtx insn)
34466 {
34467 return for_each_rtx (INSN_P (insn) ? &PATTERN (insn) : &insn,
34468 extended_reg_mentioned_1, NULL);
34469 }
34470
34471 /* If profitable, negate (without causing overflow) integer constant
34472 of mode MODE at location LOC. Return true in this case. */
34473 bool
34474 x86_maybe_negate_const_int (rtx *loc, enum machine_mode mode)
34475 {
34476 HOST_WIDE_INT val;
34477
34478 if (!CONST_INT_P (*loc))
34479 return false;
34480
34481 switch (mode)
34482 {
34483 case DImode:
34484 /* DImode x86_64 constants must fit in 32 bits. */
34485 gcc_assert (x86_64_immediate_operand (*loc, mode));
34486
34487 mode = SImode;
34488 break;
34489
34490 case SImode:
34491 case HImode:
34492 case QImode:
34493 break;
34494
34495 default:
34496 gcc_unreachable ();
34497 }
34498
34499 /* Avoid overflows. */
34500 if (mode_signbit_p (mode, *loc))
34501 return false;
34502
34503 val = INTVAL (*loc);
34504
34505 /* Make things pretty and `subl $4,%eax' rather than `addl $-4,%eax'.
34506 Exceptions: -128 encodes smaller than 128, so swap sign and op. */
34507 if ((val < 0 && val != -128)
34508 || val == 128)
34509 {
34510 *loc = GEN_INT (-val);
34511 return true;
34512 }
34513
34514 return false;
34515 }
34516
34517 /* Generate an unsigned DImode/SImode to FP conversion. This is the same code
34518 optabs would emit if we didn't have TFmode patterns. */
34519
34520 void
34521 x86_emit_floatuns (rtx operands[2])
34522 {
34523 rtx neglab, donelab, i0, i1, f0, in, out;
34524 enum machine_mode mode, inmode;
34525
34526 inmode = GET_MODE (operands[1]);
34527 gcc_assert (inmode == SImode || inmode == DImode);
34528
34529 out = operands[0];
34530 in = force_reg (inmode, operands[1]);
34531 mode = GET_MODE (out);
34532 neglab = gen_label_rtx ();
34533 donelab = gen_label_rtx ();
34534 f0 = gen_reg_rtx (mode);
34535
34536 emit_cmp_and_jump_insns (in, const0_rtx, LT, const0_rtx, inmode, 0, neglab);
34537
34538 expand_float (out, in, 0);
34539
34540 emit_jump_insn (gen_jump (donelab));
34541 emit_barrier ();
34542
34543 emit_label (neglab);
34544
34545 i0 = expand_simple_binop (inmode, LSHIFTRT, in, const1_rtx, NULL,
34546 1, OPTAB_DIRECT);
34547 i1 = expand_simple_binop (inmode, AND, in, const1_rtx, NULL,
34548 1, OPTAB_DIRECT);
34549 i0 = expand_simple_binop (inmode, IOR, i0, i1, i0, 1, OPTAB_DIRECT);
34550
34551 expand_float (f0, i0, 0);
34552
34553 emit_insn (gen_rtx_SET (VOIDmode, out, gen_rtx_PLUS (mode, f0, f0)));
34554
34555 emit_label (donelab);
34556 }
34557 \f
34558 /* AVX2 does support 32-byte integer vector operations,
34559 thus the longest vector we are faced with is V32QImode. */
34560 #define MAX_VECT_LEN 32
34561
34562 struct expand_vec_perm_d
34563 {
34564 rtx target, op0, op1;
34565 unsigned char perm[MAX_VECT_LEN];
34566 enum machine_mode vmode;
34567 unsigned char nelt;
34568 bool one_operand_p;
34569 bool testing_p;
34570 };
34571
34572 static bool canonicalize_perm (struct expand_vec_perm_d *d);
34573 static bool expand_vec_perm_1 (struct expand_vec_perm_d *d);
34574 static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d);
34575
34576 /* Get a vector mode of the same size as the original but with elements
34577 twice as wide. This is only guaranteed to apply to integral vectors. */
34578
34579 static inline enum machine_mode
34580 get_mode_wider_vector (enum machine_mode o)
34581 {
34582 /* ??? Rely on the ordering that genmodes.c gives to vectors. */
34583 enum machine_mode n = GET_MODE_WIDER_MODE (o);
34584 gcc_assert (GET_MODE_NUNITS (o) == GET_MODE_NUNITS (n) * 2);
34585 gcc_assert (GET_MODE_SIZE (o) == GET_MODE_SIZE (n));
34586 return n;
34587 }
34588
34589 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
34590 with all elements equal to VAR. Return true if successful. */
34591
34592 static bool
34593 ix86_expand_vector_init_duplicate (bool mmx_ok, enum machine_mode mode,
34594 rtx target, rtx val)
34595 {
34596 bool ok;
34597
34598 switch (mode)
34599 {
34600 case V2SImode:
34601 case V2SFmode:
34602 if (!mmx_ok)
34603 return false;
34604 /* FALLTHRU */
34605
34606 case V4DFmode:
34607 case V4DImode:
34608 case V8SFmode:
34609 case V8SImode:
34610 case V2DFmode:
34611 case V2DImode:
34612 case V4SFmode:
34613 case V4SImode:
34614 {
34615 rtx insn, dup;
34616
34617 /* First attempt to recognize VAL as-is. */
34618 dup = gen_rtx_VEC_DUPLICATE (mode, val);
34619 insn = emit_insn (gen_rtx_SET (VOIDmode, target, dup));
34620 if (recog_memoized (insn) < 0)
34621 {
34622 rtx seq;
34623 /* If that fails, force VAL into a register. */
34624
34625 start_sequence ();
34626 XEXP (dup, 0) = force_reg (GET_MODE_INNER (mode), val);
34627 seq = get_insns ();
34628 end_sequence ();
34629 if (seq)
34630 emit_insn_before (seq, insn);
34631
34632 ok = recog_memoized (insn) >= 0;
34633 gcc_assert (ok);
34634 }
34635 }
34636 return true;
34637
34638 case V4HImode:
34639 if (!mmx_ok)
34640 return false;
34641 if (TARGET_SSE || TARGET_3DNOW_A)
34642 {
34643 rtx x;
34644
34645 val = gen_lowpart (SImode, val);
34646 x = gen_rtx_TRUNCATE (HImode, val);
34647 x = gen_rtx_VEC_DUPLICATE (mode, x);
34648 emit_insn (gen_rtx_SET (VOIDmode, target, x));
34649 return true;
34650 }
34651 goto widen;
34652
34653 case V8QImode:
34654 if (!mmx_ok)
34655 return false;
34656 goto widen;
34657
34658 case V8HImode:
34659 if (TARGET_SSE2)
34660 {
34661 struct expand_vec_perm_d dperm;
34662 rtx tmp1, tmp2;
34663
34664 permute:
34665 memset (&dperm, 0, sizeof (dperm));
34666 dperm.target = target;
34667 dperm.vmode = mode;
34668 dperm.nelt = GET_MODE_NUNITS (mode);
34669 dperm.op0 = dperm.op1 = gen_reg_rtx (mode);
34670 dperm.one_operand_p = true;
34671
34672 /* Extend to SImode using a paradoxical SUBREG. */
34673 tmp1 = gen_reg_rtx (SImode);
34674 emit_move_insn (tmp1, gen_lowpart (SImode, val));
34675
34676 /* Insert the SImode value as low element of a V4SImode vector. */
34677 tmp2 = gen_lowpart (V4SImode, dperm.op0);
34678 emit_insn (gen_vec_setv4si_0 (tmp2, CONST0_RTX (V4SImode), tmp1));
34679
34680 ok = (expand_vec_perm_1 (&dperm)
34681 || expand_vec_perm_broadcast_1 (&dperm));
34682 gcc_assert (ok);
34683 return ok;
34684 }
34685 goto widen;
34686
34687 case V16QImode:
34688 if (TARGET_SSE2)
34689 goto permute;
34690 goto widen;
34691
34692 widen:
34693 /* Replicate the value once into the next wider mode and recurse. */
34694 {
34695 enum machine_mode smode, wsmode, wvmode;
34696 rtx x;
34697
34698 smode = GET_MODE_INNER (mode);
34699 wvmode = get_mode_wider_vector (mode);
34700 wsmode = GET_MODE_INNER (wvmode);
34701
34702 val = convert_modes (wsmode, smode, val, true);
34703 x = expand_simple_binop (wsmode, ASHIFT, val,
34704 GEN_INT (GET_MODE_BITSIZE (smode)),
34705 NULL_RTX, 1, OPTAB_LIB_WIDEN);
34706 val = expand_simple_binop (wsmode, IOR, val, x, x, 1, OPTAB_LIB_WIDEN);
34707
34708 x = gen_lowpart (wvmode, target);
34709 ok = ix86_expand_vector_init_duplicate (mmx_ok, wvmode, x, val);
34710 gcc_assert (ok);
34711 return ok;
34712 }
34713
34714 case V16HImode:
34715 case V32QImode:
34716 {
34717 enum machine_mode hvmode = (mode == V16HImode ? V8HImode : V16QImode);
34718 rtx x = gen_reg_rtx (hvmode);
34719
34720 ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
34721 gcc_assert (ok);
34722
34723 x = gen_rtx_VEC_CONCAT (mode, x, x);
34724 emit_insn (gen_rtx_SET (VOIDmode, target, x));
34725 }
34726 return true;
34727
34728 default:
34729 return false;
34730 }
34731 }
34732
34733 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
34734 whose ONE_VAR element is VAR, and other elements are zero. Return true
34735 if successful. */
34736
34737 static bool
34738 ix86_expand_vector_init_one_nonzero (bool mmx_ok, enum machine_mode mode,
34739 rtx target, rtx var, int one_var)
34740 {
34741 enum machine_mode vsimode;
34742 rtx new_target;
34743 rtx x, tmp;
34744 bool use_vector_set = false;
34745
34746 switch (mode)
34747 {
34748 case V2DImode:
34749 /* For SSE4.1, we normally use vector set. But if the second
34750 element is zero and inter-unit moves are OK, we use movq
34751 instead. */
34752 use_vector_set = (TARGET_64BIT
34753 && TARGET_SSE4_1
34754 && !(TARGET_INTER_UNIT_MOVES
34755 && one_var == 0));
34756 break;
34757 case V16QImode:
34758 case V4SImode:
34759 case V4SFmode:
34760 use_vector_set = TARGET_SSE4_1;
34761 break;
34762 case V8HImode:
34763 use_vector_set = TARGET_SSE2;
34764 break;
34765 case V4HImode:
34766 use_vector_set = TARGET_SSE || TARGET_3DNOW_A;
34767 break;
34768 case V32QImode:
34769 case V16HImode:
34770 case V8SImode:
34771 case V8SFmode:
34772 case V4DFmode:
34773 use_vector_set = TARGET_AVX;
34774 break;
34775 case V4DImode:
34776 /* Use ix86_expand_vector_set in 64bit mode only. */
34777 use_vector_set = TARGET_AVX && TARGET_64BIT;
34778 break;
34779 default:
34780 break;
34781 }
34782
34783 if (use_vector_set)
34784 {
34785 emit_insn (gen_rtx_SET (VOIDmode, target, CONST0_RTX (mode)));
34786 var = force_reg (GET_MODE_INNER (mode), var);
34787 ix86_expand_vector_set (mmx_ok, target, var, one_var);
34788 return true;
34789 }
34790
34791 switch (mode)
34792 {
34793 case V2SFmode:
34794 case V2SImode:
34795 if (!mmx_ok)
34796 return false;
34797 /* FALLTHRU */
34798
34799 case V2DFmode:
34800 case V2DImode:
34801 if (one_var != 0)
34802 return false;
34803 var = force_reg (GET_MODE_INNER (mode), var);
34804 x = gen_rtx_VEC_CONCAT (mode, var, CONST0_RTX (GET_MODE_INNER (mode)));
34805 emit_insn (gen_rtx_SET (VOIDmode, target, x));
34806 return true;
34807
34808 case V4SFmode:
34809 case V4SImode:
34810 if (!REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
34811 new_target = gen_reg_rtx (mode);
34812 else
34813 new_target = target;
34814 var = force_reg (GET_MODE_INNER (mode), var);
34815 x = gen_rtx_VEC_DUPLICATE (mode, var);
34816 x = gen_rtx_VEC_MERGE (mode, x, CONST0_RTX (mode), const1_rtx);
34817 emit_insn (gen_rtx_SET (VOIDmode, new_target, x));
34818 if (one_var != 0)
34819 {
34820 /* We need to shuffle the value to the correct position, so
34821 create a new pseudo to store the intermediate result. */
34822
34823 /* With SSE2, we can use the integer shuffle insns. */
34824 if (mode != V4SFmode && TARGET_SSE2)
34825 {
34826 emit_insn (gen_sse2_pshufd_1 (new_target, new_target,
34827 const1_rtx,
34828 GEN_INT (one_var == 1 ? 0 : 1),
34829 GEN_INT (one_var == 2 ? 0 : 1),
34830 GEN_INT (one_var == 3 ? 0 : 1)));
34831 if (target != new_target)
34832 emit_move_insn (target, new_target);
34833 return true;
34834 }
34835
34836 /* Otherwise convert the intermediate result to V4SFmode and
34837 use the SSE1 shuffle instructions. */
34838 if (mode != V4SFmode)
34839 {
34840 tmp = gen_reg_rtx (V4SFmode);
34841 emit_move_insn (tmp, gen_lowpart (V4SFmode, new_target));
34842 }
34843 else
34844 tmp = new_target;
34845
34846 emit_insn (gen_sse_shufps_v4sf (tmp, tmp, tmp,
34847 const1_rtx,
34848 GEN_INT (one_var == 1 ? 0 : 1),
34849 GEN_INT (one_var == 2 ? 0+4 : 1+4),
34850 GEN_INT (one_var == 3 ? 0+4 : 1+4)));
34851
34852 if (mode != V4SFmode)
34853 emit_move_insn (target, gen_lowpart (V4SImode, tmp));
34854 else if (tmp != target)
34855 emit_move_insn (target, tmp);
34856 }
34857 else if (target != new_target)
34858 emit_move_insn (target, new_target);
34859 return true;
34860
34861 case V8HImode:
34862 case V16QImode:
34863 vsimode = V4SImode;
34864 goto widen;
34865 case V4HImode:
34866 case V8QImode:
34867 if (!mmx_ok)
34868 return false;
34869 vsimode = V2SImode;
34870 goto widen;
34871 widen:
34872 if (one_var != 0)
34873 return false;
34874
34875 /* Zero extend the variable element to SImode and recurse. */
34876 var = convert_modes (SImode, GET_MODE_INNER (mode), var, true);
34877
34878 x = gen_reg_rtx (vsimode);
34879 if (!ix86_expand_vector_init_one_nonzero (mmx_ok, vsimode, x,
34880 var, one_var))
34881 gcc_unreachable ();
34882
34883 emit_move_insn (target, gen_lowpart (mode, x));
34884 return true;
34885
34886 default:
34887 return false;
34888 }
34889 }
34890
34891 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
34892 consisting of the values in VALS. It is known that all elements
34893 except ONE_VAR are constants. Return true if successful. */
34894
34895 static bool
34896 ix86_expand_vector_init_one_var (bool mmx_ok, enum machine_mode mode,
34897 rtx target, rtx vals, int one_var)
34898 {
34899 rtx var = XVECEXP (vals, 0, one_var);
34900 enum machine_mode wmode;
34901 rtx const_vec, x;
34902
34903 const_vec = copy_rtx (vals);
34904 XVECEXP (const_vec, 0, one_var) = CONST0_RTX (GET_MODE_INNER (mode));
34905 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (const_vec, 0));
34906
34907 switch (mode)
34908 {
34909 case V2DFmode:
34910 case V2DImode:
34911 case V2SFmode:
34912 case V2SImode:
34913 /* For the two element vectors, it's just as easy to use
34914 the general case. */
34915 return false;
34916
34917 case V4DImode:
34918 /* Use ix86_expand_vector_set in 64bit mode only. */
34919 if (!TARGET_64BIT)
34920 return false;
34921 case V4DFmode:
34922 case V8SFmode:
34923 case V8SImode:
34924 case V16HImode:
34925 case V32QImode:
34926 case V4SFmode:
34927 case V4SImode:
34928 case V8HImode:
34929 case V4HImode:
34930 break;
34931
34932 case V16QImode:
34933 if (TARGET_SSE4_1)
34934 break;
34935 wmode = V8HImode;
34936 goto widen;
34937 case V8QImode:
34938 wmode = V4HImode;
34939 goto widen;
34940 widen:
34941 /* There's no way to set one QImode entry easily. Combine
34942 the variable value with its adjacent constant value, and
34943 promote to an HImode set. */
34944 x = XVECEXP (vals, 0, one_var ^ 1);
34945 if (one_var & 1)
34946 {
34947 var = convert_modes (HImode, QImode, var, true);
34948 var = expand_simple_binop (HImode, ASHIFT, var, GEN_INT (8),
34949 NULL_RTX, 1, OPTAB_LIB_WIDEN);
34950 x = GEN_INT (INTVAL (x) & 0xff);
34951 }
34952 else
34953 {
34954 var = convert_modes (HImode, QImode, var, true);
34955 x = gen_int_mode (INTVAL (x) << 8, HImode);
34956 }
34957 if (x != const0_rtx)
34958 var = expand_simple_binop (HImode, IOR, var, x, var,
34959 1, OPTAB_LIB_WIDEN);
34960
34961 x = gen_reg_rtx (wmode);
34962 emit_move_insn (x, gen_lowpart (wmode, const_vec));
34963 ix86_expand_vector_set (mmx_ok, x, var, one_var >> 1);
34964
34965 emit_move_insn (target, gen_lowpart (mode, x));
34966 return true;
34967
34968 default:
34969 return false;
34970 }
34971
34972 emit_move_insn (target, const_vec);
34973 ix86_expand_vector_set (mmx_ok, target, var, one_var);
34974 return true;
34975 }
34976
34977 /* A subroutine of ix86_expand_vector_init_general. Use vector
34978 concatenate to handle the most general case: all values variable,
34979 and none identical. */
34980
34981 static void
34982 ix86_expand_vector_init_concat (enum machine_mode mode,
34983 rtx target, rtx *ops, int n)
34984 {
34985 enum machine_mode cmode, hmode = VOIDmode;
34986 rtx first[8], second[4];
34987 rtvec v;
34988 int i, j;
34989
34990 switch (n)
34991 {
34992 case 2:
34993 switch (mode)
34994 {
34995 case V8SImode:
34996 cmode = V4SImode;
34997 break;
34998 case V8SFmode:
34999 cmode = V4SFmode;
35000 break;
35001 case V4DImode:
35002 cmode = V2DImode;
35003 break;
35004 case V4DFmode:
35005 cmode = V2DFmode;
35006 break;
35007 case V4SImode:
35008 cmode = V2SImode;
35009 break;
35010 case V4SFmode:
35011 cmode = V2SFmode;
35012 break;
35013 case V2DImode:
35014 cmode = DImode;
35015 break;
35016 case V2SImode:
35017 cmode = SImode;
35018 break;
35019 case V2DFmode:
35020 cmode = DFmode;
35021 break;
35022 case V2SFmode:
35023 cmode = SFmode;
35024 break;
35025 default:
35026 gcc_unreachable ();
35027 }
35028
35029 if (!register_operand (ops[1], cmode))
35030 ops[1] = force_reg (cmode, ops[1]);
35031 if (!register_operand (ops[0], cmode))
35032 ops[0] = force_reg (cmode, ops[0]);
35033 emit_insn (gen_rtx_SET (VOIDmode, target,
35034 gen_rtx_VEC_CONCAT (mode, ops[0],
35035 ops[1])));
35036 break;
35037
35038 case 4:
35039 switch (mode)
35040 {
35041 case V4DImode:
35042 cmode = V2DImode;
35043 break;
35044 case V4DFmode:
35045 cmode = V2DFmode;
35046 break;
35047 case V4SImode:
35048 cmode = V2SImode;
35049 break;
35050 case V4SFmode:
35051 cmode = V2SFmode;
35052 break;
35053 default:
35054 gcc_unreachable ();
35055 }
35056 goto half;
35057
35058 case 8:
35059 switch (mode)
35060 {
35061 case V8SImode:
35062 cmode = V2SImode;
35063 hmode = V4SImode;
35064 break;
35065 case V8SFmode:
35066 cmode = V2SFmode;
35067 hmode = V4SFmode;
35068 break;
35069 default:
35070 gcc_unreachable ();
35071 }
35072 goto half;
35073
35074 half:
35075 /* FIXME: We process inputs backward to help RA. PR 36222. */
35076 i = n - 1;
35077 j = (n >> 1) - 1;
35078 for (; i > 0; i -= 2, j--)
35079 {
35080 first[j] = gen_reg_rtx (cmode);
35081 v = gen_rtvec (2, ops[i - 1], ops[i]);
35082 ix86_expand_vector_init (false, first[j],
35083 gen_rtx_PARALLEL (cmode, v));
35084 }
35085
35086 n >>= 1;
35087 if (n > 2)
35088 {
35089 gcc_assert (hmode != VOIDmode);
35090 for (i = j = 0; i < n; i += 2, j++)
35091 {
35092 second[j] = gen_reg_rtx (hmode);
35093 ix86_expand_vector_init_concat (hmode, second [j],
35094 &first [i], 2);
35095 }
35096 n >>= 1;
35097 ix86_expand_vector_init_concat (mode, target, second, n);
35098 }
35099 else
35100 ix86_expand_vector_init_concat (mode, target, first, n);
35101 break;
35102
35103 default:
35104 gcc_unreachable ();
35105 }
35106 }
35107
35108 /* A subroutine of ix86_expand_vector_init_general. Use vector
35109 interleave to handle the most general case: all values variable,
35110 and none identical. */
35111
35112 static void
35113 ix86_expand_vector_init_interleave (enum machine_mode mode,
35114 rtx target, rtx *ops, int n)
35115 {
35116 enum machine_mode first_imode, second_imode, third_imode, inner_mode;
35117 int i, j;
35118 rtx op0, op1;
35119 rtx (*gen_load_even) (rtx, rtx, rtx);
35120 rtx (*gen_interleave_first_low) (rtx, rtx, rtx);
35121 rtx (*gen_interleave_second_low) (rtx, rtx, rtx);
35122
35123 switch (mode)
35124 {
35125 case V8HImode:
35126 gen_load_even = gen_vec_setv8hi;
35127 gen_interleave_first_low = gen_vec_interleave_lowv4si;
35128 gen_interleave_second_low = gen_vec_interleave_lowv2di;
35129 inner_mode = HImode;
35130 first_imode = V4SImode;
35131 second_imode = V2DImode;
35132 third_imode = VOIDmode;
35133 break;
35134 case V16QImode:
35135 gen_load_even = gen_vec_setv16qi;
35136 gen_interleave_first_low = gen_vec_interleave_lowv8hi;
35137 gen_interleave_second_low = gen_vec_interleave_lowv4si;
35138 inner_mode = QImode;
35139 first_imode = V8HImode;
35140 second_imode = V4SImode;
35141 third_imode = V2DImode;
35142 break;
35143 default:
35144 gcc_unreachable ();
35145 }
35146
35147 for (i = 0; i < n; i++)
35148 {
35149 /* Extend the odd elment to SImode using a paradoxical SUBREG. */
35150 op0 = gen_reg_rtx (SImode);
35151 emit_move_insn (op0, gen_lowpart (SImode, ops [i + i]));
35152
35153 /* Insert the SImode value as low element of V4SImode vector. */
35154 op1 = gen_reg_rtx (V4SImode);
35155 op0 = gen_rtx_VEC_MERGE (V4SImode,
35156 gen_rtx_VEC_DUPLICATE (V4SImode,
35157 op0),
35158 CONST0_RTX (V4SImode),
35159 const1_rtx);
35160 emit_insn (gen_rtx_SET (VOIDmode, op1, op0));
35161
35162 /* Cast the V4SImode vector back to a vector in orignal mode. */
35163 op0 = gen_reg_rtx (mode);
35164 emit_move_insn (op0, gen_lowpart (mode, op1));
35165
35166 /* Load even elements into the second positon. */
35167 emit_insn (gen_load_even (op0,
35168 force_reg (inner_mode,
35169 ops [i + i + 1]),
35170 const1_rtx));
35171
35172 /* Cast vector to FIRST_IMODE vector. */
35173 ops[i] = gen_reg_rtx (first_imode);
35174 emit_move_insn (ops[i], gen_lowpart (first_imode, op0));
35175 }
35176
35177 /* Interleave low FIRST_IMODE vectors. */
35178 for (i = j = 0; i < n; i += 2, j++)
35179 {
35180 op0 = gen_reg_rtx (first_imode);
35181 emit_insn (gen_interleave_first_low (op0, ops[i], ops[i + 1]));
35182
35183 /* Cast FIRST_IMODE vector to SECOND_IMODE vector. */
35184 ops[j] = gen_reg_rtx (second_imode);
35185 emit_move_insn (ops[j], gen_lowpart (second_imode, op0));
35186 }
35187
35188 /* Interleave low SECOND_IMODE vectors. */
35189 switch (second_imode)
35190 {
35191 case V4SImode:
35192 for (i = j = 0; i < n / 2; i += 2, j++)
35193 {
35194 op0 = gen_reg_rtx (second_imode);
35195 emit_insn (gen_interleave_second_low (op0, ops[i],
35196 ops[i + 1]));
35197
35198 /* Cast the SECOND_IMODE vector to the THIRD_IMODE
35199 vector. */
35200 ops[j] = gen_reg_rtx (third_imode);
35201 emit_move_insn (ops[j], gen_lowpart (third_imode, op0));
35202 }
35203 second_imode = V2DImode;
35204 gen_interleave_second_low = gen_vec_interleave_lowv2di;
35205 /* FALLTHRU */
35206
35207 case V2DImode:
35208 op0 = gen_reg_rtx (second_imode);
35209 emit_insn (gen_interleave_second_low (op0, ops[0],
35210 ops[1]));
35211
35212 /* Cast the SECOND_IMODE vector back to a vector on original
35213 mode. */
35214 emit_insn (gen_rtx_SET (VOIDmode, target,
35215 gen_lowpart (mode, op0)));
35216 break;
35217
35218 default:
35219 gcc_unreachable ();
35220 }
35221 }
35222
35223 /* A subroutine of ix86_expand_vector_init. Handle the most general case:
35224 all values variable, and none identical. */
35225
35226 static void
35227 ix86_expand_vector_init_general (bool mmx_ok, enum machine_mode mode,
35228 rtx target, rtx vals)
35229 {
35230 rtx ops[32], op0, op1;
35231 enum machine_mode half_mode = VOIDmode;
35232 int n, i;
35233
35234 switch (mode)
35235 {
35236 case V2SFmode:
35237 case V2SImode:
35238 if (!mmx_ok && !TARGET_SSE)
35239 break;
35240 /* FALLTHRU */
35241
35242 case V8SFmode:
35243 case V8SImode:
35244 case V4DFmode:
35245 case V4DImode:
35246 case V4SFmode:
35247 case V4SImode:
35248 case V2DFmode:
35249 case V2DImode:
35250 n = GET_MODE_NUNITS (mode);
35251 for (i = 0; i < n; i++)
35252 ops[i] = XVECEXP (vals, 0, i);
35253 ix86_expand_vector_init_concat (mode, target, ops, n);
35254 return;
35255
35256 case V32QImode:
35257 half_mode = V16QImode;
35258 goto half;
35259
35260 case V16HImode:
35261 half_mode = V8HImode;
35262 goto half;
35263
35264 half:
35265 n = GET_MODE_NUNITS (mode);
35266 for (i = 0; i < n; i++)
35267 ops[i] = XVECEXP (vals, 0, i);
35268 op0 = gen_reg_rtx (half_mode);
35269 op1 = gen_reg_rtx (half_mode);
35270 ix86_expand_vector_init_interleave (half_mode, op0, ops,
35271 n >> 2);
35272 ix86_expand_vector_init_interleave (half_mode, op1,
35273 &ops [n >> 1], n >> 2);
35274 emit_insn (gen_rtx_SET (VOIDmode, target,
35275 gen_rtx_VEC_CONCAT (mode, op0, op1)));
35276 return;
35277
35278 case V16QImode:
35279 if (!TARGET_SSE4_1)
35280 break;
35281 /* FALLTHRU */
35282
35283 case V8HImode:
35284 if (!TARGET_SSE2)
35285 break;
35286
35287 /* Don't use ix86_expand_vector_init_interleave if we can't
35288 move from GPR to SSE register directly. */
35289 if (!TARGET_INTER_UNIT_MOVES)
35290 break;
35291
35292 n = GET_MODE_NUNITS (mode);
35293 for (i = 0; i < n; i++)
35294 ops[i] = XVECEXP (vals, 0, i);
35295 ix86_expand_vector_init_interleave (mode, target, ops, n >> 1);
35296 return;
35297
35298 case V4HImode:
35299 case V8QImode:
35300 break;
35301
35302 default:
35303 gcc_unreachable ();
35304 }
35305
35306 {
35307 int i, j, n_elts, n_words, n_elt_per_word;
35308 enum machine_mode inner_mode;
35309 rtx words[4], shift;
35310
35311 inner_mode = GET_MODE_INNER (mode);
35312 n_elts = GET_MODE_NUNITS (mode);
35313 n_words = GET_MODE_SIZE (mode) / UNITS_PER_WORD;
35314 n_elt_per_word = n_elts / n_words;
35315 shift = GEN_INT (GET_MODE_BITSIZE (inner_mode));
35316
35317 for (i = 0; i < n_words; ++i)
35318 {
35319 rtx word = NULL_RTX;
35320
35321 for (j = 0; j < n_elt_per_word; ++j)
35322 {
35323 rtx elt = XVECEXP (vals, 0, (i+1)*n_elt_per_word - j - 1);
35324 elt = convert_modes (word_mode, inner_mode, elt, true);
35325
35326 if (j == 0)
35327 word = elt;
35328 else
35329 {
35330 word = expand_simple_binop (word_mode, ASHIFT, word, shift,
35331 word, 1, OPTAB_LIB_WIDEN);
35332 word = expand_simple_binop (word_mode, IOR, word, elt,
35333 word, 1, OPTAB_LIB_WIDEN);
35334 }
35335 }
35336
35337 words[i] = word;
35338 }
35339
35340 if (n_words == 1)
35341 emit_move_insn (target, gen_lowpart (mode, words[0]));
35342 else if (n_words == 2)
35343 {
35344 rtx tmp = gen_reg_rtx (mode);
35345 emit_clobber (tmp);
35346 emit_move_insn (gen_lowpart (word_mode, tmp), words[0]);
35347 emit_move_insn (gen_highpart (word_mode, tmp), words[1]);
35348 emit_move_insn (target, tmp);
35349 }
35350 else if (n_words == 4)
35351 {
35352 rtx tmp = gen_reg_rtx (V4SImode);
35353 gcc_assert (word_mode == SImode);
35354 vals = gen_rtx_PARALLEL (V4SImode, gen_rtvec_v (4, words));
35355 ix86_expand_vector_init_general (false, V4SImode, tmp, vals);
35356 emit_move_insn (target, gen_lowpart (mode, tmp));
35357 }
35358 else
35359 gcc_unreachable ();
35360 }
35361 }
35362
35363 /* Initialize vector TARGET via VALS. Suppress the use of MMX
35364 instructions unless MMX_OK is true. */
35365
35366 void
35367 ix86_expand_vector_init (bool mmx_ok, rtx target, rtx vals)
35368 {
35369 enum machine_mode mode = GET_MODE (target);
35370 enum machine_mode inner_mode = GET_MODE_INNER (mode);
35371 int n_elts = GET_MODE_NUNITS (mode);
35372 int n_var = 0, one_var = -1;
35373 bool all_same = true, all_const_zero = true;
35374 int i;
35375 rtx x;
35376
35377 for (i = 0; i < n_elts; ++i)
35378 {
35379 x = XVECEXP (vals, 0, i);
35380 if (!(CONST_INT_P (x)
35381 || GET_CODE (x) == CONST_DOUBLE
35382 || GET_CODE (x) == CONST_FIXED))
35383 n_var++, one_var = i;
35384 else if (x != CONST0_RTX (inner_mode))
35385 all_const_zero = false;
35386 if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
35387 all_same = false;
35388 }
35389
35390 /* Constants are best loaded from the constant pool. */
35391 if (n_var == 0)
35392 {
35393 emit_move_insn (target, gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0)));
35394 return;
35395 }
35396
35397 /* If all values are identical, broadcast the value. */
35398 if (all_same
35399 && ix86_expand_vector_init_duplicate (mmx_ok, mode, target,
35400 XVECEXP (vals, 0, 0)))
35401 return;
35402
35403 /* Values where only one field is non-constant are best loaded from
35404 the pool and overwritten via move later. */
35405 if (n_var == 1)
35406 {
35407 if (all_const_zero
35408 && ix86_expand_vector_init_one_nonzero (mmx_ok, mode, target,
35409 XVECEXP (vals, 0, one_var),
35410 one_var))
35411 return;
35412
35413 if (ix86_expand_vector_init_one_var (mmx_ok, mode, target, vals, one_var))
35414 return;
35415 }
35416
35417 ix86_expand_vector_init_general (mmx_ok, mode, target, vals);
35418 }
35419
35420 void
35421 ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt)
35422 {
35423 enum machine_mode mode = GET_MODE (target);
35424 enum machine_mode inner_mode = GET_MODE_INNER (mode);
35425 enum machine_mode half_mode;
35426 bool use_vec_merge = false;
35427 rtx tmp;
35428 static rtx (*gen_extract[6][2]) (rtx, rtx)
35429 = {
35430 { gen_vec_extract_lo_v32qi, gen_vec_extract_hi_v32qi },
35431 { gen_vec_extract_lo_v16hi, gen_vec_extract_hi_v16hi },
35432 { gen_vec_extract_lo_v8si, gen_vec_extract_hi_v8si },
35433 { gen_vec_extract_lo_v4di, gen_vec_extract_hi_v4di },
35434 { gen_vec_extract_lo_v8sf, gen_vec_extract_hi_v8sf },
35435 { gen_vec_extract_lo_v4df, gen_vec_extract_hi_v4df }
35436 };
35437 static rtx (*gen_insert[6][2]) (rtx, rtx, rtx)
35438 = {
35439 { gen_vec_set_lo_v32qi, gen_vec_set_hi_v32qi },
35440 { gen_vec_set_lo_v16hi, gen_vec_set_hi_v16hi },
35441 { gen_vec_set_lo_v8si, gen_vec_set_hi_v8si },
35442 { gen_vec_set_lo_v4di, gen_vec_set_hi_v4di },
35443 { gen_vec_set_lo_v8sf, gen_vec_set_hi_v8sf },
35444 { gen_vec_set_lo_v4df, gen_vec_set_hi_v4df }
35445 };
35446 int i, j, n;
35447
35448 switch (mode)
35449 {
35450 case V2SFmode:
35451 case V2SImode:
35452 if (mmx_ok)
35453 {
35454 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
35455 ix86_expand_vector_extract (true, tmp, target, 1 - elt);
35456 if (elt == 0)
35457 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
35458 else
35459 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
35460 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
35461 return;
35462 }
35463 break;
35464
35465 case V2DImode:
35466 use_vec_merge = TARGET_SSE4_1 && TARGET_64BIT;
35467 if (use_vec_merge)
35468 break;
35469
35470 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
35471 ix86_expand_vector_extract (false, tmp, target, 1 - elt);
35472 if (elt == 0)
35473 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
35474 else
35475 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
35476 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
35477 return;
35478
35479 case V2DFmode:
35480 {
35481 rtx op0, op1;
35482
35483 /* For the two element vectors, we implement a VEC_CONCAT with
35484 the extraction of the other element. */
35485
35486 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (1 - elt)));
35487 tmp = gen_rtx_VEC_SELECT (inner_mode, target, tmp);
35488
35489 if (elt == 0)
35490 op0 = val, op1 = tmp;
35491 else
35492 op0 = tmp, op1 = val;
35493
35494 tmp = gen_rtx_VEC_CONCAT (mode, op0, op1);
35495 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
35496 }
35497 return;
35498
35499 case V4SFmode:
35500 use_vec_merge = TARGET_SSE4_1;
35501 if (use_vec_merge)
35502 break;
35503
35504 switch (elt)
35505 {
35506 case 0:
35507 use_vec_merge = true;
35508 break;
35509
35510 case 1:
35511 /* tmp = target = A B C D */
35512 tmp = copy_to_reg (target);
35513 /* target = A A B B */
35514 emit_insn (gen_vec_interleave_lowv4sf (target, target, target));
35515 /* target = X A B B */
35516 ix86_expand_vector_set (false, target, val, 0);
35517 /* target = A X C D */
35518 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
35519 const1_rtx, const0_rtx,
35520 GEN_INT (2+4), GEN_INT (3+4)));
35521 return;
35522
35523 case 2:
35524 /* tmp = target = A B C D */
35525 tmp = copy_to_reg (target);
35526 /* tmp = X B C D */
35527 ix86_expand_vector_set (false, tmp, val, 0);
35528 /* target = A B X D */
35529 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
35530 const0_rtx, const1_rtx,
35531 GEN_INT (0+4), GEN_INT (3+4)));
35532 return;
35533
35534 case 3:
35535 /* tmp = target = A B C D */
35536 tmp = copy_to_reg (target);
35537 /* tmp = X B C D */
35538 ix86_expand_vector_set (false, tmp, val, 0);
35539 /* target = A B X D */
35540 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
35541 const0_rtx, const1_rtx,
35542 GEN_INT (2+4), GEN_INT (0+4)));
35543 return;
35544
35545 default:
35546 gcc_unreachable ();
35547 }
35548 break;
35549
35550 case V4SImode:
35551 use_vec_merge = TARGET_SSE4_1;
35552 if (use_vec_merge)
35553 break;
35554
35555 /* Element 0 handled by vec_merge below. */
35556 if (elt == 0)
35557 {
35558 use_vec_merge = true;
35559 break;
35560 }
35561
35562 if (TARGET_SSE2)
35563 {
35564 /* With SSE2, use integer shuffles to swap element 0 and ELT,
35565 store into element 0, then shuffle them back. */
35566
35567 rtx order[4];
35568
35569 order[0] = GEN_INT (elt);
35570 order[1] = const1_rtx;
35571 order[2] = const2_rtx;
35572 order[3] = GEN_INT (3);
35573 order[elt] = const0_rtx;
35574
35575 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
35576 order[1], order[2], order[3]));
35577
35578 ix86_expand_vector_set (false, target, val, 0);
35579
35580 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
35581 order[1], order[2], order[3]));
35582 }
35583 else
35584 {
35585 /* For SSE1, we have to reuse the V4SF code. */
35586 ix86_expand_vector_set (false, gen_lowpart (V4SFmode, target),
35587 gen_lowpart (SFmode, val), elt);
35588 }
35589 return;
35590
35591 case V8HImode:
35592 use_vec_merge = TARGET_SSE2;
35593 break;
35594 case V4HImode:
35595 use_vec_merge = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
35596 break;
35597
35598 case V16QImode:
35599 use_vec_merge = TARGET_SSE4_1;
35600 break;
35601
35602 case V8QImode:
35603 break;
35604
35605 case V32QImode:
35606 half_mode = V16QImode;
35607 j = 0;
35608 n = 16;
35609 goto half;
35610
35611 case V16HImode:
35612 half_mode = V8HImode;
35613 j = 1;
35614 n = 8;
35615 goto half;
35616
35617 case V8SImode:
35618 half_mode = V4SImode;
35619 j = 2;
35620 n = 4;
35621 goto half;
35622
35623 case V4DImode:
35624 half_mode = V2DImode;
35625 j = 3;
35626 n = 2;
35627 goto half;
35628
35629 case V8SFmode:
35630 half_mode = V4SFmode;
35631 j = 4;
35632 n = 4;
35633 goto half;
35634
35635 case V4DFmode:
35636 half_mode = V2DFmode;
35637 j = 5;
35638 n = 2;
35639 goto half;
35640
35641 half:
35642 /* Compute offset. */
35643 i = elt / n;
35644 elt %= n;
35645
35646 gcc_assert (i <= 1);
35647
35648 /* Extract the half. */
35649 tmp = gen_reg_rtx (half_mode);
35650 emit_insn (gen_extract[j][i] (tmp, target));
35651
35652 /* Put val in tmp at elt. */
35653 ix86_expand_vector_set (false, tmp, val, elt);
35654
35655 /* Put it back. */
35656 emit_insn (gen_insert[j][i] (target, target, tmp));
35657 return;
35658
35659 default:
35660 break;
35661 }
35662
35663 if (use_vec_merge)
35664 {
35665 tmp = gen_rtx_VEC_DUPLICATE (mode, val);
35666 tmp = gen_rtx_VEC_MERGE (mode, tmp, target, GEN_INT (1 << elt));
35667 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
35668 }
35669 else
35670 {
35671 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
35672
35673 emit_move_insn (mem, target);
35674
35675 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
35676 emit_move_insn (tmp, val);
35677
35678 emit_move_insn (target, mem);
35679 }
35680 }
35681
35682 void
35683 ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt)
35684 {
35685 enum machine_mode mode = GET_MODE (vec);
35686 enum machine_mode inner_mode = GET_MODE_INNER (mode);
35687 bool use_vec_extr = false;
35688 rtx tmp;
35689
35690 switch (mode)
35691 {
35692 case V2SImode:
35693 case V2SFmode:
35694 if (!mmx_ok)
35695 break;
35696 /* FALLTHRU */
35697
35698 case V2DFmode:
35699 case V2DImode:
35700 use_vec_extr = true;
35701 break;
35702
35703 case V4SFmode:
35704 use_vec_extr = TARGET_SSE4_1;
35705 if (use_vec_extr)
35706 break;
35707
35708 switch (elt)
35709 {
35710 case 0:
35711 tmp = vec;
35712 break;
35713
35714 case 1:
35715 case 3:
35716 tmp = gen_reg_rtx (mode);
35717 emit_insn (gen_sse_shufps_v4sf (tmp, vec, vec,
35718 GEN_INT (elt), GEN_INT (elt),
35719 GEN_INT (elt+4), GEN_INT (elt+4)));
35720 break;
35721
35722 case 2:
35723 tmp = gen_reg_rtx (mode);
35724 emit_insn (gen_vec_interleave_highv4sf (tmp, vec, vec));
35725 break;
35726
35727 default:
35728 gcc_unreachable ();
35729 }
35730 vec = tmp;
35731 use_vec_extr = true;
35732 elt = 0;
35733 break;
35734
35735 case V4SImode:
35736 use_vec_extr = TARGET_SSE4_1;
35737 if (use_vec_extr)
35738 break;
35739
35740 if (TARGET_SSE2)
35741 {
35742 switch (elt)
35743 {
35744 case 0:
35745 tmp = vec;
35746 break;
35747
35748 case 1:
35749 case 3:
35750 tmp = gen_reg_rtx (mode);
35751 emit_insn (gen_sse2_pshufd_1 (tmp, vec,
35752 GEN_INT (elt), GEN_INT (elt),
35753 GEN_INT (elt), GEN_INT (elt)));
35754 break;
35755
35756 case 2:
35757 tmp = gen_reg_rtx (mode);
35758 emit_insn (gen_vec_interleave_highv4si (tmp, vec, vec));
35759 break;
35760
35761 default:
35762 gcc_unreachable ();
35763 }
35764 vec = tmp;
35765 use_vec_extr = true;
35766 elt = 0;
35767 }
35768 else
35769 {
35770 /* For SSE1, we have to reuse the V4SF code. */
35771 ix86_expand_vector_extract (false, gen_lowpart (SFmode, target),
35772 gen_lowpart (V4SFmode, vec), elt);
35773 return;
35774 }
35775 break;
35776
35777 case V8HImode:
35778 use_vec_extr = TARGET_SSE2;
35779 break;
35780 case V4HImode:
35781 use_vec_extr = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
35782 break;
35783
35784 case V16QImode:
35785 use_vec_extr = TARGET_SSE4_1;
35786 break;
35787
35788 case V8SFmode:
35789 if (TARGET_AVX)
35790 {
35791 tmp = gen_reg_rtx (V4SFmode);
35792 if (elt < 4)
35793 emit_insn (gen_vec_extract_lo_v8sf (tmp, vec));
35794 else
35795 emit_insn (gen_vec_extract_hi_v8sf (tmp, vec));
35796 ix86_expand_vector_extract (false, target, tmp, elt & 3);
35797 return;
35798 }
35799 break;
35800
35801 case V4DFmode:
35802 if (TARGET_AVX)
35803 {
35804 tmp = gen_reg_rtx (V2DFmode);
35805 if (elt < 2)
35806 emit_insn (gen_vec_extract_lo_v4df (tmp, vec));
35807 else
35808 emit_insn (gen_vec_extract_hi_v4df (tmp, vec));
35809 ix86_expand_vector_extract (false, target, tmp, elt & 1);
35810 return;
35811 }
35812 break;
35813
35814 case V32QImode:
35815 if (TARGET_AVX)
35816 {
35817 tmp = gen_reg_rtx (V16QImode);
35818 if (elt < 16)
35819 emit_insn (gen_vec_extract_lo_v32qi (tmp, vec));
35820 else
35821 emit_insn (gen_vec_extract_hi_v32qi (tmp, vec));
35822 ix86_expand_vector_extract (false, target, tmp, elt & 15);
35823 return;
35824 }
35825 break;
35826
35827 case V16HImode:
35828 if (TARGET_AVX)
35829 {
35830 tmp = gen_reg_rtx (V8HImode);
35831 if (elt < 8)
35832 emit_insn (gen_vec_extract_lo_v16hi (tmp, vec));
35833 else
35834 emit_insn (gen_vec_extract_hi_v16hi (tmp, vec));
35835 ix86_expand_vector_extract (false, target, tmp, elt & 7);
35836 return;
35837 }
35838 break;
35839
35840 case V8SImode:
35841 if (TARGET_AVX)
35842 {
35843 tmp = gen_reg_rtx (V4SImode);
35844 if (elt < 4)
35845 emit_insn (gen_vec_extract_lo_v8si (tmp, vec));
35846 else
35847 emit_insn (gen_vec_extract_hi_v8si (tmp, vec));
35848 ix86_expand_vector_extract (false, target, tmp, elt & 3);
35849 return;
35850 }
35851 break;
35852
35853 case V4DImode:
35854 if (TARGET_AVX)
35855 {
35856 tmp = gen_reg_rtx (V2DImode);
35857 if (elt < 2)
35858 emit_insn (gen_vec_extract_lo_v4di (tmp, vec));
35859 else
35860 emit_insn (gen_vec_extract_hi_v4di (tmp, vec));
35861 ix86_expand_vector_extract (false, target, tmp, elt & 1);
35862 return;
35863 }
35864 break;
35865
35866 case V8QImode:
35867 /* ??? Could extract the appropriate HImode element and shift. */
35868 default:
35869 break;
35870 }
35871
35872 if (use_vec_extr)
35873 {
35874 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (elt)));
35875 tmp = gen_rtx_VEC_SELECT (inner_mode, vec, tmp);
35876
35877 /* Let the rtl optimizers know about the zero extension performed. */
35878 if (inner_mode == QImode || inner_mode == HImode)
35879 {
35880 tmp = gen_rtx_ZERO_EXTEND (SImode, tmp);
35881 target = gen_lowpart (SImode, target);
35882 }
35883
35884 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
35885 }
35886 else
35887 {
35888 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
35889
35890 emit_move_insn (mem, vec);
35891
35892 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
35893 emit_move_insn (target, tmp);
35894 }
35895 }
35896
35897 /* Generate code to copy vector bits i / 2 ... i - 1 from vector SRC
35898 to bits 0 ... i / 2 - 1 of vector DEST, which has the same mode.
35899 The upper bits of DEST are undefined, though they shouldn't cause
35900 exceptions (some bits from src or all zeros are ok). */
35901
35902 static void
35903 emit_reduc_half (rtx dest, rtx src, int i)
35904 {
35905 rtx tem;
35906 switch (GET_MODE (src))
35907 {
35908 case V4SFmode:
35909 if (i == 128)
35910 tem = gen_sse_movhlps (dest, src, src);
35911 else
35912 tem = gen_sse_shufps_v4sf (dest, src, src, const1_rtx, const1_rtx,
35913 GEN_INT (1 + 4), GEN_INT (1 + 4));
35914 break;
35915 case V2DFmode:
35916 tem = gen_vec_interleave_highv2df (dest, src, src);
35917 break;
35918 case V16QImode:
35919 case V8HImode:
35920 case V4SImode:
35921 case V2DImode:
35922 tem = gen_sse2_lshrv1ti3 (gen_lowpart (V1TImode, dest),
35923 gen_lowpart (V1TImode, src),
35924 GEN_INT (i / 2));
35925 break;
35926 case V8SFmode:
35927 if (i == 256)
35928 tem = gen_avx_vperm2f128v8sf3 (dest, src, src, const1_rtx);
35929 else
35930 tem = gen_avx_shufps256 (dest, src, src,
35931 GEN_INT (i == 128 ? 2 + (3 << 2) : 1));
35932 break;
35933 case V4DFmode:
35934 if (i == 256)
35935 tem = gen_avx_vperm2f128v4df3 (dest, src, src, const1_rtx);
35936 else
35937 tem = gen_avx_shufpd256 (dest, src, src, const1_rtx);
35938 break;
35939 case V32QImode:
35940 case V16HImode:
35941 case V8SImode:
35942 case V4DImode:
35943 if (i == 256)
35944 tem = gen_avx2_permv2ti (gen_lowpart (V4DImode, dest),
35945 gen_lowpart (V4DImode, src),
35946 gen_lowpart (V4DImode, src),
35947 const1_rtx);
35948 else
35949 tem = gen_avx2_lshrv2ti3 (gen_lowpart (V2TImode, dest),
35950 gen_lowpart (V2TImode, src),
35951 GEN_INT (i / 2));
35952 break;
35953 default:
35954 gcc_unreachable ();
35955 }
35956 emit_insn (tem);
35957 }
35958
35959 /* Expand a vector reduction. FN is the binary pattern to reduce;
35960 DEST is the destination; IN is the input vector. */
35961
35962 void
35963 ix86_expand_reduc (rtx (*fn) (rtx, rtx, rtx), rtx dest, rtx in)
35964 {
35965 rtx half, dst, vec = in;
35966 enum machine_mode mode = GET_MODE (in);
35967 int i;
35968
35969 /* SSE4 has a special instruction for V8HImode UMIN reduction. */
35970 if (TARGET_SSE4_1
35971 && mode == V8HImode
35972 && fn == gen_uminv8hi3)
35973 {
35974 emit_insn (gen_sse4_1_phminposuw (dest, in));
35975 return;
35976 }
35977
35978 for (i = GET_MODE_BITSIZE (mode);
35979 i > GET_MODE_BITSIZE (GET_MODE_INNER (mode));
35980 i >>= 1)
35981 {
35982 half = gen_reg_rtx (mode);
35983 emit_reduc_half (half, vec, i);
35984 if (i == GET_MODE_BITSIZE (GET_MODE_INNER (mode)) * 2)
35985 dst = dest;
35986 else
35987 dst = gen_reg_rtx (mode);
35988 emit_insn (fn (dst, half, vec));
35989 vec = dst;
35990 }
35991 }
35992 \f
35993 /* Target hook for scalar_mode_supported_p. */
35994 static bool
35995 ix86_scalar_mode_supported_p (enum machine_mode mode)
35996 {
35997 if (DECIMAL_FLOAT_MODE_P (mode))
35998 return default_decimal_float_supported_p ();
35999 else if (mode == TFmode)
36000 return true;
36001 else
36002 return default_scalar_mode_supported_p (mode);
36003 }
36004
36005 /* Implements target hook vector_mode_supported_p. */
36006 static bool
36007 ix86_vector_mode_supported_p (enum machine_mode mode)
36008 {
36009 if (TARGET_SSE && VALID_SSE_REG_MODE (mode))
36010 return true;
36011 if (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode))
36012 return true;
36013 if (TARGET_AVX && VALID_AVX256_REG_MODE (mode))
36014 return true;
36015 if (TARGET_MMX && VALID_MMX_REG_MODE (mode))
36016 return true;
36017 if (TARGET_3DNOW && VALID_MMX_REG_MODE_3DNOW (mode))
36018 return true;
36019 return false;
36020 }
36021
36022 /* Target hook for c_mode_for_suffix. */
36023 static enum machine_mode
36024 ix86_c_mode_for_suffix (char suffix)
36025 {
36026 if (suffix == 'q')
36027 return TFmode;
36028 if (suffix == 'w')
36029 return XFmode;
36030
36031 return VOIDmode;
36032 }
36033
36034 /* Worker function for TARGET_MD_ASM_CLOBBERS.
36035
36036 We do this in the new i386 backend to maintain source compatibility
36037 with the old cc0-based compiler. */
36038
36039 static tree
36040 ix86_md_asm_clobbers (tree outputs ATTRIBUTE_UNUSED,
36041 tree inputs ATTRIBUTE_UNUSED,
36042 tree clobbers)
36043 {
36044 clobbers = tree_cons (NULL_TREE, build_string (5, "flags"),
36045 clobbers);
36046 clobbers = tree_cons (NULL_TREE, build_string (4, "fpsr"),
36047 clobbers);
36048 return clobbers;
36049 }
36050
36051 /* Implements target vector targetm.asm.encode_section_info. */
36052
36053 static void ATTRIBUTE_UNUSED
36054 ix86_encode_section_info (tree decl, rtx rtl, int first)
36055 {
36056 default_encode_section_info (decl, rtl, first);
36057
36058 if (TREE_CODE (decl) == VAR_DECL
36059 && (TREE_STATIC (decl) || DECL_EXTERNAL (decl))
36060 && ix86_in_large_data_p (decl))
36061 SYMBOL_REF_FLAGS (XEXP (rtl, 0)) |= SYMBOL_FLAG_FAR_ADDR;
36062 }
36063
36064 /* Worker function for REVERSE_CONDITION. */
36065
36066 enum rtx_code
36067 ix86_reverse_condition (enum rtx_code code, enum machine_mode mode)
36068 {
36069 return (mode != CCFPmode && mode != CCFPUmode
36070 ? reverse_condition (code)
36071 : reverse_condition_maybe_unordered (code));
36072 }
36073
36074 /* Output code to perform an x87 FP register move, from OPERANDS[1]
36075 to OPERANDS[0]. */
36076
36077 const char *
36078 output_387_reg_move (rtx insn, rtx *operands)
36079 {
36080 if (REG_P (operands[0]))
36081 {
36082 if (REG_P (operands[1])
36083 && find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
36084 {
36085 if (REGNO (operands[0]) == FIRST_STACK_REG)
36086 return output_387_ffreep (operands, 0);
36087 return "fstp\t%y0";
36088 }
36089 if (STACK_TOP_P (operands[0]))
36090 return "fld%Z1\t%y1";
36091 return "fst\t%y0";
36092 }
36093 else if (MEM_P (operands[0]))
36094 {
36095 gcc_assert (REG_P (operands[1]));
36096 if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
36097 return "fstp%Z0\t%y0";
36098 else
36099 {
36100 /* There is no non-popping store to memory for XFmode.
36101 So if we need one, follow the store with a load. */
36102 if (GET_MODE (operands[0]) == XFmode)
36103 return "fstp%Z0\t%y0\n\tfld%Z0\t%y0";
36104 else
36105 return "fst%Z0\t%y0";
36106 }
36107 }
36108 else
36109 gcc_unreachable();
36110 }
36111
36112 /* Output code to perform a conditional jump to LABEL, if C2 flag in
36113 FP status register is set. */
36114
36115 void
36116 ix86_emit_fp_unordered_jump (rtx label)
36117 {
36118 rtx reg = gen_reg_rtx (HImode);
36119 rtx temp;
36120
36121 emit_insn (gen_x86_fnstsw_1 (reg));
36122
36123 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
36124 {
36125 emit_insn (gen_x86_sahf_1 (reg));
36126
36127 temp = gen_rtx_REG (CCmode, FLAGS_REG);
36128 temp = gen_rtx_UNORDERED (VOIDmode, temp, const0_rtx);
36129 }
36130 else
36131 {
36132 emit_insn (gen_testqi_ext_ccno_0 (reg, GEN_INT (0x04)));
36133
36134 temp = gen_rtx_REG (CCNOmode, FLAGS_REG);
36135 temp = gen_rtx_NE (VOIDmode, temp, const0_rtx);
36136 }
36137
36138 temp = gen_rtx_IF_THEN_ELSE (VOIDmode, temp,
36139 gen_rtx_LABEL_REF (VOIDmode, label),
36140 pc_rtx);
36141 temp = gen_rtx_SET (VOIDmode, pc_rtx, temp);
36142
36143 emit_jump_insn (temp);
36144 predict_jump (REG_BR_PROB_BASE * 10 / 100);
36145 }
36146
36147 /* Output code to perform a log1p XFmode calculation. */
36148
36149 void ix86_emit_i387_log1p (rtx op0, rtx op1)
36150 {
36151 rtx label1 = gen_label_rtx ();
36152 rtx label2 = gen_label_rtx ();
36153
36154 rtx tmp = gen_reg_rtx (XFmode);
36155 rtx tmp2 = gen_reg_rtx (XFmode);
36156 rtx test;
36157
36158 emit_insn (gen_absxf2 (tmp, op1));
36159 test = gen_rtx_GE (VOIDmode, tmp,
36160 CONST_DOUBLE_FROM_REAL_VALUE (
36161 REAL_VALUE_ATOF ("0.29289321881345247561810596348408353", XFmode),
36162 XFmode));
36163 emit_jump_insn (gen_cbranchxf4 (test, XEXP (test, 0), XEXP (test, 1), label1));
36164
36165 emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
36166 emit_insn (gen_fyl2xp1xf3_i387 (op0, op1, tmp2));
36167 emit_jump (label2);
36168
36169 emit_label (label1);
36170 emit_move_insn (tmp, CONST1_RTX (XFmode));
36171 emit_insn (gen_addxf3 (tmp, op1, tmp));
36172 emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
36173 emit_insn (gen_fyl2xxf3_i387 (op0, tmp, tmp2));
36174
36175 emit_label (label2);
36176 }
36177
36178 /* Emit code for round calculation. */
36179 void ix86_emit_i387_round (rtx op0, rtx op1)
36180 {
36181 enum machine_mode inmode = GET_MODE (op1);
36182 enum machine_mode outmode = GET_MODE (op0);
36183 rtx e1, e2, res, tmp, tmp1, half;
36184 rtx scratch = gen_reg_rtx (HImode);
36185 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
36186 rtx jump_label = gen_label_rtx ();
36187 rtx insn;
36188 rtx (*gen_abs) (rtx, rtx);
36189 rtx (*gen_neg) (rtx, rtx);
36190
36191 switch (inmode)
36192 {
36193 case SFmode:
36194 gen_abs = gen_abssf2;
36195 break;
36196 case DFmode:
36197 gen_abs = gen_absdf2;
36198 break;
36199 case XFmode:
36200 gen_abs = gen_absxf2;
36201 break;
36202 default:
36203 gcc_unreachable ();
36204 }
36205
36206 switch (outmode)
36207 {
36208 case SFmode:
36209 gen_neg = gen_negsf2;
36210 break;
36211 case DFmode:
36212 gen_neg = gen_negdf2;
36213 break;
36214 case XFmode:
36215 gen_neg = gen_negxf2;
36216 break;
36217 case HImode:
36218 gen_neg = gen_neghi2;
36219 break;
36220 case SImode:
36221 gen_neg = gen_negsi2;
36222 break;
36223 case DImode:
36224 gen_neg = gen_negdi2;
36225 break;
36226 default:
36227 gcc_unreachable ();
36228 }
36229
36230 e1 = gen_reg_rtx (inmode);
36231 e2 = gen_reg_rtx (inmode);
36232 res = gen_reg_rtx (outmode);
36233
36234 half = CONST_DOUBLE_FROM_REAL_VALUE (dconsthalf, inmode);
36235
36236 /* round(a) = sgn(a) * floor(fabs(a) + 0.5) */
36237
36238 /* scratch = fxam(op1) */
36239 emit_insn (gen_rtx_SET (VOIDmode, scratch,
36240 gen_rtx_UNSPEC (HImode, gen_rtvec (1, op1),
36241 UNSPEC_FXAM)));
36242 /* e1 = fabs(op1) */
36243 emit_insn (gen_abs (e1, op1));
36244
36245 /* e2 = e1 + 0.5 */
36246 half = force_reg (inmode, half);
36247 emit_insn (gen_rtx_SET (VOIDmode, e2,
36248 gen_rtx_PLUS (inmode, e1, half)));
36249
36250 /* res = floor(e2) */
36251 if (inmode != XFmode)
36252 {
36253 tmp1 = gen_reg_rtx (XFmode);
36254
36255 emit_insn (gen_rtx_SET (VOIDmode, tmp1,
36256 gen_rtx_FLOAT_EXTEND (XFmode, e2)));
36257 }
36258 else
36259 tmp1 = e2;
36260
36261 switch (outmode)
36262 {
36263 case SFmode:
36264 case DFmode:
36265 {
36266 rtx tmp0 = gen_reg_rtx (XFmode);
36267
36268 emit_insn (gen_frndintxf2_floor (tmp0, tmp1));
36269
36270 emit_insn (gen_rtx_SET (VOIDmode, res,
36271 gen_rtx_UNSPEC (outmode, gen_rtvec (1, tmp0),
36272 UNSPEC_TRUNC_NOOP)));
36273 }
36274 break;
36275 case XFmode:
36276 emit_insn (gen_frndintxf2_floor (res, tmp1));
36277 break;
36278 case HImode:
36279 emit_insn (gen_lfloorxfhi2 (res, tmp1));
36280 break;
36281 case SImode:
36282 emit_insn (gen_lfloorxfsi2 (res, tmp1));
36283 break;
36284 case DImode:
36285 emit_insn (gen_lfloorxfdi2 (res, tmp1));
36286 break;
36287 default:
36288 gcc_unreachable ();
36289 }
36290
36291 /* flags = signbit(a) */
36292 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x02)));
36293
36294 /* if (flags) then res = -res */
36295 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
36296 gen_rtx_EQ (VOIDmode, flags, const0_rtx),
36297 gen_rtx_LABEL_REF (VOIDmode, jump_label),
36298 pc_rtx);
36299 insn = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
36300 predict_jump (REG_BR_PROB_BASE * 50 / 100);
36301 JUMP_LABEL (insn) = jump_label;
36302
36303 emit_insn (gen_neg (res, res));
36304
36305 emit_label (jump_label);
36306 LABEL_NUSES (jump_label) = 1;
36307
36308 emit_move_insn (op0, res);
36309 }
36310
36311 /* Output code to perform a Newton-Rhapson approximation of a single precision
36312 floating point divide [http://en.wikipedia.org/wiki/N-th_root_algorithm]. */
36313
36314 void ix86_emit_swdivsf (rtx res, rtx a, rtx b, enum machine_mode mode)
36315 {
36316 rtx x0, x1, e0, e1;
36317
36318 x0 = gen_reg_rtx (mode);
36319 e0 = gen_reg_rtx (mode);
36320 e1 = gen_reg_rtx (mode);
36321 x1 = gen_reg_rtx (mode);
36322
36323 /* a / b = a * ((rcp(b) + rcp(b)) - (b * rcp(b) * rcp (b))) */
36324
36325 b = force_reg (mode, b);
36326
36327 /* x0 = rcp(b) estimate */
36328 emit_insn (gen_rtx_SET (VOIDmode, x0,
36329 gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
36330 UNSPEC_RCP)));
36331 /* e0 = x0 * b */
36332 emit_insn (gen_rtx_SET (VOIDmode, e0,
36333 gen_rtx_MULT (mode, x0, b)));
36334
36335 /* e0 = x0 * e0 */
36336 emit_insn (gen_rtx_SET (VOIDmode, e0,
36337 gen_rtx_MULT (mode, x0, e0)));
36338
36339 /* e1 = x0 + x0 */
36340 emit_insn (gen_rtx_SET (VOIDmode, e1,
36341 gen_rtx_PLUS (mode, x0, x0)));
36342
36343 /* x1 = e1 - e0 */
36344 emit_insn (gen_rtx_SET (VOIDmode, x1,
36345 gen_rtx_MINUS (mode, e1, e0)));
36346
36347 /* res = a * x1 */
36348 emit_insn (gen_rtx_SET (VOIDmode, res,
36349 gen_rtx_MULT (mode, a, x1)));
36350 }
36351
36352 /* Output code to perform a Newton-Rhapson approximation of a
36353 single precision floating point [reciprocal] square root. */
36354
36355 void ix86_emit_swsqrtsf (rtx res, rtx a, enum machine_mode mode,
36356 bool recip)
36357 {
36358 rtx x0, e0, e1, e2, e3, mthree, mhalf;
36359 REAL_VALUE_TYPE r;
36360
36361 x0 = gen_reg_rtx (mode);
36362 e0 = gen_reg_rtx (mode);
36363 e1 = gen_reg_rtx (mode);
36364 e2 = gen_reg_rtx (mode);
36365 e3 = gen_reg_rtx (mode);
36366
36367 real_from_integer (&r, VOIDmode, -3, -1, 0);
36368 mthree = CONST_DOUBLE_FROM_REAL_VALUE (r, SFmode);
36369
36370 real_arithmetic (&r, NEGATE_EXPR, &dconsthalf, NULL);
36371 mhalf = CONST_DOUBLE_FROM_REAL_VALUE (r, SFmode);
36372
36373 if (VECTOR_MODE_P (mode))
36374 {
36375 mthree = ix86_build_const_vector (mode, true, mthree);
36376 mhalf = ix86_build_const_vector (mode, true, mhalf);
36377 }
36378
36379 /* sqrt(a) = -0.5 * a * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0)
36380 rsqrt(a) = -0.5 * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) */
36381
36382 a = force_reg (mode, a);
36383
36384 /* x0 = rsqrt(a) estimate */
36385 emit_insn (gen_rtx_SET (VOIDmode, x0,
36386 gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
36387 UNSPEC_RSQRT)));
36388
36389 /* If (a == 0.0) Filter out infinity to prevent NaN for sqrt(0.0). */
36390 if (!recip)
36391 {
36392 rtx zero, mask;
36393
36394 zero = gen_reg_rtx (mode);
36395 mask = gen_reg_rtx (mode);
36396
36397 zero = force_reg (mode, CONST0_RTX(mode));
36398 emit_insn (gen_rtx_SET (VOIDmode, mask,
36399 gen_rtx_NE (mode, zero, a)));
36400
36401 emit_insn (gen_rtx_SET (VOIDmode, x0,
36402 gen_rtx_AND (mode, x0, mask)));
36403 }
36404
36405 /* e0 = x0 * a */
36406 emit_insn (gen_rtx_SET (VOIDmode, e0,
36407 gen_rtx_MULT (mode, x0, a)));
36408 /* e1 = e0 * x0 */
36409 emit_insn (gen_rtx_SET (VOIDmode, e1,
36410 gen_rtx_MULT (mode, e0, x0)));
36411
36412 /* e2 = e1 - 3. */
36413 mthree = force_reg (mode, mthree);
36414 emit_insn (gen_rtx_SET (VOIDmode, e2,
36415 gen_rtx_PLUS (mode, e1, mthree)));
36416
36417 mhalf = force_reg (mode, mhalf);
36418 if (recip)
36419 /* e3 = -.5 * x0 */
36420 emit_insn (gen_rtx_SET (VOIDmode, e3,
36421 gen_rtx_MULT (mode, x0, mhalf)));
36422 else
36423 /* e3 = -.5 * e0 */
36424 emit_insn (gen_rtx_SET (VOIDmode, e3,
36425 gen_rtx_MULT (mode, e0, mhalf)));
36426 /* ret = e2 * e3 */
36427 emit_insn (gen_rtx_SET (VOIDmode, res,
36428 gen_rtx_MULT (mode, e2, e3)));
36429 }
36430
36431 #ifdef TARGET_SOLARIS
36432 /* Solaris implementation of TARGET_ASM_NAMED_SECTION. */
36433
36434 static void
36435 i386_solaris_elf_named_section (const char *name, unsigned int flags,
36436 tree decl)
36437 {
36438 /* With Binutils 2.15, the "@unwind" marker must be specified on
36439 every occurrence of the ".eh_frame" section, not just the first
36440 one. */
36441 if (TARGET_64BIT
36442 && strcmp (name, ".eh_frame") == 0)
36443 {
36444 fprintf (asm_out_file, "\t.section\t%s,\"%s\",@unwind\n", name,
36445 flags & SECTION_WRITE ? "aw" : "a");
36446 return;
36447 }
36448
36449 #ifndef USE_GAS
36450 if (HAVE_COMDAT_GROUP && flags & SECTION_LINKONCE)
36451 {
36452 solaris_elf_asm_comdat_section (name, flags, decl);
36453 return;
36454 }
36455 #endif
36456
36457 default_elf_asm_named_section (name, flags, decl);
36458 }
36459 #endif /* TARGET_SOLARIS */
36460
36461 /* Return the mangling of TYPE if it is an extended fundamental type. */
36462
36463 static const char *
36464 ix86_mangle_type (const_tree type)
36465 {
36466 type = TYPE_MAIN_VARIANT (type);
36467
36468 if (TREE_CODE (type) != VOID_TYPE && TREE_CODE (type) != BOOLEAN_TYPE
36469 && TREE_CODE (type) != INTEGER_TYPE && TREE_CODE (type) != REAL_TYPE)
36470 return NULL;
36471
36472 switch (TYPE_MODE (type))
36473 {
36474 case TFmode:
36475 /* __float128 is "g". */
36476 return "g";
36477 case XFmode:
36478 /* "long double" or __float80 is "e". */
36479 return "e";
36480 default:
36481 return NULL;
36482 }
36483 }
36484
36485 /* For 32-bit code we can save PIC register setup by using
36486 __stack_chk_fail_local hidden function instead of calling
36487 __stack_chk_fail directly. 64-bit code doesn't need to setup any PIC
36488 register, so it is better to call __stack_chk_fail directly. */
36489
36490 static tree ATTRIBUTE_UNUSED
36491 ix86_stack_protect_fail (void)
36492 {
36493 return TARGET_64BIT
36494 ? default_external_stack_protect_fail ()
36495 : default_hidden_stack_protect_fail ();
36496 }
36497
36498 /* Select a format to encode pointers in exception handling data. CODE
36499 is 0 for data, 1 for code labels, 2 for function pointers. GLOBAL is
36500 true if the symbol may be affected by dynamic relocations.
36501
36502 ??? All x86 object file formats are capable of representing this.
36503 After all, the relocation needed is the same as for the call insn.
36504 Whether or not a particular assembler allows us to enter such, I
36505 guess we'll have to see. */
36506 int
36507 asm_preferred_eh_data_format (int code, int global)
36508 {
36509 if (flag_pic)
36510 {
36511 int type = DW_EH_PE_sdata8;
36512 if (!TARGET_64BIT
36513 || ix86_cmodel == CM_SMALL_PIC
36514 || (ix86_cmodel == CM_MEDIUM_PIC && (global || code)))
36515 type = DW_EH_PE_sdata4;
36516 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
36517 }
36518 if (ix86_cmodel == CM_SMALL
36519 || (ix86_cmodel == CM_MEDIUM && code))
36520 return DW_EH_PE_udata4;
36521 return DW_EH_PE_absptr;
36522 }
36523 \f
36524 /* Expand copysign from SIGN to the positive value ABS_VALUE
36525 storing in RESULT. If MASK is non-null, it shall be a mask to mask out
36526 the sign-bit. */
36527 static void
36528 ix86_sse_copysign_to_positive (rtx result, rtx abs_value, rtx sign, rtx mask)
36529 {
36530 enum machine_mode mode = GET_MODE (sign);
36531 rtx sgn = gen_reg_rtx (mode);
36532 if (mask == NULL_RTX)
36533 {
36534 enum machine_mode vmode;
36535
36536 if (mode == SFmode)
36537 vmode = V4SFmode;
36538 else if (mode == DFmode)
36539 vmode = V2DFmode;
36540 else
36541 vmode = mode;
36542
36543 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), false);
36544 if (!VECTOR_MODE_P (mode))
36545 {
36546 /* We need to generate a scalar mode mask in this case. */
36547 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
36548 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
36549 mask = gen_reg_rtx (mode);
36550 emit_insn (gen_rtx_SET (VOIDmode, mask, tmp));
36551 }
36552 }
36553 else
36554 mask = gen_rtx_NOT (mode, mask);
36555 emit_insn (gen_rtx_SET (VOIDmode, sgn,
36556 gen_rtx_AND (mode, mask, sign)));
36557 emit_insn (gen_rtx_SET (VOIDmode, result,
36558 gen_rtx_IOR (mode, abs_value, sgn)));
36559 }
36560
36561 /* Expand fabs (OP0) and return a new rtx that holds the result. The
36562 mask for masking out the sign-bit is stored in *SMASK, if that is
36563 non-null. */
36564 static rtx
36565 ix86_expand_sse_fabs (rtx op0, rtx *smask)
36566 {
36567 enum machine_mode vmode, mode = GET_MODE (op0);
36568 rtx xa, mask;
36569
36570 xa = gen_reg_rtx (mode);
36571 if (mode == SFmode)
36572 vmode = V4SFmode;
36573 else if (mode == DFmode)
36574 vmode = V2DFmode;
36575 else
36576 vmode = mode;
36577 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), true);
36578 if (!VECTOR_MODE_P (mode))
36579 {
36580 /* We need to generate a scalar mode mask in this case. */
36581 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
36582 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
36583 mask = gen_reg_rtx (mode);
36584 emit_insn (gen_rtx_SET (VOIDmode, mask, tmp));
36585 }
36586 emit_insn (gen_rtx_SET (VOIDmode, xa,
36587 gen_rtx_AND (mode, op0, mask)));
36588
36589 if (smask)
36590 *smask = mask;
36591
36592 return xa;
36593 }
36594
36595 /* Expands a comparison of OP0 with OP1 using comparison code CODE,
36596 swapping the operands if SWAP_OPERANDS is true. The expanded
36597 code is a forward jump to a newly created label in case the
36598 comparison is true. The generated label rtx is returned. */
36599 static rtx
36600 ix86_expand_sse_compare_and_jump (enum rtx_code code, rtx op0, rtx op1,
36601 bool swap_operands)
36602 {
36603 rtx label, tmp;
36604
36605 if (swap_operands)
36606 {
36607 tmp = op0;
36608 op0 = op1;
36609 op1 = tmp;
36610 }
36611
36612 label = gen_label_rtx ();
36613 tmp = gen_rtx_REG (CCFPUmode, FLAGS_REG);
36614 emit_insn (gen_rtx_SET (VOIDmode, tmp,
36615 gen_rtx_COMPARE (CCFPUmode, op0, op1)));
36616 tmp = gen_rtx_fmt_ee (code, VOIDmode, tmp, const0_rtx);
36617 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
36618 gen_rtx_LABEL_REF (VOIDmode, label), pc_rtx);
36619 tmp = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
36620 JUMP_LABEL (tmp) = label;
36621
36622 return label;
36623 }
36624
36625 /* Expand a mask generating SSE comparison instruction comparing OP0 with OP1
36626 using comparison code CODE. Operands are swapped for the comparison if
36627 SWAP_OPERANDS is true. Returns a rtx for the generated mask. */
36628 static rtx
36629 ix86_expand_sse_compare_mask (enum rtx_code code, rtx op0, rtx op1,
36630 bool swap_operands)
36631 {
36632 rtx (*insn)(rtx, rtx, rtx, rtx);
36633 enum machine_mode mode = GET_MODE (op0);
36634 rtx mask = gen_reg_rtx (mode);
36635
36636 if (swap_operands)
36637 {
36638 rtx tmp = op0;
36639 op0 = op1;
36640 op1 = tmp;
36641 }
36642
36643 insn = mode == DFmode ? gen_setcc_df_sse : gen_setcc_sf_sse;
36644
36645 emit_insn (insn (mask, op0, op1,
36646 gen_rtx_fmt_ee (code, mode, op0, op1)));
36647 return mask;
36648 }
36649
36650 /* Generate and return a rtx of mode MODE for 2**n where n is the number
36651 of bits of the mantissa of MODE, which must be one of DFmode or SFmode. */
36652 static rtx
36653 ix86_gen_TWO52 (enum machine_mode mode)
36654 {
36655 REAL_VALUE_TYPE TWO52r;
36656 rtx TWO52;
36657
36658 real_ldexp (&TWO52r, &dconst1, mode == DFmode ? 52 : 23);
36659 TWO52 = const_double_from_real_value (TWO52r, mode);
36660 TWO52 = force_reg (mode, TWO52);
36661
36662 return TWO52;
36663 }
36664
36665 /* Expand SSE sequence for computing lround from OP1 storing
36666 into OP0. */
36667 void
36668 ix86_expand_lround (rtx op0, rtx op1)
36669 {
36670 /* C code for the stuff we're doing below:
36671 tmp = op1 + copysign (nextafter (0.5, 0.0), op1)
36672 return (long)tmp;
36673 */
36674 enum machine_mode mode = GET_MODE (op1);
36675 const struct real_format *fmt;
36676 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
36677 rtx adj;
36678
36679 /* load nextafter (0.5, 0.0) */
36680 fmt = REAL_MODE_FORMAT (mode);
36681 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
36682 REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
36683
36684 /* adj = copysign (0.5, op1) */
36685 adj = force_reg (mode, const_double_from_real_value (pred_half, mode));
36686 ix86_sse_copysign_to_positive (adj, adj, force_reg (mode, op1), NULL_RTX);
36687
36688 /* adj = op1 + adj */
36689 adj = expand_simple_binop (mode, PLUS, adj, op1, NULL_RTX, 0, OPTAB_DIRECT);
36690
36691 /* op0 = (imode)adj */
36692 expand_fix (op0, adj, 0);
36693 }
36694
36695 /* Expand SSE2 sequence for computing lround from OPERAND1 storing
36696 into OPERAND0. */
36697 void
36698 ix86_expand_lfloorceil (rtx op0, rtx op1, bool do_floor)
36699 {
36700 /* C code for the stuff we're doing below (for do_floor):
36701 xi = (long)op1;
36702 xi -= (double)xi > op1 ? 1 : 0;
36703 return xi;
36704 */
36705 enum machine_mode fmode = GET_MODE (op1);
36706 enum machine_mode imode = GET_MODE (op0);
36707 rtx ireg, freg, label, tmp;
36708
36709 /* reg = (long)op1 */
36710 ireg = gen_reg_rtx (imode);
36711 expand_fix (ireg, op1, 0);
36712
36713 /* freg = (double)reg */
36714 freg = gen_reg_rtx (fmode);
36715 expand_float (freg, ireg, 0);
36716
36717 /* ireg = (freg > op1) ? ireg - 1 : ireg */
36718 label = ix86_expand_sse_compare_and_jump (UNLE,
36719 freg, op1, !do_floor);
36720 tmp = expand_simple_binop (imode, do_floor ? MINUS : PLUS,
36721 ireg, const1_rtx, NULL_RTX, 0, OPTAB_DIRECT);
36722 emit_move_insn (ireg, tmp);
36723
36724 emit_label (label);
36725 LABEL_NUSES (label) = 1;
36726
36727 emit_move_insn (op0, ireg);
36728 }
36729
36730 /* Expand rint (IEEE round to nearest) rounding OPERAND1 and storing the
36731 result in OPERAND0. */
36732 void
36733 ix86_expand_rint (rtx operand0, rtx operand1)
36734 {
36735 /* C code for the stuff we're doing below:
36736 xa = fabs (operand1);
36737 if (!isless (xa, 2**52))
36738 return operand1;
36739 xa = xa + 2**52 - 2**52;
36740 return copysign (xa, operand1);
36741 */
36742 enum machine_mode mode = GET_MODE (operand0);
36743 rtx res, xa, label, TWO52, mask;
36744
36745 res = gen_reg_rtx (mode);
36746 emit_move_insn (res, operand1);
36747
36748 /* xa = abs (operand1) */
36749 xa = ix86_expand_sse_fabs (res, &mask);
36750
36751 /* if (!isless (xa, TWO52)) goto label; */
36752 TWO52 = ix86_gen_TWO52 (mode);
36753 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
36754
36755 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
36756 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
36757
36758 ix86_sse_copysign_to_positive (res, xa, res, mask);
36759
36760 emit_label (label);
36761 LABEL_NUSES (label) = 1;
36762
36763 emit_move_insn (operand0, res);
36764 }
36765
36766 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
36767 into OPERAND0. */
36768 void
36769 ix86_expand_floorceildf_32 (rtx operand0, rtx operand1, bool do_floor)
36770 {
36771 /* C code for the stuff we expand below.
36772 double xa = fabs (x), x2;
36773 if (!isless (xa, TWO52))
36774 return x;
36775 xa = xa + TWO52 - TWO52;
36776 x2 = copysign (xa, x);
36777 Compensate. Floor:
36778 if (x2 > x)
36779 x2 -= 1;
36780 Compensate. Ceil:
36781 if (x2 < x)
36782 x2 -= -1;
36783 return x2;
36784 */
36785 enum machine_mode mode = GET_MODE (operand0);
36786 rtx xa, TWO52, tmp, label, one, res, mask;
36787
36788 TWO52 = ix86_gen_TWO52 (mode);
36789
36790 /* Temporary for holding the result, initialized to the input
36791 operand to ease control flow. */
36792 res = gen_reg_rtx (mode);
36793 emit_move_insn (res, operand1);
36794
36795 /* xa = abs (operand1) */
36796 xa = ix86_expand_sse_fabs (res, &mask);
36797
36798 /* if (!isless (xa, TWO52)) goto label; */
36799 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
36800
36801 /* xa = xa + TWO52 - TWO52; */
36802 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
36803 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
36804
36805 /* xa = copysign (xa, operand1) */
36806 ix86_sse_copysign_to_positive (xa, xa, res, mask);
36807
36808 /* generate 1.0 or -1.0 */
36809 one = force_reg (mode,
36810 const_double_from_real_value (do_floor
36811 ? dconst1 : dconstm1, mode));
36812
36813 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
36814 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
36815 emit_insn (gen_rtx_SET (VOIDmode, tmp,
36816 gen_rtx_AND (mode, one, tmp)));
36817 /* We always need to subtract here to preserve signed zero. */
36818 tmp = expand_simple_binop (mode, MINUS,
36819 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
36820 emit_move_insn (res, tmp);
36821
36822 emit_label (label);
36823 LABEL_NUSES (label) = 1;
36824
36825 emit_move_insn (operand0, res);
36826 }
36827
36828 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
36829 into OPERAND0. */
36830 void
36831 ix86_expand_floorceil (rtx operand0, rtx operand1, bool do_floor)
36832 {
36833 /* C code for the stuff we expand below.
36834 double xa = fabs (x), x2;
36835 if (!isless (xa, TWO52))
36836 return x;
36837 x2 = (double)(long)x;
36838 Compensate. Floor:
36839 if (x2 > x)
36840 x2 -= 1;
36841 Compensate. Ceil:
36842 if (x2 < x)
36843 x2 += 1;
36844 if (HONOR_SIGNED_ZEROS (mode))
36845 return copysign (x2, x);
36846 return x2;
36847 */
36848 enum machine_mode mode = GET_MODE (operand0);
36849 rtx xa, xi, TWO52, tmp, label, one, res, mask;
36850
36851 TWO52 = ix86_gen_TWO52 (mode);
36852
36853 /* Temporary for holding the result, initialized to the input
36854 operand to ease control flow. */
36855 res = gen_reg_rtx (mode);
36856 emit_move_insn (res, operand1);
36857
36858 /* xa = abs (operand1) */
36859 xa = ix86_expand_sse_fabs (res, &mask);
36860
36861 /* if (!isless (xa, TWO52)) goto label; */
36862 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
36863
36864 /* xa = (double)(long)x */
36865 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
36866 expand_fix (xi, res, 0);
36867 expand_float (xa, xi, 0);
36868
36869 /* generate 1.0 */
36870 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
36871
36872 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
36873 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
36874 emit_insn (gen_rtx_SET (VOIDmode, tmp,
36875 gen_rtx_AND (mode, one, tmp)));
36876 tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
36877 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
36878 emit_move_insn (res, tmp);
36879
36880 if (HONOR_SIGNED_ZEROS (mode))
36881 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
36882
36883 emit_label (label);
36884 LABEL_NUSES (label) = 1;
36885
36886 emit_move_insn (operand0, res);
36887 }
36888
36889 /* Expand SSE sequence for computing round from OPERAND1 storing
36890 into OPERAND0. Sequence that works without relying on DImode truncation
36891 via cvttsd2siq that is only available on 64bit targets. */
36892 void
36893 ix86_expand_rounddf_32 (rtx operand0, rtx operand1)
36894 {
36895 /* C code for the stuff we expand below.
36896 double xa = fabs (x), xa2, x2;
36897 if (!isless (xa, TWO52))
36898 return x;
36899 Using the absolute value and copying back sign makes
36900 -0.0 -> -0.0 correct.
36901 xa2 = xa + TWO52 - TWO52;
36902 Compensate.
36903 dxa = xa2 - xa;
36904 if (dxa <= -0.5)
36905 xa2 += 1;
36906 else if (dxa > 0.5)
36907 xa2 -= 1;
36908 x2 = copysign (xa2, x);
36909 return x2;
36910 */
36911 enum machine_mode mode = GET_MODE (operand0);
36912 rtx xa, xa2, dxa, TWO52, tmp, label, half, mhalf, one, res, mask;
36913
36914 TWO52 = ix86_gen_TWO52 (mode);
36915
36916 /* Temporary for holding the result, initialized to the input
36917 operand to ease control flow. */
36918 res = gen_reg_rtx (mode);
36919 emit_move_insn (res, operand1);
36920
36921 /* xa = abs (operand1) */
36922 xa = ix86_expand_sse_fabs (res, &mask);
36923
36924 /* if (!isless (xa, TWO52)) goto label; */
36925 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
36926
36927 /* xa2 = xa + TWO52 - TWO52; */
36928 xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
36929 xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
36930
36931 /* dxa = xa2 - xa; */
36932 dxa = expand_simple_binop (mode, MINUS, xa2, xa, NULL_RTX, 0, OPTAB_DIRECT);
36933
36934 /* generate 0.5, 1.0 and -0.5 */
36935 half = force_reg (mode, const_double_from_real_value (dconsthalf, mode));
36936 one = expand_simple_binop (mode, PLUS, half, half, NULL_RTX, 0, OPTAB_DIRECT);
36937 mhalf = expand_simple_binop (mode, MINUS, half, one, NULL_RTX,
36938 0, OPTAB_DIRECT);
36939
36940 /* Compensate. */
36941 tmp = gen_reg_rtx (mode);
36942 /* xa2 = xa2 - (dxa > 0.5 ? 1 : 0) */
36943 tmp = ix86_expand_sse_compare_mask (UNGT, dxa, half, false);
36944 emit_insn (gen_rtx_SET (VOIDmode, tmp,
36945 gen_rtx_AND (mode, one, tmp)));
36946 xa2 = expand_simple_binop (mode, MINUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
36947 /* xa2 = xa2 + (dxa <= -0.5 ? 1 : 0) */
36948 tmp = ix86_expand_sse_compare_mask (UNGE, mhalf, dxa, false);
36949 emit_insn (gen_rtx_SET (VOIDmode, tmp,
36950 gen_rtx_AND (mode, one, tmp)));
36951 xa2 = expand_simple_binop (mode, PLUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
36952
36953 /* res = copysign (xa2, operand1) */
36954 ix86_sse_copysign_to_positive (res, xa2, force_reg (mode, operand1), mask);
36955
36956 emit_label (label);
36957 LABEL_NUSES (label) = 1;
36958
36959 emit_move_insn (operand0, res);
36960 }
36961
36962 /* Expand SSE sequence for computing trunc from OPERAND1 storing
36963 into OPERAND0. */
36964 void
36965 ix86_expand_trunc (rtx operand0, rtx operand1)
36966 {
36967 /* C code for SSE variant we expand below.
36968 double xa = fabs (x), x2;
36969 if (!isless (xa, TWO52))
36970 return x;
36971 x2 = (double)(long)x;
36972 if (HONOR_SIGNED_ZEROS (mode))
36973 return copysign (x2, x);
36974 return x2;
36975 */
36976 enum machine_mode mode = GET_MODE (operand0);
36977 rtx xa, xi, TWO52, label, res, mask;
36978
36979 TWO52 = ix86_gen_TWO52 (mode);
36980
36981 /* Temporary for holding the result, initialized to the input
36982 operand to ease control flow. */
36983 res = gen_reg_rtx (mode);
36984 emit_move_insn (res, operand1);
36985
36986 /* xa = abs (operand1) */
36987 xa = ix86_expand_sse_fabs (res, &mask);
36988
36989 /* if (!isless (xa, TWO52)) goto label; */
36990 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
36991
36992 /* x = (double)(long)x */
36993 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
36994 expand_fix (xi, res, 0);
36995 expand_float (res, xi, 0);
36996
36997 if (HONOR_SIGNED_ZEROS (mode))
36998 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
36999
37000 emit_label (label);
37001 LABEL_NUSES (label) = 1;
37002
37003 emit_move_insn (operand0, res);
37004 }
37005
37006 /* Expand SSE sequence for computing trunc from OPERAND1 storing
37007 into OPERAND0. */
37008 void
37009 ix86_expand_truncdf_32 (rtx operand0, rtx operand1)
37010 {
37011 enum machine_mode mode = GET_MODE (operand0);
37012 rtx xa, mask, TWO52, label, one, res, smask, tmp;
37013
37014 /* C code for SSE variant we expand below.
37015 double xa = fabs (x), x2;
37016 if (!isless (xa, TWO52))
37017 return x;
37018 xa2 = xa + TWO52 - TWO52;
37019 Compensate:
37020 if (xa2 > xa)
37021 xa2 -= 1.0;
37022 x2 = copysign (xa2, x);
37023 return x2;
37024 */
37025
37026 TWO52 = ix86_gen_TWO52 (mode);
37027
37028 /* Temporary for holding the result, initialized to the input
37029 operand to ease control flow. */
37030 res = gen_reg_rtx (mode);
37031 emit_move_insn (res, operand1);
37032
37033 /* xa = abs (operand1) */
37034 xa = ix86_expand_sse_fabs (res, &smask);
37035
37036 /* if (!isless (xa, TWO52)) goto label; */
37037 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
37038
37039 /* res = xa + TWO52 - TWO52; */
37040 tmp = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
37041 tmp = expand_simple_binop (mode, MINUS, tmp, TWO52, tmp, 0, OPTAB_DIRECT);
37042 emit_move_insn (res, tmp);
37043
37044 /* generate 1.0 */
37045 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
37046
37047 /* Compensate: res = xa2 - (res > xa ? 1 : 0) */
37048 mask = ix86_expand_sse_compare_mask (UNGT, res, xa, false);
37049 emit_insn (gen_rtx_SET (VOIDmode, mask,
37050 gen_rtx_AND (mode, mask, one)));
37051 tmp = expand_simple_binop (mode, MINUS,
37052 res, mask, NULL_RTX, 0, OPTAB_DIRECT);
37053 emit_move_insn (res, tmp);
37054
37055 /* res = copysign (res, operand1) */
37056 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), smask);
37057
37058 emit_label (label);
37059 LABEL_NUSES (label) = 1;
37060
37061 emit_move_insn (operand0, res);
37062 }
37063
37064 /* Expand SSE sequence for computing round from OPERAND1 storing
37065 into OPERAND0. */
37066 void
37067 ix86_expand_round (rtx operand0, rtx operand1)
37068 {
37069 /* C code for the stuff we're doing below:
37070 double xa = fabs (x);
37071 if (!isless (xa, TWO52))
37072 return x;
37073 xa = (double)(long)(xa + nextafter (0.5, 0.0));
37074 return copysign (xa, x);
37075 */
37076 enum machine_mode mode = GET_MODE (operand0);
37077 rtx res, TWO52, xa, label, xi, half, mask;
37078 const struct real_format *fmt;
37079 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
37080
37081 /* Temporary for holding the result, initialized to the input
37082 operand to ease control flow. */
37083 res = gen_reg_rtx (mode);
37084 emit_move_insn (res, operand1);
37085
37086 TWO52 = ix86_gen_TWO52 (mode);
37087 xa = ix86_expand_sse_fabs (res, &mask);
37088 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
37089
37090 /* load nextafter (0.5, 0.0) */
37091 fmt = REAL_MODE_FORMAT (mode);
37092 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
37093 REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
37094
37095 /* xa = xa + 0.5 */
37096 half = force_reg (mode, const_double_from_real_value (pred_half, mode));
37097 xa = expand_simple_binop (mode, PLUS, xa, half, NULL_RTX, 0, OPTAB_DIRECT);
37098
37099 /* xa = (double)(int64_t)xa */
37100 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
37101 expand_fix (xi, xa, 0);
37102 expand_float (xa, xi, 0);
37103
37104 /* res = copysign (xa, operand1) */
37105 ix86_sse_copysign_to_positive (res, xa, force_reg (mode, operand1), mask);
37106
37107 emit_label (label);
37108 LABEL_NUSES (label) = 1;
37109
37110 emit_move_insn (operand0, res);
37111 }
37112
37113 /* Expand SSE sequence for computing round
37114 from OP1 storing into OP0 using sse4 round insn. */
37115 void
37116 ix86_expand_round_sse4 (rtx op0, rtx op1)
37117 {
37118 enum machine_mode mode = GET_MODE (op0);
37119 rtx e1, e2, res, half;
37120 const struct real_format *fmt;
37121 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
37122 rtx (*gen_copysign) (rtx, rtx, rtx);
37123 rtx (*gen_round) (rtx, rtx, rtx);
37124
37125 switch (mode)
37126 {
37127 case SFmode:
37128 gen_copysign = gen_copysignsf3;
37129 gen_round = gen_sse4_1_roundsf2;
37130 break;
37131 case DFmode:
37132 gen_copysign = gen_copysigndf3;
37133 gen_round = gen_sse4_1_rounddf2;
37134 break;
37135 default:
37136 gcc_unreachable ();
37137 }
37138
37139 /* round (a) = trunc (a + copysign (0.5, a)) */
37140
37141 /* load nextafter (0.5, 0.0) */
37142 fmt = REAL_MODE_FORMAT (mode);
37143 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
37144 REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
37145 half = const_double_from_real_value (pred_half, mode);
37146
37147 /* e1 = copysign (0.5, op1) */
37148 e1 = gen_reg_rtx (mode);
37149 emit_insn (gen_copysign (e1, half, op1));
37150
37151 /* e2 = op1 + e1 */
37152 e2 = expand_simple_binop (mode, PLUS, op1, e1, NULL_RTX, 0, OPTAB_DIRECT);
37153
37154 /* res = trunc (e2) */
37155 res = gen_reg_rtx (mode);
37156 emit_insn (gen_round (res, e2, GEN_INT (ROUND_TRUNC)));
37157
37158 emit_move_insn (op0, res);
37159 }
37160 \f
37161
37162 /* Table of valid machine attributes. */
37163 static const struct attribute_spec ix86_attribute_table[] =
37164 {
37165 /* { name, min_len, max_len, decl_req, type_req, fn_type_req, handler,
37166 affects_type_identity } */
37167 /* Stdcall attribute says callee is responsible for popping arguments
37168 if they are not variable. */
37169 { "stdcall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
37170 true },
37171 /* Fastcall attribute says callee is responsible for popping arguments
37172 if they are not variable. */
37173 { "fastcall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
37174 true },
37175 /* Thiscall attribute says callee is responsible for popping arguments
37176 if they are not variable. */
37177 { "thiscall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
37178 true },
37179 /* Cdecl attribute says the callee is a normal C declaration */
37180 { "cdecl", 0, 0, false, true, true, ix86_handle_cconv_attribute,
37181 true },
37182 /* Regparm attribute specifies how many integer arguments are to be
37183 passed in registers. */
37184 { "regparm", 1, 1, false, true, true, ix86_handle_cconv_attribute,
37185 true },
37186 /* Sseregparm attribute says we are using x86_64 calling conventions
37187 for FP arguments. */
37188 { "sseregparm", 0, 0, false, true, true, ix86_handle_cconv_attribute,
37189 true },
37190 /* The transactional memory builtins are implicitly regparm or fastcall
37191 depending on the ABI. Override the generic do-nothing attribute that
37192 these builtins were declared with. */
37193 { "*tm regparm", 0, 0, false, true, true, ix86_handle_tm_regparm_attribute,
37194 true },
37195 /* force_align_arg_pointer says this function realigns the stack at entry. */
37196 { (const char *)&ix86_force_align_arg_pointer_string, 0, 0,
37197 false, true, true, ix86_handle_cconv_attribute, false },
37198 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
37199 { "dllimport", 0, 0, false, false, false, handle_dll_attribute, false },
37200 { "dllexport", 0, 0, false, false, false, handle_dll_attribute, false },
37201 { "shared", 0, 0, true, false, false, ix86_handle_shared_attribute,
37202 false },
37203 #endif
37204 { "ms_struct", 0, 0, false, false, false, ix86_handle_struct_attribute,
37205 false },
37206 { "gcc_struct", 0, 0, false, false, false, ix86_handle_struct_attribute,
37207 false },
37208 #ifdef SUBTARGET_ATTRIBUTE_TABLE
37209 SUBTARGET_ATTRIBUTE_TABLE,
37210 #endif
37211 /* ms_abi and sysv_abi calling convention function attributes. */
37212 { "ms_abi", 0, 0, false, true, true, ix86_handle_abi_attribute, true },
37213 { "sysv_abi", 0, 0, false, true, true, ix86_handle_abi_attribute, true },
37214 { "ms_hook_prologue", 0, 0, true, false, false, ix86_handle_fndecl_attribute,
37215 false },
37216 { "callee_pop_aggregate_return", 1, 1, false, true, true,
37217 ix86_handle_callee_pop_aggregate_return, true },
37218 /* End element. */
37219 { NULL, 0, 0, false, false, false, NULL, false }
37220 };
37221
37222 /* Implement targetm.vectorize.builtin_vectorization_cost. */
37223 static int
37224 ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
37225 tree vectype,
37226 int misalign ATTRIBUTE_UNUSED)
37227 {
37228 unsigned elements;
37229
37230 switch (type_of_cost)
37231 {
37232 case scalar_stmt:
37233 return ix86_cost->scalar_stmt_cost;
37234
37235 case scalar_load:
37236 return ix86_cost->scalar_load_cost;
37237
37238 case scalar_store:
37239 return ix86_cost->scalar_store_cost;
37240
37241 case vector_stmt:
37242 return ix86_cost->vec_stmt_cost;
37243
37244 case vector_load:
37245 return ix86_cost->vec_align_load_cost;
37246
37247 case vector_store:
37248 return ix86_cost->vec_store_cost;
37249
37250 case vec_to_scalar:
37251 return ix86_cost->vec_to_scalar_cost;
37252
37253 case scalar_to_vec:
37254 return ix86_cost->scalar_to_vec_cost;
37255
37256 case unaligned_load:
37257 case unaligned_store:
37258 return ix86_cost->vec_unalign_load_cost;
37259
37260 case cond_branch_taken:
37261 return ix86_cost->cond_taken_branch_cost;
37262
37263 case cond_branch_not_taken:
37264 return ix86_cost->cond_not_taken_branch_cost;
37265
37266 case vec_perm:
37267 case vec_promote_demote:
37268 return ix86_cost->vec_stmt_cost;
37269
37270 case vec_construct:
37271 elements = TYPE_VECTOR_SUBPARTS (vectype);
37272 return elements / 2 + 1;
37273
37274 default:
37275 gcc_unreachable ();
37276 }
37277 }
37278
37279 /* A cached (set (nil) (vselect (vconcat (nil) (nil)) (parallel [])))
37280 insn, so that expand_vselect{,_vconcat} doesn't have to create a fresh
37281 insn every time. */
37282
37283 static GTY(()) rtx vselect_insn;
37284
37285 /* Initialize vselect_insn. */
37286
37287 static void
37288 init_vselect_insn (void)
37289 {
37290 unsigned i;
37291 rtx x;
37292
37293 x = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (MAX_VECT_LEN));
37294 for (i = 0; i < MAX_VECT_LEN; ++i)
37295 XVECEXP (x, 0, i) = const0_rtx;
37296 x = gen_rtx_VEC_SELECT (V2DFmode, gen_rtx_VEC_CONCAT (V4DFmode, const0_rtx,
37297 const0_rtx), x);
37298 x = gen_rtx_SET (VOIDmode, const0_rtx, x);
37299 start_sequence ();
37300 vselect_insn = emit_insn (x);
37301 end_sequence ();
37302 }
37303
37304 /* Construct (set target (vec_select op0 (parallel perm))) and
37305 return true if that's a valid instruction in the active ISA. */
37306
37307 static bool
37308 expand_vselect (rtx target, rtx op0, const unsigned char *perm,
37309 unsigned nelt, bool testing_p)
37310 {
37311 unsigned int i;
37312 rtx x, save_vconcat;
37313 int icode;
37314
37315 if (vselect_insn == NULL_RTX)
37316 init_vselect_insn ();
37317
37318 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 1);
37319 PUT_NUM_ELEM (XVEC (x, 0), nelt);
37320 for (i = 0; i < nelt; ++i)
37321 XVECEXP (x, 0, i) = GEN_INT (perm[i]);
37322 save_vconcat = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
37323 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = op0;
37324 PUT_MODE (SET_SRC (PATTERN (vselect_insn)), GET_MODE (target));
37325 SET_DEST (PATTERN (vselect_insn)) = target;
37326 icode = recog_memoized (vselect_insn);
37327
37328 if (icode >= 0 && !testing_p)
37329 emit_insn (copy_rtx (PATTERN (vselect_insn)));
37330
37331 SET_DEST (PATTERN (vselect_insn)) = const0_rtx;
37332 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = save_vconcat;
37333 INSN_CODE (vselect_insn) = -1;
37334
37335 return icode >= 0;
37336 }
37337
37338 /* Similar, but generate a vec_concat from op0 and op1 as well. */
37339
37340 static bool
37341 expand_vselect_vconcat (rtx target, rtx op0, rtx op1,
37342 const unsigned char *perm, unsigned nelt,
37343 bool testing_p)
37344 {
37345 enum machine_mode v2mode;
37346 rtx x;
37347 bool ok;
37348
37349 if (vselect_insn == NULL_RTX)
37350 init_vselect_insn ();
37351
37352 v2mode = GET_MODE_2XWIDER_MODE (GET_MODE (op0));
37353 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
37354 PUT_MODE (x, v2mode);
37355 XEXP (x, 0) = op0;
37356 XEXP (x, 1) = op1;
37357 ok = expand_vselect (target, x, perm, nelt, testing_p);
37358 XEXP (x, 0) = const0_rtx;
37359 XEXP (x, 1) = const0_rtx;
37360 return ok;
37361 }
37362
37363 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
37364 in terms of blendp[sd] / pblendw / pblendvb / vpblendd. */
37365
37366 static bool
37367 expand_vec_perm_blend (struct expand_vec_perm_d *d)
37368 {
37369 enum machine_mode vmode = d->vmode;
37370 unsigned i, mask, nelt = d->nelt;
37371 rtx target, op0, op1, x;
37372 rtx rperm[32], vperm;
37373
37374 if (d->one_operand_p)
37375 return false;
37376 if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
37377 ;
37378 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
37379 ;
37380 else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
37381 ;
37382 else
37383 return false;
37384
37385 /* This is a blend, not a permute. Elements must stay in their
37386 respective lanes. */
37387 for (i = 0; i < nelt; ++i)
37388 {
37389 unsigned e = d->perm[i];
37390 if (!(e == i || e == i + nelt))
37391 return false;
37392 }
37393
37394 if (d->testing_p)
37395 return true;
37396
37397 /* ??? Without SSE4.1, we could implement this with and/andn/or. This
37398 decision should be extracted elsewhere, so that we only try that
37399 sequence once all budget==3 options have been tried. */
37400 target = d->target;
37401 op0 = d->op0;
37402 op1 = d->op1;
37403 mask = 0;
37404
37405 switch (vmode)
37406 {
37407 case V4DFmode:
37408 case V8SFmode:
37409 case V2DFmode:
37410 case V4SFmode:
37411 case V8HImode:
37412 case V8SImode:
37413 for (i = 0; i < nelt; ++i)
37414 mask |= (d->perm[i] >= nelt) << i;
37415 break;
37416
37417 case V2DImode:
37418 for (i = 0; i < 2; ++i)
37419 mask |= (d->perm[i] >= 2 ? 15 : 0) << (i * 4);
37420 vmode = V8HImode;
37421 goto do_subreg;
37422
37423 case V4SImode:
37424 for (i = 0; i < 4; ++i)
37425 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
37426 vmode = V8HImode;
37427 goto do_subreg;
37428
37429 case V16QImode:
37430 /* See if bytes move in pairs so we can use pblendw with
37431 an immediate argument, rather than pblendvb with a vector
37432 argument. */
37433 for (i = 0; i < 16; i += 2)
37434 if (d->perm[i] + 1 != d->perm[i + 1])
37435 {
37436 use_pblendvb:
37437 for (i = 0; i < nelt; ++i)
37438 rperm[i] = (d->perm[i] < nelt ? const0_rtx : constm1_rtx);
37439
37440 finish_pblendvb:
37441 vperm = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
37442 vperm = force_reg (vmode, vperm);
37443
37444 if (GET_MODE_SIZE (vmode) == 16)
37445 emit_insn (gen_sse4_1_pblendvb (target, op0, op1, vperm));
37446 else
37447 emit_insn (gen_avx2_pblendvb (target, op0, op1, vperm));
37448 return true;
37449 }
37450
37451 for (i = 0; i < 8; ++i)
37452 mask |= (d->perm[i * 2] >= 16) << i;
37453 vmode = V8HImode;
37454 /* FALLTHRU */
37455
37456 do_subreg:
37457 target = gen_lowpart (vmode, target);
37458 op0 = gen_lowpart (vmode, op0);
37459 op1 = gen_lowpart (vmode, op1);
37460 break;
37461
37462 case V32QImode:
37463 /* See if bytes move in pairs. If not, vpblendvb must be used. */
37464 for (i = 0; i < 32; i += 2)
37465 if (d->perm[i] + 1 != d->perm[i + 1])
37466 goto use_pblendvb;
37467 /* See if bytes move in quadruplets. If yes, vpblendd
37468 with immediate can be used. */
37469 for (i = 0; i < 32; i += 4)
37470 if (d->perm[i] + 2 != d->perm[i + 2])
37471 break;
37472 if (i < 32)
37473 {
37474 /* See if bytes move the same in both lanes. If yes,
37475 vpblendw with immediate can be used. */
37476 for (i = 0; i < 16; i += 2)
37477 if (d->perm[i] + 16 != d->perm[i + 16])
37478 goto use_pblendvb;
37479
37480 /* Use vpblendw. */
37481 for (i = 0; i < 16; ++i)
37482 mask |= (d->perm[i * 2] >= 32) << i;
37483 vmode = V16HImode;
37484 goto do_subreg;
37485 }
37486
37487 /* Use vpblendd. */
37488 for (i = 0; i < 8; ++i)
37489 mask |= (d->perm[i * 4] >= 32) << i;
37490 vmode = V8SImode;
37491 goto do_subreg;
37492
37493 case V16HImode:
37494 /* See if words move in pairs. If yes, vpblendd can be used. */
37495 for (i = 0; i < 16; i += 2)
37496 if (d->perm[i] + 1 != d->perm[i + 1])
37497 break;
37498 if (i < 16)
37499 {
37500 /* See if words move the same in both lanes. If not,
37501 vpblendvb must be used. */
37502 for (i = 0; i < 8; i++)
37503 if (d->perm[i] + 8 != d->perm[i + 8])
37504 {
37505 /* Use vpblendvb. */
37506 for (i = 0; i < 32; ++i)
37507 rperm[i] = (d->perm[i / 2] < 16 ? const0_rtx : constm1_rtx);
37508
37509 vmode = V32QImode;
37510 nelt = 32;
37511 target = gen_lowpart (vmode, target);
37512 op0 = gen_lowpart (vmode, op0);
37513 op1 = gen_lowpart (vmode, op1);
37514 goto finish_pblendvb;
37515 }
37516
37517 /* Use vpblendw. */
37518 for (i = 0; i < 16; ++i)
37519 mask |= (d->perm[i] >= 16) << i;
37520 break;
37521 }
37522
37523 /* Use vpblendd. */
37524 for (i = 0; i < 8; ++i)
37525 mask |= (d->perm[i * 2] >= 16) << i;
37526 vmode = V8SImode;
37527 goto do_subreg;
37528
37529 case V4DImode:
37530 /* Use vpblendd. */
37531 for (i = 0; i < 4; ++i)
37532 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
37533 vmode = V8SImode;
37534 goto do_subreg;
37535
37536 default:
37537 gcc_unreachable ();
37538 }
37539
37540 /* This matches five different patterns with the different modes. */
37541 x = gen_rtx_VEC_MERGE (vmode, op1, op0, GEN_INT (mask));
37542 x = gen_rtx_SET (VOIDmode, target, x);
37543 emit_insn (x);
37544
37545 return true;
37546 }
37547
37548 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
37549 in terms of the variable form of vpermilps.
37550
37551 Note that we will have already failed the immediate input vpermilps,
37552 which requires that the high and low part shuffle be identical; the
37553 variable form doesn't require that. */
37554
37555 static bool
37556 expand_vec_perm_vpermil (struct expand_vec_perm_d *d)
37557 {
37558 rtx rperm[8], vperm;
37559 unsigned i;
37560
37561 if (!TARGET_AVX || d->vmode != V8SFmode || !d->one_operand_p)
37562 return false;
37563
37564 /* We can only permute within the 128-bit lane. */
37565 for (i = 0; i < 8; ++i)
37566 {
37567 unsigned e = d->perm[i];
37568 if (i < 4 ? e >= 4 : e < 4)
37569 return false;
37570 }
37571
37572 if (d->testing_p)
37573 return true;
37574
37575 for (i = 0; i < 8; ++i)
37576 {
37577 unsigned e = d->perm[i];
37578
37579 /* Within each 128-bit lane, the elements of op0 are numbered
37580 from 0 and the elements of op1 are numbered from 4. */
37581 if (e >= 8 + 4)
37582 e -= 8;
37583 else if (e >= 4)
37584 e -= 4;
37585
37586 rperm[i] = GEN_INT (e);
37587 }
37588
37589 vperm = gen_rtx_CONST_VECTOR (V8SImode, gen_rtvec_v (8, rperm));
37590 vperm = force_reg (V8SImode, vperm);
37591 emit_insn (gen_avx_vpermilvarv8sf3 (d->target, d->op0, vperm));
37592
37593 return true;
37594 }
37595
37596 /* Return true if permutation D can be performed as VMODE permutation
37597 instead. */
37598
37599 static bool
37600 valid_perm_using_mode_p (enum machine_mode vmode, struct expand_vec_perm_d *d)
37601 {
37602 unsigned int i, j, chunk;
37603
37604 if (GET_MODE_CLASS (vmode) != MODE_VECTOR_INT
37605 || GET_MODE_CLASS (d->vmode) != MODE_VECTOR_INT
37606 || GET_MODE_SIZE (vmode) != GET_MODE_SIZE (d->vmode))
37607 return false;
37608
37609 if (GET_MODE_NUNITS (vmode) >= d->nelt)
37610 return true;
37611
37612 chunk = d->nelt / GET_MODE_NUNITS (vmode);
37613 for (i = 0; i < d->nelt; i += chunk)
37614 if (d->perm[i] & (chunk - 1))
37615 return false;
37616 else
37617 for (j = 1; j < chunk; ++j)
37618 if (d->perm[i] + j != d->perm[i + j])
37619 return false;
37620
37621 return true;
37622 }
37623
37624 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
37625 in terms of pshufb, vpperm, vpermq, vpermd, vpermps or vperm2i128. */
37626
37627 static bool
37628 expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
37629 {
37630 unsigned i, nelt, eltsz, mask;
37631 unsigned char perm[32];
37632 enum machine_mode vmode = V16QImode;
37633 rtx rperm[32], vperm, target, op0, op1;
37634
37635 nelt = d->nelt;
37636
37637 if (!d->one_operand_p)
37638 {
37639 if (!TARGET_XOP || GET_MODE_SIZE (d->vmode) != 16)
37640 {
37641 if (TARGET_AVX2
37642 && valid_perm_using_mode_p (V2TImode, d))
37643 {
37644 if (d->testing_p)
37645 return true;
37646
37647 /* Use vperm2i128 insn. The pattern uses
37648 V4DImode instead of V2TImode. */
37649 target = gen_lowpart (V4DImode, d->target);
37650 op0 = gen_lowpart (V4DImode, d->op0);
37651 op1 = gen_lowpart (V4DImode, d->op1);
37652 rperm[0]
37653 = GEN_INT (((d->perm[0] & (nelt / 2)) ? 1 : 0)
37654 || ((d->perm[nelt / 2] & (nelt / 2)) ? 2 : 0));
37655 emit_insn (gen_avx2_permv2ti (target, op0, op1, rperm[0]));
37656 return true;
37657 }
37658 return false;
37659 }
37660 }
37661 else
37662 {
37663 if (GET_MODE_SIZE (d->vmode) == 16)
37664 {
37665 if (!TARGET_SSSE3)
37666 return false;
37667 }
37668 else if (GET_MODE_SIZE (d->vmode) == 32)
37669 {
37670 if (!TARGET_AVX2)
37671 return false;
37672
37673 /* V4DImode should be already handled through
37674 expand_vselect by vpermq instruction. */
37675 gcc_assert (d->vmode != V4DImode);
37676
37677 vmode = V32QImode;
37678 if (d->vmode == V8SImode
37679 || d->vmode == V16HImode
37680 || d->vmode == V32QImode)
37681 {
37682 /* First see if vpermq can be used for
37683 V8SImode/V16HImode/V32QImode. */
37684 if (valid_perm_using_mode_p (V4DImode, d))
37685 {
37686 for (i = 0; i < 4; i++)
37687 perm[i] = (d->perm[i * nelt / 4] * 4 / nelt) & 3;
37688 if (d->testing_p)
37689 return true;
37690 return expand_vselect (gen_lowpart (V4DImode, d->target),
37691 gen_lowpart (V4DImode, d->op0),
37692 perm, 4, false);
37693 }
37694
37695 /* Next see if vpermd can be used. */
37696 if (valid_perm_using_mode_p (V8SImode, d))
37697 vmode = V8SImode;
37698 }
37699 /* Or if vpermps can be used. */
37700 else if (d->vmode == V8SFmode)
37701 vmode = V8SImode;
37702
37703 if (vmode == V32QImode)
37704 {
37705 /* vpshufb only works intra lanes, it is not
37706 possible to shuffle bytes in between the lanes. */
37707 for (i = 0; i < nelt; ++i)
37708 if ((d->perm[i] ^ i) & (nelt / 2))
37709 return false;
37710 }
37711 }
37712 else
37713 return false;
37714 }
37715
37716 if (d->testing_p)
37717 return true;
37718
37719 if (vmode == V8SImode)
37720 for (i = 0; i < 8; ++i)
37721 rperm[i] = GEN_INT ((d->perm[i * nelt / 8] * 8 / nelt) & 7);
37722 else
37723 {
37724 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
37725 if (!d->one_operand_p)
37726 mask = 2 * nelt - 1;
37727 else if (vmode == V16QImode)
37728 mask = nelt - 1;
37729 else
37730 mask = nelt / 2 - 1;
37731
37732 for (i = 0; i < nelt; ++i)
37733 {
37734 unsigned j, e = d->perm[i] & mask;
37735 for (j = 0; j < eltsz; ++j)
37736 rperm[i * eltsz + j] = GEN_INT (e * eltsz + j);
37737 }
37738 }
37739
37740 vperm = gen_rtx_CONST_VECTOR (vmode,
37741 gen_rtvec_v (GET_MODE_NUNITS (vmode), rperm));
37742 vperm = force_reg (vmode, vperm);
37743
37744 target = gen_lowpart (vmode, d->target);
37745 op0 = gen_lowpart (vmode, d->op0);
37746 if (d->one_operand_p)
37747 {
37748 if (vmode == V16QImode)
37749 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, vperm));
37750 else if (vmode == V32QImode)
37751 emit_insn (gen_avx2_pshufbv32qi3 (target, op0, vperm));
37752 else if (vmode == V8SFmode)
37753 emit_insn (gen_avx2_permvarv8sf (target, op0, vperm));
37754 else
37755 emit_insn (gen_avx2_permvarv8si (target, op0, vperm));
37756 }
37757 else
37758 {
37759 op1 = gen_lowpart (vmode, d->op1);
37760 emit_insn (gen_xop_pperm (target, op0, op1, vperm));
37761 }
37762
37763 return true;
37764 }
37765
37766 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to instantiate D
37767 in a single instruction. */
37768
37769 static bool
37770 expand_vec_perm_1 (struct expand_vec_perm_d *d)
37771 {
37772 unsigned i, nelt = d->nelt;
37773 unsigned char perm2[MAX_VECT_LEN];
37774
37775 /* Check plain VEC_SELECT first, because AVX has instructions that could
37776 match both SEL and SEL+CONCAT, but the plain SEL will allow a memory
37777 input where SEL+CONCAT may not. */
37778 if (d->one_operand_p)
37779 {
37780 int mask = nelt - 1;
37781 bool identity_perm = true;
37782 bool broadcast_perm = true;
37783
37784 for (i = 0; i < nelt; i++)
37785 {
37786 perm2[i] = d->perm[i] & mask;
37787 if (perm2[i] != i)
37788 identity_perm = false;
37789 if (perm2[i])
37790 broadcast_perm = false;
37791 }
37792
37793 if (identity_perm)
37794 {
37795 if (!d->testing_p)
37796 emit_move_insn (d->target, d->op0);
37797 return true;
37798 }
37799 else if (broadcast_perm && TARGET_AVX2)
37800 {
37801 /* Use vpbroadcast{b,w,d}. */
37802 rtx (*gen) (rtx, rtx) = NULL;
37803 switch (d->vmode)
37804 {
37805 case V32QImode:
37806 gen = gen_avx2_pbroadcastv32qi_1;
37807 break;
37808 case V16HImode:
37809 gen = gen_avx2_pbroadcastv16hi_1;
37810 break;
37811 case V8SImode:
37812 gen = gen_avx2_pbroadcastv8si_1;
37813 break;
37814 case V16QImode:
37815 gen = gen_avx2_pbroadcastv16qi;
37816 break;
37817 case V8HImode:
37818 gen = gen_avx2_pbroadcastv8hi;
37819 break;
37820 case V8SFmode:
37821 gen = gen_avx2_vec_dupv8sf_1;
37822 break;
37823 /* For other modes prefer other shuffles this function creates. */
37824 default: break;
37825 }
37826 if (gen != NULL)
37827 {
37828 if (!d->testing_p)
37829 emit_insn (gen (d->target, d->op0));
37830 return true;
37831 }
37832 }
37833
37834 if (expand_vselect (d->target, d->op0, perm2, nelt, d->testing_p))
37835 return true;
37836
37837 /* There are plenty of patterns in sse.md that are written for
37838 SEL+CONCAT and are not replicated for a single op. Perhaps
37839 that should be changed, to avoid the nastiness here. */
37840
37841 /* Recognize interleave style patterns, which means incrementing
37842 every other permutation operand. */
37843 for (i = 0; i < nelt; i += 2)
37844 {
37845 perm2[i] = d->perm[i] & mask;
37846 perm2[i + 1] = (d->perm[i + 1] & mask) + nelt;
37847 }
37848 if (expand_vselect_vconcat (d->target, d->op0, d->op0, perm2, nelt,
37849 d->testing_p))
37850 return true;
37851
37852 /* Recognize shufps, which means adding {0, 0, nelt, nelt}. */
37853 if (nelt >= 4)
37854 {
37855 for (i = 0; i < nelt; i += 4)
37856 {
37857 perm2[i + 0] = d->perm[i + 0] & mask;
37858 perm2[i + 1] = d->perm[i + 1] & mask;
37859 perm2[i + 2] = (d->perm[i + 2] & mask) + nelt;
37860 perm2[i + 3] = (d->perm[i + 3] & mask) + nelt;
37861 }
37862
37863 if (expand_vselect_vconcat (d->target, d->op0, d->op0, perm2, nelt,
37864 d->testing_p))
37865 return true;
37866 }
37867 }
37868
37869 /* Finally, try the fully general two operand permute. */
37870 if (expand_vselect_vconcat (d->target, d->op0, d->op1, d->perm, nelt,
37871 d->testing_p))
37872 return true;
37873
37874 /* Recognize interleave style patterns with reversed operands. */
37875 if (!d->one_operand_p)
37876 {
37877 for (i = 0; i < nelt; ++i)
37878 {
37879 unsigned e = d->perm[i];
37880 if (e >= nelt)
37881 e -= nelt;
37882 else
37883 e += nelt;
37884 perm2[i] = e;
37885 }
37886
37887 if (expand_vselect_vconcat (d->target, d->op1, d->op0, perm2, nelt,
37888 d->testing_p))
37889 return true;
37890 }
37891
37892 /* Try the SSE4.1 blend variable merge instructions. */
37893 if (expand_vec_perm_blend (d))
37894 return true;
37895
37896 /* Try one of the AVX vpermil variable permutations. */
37897 if (expand_vec_perm_vpermil (d))
37898 return true;
37899
37900 /* Try the SSSE3 pshufb or XOP vpperm or AVX2 vperm2i128,
37901 vpshufb, vpermd, vpermps or vpermq variable permutation. */
37902 if (expand_vec_perm_pshufb (d))
37903 return true;
37904
37905 return false;
37906 }
37907
37908 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
37909 in terms of a pair of pshuflw + pshufhw instructions. */
37910
37911 static bool
37912 expand_vec_perm_pshuflw_pshufhw (struct expand_vec_perm_d *d)
37913 {
37914 unsigned char perm2[MAX_VECT_LEN];
37915 unsigned i;
37916 bool ok;
37917
37918 if (d->vmode != V8HImode || !d->one_operand_p)
37919 return false;
37920
37921 /* The two permutations only operate in 64-bit lanes. */
37922 for (i = 0; i < 4; ++i)
37923 if (d->perm[i] >= 4)
37924 return false;
37925 for (i = 4; i < 8; ++i)
37926 if (d->perm[i] < 4)
37927 return false;
37928
37929 if (d->testing_p)
37930 return true;
37931
37932 /* Emit the pshuflw. */
37933 memcpy (perm2, d->perm, 4);
37934 for (i = 4; i < 8; ++i)
37935 perm2[i] = i;
37936 ok = expand_vselect (d->target, d->op0, perm2, 8, d->testing_p);
37937 gcc_assert (ok);
37938
37939 /* Emit the pshufhw. */
37940 memcpy (perm2 + 4, d->perm + 4, 4);
37941 for (i = 0; i < 4; ++i)
37942 perm2[i] = i;
37943 ok = expand_vselect (d->target, d->target, perm2, 8, d->testing_p);
37944 gcc_assert (ok);
37945
37946 return true;
37947 }
37948
37949 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
37950 the permutation using the SSSE3 palignr instruction. This succeeds
37951 when all of the elements in PERM fit within one vector and we merely
37952 need to shift them down so that a single vector permutation has a
37953 chance to succeed. */
37954
37955 static bool
37956 expand_vec_perm_palignr (struct expand_vec_perm_d *d)
37957 {
37958 unsigned i, nelt = d->nelt;
37959 unsigned min, max;
37960 bool in_order, ok;
37961 rtx shift;
37962
37963 /* Even with AVX, palignr only operates on 128-bit vectors. */
37964 if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
37965 return false;
37966
37967 min = nelt, max = 0;
37968 for (i = 0; i < nelt; ++i)
37969 {
37970 unsigned e = d->perm[i];
37971 if (e < min)
37972 min = e;
37973 if (e > max)
37974 max = e;
37975 }
37976 if (min == 0 || max - min >= nelt)
37977 return false;
37978
37979 /* Given that we have SSSE3, we know we'll be able to implement the
37980 single operand permutation after the palignr with pshufb. */
37981 if (d->testing_p)
37982 return true;
37983
37984 shift = GEN_INT (min * GET_MODE_BITSIZE (GET_MODE_INNER (d->vmode)));
37985 emit_insn (gen_ssse3_palignrti (gen_lowpart (TImode, d->target),
37986 gen_lowpart (TImode, d->op1),
37987 gen_lowpart (TImode, d->op0), shift));
37988
37989 d->op0 = d->op1 = d->target;
37990 d->one_operand_p = true;
37991
37992 in_order = true;
37993 for (i = 0; i < nelt; ++i)
37994 {
37995 unsigned e = d->perm[i] - min;
37996 if (e != i)
37997 in_order = false;
37998 d->perm[i] = e;
37999 }
38000
38001 /* Test for the degenerate case where the alignment by itself
38002 produces the desired permutation. */
38003 if (in_order)
38004 return true;
38005
38006 ok = expand_vec_perm_1 (d);
38007 gcc_assert (ok);
38008
38009 return ok;
38010 }
38011
38012 static bool expand_vec_perm_interleave3 (struct expand_vec_perm_d *d);
38013
38014 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
38015 a two vector permutation into a single vector permutation by using
38016 an interleave operation to merge the vectors. */
38017
38018 static bool
38019 expand_vec_perm_interleave2 (struct expand_vec_perm_d *d)
38020 {
38021 struct expand_vec_perm_d dremap, dfinal;
38022 unsigned i, nelt = d->nelt, nelt2 = nelt / 2;
38023 unsigned HOST_WIDE_INT contents;
38024 unsigned char remap[2 * MAX_VECT_LEN];
38025 rtx seq;
38026 bool ok, same_halves = false;
38027
38028 if (GET_MODE_SIZE (d->vmode) == 16)
38029 {
38030 if (d->one_operand_p)
38031 return false;
38032 }
38033 else if (GET_MODE_SIZE (d->vmode) == 32)
38034 {
38035 if (!TARGET_AVX)
38036 return false;
38037 /* For 32-byte modes allow even d->one_operand_p.
38038 The lack of cross-lane shuffling in some instructions
38039 might prevent a single insn shuffle. */
38040 dfinal = *d;
38041 dfinal.testing_p = true;
38042 /* If expand_vec_perm_interleave3 can expand this into
38043 a 3 insn sequence, give up and let it be expanded as
38044 3 insn sequence. While that is one insn longer,
38045 it doesn't need a memory operand and in the common
38046 case that both interleave low and high permutations
38047 with the same operands are adjacent needs 4 insns
38048 for both after CSE. */
38049 if (expand_vec_perm_interleave3 (&dfinal))
38050 return false;
38051 }
38052 else
38053 return false;
38054
38055 /* Examine from whence the elements come. */
38056 contents = 0;
38057 for (i = 0; i < nelt; ++i)
38058 contents |= ((unsigned HOST_WIDE_INT) 1) << d->perm[i];
38059
38060 memset (remap, 0xff, sizeof (remap));
38061 dremap = *d;
38062
38063 if (GET_MODE_SIZE (d->vmode) == 16)
38064 {
38065 unsigned HOST_WIDE_INT h1, h2, h3, h4;
38066
38067 /* Split the two input vectors into 4 halves. */
38068 h1 = (((unsigned HOST_WIDE_INT) 1) << nelt2) - 1;
38069 h2 = h1 << nelt2;
38070 h3 = h2 << nelt2;
38071 h4 = h3 << nelt2;
38072
38073 /* If the elements from the low halves use interleave low, and similarly
38074 for interleave high. If the elements are from mis-matched halves, we
38075 can use shufps for V4SF/V4SI or do a DImode shuffle. */
38076 if ((contents & (h1 | h3)) == contents)
38077 {
38078 /* punpckl* */
38079 for (i = 0; i < nelt2; ++i)
38080 {
38081 remap[i] = i * 2;
38082 remap[i + nelt] = i * 2 + 1;
38083 dremap.perm[i * 2] = i;
38084 dremap.perm[i * 2 + 1] = i + nelt;
38085 }
38086 if (!TARGET_SSE2 && d->vmode == V4SImode)
38087 dremap.vmode = V4SFmode;
38088 }
38089 else if ((contents & (h2 | h4)) == contents)
38090 {
38091 /* punpckh* */
38092 for (i = 0; i < nelt2; ++i)
38093 {
38094 remap[i + nelt2] = i * 2;
38095 remap[i + nelt + nelt2] = i * 2 + 1;
38096 dremap.perm[i * 2] = i + nelt2;
38097 dremap.perm[i * 2 + 1] = i + nelt + nelt2;
38098 }
38099 if (!TARGET_SSE2 && d->vmode == V4SImode)
38100 dremap.vmode = V4SFmode;
38101 }
38102 else if ((contents & (h1 | h4)) == contents)
38103 {
38104 /* shufps */
38105 for (i = 0; i < nelt2; ++i)
38106 {
38107 remap[i] = i;
38108 remap[i + nelt + nelt2] = i + nelt2;
38109 dremap.perm[i] = i;
38110 dremap.perm[i + nelt2] = i + nelt + nelt2;
38111 }
38112 if (nelt != 4)
38113 {
38114 /* shufpd */
38115 dremap.vmode = V2DImode;
38116 dremap.nelt = 2;
38117 dremap.perm[0] = 0;
38118 dremap.perm[1] = 3;
38119 }
38120 }
38121 else if ((contents & (h2 | h3)) == contents)
38122 {
38123 /* shufps */
38124 for (i = 0; i < nelt2; ++i)
38125 {
38126 remap[i + nelt2] = i;
38127 remap[i + nelt] = i + nelt2;
38128 dremap.perm[i] = i + nelt2;
38129 dremap.perm[i + nelt2] = i + nelt;
38130 }
38131 if (nelt != 4)
38132 {
38133 /* shufpd */
38134 dremap.vmode = V2DImode;
38135 dremap.nelt = 2;
38136 dremap.perm[0] = 1;
38137 dremap.perm[1] = 2;
38138 }
38139 }
38140 else
38141 return false;
38142 }
38143 else
38144 {
38145 unsigned int nelt4 = nelt / 4, nzcnt = 0;
38146 unsigned HOST_WIDE_INT q[8];
38147 unsigned int nonzero_halves[4];
38148
38149 /* Split the two input vectors into 8 quarters. */
38150 q[0] = (((unsigned HOST_WIDE_INT) 1) << nelt4) - 1;
38151 for (i = 1; i < 8; ++i)
38152 q[i] = q[0] << (nelt4 * i);
38153 for (i = 0; i < 4; ++i)
38154 if (((q[2 * i] | q[2 * i + 1]) & contents) != 0)
38155 {
38156 nonzero_halves[nzcnt] = i;
38157 ++nzcnt;
38158 }
38159
38160 if (nzcnt == 1)
38161 {
38162 gcc_assert (d->one_operand_p);
38163 nonzero_halves[1] = nonzero_halves[0];
38164 same_halves = true;
38165 }
38166 else if (d->one_operand_p)
38167 {
38168 gcc_assert (nonzero_halves[0] == 0);
38169 gcc_assert (nonzero_halves[1] == 1);
38170 }
38171
38172 if (nzcnt <= 2)
38173 {
38174 if (d->perm[0] / nelt2 == nonzero_halves[1])
38175 {
38176 /* Attempt to increase the likelihood that dfinal
38177 shuffle will be intra-lane. */
38178 char tmph = nonzero_halves[0];
38179 nonzero_halves[0] = nonzero_halves[1];
38180 nonzero_halves[1] = tmph;
38181 }
38182
38183 /* vperm2f128 or vperm2i128. */
38184 for (i = 0; i < nelt2; ++i)
38185 {
38186 remap[i + nonzero_halves[1] * nelt2] = i + nelt2;
38187 remap[i + nonzero_halves[0] * nelt2] = i;
38188 dremap.perm[i + nelt2] = i + nonzero_halves[1] * nelt2;
38189 dremap.perm[i] = i + nonzero_halves[0] * nelt2;
38190 }
38191
38192 if (d->vmode != V8SFmode
38193 && d->vmode != V4DFmode
38194 && d->vmode != V8SImode)
38195 {
38196 dremap.vmode = V8SImode;
38197 dremap.nelt = 8;
38198 for (i = 0; i < 4; ++i)
38199 {
38200 dremap.perm[i] = i + nonzero_halves[0] * 4;
38201 dremap.perm[i + 4] = i + nonzero_halves[1] * 4;
38202 }
38203 }
38204 }
38205 else if (d->one_operand_p)
38206 return false;
38207 else if (TARGET_AVX2
38208 && (contents & (q[0] | q[2] | q[4] | q[6])) == contents)
38209 {
38210 /* vpunpckl* */
38211 for (i = 0; i < nelt4; ++i)
38212 {
38213 remap[i] = i * 2;
38214 remap[i + nelt] = i * 2 + 1;
38215 remap[i + nelt2] = i * 2 + nelt2;
38216 remap[i + nelt + nelt2] = i * 2 + nelt2 + 1;
38217 dremap.perm[i * 2] = i;
38218 dremap.perm[i * 2 + 1] = i + nelt;
38219 dremap.perm[i * 2 + nelt2] = i + nelt2;
38220 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2;
38221 }
38222 }
38223 else if (TARGET_AVX2
38224 && (contents & (q[1] | q[3] | q[5] | q[7])) == contents)
38225 {
38226 /* vpunpckh* */
38227 for (i = 0; i < nelt4; ++i)
38228 {
38229 remap[i + nelt4] = i * 2;
38230 remap[i + nelt + nelt4] = i * 2 + 1;
38231 remap[i + nelt2 + nelt4] = i * 2 + nelt2;
38232 remap[i + nelt + nelt2 + nelt4] = i * 2 + nelt2 + 1;
38233 dremap.perm[i * 2] = i + nelt4;
38234 dremap.perm[i * 2 + 1] = i + nelt + nelt4;
38235 dremap.perm[i * 2 + nelt2] = i + nelt2 + nelt4;
38236 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2 + nelt4;
38237 }
38238 }
38239 else
38240 return false;
38241 }
38242
38243 /* Use the remapping array set up above to move the elements from their
38244 swizzled locations into their final destinations. */
38245 dfinal = *d;
38246 for (i = 0; i < nelt; ++i)
38247 {
38248 unsigned e = remap[d->perm[i]];
38249 gcc_assert (e < nelt);
38250 /* If same_halves is true, both halves of the remapped vector are the
38251 same. Avoid cross-lane accesses if possible. */
38252 if (same_halves && i >= nelt2)
38253 {
38254 gcc_assert (e < nelt2);
38255 dfinal.perm[i] = e + nelt2;
38256 }
38257 else
38258 dfinal.perm[i] = e;
38259 }
38260 dfinal.op0 = gen_reg_rtx (dfinal.vmode);
38261 dfinal.op1 = dfinal.op0;
38262 dfinal.one_operand_p = true;
38263 dremap.target = dfinal.op0;
38264
38265 /* Test if the final remap can be done with a single insn. For V4SFmode or
38266 V4SImode this *will* succeed. For V8HImode or V16QImode it may not. */
38267 start_sequence ();
38268 ok = expand_vec_perm_1 (&dfinal);
38269 seq = get_insns ();
38270 end_sequence ();
38271
38272 if (!ok)
38273 return false;
38274
38275 if (d->testing_p)
38276 return true;
38277
38278 if (dremap.vmode != dfinal.vmode)
38279 {
38280 dremap.target = gen_lowpart (dremap.vmode, dremap.target);
38281 dremap.op0 = gen_lowpart (dremap.vmode, dremap.op0);
38282 dremap.op1 = gen_lowpart (dremap.vmode, dremap.op1);
38283 }
38284
38285 ok = expand_vec_perm_1 (&dremap);
38286 gcc_assert (ok);
38287
38288 emit_insn (seq);
38289 return true;
38290 }
38291
38292 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
38293 a single vector cross-lane permutation into vpermq followed
38294 by any of the single insn permutations. */
38295
38296 static bool
38297 expand_vec_perm_vpermq_perm_1 (struct expand_vec_perm_d *d)
38298 {
38299 struct expand_vec_perm_d dremap, dfinal;
38300 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, nelt4 = nelt / 4;
38301 unsigned contents[2];
38302 bool ok;
38303
38304 if (!(TARGET_AVX2
38305 && (d->vmode == V32QImode || d->vmode == V16HImode)
38306 && d->one_operand_p))
38307 return false;
38308
38309 contents[0] = 0;
38310 contents[1] = 0;
38311 for (i = 0; i < nelt2; ++i)
38312 {
38313 contents[0] |= 1u << (d->perm[i] / nelt4);
38314 contents[1] |= 1u << (d->perm[i + nelt2] / nelt4);
38315 }
38316
38317 for (i = 0; i < 2; ++i)
38318 {
38319 unsigned int cnt = 0;
38320 for (j = 0; j < 4; ++j)
38321 if ((contents[i] & (1u << j)) != 0 && ++cnt > 2)
38322 return false;
38323 }
38324
38325 if (d->testing_p)
38326 return true;
38327
38328 dremap = *d;
38329 dremap.vmode = V4DImode;
38330 dremap.nelt = 4;
38331 dremap.target = gen_reg_rtx (V4DImode);
38332 dremap.op0 = gen_lowpart (V4DImode, d->op0);
38333 dremap.op1 = dremap.op0;
38334 dremap.one_operand_p = true;
38335 for (i = 0; i < 2; ++i)
38336 {
38337 unsigned int cnt = 0;
38338 for (j = 0; j < 4; ++j)
38339 if ((contents[i] & (1u << j)) != 0)
38340 dremap.perm[2 * i + cnt++] = j;
38341 for (; cnt < 2; ++cnt)
38342 dremap.perm[2 * i + cnt] = 0;
38343 }
38344
38345 dfinal = *d;
38346 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
38347 dfinal.op1 = dfinal.op0;
38348 dfinal.one_operand_p = true;
38349 for (i = 0, j = 0; i < nelt; ++i)
38350 {
38351 if (i == nelt2)
38352 j = 2;
38353 dfinal.perm[i] = (d->perm[i] & (nelt4 - 1)) | (j ? nelt2 : 0);
38354 if ((d->perm[i] / nelt4) == dremap.perm[j])
38355 ;
38356 else if ((d->perm[i] / nelt4) == dremap.perm[j + 1])
38357 dfinal.perm[i] |= nelt4;
38358 else
38359 gcc_unreachable ();
38360 }
38361
38362 ok = expand_vec_perm_1 (&dremap);
38363 gcc_assert (ok);
38364
38365 ok = expand_vec_perm_1 (&dfinal);
38366 gcc_assert (ok);
38367
38368 return true;
38369 }
38370
38371 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to expand
38372 a vector permutation using two instructions, vperm2f128 resp.
38373 vperm2i128 followed by any single in-lane permutation. */
38374
38375 static bool
38376 expand_vec_perm_vperm2f128 (struct expand_vec_perm_d *d)
38377 {
38378 struct expand_vec_perm_d dfirst, dsecond;
38379 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, perm;
38380 bool ok;
38381
38382 if (!TARGET_AVX
38383 || GET_MODE_SIZE (d->vmode) != 32
38384 || (d->vmode != V8SFmode && d->vmode != V4DFmode && !TARGET_AVX2))
38385 return false;
38386
38387 dsecond = *d;
38388 dsecond.one_operand_p = false;
38389 dsecond.testing_p = true;
38390
38391 /* ((perm << 2)|perm) & 0x33 is the vperm2[fi]128
38392 immediate. For perm < 16 the second permutation uses
38393 d->op0 as first operand, for perm >= 16 it uses d->op1
38394 as first operand. The second operand is the result of
38395 vperm2[fi]128. */
38396 for (perm = 0; perm < 32; perm++)
38397 {
38398 /* Ignore permutations which do not move anything cross-lane. */
38399 if (perm < 16)
38400 {
38401 /* The second shuffle for e.g. V4DFmode has
38402 0123 and ABCD operands.
38403 Ignore AB23, as 23 is already in the second lane
38404 of the first operand. */
38405 if ((perm & 0xc) == (1 << 2)) continue;
38406 /* And 01CD, as 01 is in the first lane of the first
38407 operand. */
38408 if ((perm & 3) == 0) continue;
38409 /* And 4567, as then the vperm2[fi]128 doesn't change
38410 anything on the original 4567 second operand. */
38411 if ((perm & 0xf) == ((3 << 2) | 2)) continue;
38412 }
38413 else
38414 {
38415 /* The second shuffle for e.g. V4DFmode has
38416 4567 and ABCD operands.
38417 Ignore AB67, as 67 is already in the second lane
38418 of the first operand. */
38419 if ((perm & 0xc) == (3 << 2)) continue;
38420 /* And 45CD, as 45 is in the first lane of the first
38421 operand. */
38422 if ((perm & 3) == 2) continue;
38423 /* And 0123, as then the vperm2[fi]128 doesn't change
38424 anything on the original 0123 first operand. */
38425 if ((perm & 0xf) == (1 << 2)) continue;
38426 }
38427
38428 for (i = 0; i < nelt; i++)
38429 {
38430 j = d->perm[i] / nelt2;
38431 if (j == ((perm >> (2 * (i >= nelt2))) & 3))
38432 dsecond.perm[i] = nelt + (i & nelt2) + (d->perm[i] & (nelt2 - 1));
38433 else if (j == (unsigned) (i >= nelt2) + 2 * (perm >= 16))
38434 dsecond.perm[i] = d->perm[i] & (nelt - 1);
38435 else
38436 break;
38437 }
38438
38439 if (i == nelt)
38440 {
38441 start_sequence ();
38442 ok = expand_vec_perm_1 (&dsecond);
38443 end_sequence ();
38444 }
38445 else
38446 ok = false;
38447
38448 if (ok)
38449 {
38450 if (d->testing_p)
38451 return true;
38452
38453 /* Found a usable second shuffle. dfirst will be
38454 vperm2f128 on d->op0 and d->op1. */
38455 dsecond.testing_p = false;
38456 dfirst = *d;
38457 dfirst.target = gen_reg_rtx (d->vmode);
38458 for (i = 0; i < nelt; i++)
38459 dfirst.perm[i] = (i & (nelt2 - 1))
38460 + ((perm >> (2 * (i >= nelt2))) & 3) * nelt2;
38461
38462 ok = expand_vec_perm_1 (&dfirst);
38463 gcc_assert (ok);
38464
38465 /* And dsecond is some single insn shuffle, taking
38466 d->op0 and result of vperm2f128 (if perm < 16) or
38467 d->op1 and result of vperm2f128 (otherwise). */
38468 dsecond.op1 = dfirst.target;
38469 if (perm >= 16)
38470 dsecond.op0 = dfirst.op1;
38471
38472 ok = expand_vec_perm_1 (&dsecond);
38473 gcc_assert (ok);
38474
38475 return true;
38476 }
38477
38478 /* For one operand, the only useful vperm2f128 permutation is 0x10. */
38479 if (d->one_operand_p)
38480 return false;
38481 }
38482
38483 return false;
38484 }
38485
38486 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
38487 a two vector permutation using 2 intra-lane interleave insns
38488 and cross-lane shuffle for 32-byte vectors. */
38489
38490 static bool
38491 expand_vec_perm_interleave3 (struct expand_vec_perm_d *d)
38492 {
38493 unsigned i, nelt;
38494 rtx (*gen) (rtx, rtx, rtx);
38495
38496 if (d->one_operand_p)
38497 return false;
38498 if (TARGET_AVX2 && GET_MODE_SIZE (d->vmode) == 32)
38499 ;
38500 else if (TARGET_AVX && (d->vmode == V8SFmode || d->vmode == V4DFmode))
38501 ;
38502 else
38503 return false;
38504
38505 nelt = d->nelt;
38506 if (d->perm[0] != 0 && d->perm[0] != nelt / 2)
38507 return false;
38508 for (i = 0; i < nelt; i += 2)
38509 if (d->perm[i] != d->perm[0] + i / 2
38510 || d->perm[i + 1] != d->perm[0] + i / 2 + nelt)
38511 return false;
38512
38513 if (d->testing_p)
38514 return true;
38515
38516 switch (d->vmode)
38517 {
38518 case V32QImode:
38519 if (d->perm[0])
38520 gen = gen_vec_interleave_highv32qi;
38521 else
38522 gen = gen_vec_interleave_lowv32qi;
38523 break;
38524 case V16HImode:
38525 if (d->perm[0])
38526 gen = gen_vec_interleave_highv16hi;
38527 else
38528 gen = gen_vec_interleave_lowv16hi;
38529 break;
38530 case V8SImode:
38531 if (d->perm[0])
38532 gen = gen_vec_interleave_highv8si;
38533 else
38534 gen = gen_vec_interleave_lowv8si;
38535 break;
38536 case V4DImode:
38537 if (d->perm[0])
38538 gen = gen_vec_interleave_highv4di;
38539 else
38540 gen = gen_vec_interleave_lowv4di;
38541 break;
38542 case V8SFmode:
38543 if (d->perm[0])
38544 gen = gen_vec_interleave_highv8sf;
38545 else
38546 gen = gen_vec_interleave_lowv8sf;
38547 break;
38548 case V4DFmode:
38549 if (d->perm[0])
38550 gen = gen_vec_interleave_highv4df;
38551 else
38552 gen = gen_vec_interleave_lowv4df;
38553 break;
38554 default:
38555 gcc_unreachable ();
38556 }
38557
38558 emit_insn (gen (d->target, d->op0, d->op1));
38559 return true;
38560 }
38561
38562 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement
38563 a single vector permutation using a single intra-lane vector
38564 permutation, vperm2f128 swapping the lanes and vblend* insn blending
38565 the non-swapped and swapped vectors together. */
38566
38567 static bool
38568 expand_vec_perm_vperm2f128_vblend (struct expand_vec_perm_d *d)
38569 {
38570 struct expand_vec_perm_d dfirst, dsecond;
38571 unsigned i, j, msk, nelt = d->nelt, nelt2 = nelt / 2;
38572 rtx seq;
38573 bool ok;
38574 rtx (*blend) (rtx, rtx, rtx, rtx) = NULL;
38575
38576 if (!TARGET_AVX
38577 || TARGET_AVX2
38578 || (d->vmode != V8SFmode && d->vmode != V4DFmode)
38579 || !d->one_operand_p)
38580 return false;
38581
38582 dfirst = *d;
38583 for (i = 0; i < nelt; i++)
38584 dfirst.perm[i] = 0xff;
38585 for (i = 0, msk = 0; i < nelt; i++)
38586 {
38587 j = (d->perm[i] & nelt2) ? i | nelt2 : i & ~nelt2;
38588 if (dfirst.perm[j] != 0xff && dfirst.perm[j] != d->perm[i])
38589 return false;
38590 dfirst.perm[j] = d->perm[i];
38591 if (j != i)
38592 msk |= (1 << i);
38593 }
38594 for (i = 0; i < nelt; i++)
38595 if (dfirst.perm[i] == 0xff)
38596 dfirst.perm[i] = i;
38597
38598 if (!d->testing_p)
38599 dfirst.target = gen_reg_rtx (dfirst.vmode);
38600
38601 start_sequence ();
38602 ok = expand_vec_perm_1 (&dfirst);
38603 seq = get_insns ();
38604 end_sequence ();
38605
38606 if (!ok)
38607 return false;
38608
38609 if (d->testing_p)
38610 return true;
38611
38612 emit_insn (seq);
38613
38614 dsecond = *d;
38615 dsecond.op0 = dfirst.target;
38616 dsecond.op1 = dfirst.target;
38617 dsecond.one_operand_p = true;
38618 dsecond.target = gen_reg_rtx (dsecond.vmode);
38619 for (i = 0; i < nelt; i++)
38620 dsecond.perm[i] = i ^ nelt2;
38621
38622 ok = expand_vec_perm_1 (&dsecond);
38623 gcc_assert (ok);
38624
38625 blend = d->vmode == V8SFmode ? gen_avx_blendps256 : gen_avx_blendpd256;
38626 emit_insn (blend (d->target, dfirst.target, dsecond.target, GEN_INT (msk)));
38627 return true;
38628 }
38629
38630 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement a V4DF
38631 permutation using two vperm2f128, followed by a vshufpd insn blending
38632 the two vectors together. */
38633
38634 static bool
38635 expand_vec_perm_2vperm2f128_vshuf (struct expand_vec_perm_d *d)
38636 {
38637 struct expand_vec_perm_d dfirst, dsecond, dthird;
38638 bool ok;
38639
38640 if (!TARGET_AVX || (d->vmode != V4DFmode))
38641 return false;
38642
38643 if (d->testing_p)
38644 return true;
38645
38646 dfirst = *d;
38647 dsecond = *d;
38648 dthird = *d;
38649
38650 dfirst.perm[0] = (d->perm[0] & ~1);
38651 dfirst.perm[1] = (d->perm[0] & ~1) + 1;
38652 dfirst.perm[2] = (d->perm[2] & ~1);
38653 dfirst.perm[3] = (d->perm[2] & ~1) + 1;
38654 dsecond.perm[0] = (d->perm[1] & ~1);
38655 dsecond.perm[1] = (d->perm[1] & ~1) + 1;
38656 dsecond.perm[2] = (d->perm[3] & ~1);
38657 dsecond.perm[3] = (d->perm[3] & ~1) + 1;
38658 dthird.perm[0] = (d->perm[0] % 2);
38659 dthird.perm[1] = (d->perm[1] % 2) + 4;
38660 dthird.perm[2] = (d->perm[2] % 2) + 2;
38661 dthird.perm[3] = (d->perm[3] % 2) + 6;
38662
38663 dfirst.target = gen_reg_rtx (dfirst.vmode);
38664 dsecond.target = gen_reg_rtx (dsecond.vmode);
38665 dthird.op0 = dfirst.target;
38666 dthird.op1 = dsecond.target;
38667 dthird.one_operand_p = false;
38668
38669 canonicalize_perm (&dfirst);
38670 canonicalize_perm (&dsecond);
38671
38672 ok = expand_vec_perm_1 (&dfirst)
38673 && expand_vec_perm_1 (&dsecond)
38674 && expand_vec_perm_1 (&dthird);
38675
38676 gcc_assert (ok);
38677
38678 return true;
38679 }
38680
38681 /* A subroutine of expand_vec_perm_even_odd_1. Implement the double-word
38682 permutation with two pshufb insns and an ior. We should have already
38683 failed all two instruction sequences. */
38684
38685 static bool
38686 expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d)
38687 {
38688 rtx rperm[2][16], vperm, l, h, op, m128;
38689 unsigned int i, nelt, eltsz;
38690
38691 if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
38692 return false;
38693 gcc_assert (!d->one_operand_p);
38694
38695 nelt = d->nelt;
38696 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
38697
38698 /* Generate two permutation masks. If the required element is within
38699 the given vector it is shuffled into the proper lane. If the required
38700 element is in the other vector, force a zero into the lane by setting
38701 bit 7 in the permutation mask. */
38702 m128 = GEN_INT (-128);
38703 for (i = 0; i < nelt; ++i)
38704 {
38705 unsigned j, e = d->perm[i];
38706 unsigned which = (e >= nelt);
38707 if (e >= nelt)
38708 e -= nelt;
38709
38710 for (j = 0; j < eltsz; ++j)
38711 {
38712 rperm[which][i*eltsz + j] = GEN_INT (e*eltsz + j);
38713 rperm[1-which][i*eltsz + j] = m128;
38714 }
38715 }
38716
38717 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[0]));
38718 vperm = force_reg (V16QImode, vperm);
38719
38720 l = gen_reg_rtx (V16QImode);
38721 op = gen_lowpart (V16QImode, d->op0);
38722 emit_insn (gen_ssse3_pshufbv16qi3 (l, op, vperm));
38723
38724 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[1]));
38725 vperm = force_reg (V16QImode, vperm);
38726
38727 h = gen_reg_rtx (V16QImode);
38728 op = gen_lowpart (V16QImode, d->op1);
38729 emit_insn (gen_ssse3_pshufbv16qi3 (h, op, vperm));
38730
38731 op = gen_lowpart (V16QImode, d->target);
38732 emit_insn (gen_iorv16qi3 (op, l, h));
38733
38734 return true;
38735 }
38736
38737 /* Implement arbitrary permutation of one V32QImode and V16QImode operand
38738 with two vpshufb insns, vpermq and vpor. We should have already failed
38739 all two or three instruction sequences. */
38740
38741 static bool
38742 expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d)
38743 {
38744 rtx rperm[2][32], vperm, l, h, hp, op, m128;
38745 unsigned int i, nelt, eltsz;
38746
38747 if (!TARGET_AVX2
38748 || !d->one_operand_p
38749 || (d->vmode != V32QImode && d->vmode != V16HImode))
38750 return false;
38751
38752 if (d->testing_p)
38753 return true;
38754
38755 nelt = d->nelt;
38756 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
38757
38758 /* Generate two permutation masks. If the required element is within
38759 the same lane, it is shuffled in. If the required element from the
38760 other lane, force a zero by setting bit 7 in the permutation mask.
38761 In the other mask the mask has non-negative elements if element
38762 is requested from the other lane, but also moved to the other lane,
38763 so that the result of vpshufb can have the two V2TImode halves
38764 swapped. */
38765 m128 = GEN_INT (-128);
38766 for (i = 0; i < nelt; ++i)
38767 {
38768 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
38769 unsigned which = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
38770
38771 for (j = 0; j < eltsz; ++j)
38772 {
38773 rperm[!!which][(i * eltsz + j) ^ which] = GEN_INT (e * eltsz + j);
38774 rperm[!which][(i * eltsz + j) ^ (which ^ 16)] = m128;
38775 }
38776 }
38777
38778 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
38779 vperm = force_reg (V32QImode, vperm);
38780
38781 h = gen_reg_rtx (V32QImode);
38782 op = gen_lowpart (V32QImode, d->op0);
38783 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
38784
38785 /* Swap the 128-byte lanes of h into hp. */
38786 hp = gen_reg_rtx (V4DImode);
38787 op = gen_lowpart (V4DImode, h);
38788 emit_insn (gen_avx2_permv4di_1 (hp, op, const2_rtx, GEN_INT (3), const0_rtx,
38789 const1_rtx));
38790
38791 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
38792 vperm = force_reg (V32QImode, vperm);
38793
38794 l = gen_reg_rtx (V32QImode);
38795 op = gen_lowpart (V32QImode, d->op0);
38796 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
38797
38798 op = gen_lowpart (V32QImode, d->target);
38799 emit_insn (gen_iorv32qi3 (op, l, gen_lowpart (V32QImode, hp)));
38800
38801 return true;
38802 }
38803
38804 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
38805 and extract-odd permutations of two V32QImode and V16QImode operand
38806 with two vpshufb insns, vpor and vpermq. We should have already
38807 failed all two or three instruction sequences. */
38808
38809 static bool
38810 expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d *d)
38811 {
38812 rtx rperm[2][32], vperm, l, h, ior, op, m128;
38813 unsigned int i, nelt, eltsz;
38814
38815 if (!TARGET_AVX2
38816 || d->one_operand_p
38817 || (d->vmode != V32QImode && d->vmode != V16HImode))
38818 return false;
38819
38820 for (i = 0; i < d->nelt; ++i)
38821 if ((d->perm[i] ^ (i * 2)) & (3 * d->nelt / 2))
38822 return false;
38823
38824 if (d->testing_p)
38825 return true;
38826
38827 nelt = d->nelt;
38828 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
38829
38830 /* Generate two permutation masks. In the first permutation mask
38831 the first quarter will contain indexes for the first half
38832 of the op0, the second quarter will contain bit 7 set, third quarter
38833 will contain indexes for the second half of the op0 and the
38834 last quarter bit 7 set. In the second permutation mask
38835 the first quarter will contain bit 7 set, the second quarter
38836 indexes for the first half of the op1, the third quarter bit 7 set
38837 and last quarter indexes for the second half of the op1.
38838 I.e. the first mask e.g. for V32QImode extract even will be:
38839 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128
38840 (all values masked with 0xf except for -128) and second mask
38841 for extract even will be
38842 -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe. */
38843 m128 = GEN_INT (-128);
38844 for (i = 0; i < nelt; ++i)
38845 {
38846 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
38847 unsigned which = d->perm[i] >= nelt;
38848 unsigned xorv = (i >= nelt / 4 && i < 3 * nelt / 4) ? 24 : 0;
38849
38850 for (j = 0; j < eltsz; ++j)
38851 {
38852 rperm[which][(i * eltsz + j) ^ xorv] = GEN_INT (e * eltsz + j);
38853 rperm[1 - which][(i * eltsz + j) ^ xorv] = m128;
38854 }
38855 }
38856
38857 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
38858 vperm = force_reg (V32QImode, vperm);
38859
38860 l = gen_reg_rtx (V32QImode);
38861 op = gen_lowpart (V32QImode, d->op0);
38862 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
38863
38864 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
38865 vperm = force_reg (V32QImode, vperm);
38866
38867 h = gen_reg_rtx (V32QImode);
38868 op = gen_lowpart (V32QImode, d->op1);
38869 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
38870
38871 ior = gen_reg_rtx (V32QImode);
38872 emit_insn (gen_iorv32qi3 (ior, l, h));
38873
38874 /* Permute the V4DImode quarters using { 0, 2, 1, 3 } permutation. */
38875 op = gen_lowpart (V4DImode, d->target);
38876 ior = gen_lowpart (V4DImode, ior);
38877 emit_insn (gen_avx2_permv4di_1 (op, ior, const0_rtx, const2_rtx,
38878 const1_rtx, GEN_INT (3)));
38879
38880 return true;
38881 }
38882
38883 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement extract-even
38884 and extract-odd permutations. */
38885
38886 static bool
38887 expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
38888 {
38889 rtx t1, t2, t3;
38890
38891 switch (d->vmode)
38892 {
38893 case V4DFmode:
38894 t1 = gen_reg_rtx (V4DFmode);
38895 t2 = gen_reg_rtx (V4DFmode);
38896
38897 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
38898 emit_insn (gen_avx_vperm2f128v4df3 (t1, d->op0, d->op1, GEN_INT (0x20)));
38899 emit_insn (gen_avx_vperm2f128v4df3 (t2, d->op0, d->op1, GEN_INT (0x31)));
38900
38901 /* Now an unpck[lh]pd will produce the result required. */
38902 if (odd)
38903 t3 = gen_avx_unpckhpd256 (d->target, t1, t2);
38904 else
38905 t3 = gen_avx_unpcklpd256 (d->target, t1, t2);
38906 emit_insn (t3);
38907 break;
38908
38909 case V8SFmode:
38910 {
38911 int mask = odd ? 0xdd : 0x88;
38912
38913 t1 = gen_reg_rtx (V8SFmode);
38914 t2 = gen_reg_rtx (V8SFmode);
38915 t3 = gen_reg_rtx (V8SFmode);
38916
38917 /* Shuffle within the 128-bit lanes to produce:
38918 { 0 2 8 a 4 6 c e } | { 1 3 9 b 5 7 d f }. */
38919 emit_insn (gen_avx_shufps256 (t1, d->op0, d->op1,
38920 GEN_INT (mask)));
38921
38922 /* Shuffle the lanes around to produce:
38923 { 4 6 c e 0 2 8 a } and { 5 7 d f 1 3 9 b }. */
38924 emit_insn (gen_avx_vperm2f128v8sf3 (t2, t1, t1,
38925 GEN_INT (0x3)));
38926
38927 /* Shuffle within the 128-bit lanes to produce:
38928 { 0 2 4 6 4 6 0 2 } | { 1 3 5 7 5 7 1 3 }. */
38929 emit_insn (gen_avx_shufps256 (t3, t1, t2, GEN_INT (0x44)));
38930
38931 /* Shuffle within the 128-bit lanes to produce:
38932 { 8 a c e c e 8 a } | { 9 b d f d f 9 b }. */
38933 emit_insn (gen_avx_shufps256 (t2, t1, t2, GEN_INT (0xee)));
38934
38935 /* Shuffle the lanes around to produce:
38936 { 0 2 4 6 8 a c e } | { 1 3 5 7 9 b d f }. */
38937 emit_insn (gen_avx_vperm2f128v8sf3 (d->target, t3, t2,
38938 GEN_INT (0x20)));
38939 }
38940 break;
38941
38942 case V2DFmode:
38943 case V4SFmode:
38944 case V2DImode:
38945 case V4SImode:
38946 /* These are always directly implementable by expand_vec_perm_1. */
38947 gcc_unreachable ();
38948
38949 case V8HImode:
38950 if (TARGET_SSSE3)
38951 return expand_vec_perm_pshufb2 (d);
38952 else
38953 {
38954 /* We need 2*log2(N)-1 operations to achieve odd/even
38955 with interleave. */
38956 t1 = gen_reg_rtx (V8HImode);
38957 t2 = gen_reg_rtx (V8HImode);
38958 emit_insn (gen_vec_interleave_highv8hi (t1, d->op0, d->op1));
38959 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->op0, d->op1));
38960 emit_insn (gen_vec_interleave_highv8hi (t2, d->target, t1));
38961 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->target, t1));
38962 if (odd)
38963 t3 = gen_vec_interleave_highv8hi (d->target, d->target, t2);
38964 else
38965 t3 = gen_vec_interleave_lowv8hi (d->target, d->target, t2);
38966 emit_insn (t3);
38967 }
38968 break;
38969
38970 case V16QImode:
38971 if (TARGET_SSSE3)
38972 return expand_vec_perm_pshufb2 (d);
38973 else
38974 {
38975 t1 = gen_reg_rtx (V16QImode);
38976 t2 = gen_reg_rtx (V16QImode);
38977 t3 = gen_reg_rtx (V16QImode);
38978 emit_insn (gen_vec_interleave_highv16qi (t1, d->op0, d->op1));
38979 emit_insn (gen_vec_interleave_lowv16qi (d->target, d->op0, d->op1));
38980 emit_insn (gen_vec_interleave_highv16qi (t2, d->target, t1));
38981 emit_insn (gen_vec_interleave_lowv16qi (d->target, d->target, t1));
38982 emit_insn (gen_vec_interleave_highv16qi (t3, d->target, t2));
38983 emit_insn (gen_vec_interleave_lowv16qi (d->target, d->target, t2));
38984 if (odd)
38985 t3 = gen_vec_interleave_highv16qi (d->target, d->target, t3);
38986 else
38987 t3 = gen_vec_interleave_lowv16qi (d->target, d->target, t3);
38988 emit_insn (t3);
38989 }
38990 break;
38991
38992 case V16HImode:
38993 case V32QImode:
38994 return expand_vec_perm_vpshufb2_vpermq_even_odd (d);
38995
38996 case V4DImode:
38997 if (!TARGET_AVX2)
38998 {
38999 struct expand_vec_perm_d d_copy = *d;
39000 d_copy.vmode = V4DFmode;
39001 d_copy.target = gen_lowpart (V4DFmode, d->target);
39002 d_copy.op0 = gen_lowpart (V4DFmode, d->op0);
39003 d_copy.op1 = gen_lowpart (V4DFmode, d->op1);
39004 return expand_vec_perm_even_odd_1 (&d_copy, odd);
39005 }
39006
39007 t1 = gen_reg_rtx (V4DImode);
39008 t2 = gen_reg_rtx (V4DImode);
39009
39010 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
39011 emit_insn (gen_avx2_permv2ti (t1, d->op0, d->op1, GEN_INT (0x20)));
39012 emit_insn (gen_avx2_permv2ti (t2, d->op0, d->op1, GEN_INT (0x31)));
39013
39014 /* Now an vpunpck[lh]qdq will produce the result required. */
39015 if (odd)
39016 t3 = gen_avx2_interleave_highv4di (d->target, t1, t2);
39017 else
39018 t3 = gen_avx2_interleave_lowv4di (d->target, t1, t2);
39019 emit_insn (t3);
39020 break;
39021
39022 case V8SImode:
39023 if (!TARGET_AVX2)
39024 {
39025 struct expand_vec_perm_d d_copy = *d;
39026 d_copy.vmode = V8SFmode;
39027 d_copy.target = gen_lowpart (V8SFmode, d->target);
39028 d_copy.op0 = gen_lowpart (V8SFmode, d->op0);
39029 d_copy.op1 = gen_lowpart (V8SFmode, d->op1);
39030 return expand_vec_perm_even_odd_1 (&d_copy, odd);
39031 }
39032
39033 t1 = gen_reg_rtx (V8SImode);
39034 t2 = gen_reg_rtx (V8SImode);
39035
39036 /* Shuffle the lanes around into
39037 { 0 1 2 3 8 9 a b } and { 4 5 6 7 c d e f }. */
39038 emit_insn (gen_avx2_permv2ti (gen_lowpart (V4DImode, t1),
39039 gen_lowpart (V4DImode, d->op0),
39040 gen_lowpart (V4DImode, d->op1),
39041 GEN_INT (0x20)));
39042 emit_insn (gen_avx2_permv2ti (gen_lowpart (V4DImode, t2),
39043 gen_lowpart (V4DImode, d->op0),
39044 gen_lowpart (V4DImode, d->op1),
39045 GEN_INT (0x31)));
39046
39047 /* Swap the 2nd and 3rd position in each lane into
39048 { 0 2 1 3 8 a 9 b } and { 4 6 5 7 c e d f }. */
39049 emit_insn (gen_avx2_pshufdv3 (t1, t1,
39050 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
39051 emit_insn (gen_avx2_pshufdv3 (t2, t2,
39052 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
39053
39054 /* Now an vpunpck[lh]qdq will produce
39055 { 0 2 4 6 8 a c e } resp. { 1 3 5 7 9 b d f }. */
39056 if (odd)
39057 t3 = gen_avx2_interleave_highv4di (gen_lowpart (V4DImode, d->target),
39058 gen_lowpart (V4DImode, t1),
39059 gen_lowpart (V4DImode, t2));
39060 else
39061 t3 = gen_avx2_interleave_lowv4di (gen_lowpart (V4DImode, d->target),
39062 gen_lowpart (V4DImode, t1),
39063 gen_lowpart (V4DImode, t2));
39064 emit_insn (t3);
39065 break;
39066
39067 default:
39068 gcc_unreachable ();
39069 }
39070
39071 return true;
39072 }
39073
39074 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
39075 extract-even and extract-odd permutations. */
39076
39077 static bool
39078 expand_vec_perm_even_odd (struct expand_vec_perm_d *d)
39079 {
39080 unsigned i, odd, nelt = d->nelt;
39081
39082 odd = d->perm[0];
39083 if (odd != 0 && odd != 1)
39084 return false;
39085
39086 for (i = 1; i < nelt; ++i)
39087 if (d->perm[i] != 2 * i + odd)
39088 return false;
39089
39090 return expand_vec_perm_even_odd_1 (d, odd);
39091 }
39092
39093 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement broadcast
39094 permutations. We assume that expand_vec_perm_1 has already failed. */
39095
39096 static bool
39097 expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
39098 {
39099 unsigned elt = d->perm[0], nelt2 = d->nelt / 2;
39100 enum machine_mode vmode = d->vmode;
39101 unsigned char perm2[4];
39102 rtx op0 = d->op0;
39103 bool ok;
39104
39105 switch (vmode)
39106 {
39107 case V4DFmode:
39108 case V8SFmode:
39109 /* These are special-cased in sse.md so that we can optionally
39110 use the vbroadcast instruction. They expand to two insns
39111 if the input happens to be in a register. */
39112 gcc_unreachable ();
39113
39114 case V2DFmode:
39115 case V2DImode:
39116 case V4SFmode:
39117 case V4SImode:
39118 /* These are always implementable using standard shuffle patterns. */
39119 gcc_unreachable ();
39120
39121 case V8HImode:
39122 case V16QImode:
39123 /* These can be implemented via interleave. We save one insn by
39124 stopping once we have promoted to V4SImode and then use pshufd. */
39125 do
39126 {
39127 rtx dest;
39128 rtx (*gen) (rtx, rtx, rtx)
39129 = vmode == V16QImode ? gen_vec_interleave_lowv16qi
39130 : gen_vec_interleave_lowv8hi;
39131
39132 if (elt >= nelt2)
39133 {
39134 gen = vmode == V16QImode ? gen_vec_interleave_highv16qi
39135 : gen_vec_interleave_highv8hi;
39136 elt -= nelt2;
39137 }
39138 nelt2 /= 2;
39139
39140 dest = gen_reg_rtx (vmode);
39141 emit_insn (gen (dest, op0, op0));
39142 vmode = get_mode_wider_vector (vmode);
39143 op0 = gen_lowpart (vmode, dest);
39144 }
39145 while (vmode != V4SImode);
39146
39147 memset (perm2, elt, 4);
39148 ok = expand_vselect (gen_lowpart (V4SImode, d->target), op0, perm2, 4,
39149 d->testing_p);
39150 gcc_assert (ok);
39151 return true;
39152
39153 case V32QImode:
39154 case V16HImode:
39155 case V8SImode:
39156 case V4DImode:
39157 /* For AVX2 broadcasts of the first element vpbroadcast* or
39158 vpermq should be used by expand_vec_perm_1. */
39159 gcc_assert (!TARGET_AVX2 || d->perm[0]);
39160 return false;
39161
39162 default:
39163 gcc_unreachable ();
39164 }
39165 }
39166
39167 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
39168 broadcast permutations. */
39169
39170 static bool
39171 expand_vec_perm_broadcast (struct expand_vec_perm_d *d)
39172 {
39173 unsigned i, elt, nelt = d->nelt;
39174
39175 if (!d->one_operand_p)
39176 return false;
39177
39178 elt = d->perm[0];
39179 for (i = 1; i < nelt; ++i)
39180 if (d->perm[i] != elt)
39181 return false;
39182
39183 return expand_vec_perm_broadcast_1 (d);
39184 }
39185
39186 /* Implement arbitrary permutation of two V32QImode and V16QImode operands
39187 with 4 vpshufb insns, 2 vpermq and 3 vpor. We should have already failed
39188 all the shorter instruction sequences. */
39189
39190 static bool
39191 expand_vec_perm_vpshufb4_vpermq2 (struct expand_vec_perm_d *d)
39192 {
39193 rtx rperm[4][32], vperm, l[2], h[2], op, m128;
39194 unsigned int i, nelt, eltsz;
39195 bool used[4];
39196
39197 if (!TARGET_AVX2
39198 || d->one_operand_p
39199 || (d->vmode != V32QImode && d->vmode != V16HImode))
39200 return false;
39201
39202 if (d->testing_p)
39203 return true;
39204
39205 nelt = d->nelt;
39206 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
39207
39208 /* Generate 4 permutation masks. If the required element is within
39209 the same lane, it is shuffled in. If the required element from the
39210 other lane, force a zero by setting bit 7 in the permutation mask.
39211 In the other mask the mask has non-negative elements if element
39212 is requested from the other lane, but also moved to the other lane,
39213 so that the result of vpshufb can have the two V2TImode halves
39214 swapped. */
39215 m128 = GEN_INT (-128);
39216 for (i = 0; i < 32; ++i)
39217 {
39218 rperm[0][i] = m128;
39219 rperm[1][i] = m128;
39220 rperm[2][i] = m128;
39221 rperm[3][i] = m128;
39222 }
39223 used[0] = false;
39224 used[1] = false;
39225 used[2] = false;
39226 used[3] = false;
39227 for (i = 0; i < nelt; ++i)
39228 {
39229 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
39230 unsigned xlane = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
39231 unsigned int which = ((d->perm[i] & nelt) ? 2 : 0) + (xlane ? 1 : 0);
39232
39233 for (j = 0; j < eltsz; ++j)
39234 rperm[which][(i * eltsz + j) ^ xlane] = GEN_INT (e * eltsz + j);
39235 used[which] = true;
39236 }
39237
39238 for (i = 0; i < 2; ++i)
39239 {
39240 if (!used[2 * i + 1])
39241 {
39242 h[i] = NULL_RTX;
39243 continue;
39244 }
39245 vperm = gen_rtx_CONST_VECTOR (V32QImode,
39246 gen_rtvec_v (32, rperm[2 * i + 1]));
39247 vperm = force_reg (V32QImode, vperm);
39248 h[i] = gen_reg_rtx (V32QImode);
39249 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
39250 emit_insn (gen_avx2_pshufbv32qi3 (h[i], op, vperm));
39251 }
39252
39253 /* Swap the 128-byte lanes of h[X]. */
39254 for (i = 0; i < 2; ++i)
39255 {
39256 if (h[i] == NULL_RTX)
39257 continue;
39258 op = gen_reg_rtx (V4DImode);
39259 emit_insn (gen_avx2_permv4di_1 (op, gen_lowpart (V4DImode, h[i]),
39260 const2_rtx, GEN_INT (3), const0_rtx,
39261 const1_rtx));
39262 h[i] = gen_lowpart (V32QImode, op);
39263 }
39264
39265 for (i = 0; i < 2; ++i)
39266 {
39267 if (!used[2 * i])
39268 {
39269 l[i] = NULL_RTX;
39270 continue;
39271 }
39272 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[2 * i]));
39273 vperm = force_reg (V32QImode, vperm);
39274 l[i] = gen_reg_rtx (V32QImode);
39275 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
39276 emit_insn (gen_avx2_pshufbv32qi3 (l[i], op, vperm));
39277 }
39278
39279 for (i = 0; i < 2; ++i)
39280 {
39281 if (h[i] && l[i])
39282 {
39283 op = gen_reg_rtx (V32QImode);
39284 emit_insn (gen_iorv32qi3 (op, l[i], h[i]));
39285 l[i] = op;
39286 }
39287 else if (h[i])
39288 l[i] = h[i];
39289 }
39290
39291 gcc_assert (l[0] && l[1]);
39292 op = gen_lowpart (V32QImode, d->target);
39293 emit_insn (gen_iorv32qi3 (op, l[0], l[1]));
39294 return true;
39295 }
39296
39297 /* The guts of ix86_expand_vec_perm_const, also used by the ok hook.
39298 With all of the interface bits taken care of, perform the expansion
39299 in D and return true on success. */
39300
39301 static bool
39302 ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
39303 {
39304 /* Try a single instruction expansion. */
39305 if (expand_vec_perm_1 (d))
39306 return true;
39307
39308 /* Try sequences of two instructions. */
39309
39310 if (expand_vec_perm_pshuflw_pshufhw (d))
39311 return true;
39312
39313 if (expand_vec_perm_palignr (d))
39314 return true;
39315
39316 if (expand_vec_perm_interleave2 (d))
39317 return true;
39318
39319 if (expand_vec_perm_broadcast (d))
39320 return true;
39321
39322 if (expand_vec_perm_vpermq_perm_1 (d))
39323 return true;
39324
39325 if (expand_vec_perm_vperm2f128 (d))
39326 return true;
39327
39328 /* Try sequences of three instructions. */
39329
39330 if (expand_vec_perm_2vperm2f128_vshuf (d))
39331 return true;
39332
39333 if (expand_vec_perm_pshufb2 (d))
39334 return true;
39335
39336 if (expand_vec_perm_interleave3 (d))
39337 return true;
39338
39339 if (expand_vec_perm_vperm2f128_vblend (d))
39340 return true;
39341
39342 /* Try sequences of four instructions. */
39343
39344 if (expand_vec_perm_vpshufb2_vpermq (d))
39345 return true;
39346
39347 if (expand_vec_perm_vpshufb2_vpermq_even_odd (d))
39348 return true;
39349
39350 /* ??? Look for narrow permutations whose element orderings would
39351 allow the promotion to a wider mode. */
39352
39353 /* ??? Look for sequences of interleave or a wider permute that place
39354 the data into the correct lanes for a half-vector shuffle like
39355 pshuf[lh]w or vpermilps. */
39356
39357 /* ??? Look for sequences of interleave that produce the desired results.
39358 The combinatorics of punpck[lh] get pretty ugly... */
39359
39360 if (expand_vec_perm_even_odd (d))
39361 return true;
39362
39363 /* Even longer sequences. */
39364 if (expand_vec_perm_vpshufb4_vpermq2 (d))
39365 return true;
39366
39367 return false;
39368 }
39369
39370 /* If a permutation only uses one operand, make it clear. Returns true
39371 if the permutation references both operands. */
39372
39373 static bool
39374 canonicalize_perm (struct expand_vec_perm_d *d)
39375 {
39376 int i, which, nelt = d->nelt;
39377
39378 for (i = which = 0; i < nelt; ++i)
39379 which |= (d->perm[i] < nelt ? 1 : 2);
39380
39381 d->one_operand_p = true;
39382 switch (which)
39383 {
39384 default:
39385 gcc_unreachable();
39386
39387 case 3:
39388 if (!rtx_equal_p (d->op0, d->op1))
39389 {
39390 d->one_operand_p = false;
39391 break;
39392 }
39393 /* The elements of PERM do not suggest that only the first operand
39394 is used, but both operands are identical. Allow easier matching
39395 of the permutation by folding the permutation into the single
39396 input vector. */
39397 /* FALLTHRU */
39398
39399 case 2:
39400 for (i = 0; i < nelt; ++i)
39401 d->perm[i] &= nelt - 1;
39402 d->op0 = d->op1;
39403 break;
39404
39405 case 1:
39406 d->op1 = d->op0;
39407 break;
39408 }
39409
39410 return (which == 3);
39411 }
39412
39413 bool
39414 ix86_expand_vec_perm_const (rtx operands[4])
39415 {
39416 struct expand_vec_perm_d d;
39417 unsigned char perm[MAX_VECT_LEN];
39418 int i, nelt;
39419 bool two_args;
39420 rtx sel;
39421
39422 d.target = operands[0];
39423 d.op0 = operands[1];
39424 d.op1 = operands[2];
39425 sel = operands[3];
39426
39427 d.vmode = GET_MODE (d.target);
39428 gcc_assert (VECTOR_MODE_P (d.vmode));
39429 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
39430 d.testing_p = false;
39431
39432 gcc_assert (GET_CODE (sel) == CONST_VECTOR);
39433 gcc_assert (XVECLEN (sel, 0) == nelt);
39434 gcc_checking_assert (sizeof (d.perm) == sizeof (perm));
39435
39436 for (i = 0; i < nelt; ++i)
39437 {
39438 rtx e = XVECEXP (sel, 0, i);
39439 int ei = INTVAL (e) & (2 * nelt - 1);
39440 d.perm[i] = ei;
39441 perm[i] = ei;
39442 }
39443
39444 two_args = canonicalize_perm (&d);
39445
39446 if (ix86_expand_vec_perm_const_1 (&d))
39447 return true;
39448
39449 /* If the selector says both arguments are needed, but the operands are the
39450 same, the above tried to expand with one_operand_p and flattened selector.
39451 If that didn't work, retry without one_operand_p; we succeeded with that
39452 during testing. */
39453 if (two_args && d.one_operand_p)
39454 {
39455 d.one_operand_p = false;
39456 memcpy (d.perm, perm, sizeof (perm));
39457 return ix86_expand_vec_perm_const_1 (&d);
39458 }
39459
39460 return false;
39461 }
39462
39463 /* Implement targetm.vectorize.vec_perm_const_ok. */
39464
39465 static bool
39466 ix86_vectorize_vec_perm_const_ok (enum machine_mode vmode,
39467 const unsigned char *sel)
39468 {
39469 struct expand_vec_perm_d d;
39470 unsigned int i, nelt, which;
39471 bool ret;
39472
39473 d.vmode = vmode;
39474 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
39475 d.testing_p = true;
39476
39477 /* Given sufficient ISA support we can just return true here
39478 for selected vector modes. */
39479 if (GET_MODE_SIZE (d.vmode) == 16)
39480 {
39481 /* All implementable with a single vpperm insn. */
39482 if (TARGET_XOP)
39483 return true;
39484 /* All implementable with 2 pshufb + 1 ior. */
39485 if (TARGET_SSSE3)
39486 return true;
39487 /* All implementable with shufpd or unpck[lh]pd. */
39488 if (d.nelt == 2)
39489 return true;
39490 }
39491
39492 /* Extract the values from the vector CST into the permutation
39493 array in D. */
39494 memcpy (d.perm, sel, nelt);
39495 for (i = which = 0; i < nelt; ++i)
39496 {
39497 unsigned char e = d.perm[i];
39498 gcc_assert (e < 2 * nelt);
39499 which |= (e < nelt ? 1 : 2);
39500 }
39501
39502 /* For all elements from second vector, fold the elements to first. */
39503 if (which == 2)
39504 for (i = 0; i < nelt; ++i)
39505 d.perm[i] -= nelt;
39506
39507 /* Check whether the mask can be applied to the vector type. */
39508 d.one_operand_p = (which != 3);
39509
39510 /* Implementable with shufps or pshufd. */
39511 if (d.one_operand_p && (d.vmode == V4SFmode || d.vmode == V4SImode))
39512 return true;
39513
39514 /* Otherwise we have to go through the motions and see if we can
39515 figure out how to generate the requested permutation. */
39516 d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
39517 d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
39518 if (!d.one_operand_p)
39519 d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
39520
39521 start_sequence ();
39522 ret = ix86_expand_vec_perm_const_1 (&d);
39523 end_sequence ();
39524
39525 return ret;
39526 }
39527
39528 void
39529 ix86_expand_vec_extract_even_odd (rtx targ, rtx op0, rtx op1, unsigned odd)
39530 {
39531 struct expand_vec_perm_d d;
39532 unsigned i, nelt;
39533
39534 d.target = targ;
39535 d.op0 = op0;
39536 d.op1 = op1;
39537 d.vmode = GET_MODE (targ);
39538 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
39539 d.one_operand_p = false;
39540 d.testing_p = false;
39541
39542 for (i = 0; i < nelt; ++i)
39543 d.perm[i] = i * 2 + odd;
39544
39545 /* We'll either be able to implement the permutation directly... */
39546 if (expand_vec_perm_1 (&d))
39547 return;
39548
39549 /* ... or we use the special-case patterns. */
39550 expand_vec_perm_even_odd_1 (&d, odd);
39551 }
39552
39553 static void
39554 ix86_expand_vec_interleave (rtx targ, rtx op0, rtx op1, bool high_p)
39555 {
39556 struct expand_vec_perm_d d;
39557 unsigned i, nelt, base;
39558 bool ok;
39559
39560 d.target = targ;
39561 d.op0 = op0;
39562 d.op1 = op1;
39563 d.vmode = GET_MODE (targ);
39564 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
39565 d.one_operand_p = false;
39566 d.testing_p = false;
39567
39568 base = high_p ? nelt / 2 : 0;
39569 for (i = 0; i < nelt / 2; ++i)
39570 {
39571 d.perm[i * 2] = i + base;
39572 d.perm[i * 2 + 1] = i + base + nelt;
39573 }
39574
39575 /* Note that for AVX this isn't one instruction. */
39576 ok = ix86_expand_vec_perm_const_1 (&d);
39577 gcc_assert (ok);
39578 }
39579
39580
39581 /* Expand a vector operation CODE for a V*QImode in terms of the
39582 same operation on V*HImode. */
39583
39584 void
39585 ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
39586 {
39587 enum machine_mode qimode = GET_MODE (dest);
39588 enum machine_mode himode;
39589 rtx (*gen_il) (rtx, rtx, rtx);
39590 rtx (*gen_ih) (rtx, rtx, rtx);
39591 rtx op1_l, op1_h, op2_l, op2_h, res_l, res_h;
39592 struct expand_vec_perm_d d;
39593 bool ok, full_interleave;
39594 bool uns_p = false;
39595 int i;
39596
39597 switch (qimode)
39598 {
39599 case V16QImode:
39600 himode = V8HImode;
39601 gen_il = gen_vec_interleave_lowv16qi;
39602 gen_ih = gen_vec_interleave_highv16qi;
39603 break;
39604 case V32QImode:
39605 himode = V16HImode;
39606 gen_il = gen_avx2_interleave_lowv32qi;
39607 gen_ih = gen_avx2_interleave_highv32qi;
39608 break;
39609 default:
39610 gcc_unreachable ();
39611 }
39612
39613 op2_l = op2_h = op2;
39614 switch (code)
39615 {
39616 case MULT:
39617 /* Unpack data such that we've got a source byte in each low byte of
39618 each word. We don't care what goes into the high byte of each word.
39619 Rather than trying to get zero in there, most convenient is to let
39620 it be a copy of the low byte. */
39621 op2_l = gen_reg_rtx (qimode);
39622 op2_h = gen_reg_rtx (qimode);
39623 emit_insn (gen_il (op2_l, op2, op2));
39624 emit_insn (gen_ih (op2_h, op2, op2));
39625 /* FALLTHRU */
39626
39627 op1_l = gen_reg_rtx (qimode);
39628 op1_h = gen_reg_rtx (qimode);
39629 emit_insn (gen_il (op1_l, op1, op1));
39630 emit_insn (gen_ih (op1_h, op1, op1));
39631 full_interleave = qimode == V16QImode;
39632 break;
39633
39634 case ASHIFT:
39635 case LSHIFTRT:
39636 uns_p = true;
39637 /* FALLTHRU */
39638 case ASHIFTRT:
39639 op1_l = gen_reg_rtx (himode);
39640 op1_h = gen_reg_rtx (himode);
39641 ix86_expand_sse_unpack (op1_l, op1, uns_p, false);
39642 ix86_expand_sse_unpack (op1_h, op1, uns_p, true);
39643 full_interleave = true;
39644 break;
39645 default:
39646 gcc_unreachable ();
39647 }
39648
39649 /* Perform the operation. */
39650 res_l = expand_simple_binop (himode, code, op1_l, op2_l, NULL_RTX,
39651 1, OPTAB_DIRECT);
39652 res_h = expand_simple_binop (himode, code, op1_h, op2_h, NULL_RTX,
39653 1, OPTAB_DIRECT);
39654 gcc_assert (res_l && res_h);
39655
39656 /* Merge the data back into the right place. */
39657 d.target = dest;
39658 d.op0 = gen_lowpart (qimode, res_l);
39659 d.op1 = gen_lowpart (qimode, res_h);
39660 d.vmode = qimode;
39661 d.nelt = GET_MODE_NUNITS (qimode);
39662 d.one_operand_p = false;
39663 d.testing_p = false;
39664
39665 if (full_interleave)
39666 {
39667 /* For SSE2, we used an full interleave, so the desired
39668 results are in the even elements. */
39669 for (i = 0; i < 32; ++i)
39670 d.perm[i] = i * 2;
39671 }
39672 else
39673 {
39674 /* For AVX, the interleave used above was not cross-lane. So the
39675 extraction is evens but with the second and third quarter swapped.
39676 Happily, that is even one insn shorter than even extraction. */
39677 for (i = 0; i < 32; ++i)
39678 d.perm[i] = i * 2 + ((i & 24) == 8 ? 16 : (i & 24) == 16 ? -16 : 0);
39679 }
39680
39681 ok = ix86_expand_vec_perm_const_1 (&d);
39682 gcc_assert (ok);
39683
39684 set_unique_reg_note (get_last_insn (), REG_EQUAL,
39685 gen_rtx_fmt_ee (code, qimode, op1, op2));
39686 }
39687
39688 void
39689 ix86_expand_mul_widen_evenodd (rtx dest, rtx op1, rtx op2,
39690 bool uns_p, bool odd_p)
39691 {
39692 enum machine_mode mode = GET_MODE (op1);
39693 enum machine_mode wmode = GET_MODE (dest);
39694 rtx x;
39695
39696 /* We only play even/odd games with vectors of SImode. */
39697 gcc_assert (mode == V4SImode || mode == V8SImode);
39698
39699 /* If we're looking for the odd results, shift those members down to
39700 the even slots. For some cpus this is faster than a PSHUFD. */
39701 if (odd_p)
39702 {
39703 if (TARGET_XOP && mode == V4SImode)
39704 {
39705 x = force_reg (wmode, CONST0_RTX (wmode));
39706 emit_insn (gen_xop_pmacsdqh (dest, op1, op2, x));
39707 return;
39708 }
39709
39710 x = GEN_INT (GET_MODE_UNIT_BITSIZE (mode));
39711 op1 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op1),
39712 x, NULL, 1, OPTAB_DIRECT);
39713 op2 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op2),
39714 x, NULL, 1, OPTAB_DIRECT);
39715 op1 = gen_lowpart (mode, op1);
39716 op2 = gen_lowpart (mode, op2);
39717 }
39718
39719 if (mode == V8SImode)
39720 {
39721 if (uns_p)
39722 x = gen_vec_widen_umult_even_v8si (dest, op1, op2);
39723 else
39724 x = gen_vec_widen_smult_even_v8si (dest, op1, op2);
39725 }
39726 else if (uns_p)
39727 x = gen_vec_widen_umult_even_v4si (dest, op1, op2);
39728 else if (TARGET_SSE4_1)
39729 x = gen_sse4_1_mulv2siv2di3 (dest, op1, op2);
39730 else
39731 {
39732 rtx s1, s2, t0, t1, t2;
39733
39734 /* The easiest way to implement this without PMULDQ is to go through
39735 the motions as if we are performing a full 64-bit multiply. With
39736 the exception that we need to do less shuffling of the elements. */
39737
39738 /* Compute the sign-extension, aka highparts, of the two operands. */
39739 s1 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
39740 op1, pc_rtx, pc_rtx);
39741 s2 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
39742 op2, pc_rtx, pc_rtx);
39743
39744 /* Multiply LO(A) * HI(B), and vice-versa. */
39745 t1 = gen_reg_rtx (wmode);
39746 t2 = gen_reg_rtx (wmode);
39747 emit_insn (gen_vec_widen_umult_even_v4si (t1, s1, op2));
39748 emit_insn (gen_vec_widen_umult_even_v4si (t2, s2, op1));
39749
39750 /* Multiply LO(A) * LO(B). */
39751 t0 = gen_reg_rtx (wmode);
39752 emit_insn (gen_vec_widen_umult_even_v4si (t0, op1, op2));
39753
39754 /* Combine and shift the highparts into place. */
39755 t1 = expand_binop (wmode, add_optab, t1, t2, t1, 1, OPTAB_DIRECT);
39756 t1 = expand_binop (wmode, ashl_optab, t1, GEN_INT (32), t1,
39757 1, OPTAB_DIRECT);
39758
39759 /* Combine high and low parts. */
39760 force_expand_binop (wmode, add_optab, t0, t1, dest, 1, OPTAB_DIRECT);
39761 return;
39762 }
39763 emit_insn (x);
39764 }
39765
39766 void
39767 ix86_expand_mul_widen_hilo (rtx dest, rtx op1, rtx op2,
39768 bool uns_p, bool high_p)
39769 {
39770 enum machine_mode wmode = GET_MODE (dest);
39771 enum machine_mode mode = GET_MODE (op1);
39772 rtx t1, t2, t3, t4, mask;
39773
39774 switch (mode)
39775 {
39776 case V4SImode:
39777 t1 = gen_reg_rtx (mode);
39778 t2 = gen_reg_rtx (mode);
39779 if (TARGET_XOP && !uns_p)
39780 {
39781 /* With XOP, we have pmacsdqh, aka mul_widen_odd. In this case,
39782 shuffle the elements once so that all elements are in the right
39783 place for immediate use: { A C B D }. */
39784 emit_insn (gen_sse2_pshufd_1 (t1, op1, const0_rtx, const2_rtx,
39785 const1_rtx, GEN_INT (3)));
39786 emit_insn (gen_sse2_pshufd_1 (t2, op2, const0_rtx, const2_rtx,
39787 const1_rtx, GEN_INT (3)));
39788 }
39789 else
39790 {
39791 /* Put the elements into place for the multiply. */
39792 ix86_expand_vec_interleave (t1, op1, op1, high_p);
39793 ix86_expand_vec_interleave (t2, op2, op2, high_p);
39794 high_p = false;
39795 }
39796 ix86_expand_mul_widen_evenodd (dest, t1, t2, uns_p, high_p);
39797 break;
39798
39799 case V8SImode:
39800 /* Shuffle the elements between the lanes. After this we
39801 have { A B E F | C D G H } for each operand. */
39802 t1 = gen_reg_rtx (V4DImode);
39803 t2 = gen_reg_rtx (V4DImode);
39804 emit_insn (gen_avx2_permv4di_1 (t1, gen_lowpart (V4DImode, op1),
39805 const0_rtx, const2_rtx,
39806 const1_rtx, GEN_INT (3)));
39807 emit_insn (gen_avx2_permv4di_1 (t2, gen_lowpart (V4DImode, op2),
39808 const0_rtx, const2_rtx,
39809 const1_rtx, GEN_INT (3)));
39810
39811 /* Shuffle the elements within the lanes. After this we
39812 have { A A B B | C C D D } or { E E F F | G G H H }. */
39813 t3 = gen_reg_rtx (V8SImode);
39814 t4 = gen_reg_rtx (V8SImode);
39815 mask = GEN_INT (high_p
39816 ? 2 + (2 << 2) + (3 << 4) + (3 << 6)
39817 : 0 + (0 << 2) + (1 << 4) + (1 << 6));
39818 emit_insn (gen_avx2_pshufdv3 (t3, gen_lowpart (V8SImode, t1), mask));
39819 emit_insn (gen_avx2_pshufdv3 (t4, gen_lowpart (V8SImode, t2), mask));
39820
39821 ix86_expand_mul_widen_evenodd (dest, t3, t4, uns_p, false);
39822 break;
39823
39824 case V8HImode:
39825 case V16HImode:
39826 t1 = expand_binop (mode, smul_optab, op1, op2, NULL_RTX,
39827 uns_p, OPTAB_DIRECT);
39828 t2 = expand_binop (mode,
39829 uns_p ? umul_highpart_optab : smul_highpart_optab,
39830 op1, op2, NULL_RTX, uns_p, OPTAB_DIRECT);
39831 gcc_assert (t1 && t2);
39832
39833 ix86_expand_vec_interleave (gen_lowpart (mode, dest), t1, t2, high_p);
39834 break;
39835
39836 case V16QImode:
39837 case V32QImode:
39838 t1 = gen_reg_rtx (wmode);
39839 t2 = gen_reg_rtx (wmode);
39840 ix86_expand_sse_unpack (t1, op1, uns_p, high_p);
39841 ix86_expand_sse_unpack (t2, op2, uns_p, high_p);
39842
39843 emit_insn (gen_rtx_SET (VOIDmode, dest, gen_rtx_MULT (wmode, t1, t2)));
39844 break;
39845
39846 default:
39847 gcc_unreachable ();
39848 }
39849 }
39850
39851 void
39852 ix86_expand_sse2_mulv4si3 (rtx op0, rtx op1, rtx op2)
39853 {
39854 rtx res_1, res_2;
39855
39856 res_1 = gen_reg_rtx (V4SImode);
39857 res_2 = gen_reg_rtx (V4SImode);
39858 ix86_expand_mul_widen_evenodd (gen_lowpart (V2DImode, res_1),
39859 op1, op2, true, false);
39860 ix86_expand_mul_widen_evenodd (gen_lowpart (V2DImode, res_2),
39861 op1, op2, true, true);
39862
39863 /* Move the results in element 2 down to element 1; we don't care
39864 what goes in elements 2 and 3. Then we can merge the parts
39865 back together with an interleave.
39866
39867 Note that two other sequences were tried:
39868 (1) Use interleaves at the start instead of psrldq, which allows
39869 us to use a single shufps to merge things back at the end.
39870 (2) Use shufps here to combine the two vectors, then pshufd to
39871 put the elements in the correct order.
39872 In both cases the cost of the reformatting stall was too high
39873 and the overall sequence slower. */
39874
39875 emit_insn (gen_sse2_pshufd_1 (res_1, res_1, const0_rtx, const2_rtx,
39876 const0_rtx, const0_rtx));
39877 emit_insn (gen_sse2_pshufd_1 (res_2, res_2, const0_rtx, const2_rtx,
39878 const0_rtx, const0_rtx));
39879 res_1 = emit_insn (gen_vec_interleave_lowv4si (op0, res_1, res_2));
39880
39881 set_unique_reg_note (res_1, REG_EQUAL, gen_rtx_MULT (V4SImode, op1, op2));
39882 }
39883
39884 void
39885 ix86_expand_sse2_mulvxdi3 (rtx op0, rtx op1, rtx op2)
39886 {
39887 enum machine_mode mode = GET_MODE (op0);
39888 rtx t1, t2, t3, t4, t5, t6;
39889
39890 if (TARGET_XOP && mode == V2DImode)
39891 {
39892 /* op1: A,B,C,D, op2: E,F,G,H */
39893 op1 = gen_lowpart (V4SImode, op1);
39894 op2 = gen_lowpart (V4SImode, op2);
39895
39896 t1 = gen_reg_rtx (V4SImode);
39897 t2 = gen_reg_rtx (V4SImode);
39898 t3 = gen_reg_rtx (V2DImode);
39899 t4 = gen_reg_rtx (V2DImode);
39900
39901 /* t1: B,A,D,C */
39902 emit_insn (gen_sse2_pshufd_1 (t1, op1,
39903 GEN_INT (1),
39904 GEN_INT (0),
39905 GEN_INT (3),
39906 GEN_INT (2)));
39907
39908 /* t2: (B*E),(A*F),(D*G),(C*H) */
39909 emit_insn (gen_mulv4si3 (t2, t1, op2));
39910
39911 /* t3: (B*E)+(A*F), (D*G)+(C*H) */
39912 emit_insn (gen_xop_phadddq (t3, t2));
39913
39914 /* t4: ((B*E)+(A*F))<<32, ((D*G)+(C*H))<<32 */
39915 emit_insn (gen_ashlv2di3 (t4, t3, GEN_INT (32)));
39916
39917 /* op0: (((B*E)+(A*F))<<32)+(B*F), (((D*G)+(C*H))<<32)+(D*H) */
39918 emit_insn (gen_xop_pmacsdql (op0, op1, op2, t4));
39919 }
39920 else
39921 {
39922 enum machine_mode nmode;
39923 rtx (*umul) (rtx, rtx, rtx);
39924
39925 if (mode == V2DImode)
39926 {
39927 umul = gen_vec_widen_umult_even_v4si;
39928 nmode = V4SImode;
39929 }
39930 else if (mode == V4DImode)
39931 {
39932 umul = gen_vec_widen_umult_even_v8si;
39933 nmode = V8SImode;
39934 }
39935 else
39936 gcc_unreachable ();
39937
39938
39939 /* Multiply low parts. */
39940 t1 = gen_reg_rtx (mode);
39941 emit_insn (umul (t1, gen_lowpart (nmode, op1), gen_lowpart (nmode, op2)));
39942
39943 /* Shift input vectors right 32 bits so we can multiply high parts. */
39944 t6 = GEN_INT (32);
39945 t2 = expand_binop (mode, lshr_optab, op1, t6, NULL, 1, OPTAB_DIRECT);
39946 t3 = expand_binop (mode, lshr_optab, op2, t6, NULL, 1, OPTAB_DIRECT);
39947
39948 /* Multiply high parts by low parts. */
39949 t4 = gen_reg_rtx (mode);
39950 t5 = gen_reg_rtx (mode);
39951 emit_insn (umul (t4, gen_lowpart (nmode, t2), gen_lowpart (nmode, op2)));
39952 emit_insn (umul (t5, gen_lowpart (nmode, t3), gen_lowpart (nmode, op1)));
39953
39954 /* Combine and shift the highparts back. */
39955 t4 = expand_binop (mode, add_optab, t4, t5, t4, 1, OPTAB_DIRECT);
39956 t4 = expand_binop (mode, ashl_optab, t4, t6, t4, 1, OPTAB_DIRECT);
39957
39958 /* Combine high and low parts. */
39959 force_expand_binop (mode, add_optab, t1, t4, op0, 1, OPTAB_DIRECT);
39960 }
39961
39962 set_unique_reg_note (get_last_insn (), REG_EQUAL,
39963 gen_rtx_MULT (mode, op1, op2));
39964 }
39965
39966 /* Expand an insert into a vector register through pinsr insn.
39967 Return true if successful. */
39968
39969 bool
39970 ix86_expand_pinsr (rtx *operands)
39971 {
39972 rtx dst = operands[0];
39973 rtx src = operands[3];
39974
39975 unsigned int size = INTVAL (operands[1]);
39976 unsigned int pos = INTVAL (operands[2]);
39977
39978 if (GET_CODE (dst) == SUBREG)
39979 {
39980 pos += SUBREG_BYTE (dst) * BITS_PER_UNIT;
39981 dst = SUBREG_REG (dst);
39982 }
39983
39984 if (GET_CODE (src) == SUBREG)
39985 src = SUBREG_REG (src);
39986
39987 switch (GET_MODE (dst))
39988 {
39989 case V16QImode:
39990 case V8HImode:
39991 case V4SImode:
39992 case V2DImode:
39993 {
39994 enum machine_mode srcmode, dstmode;
39995 rtx (*pinsr)(rtx, rtx, rtx, rtx);
39996
39997 srcmode = mode_for_size (size, MODE_INT, 0);
39998
39999 switch (srcmode)
40000 {
40001 case QImode:
40002 if (!TARGET_SSE4_1)
40003 return false;
40004 dstmode = V16QImode;
40005 pinsr = gen_sse4_1_pinsrb;
40006 break;
40007
40008 case HImode:
40009 if (!TARGET_SSE2)
40010 return false;
40011 dstmode = V8HImode;
40012 pinsr = gen_sse2_pinsrw;
40013 break;
40014
40015 case SImode:
40016 if (!TARGET_SSE4_1)
40017 return false;
40018 dstmode = V4SImode;
40019 pinsr = gen_sse4_1_pinsrd;
40020 break;
40021
40022 case DImode:
40023 gcc_assert (TARGET_64BIT);
40024 if (!TARGET_SSE4_1)
40025 return false;
40026 dstmode = V2DImode;
40027 pinsr = gen_sse4_1_pinsrq;
40028 break;
40029
40030 default:
40031 return false;
40032 }
40033
40034 dst = gen_lowpart (dstmode, dst);
40035 src = gen_lowpart (srcmode, src);
40036
40037 pos /= size;
40038
40039 emit_insn (pinsr (dst, dst, src, GEN_INT (1 << pos)));
40040 return true;
40041 }
40042
40043 default:
40044 return false;
40045 }
40046 }
40047 \f
40048 /* This function returns the calling abi specific va_list type node.
40049 It returns the FNDECL specific va_list type. */
40050
40051 static tree
40052 ix86_fn_abi_va_list (tree fndecl)
40053 {
40054 if (!TARGET_64BIT)
40055 return va_list_type_node;
40056 gcc_assert (fndecl != NULL_TREE);
40057
40058 if (ix86_function_abi ((const_tree) fndecl) == MS_ABI)
40059 return ms_va_list_type_node;
40060 else
40061 return sysv_va_list_type_node;
40062 }
40063
40064 /* Returns the canonical va_list type specified by TYPE. If there
40065 is no valid TYPE provided, it return NULL_TREE. */
40066
40067 static tree
40068 ix86_canonical_va_list_type (tree type)
40069 {
40070 tree wtype, htype;
40071
40072 /* Resolve references and pointers to va_list type. */
40073 if (TREE_CODE (type) == MEM_REF)
40074 type = TREE_TYPE (type);
40075 else if (POINTER_TYPE_P (type) && POINTER_TYPE_P (TREE_TYPE(type)))
40076 type = TREE_TYPE (type);
40077 else if (POINTER_TYPE_P (type) && TREE_CODE (TREE_TYPE (type)) == ARRAY_TYPE)
40078 type = TREE_TYPE (type);
40079
40080 if (TARGET_64BIT && va_list_type_node != NULL_TREE)
40081 {
40082 wtype = va_list_type_node;
40083 gcc_assert (wtype != NULL_TREE);
40084 htype = type;
40085 if (TREE_CODE (wtype) == ARRAY_TYPE)
40086 {
40087 /* If va_list is an array type, the argument may have decayed
40088 to a pointer type, e.g. by being passed to another function.
40089 In that case, unwrap both types so that we can compare the
40090 underlying records. */
40091 if (TREE_CODE (htype) == ARRAY_TYPE
40092 || POINTER_TYPE_P (htype))
40093 {
40094 wtype = TREE_TYPE (wtype);
40095 htype = TREE_TYPE (htype);
40096 }
40097 }
40098 if (TYPE_MAIN_VARIANT (wtype) == TYPE_MAIN_VARIANT (htype))
40099 return va_list_type_node;
40100 wtype = sysv_va_list_type_node;
40101 gcc_assert (wtype != NULL_TREE);
40102 htype = type;
40103 if (TREE_CODE (wtype) == ARRAY_TYPE)
40104 {
40105 /* If va_list is an array type, the argument may have decayed
40106 to a pointer type, e.g. by being passed to another function.
40107 In that case, unwrap both types so that we can compare the
40108 underlying records. */
40109 if (TREE_CODE (htype) == ARRAY_TYPE
40110 || POINTER_TYPE_P (htype))
40111 {
40112 wtype = TREE_TYPE (wtype);
40113 htype = TREE_TYPE (htype);
40114 }
40115 }
40116 if (TYPE_MAIN_VARIANT (wtype) == TYPE_MAIN_VARIANT (htype))
40117 return sysv_va_list_type_node;
40118 wtype = ms_va_list_type_node;
40119 gcc_assert (wtype != NULL_TREE);
40120 htype = type;
40121 if (TREE_CODE (wtype) == ARRAY_TYPE)
40122 {
40123 /* If va_list is an array type, the argument may have decayed
40124 to a pointer type, e.g. by being passed to another function.
40125 In that case, unwrap both types so that we can compare the
40126 underlying records. */
40127 if (TREE_CODE (htype) == ARRAY_TYPE
40128 || POINTER_TYPE_P (htype))
40129 {
40130 wtype = TREE_TYPE (wtype);
40131 htype = TREE_TYPE (htype);
40132 }
40133 }
40134 if (TYPE_MAIN_VARIANT (wtype) == TYPE_MAIN_VARIANT (htype))
40135 return ms_va_list_type_node;
40136 return NULL_TREE;
40137 }
40138 return std_canonical_va_list_type (type);
40139 }
40140
40141 /* Iterate through the target-specific builtin types for va_list.
40142 IDX denotes the iterator, *PTREE is set to the result type of
40143 the va_list builtin, and *PNAME to its internal type.
40144 Returns zero if there is no element for this index, otherwise
40145 IDX should be increased upon the next call.
40146 Note, do not iterate a base builtin's name like __builtin_va_list.
40147 Used from c_common_nodes_and_builtins. */
40148
40149 static int
40150 ix86_enum_va_list (int idx, const char **pname, tree *ptree)
40151 {
40152 if (TARGET_64BIT)
40153 {
40154 switch (idx)
40155 {
40156 default:
40157 break;
40158
40159 case 0:
40160 *ptree = ms_va_list_type_node;
40161 *pname = "__builtin_ms_va_list";
40162 return 1;
40163
40164 case 1:
40165 *ptree = sysv_va_list_type_node;
40166 *pname = "__builtin_sysv_va_list";
40167 return 1;
40168 }
40169 }
40170
40171 return 0;
40172 }
40173
40174 #undef TARGET_SCHED_DISPATCH
40175 #define TARGET_SCHED_DISPATCH has_dispatch
40176 #undef TARGET_SCHED_DISPATCH_DO
40177 #define TARGET_SCHED_DISPATCH_DO do_dispatch
40178 #undef TARGET_SCHED_REASSOCIATION_WIDTH
40179 #define TARGET_SCHED_REASSOCIATION_WIDTH ix86_reassociation_width
40180 #undef TARGET_SCHED_REORDER
40181 #define TARGET_SCHED_REORDER ix86_sched_reorder
40182 #undef TARGET_SCHED_ADJUST_PRIORITY
40183 #define TARGET_SCHED_ADJUST_PRIORITY ix86_adjust_priority
40184 #undef TARGET_SCHED_DEPENDENCIES_EVALUATION_HOOK
40185 #define TARGET_SCHED_DEPENDENCIES_EVALUATION_HOOK ix86_dependencies_evaluation_hook
40186
40187 /* The size of the dispatch window is the total number of bytes of
40188 object code allowed in a window. */
40189 #define DISPATCH_WINDOW_SIZE 16
40190
40191 /* Number of dispatch windows considered for scheduling. */
40192 #define MAX_DISPATCH_WINDOWS 3
40193
40194 /* Maximum number of instructions in a window. */
40195 #define MAX_INSN 4
40196
40197 /* Maximum number of immediate operands in a window. */
40198 #define MAX_IMM 4
40199
40200 /* Maximum number of immediate bits allowed in a window. */
40201 #define MAX_IMM_SIZE 128
40202
40203 /* Maximum number of 32 bit immediates allowed in a window. */
40204 #define MAX_IMM_32 4
40205
40206 /* Maximum number of 64 bit immediates allowed in a window. */
40207 #define MAX_IMM_64 2
40208
40209 /* Maximum total of loads or prefetches allowed in a window. */
40210 #define MAX_LOAD 2
40211
40212 /* Maximum total of stores allowed in a window. */
40213 #define MAX_STORE 1
40214
40215 #undef BIG
40216 #define BIG 100
40217
40218
40219 /* Dispatch groups. Istructions that affect the mix in a dispatch window. */
40220 enum dispatch_group {
40221 disp_no_group = 0,
40222 disp_load,
40223 disp_store,
40224 disp_load_store,
40225 disp_prefetch,
40226 disp_imm,
40227 disp_imm_32,
40228 disp_imm_64,
40229 disp_branch,
40230 disp_cmp,
40231 disp_jcc,
40232 disp_last
40233 };
40234
40235 /* Number of allowable groups in a dispatch window. It is an array
40236 indexed by dispatch_group enum. 100 is used as a big number,
40237 because the number of these kind of operations does not have any
40238 effect in dispatch window, but we need them for other reasons in
40239 the table. */
40240 static unsigned int num_allowable_groups[disp_last] = {
40241 0, 2, 1, 1, 2, 4, 4, 2, 1, BIG, BIG
40242 };
40243
40244 char group_name[disp_last + 1][16] = {
40245 "disp_no_group", "disp_load", "disp_store", "disp_load_store",
40246 "disp_prefetch", "disp_imm", "disp_imm_32", "disp_imm_64",
40247 "disp_branch", "disp_cmp", "disp_jcc", "disp_last"
40248 };
40249
40250 /* Instruction path. */
40251 enum insn_path {
40252 no_path = 0,
40253 path_single, /* Single micro op. */
40254 path_double, /* Double micro op. */
40255 path_multi, /* Instructions with more than 2 micro op.. */
40256 last_path
40257 };
40258
40259 /* sched_insn_info defines a window to the instructions scheduled in
40260 the basic block. It contains a pointer to the insn_info table and
40261 the instruction scheduled.
40262
40263 Windows are allocated for each basic block and are linked
40264 together. */
40265 typedef struct sched_insn_info_s {
40266 rtx insn;
40267 enum dispatch_group group;
40268 enum insn_path path;
40269 int byte_len;
40270 int imm_bytes;
40271 } sched_insn_info;
40272
40273 /* Linked list of dispatch windows. This is a two way list of
40274 dispatch windows of a basic block. It contains information about
40275 the number of uops in the window and the total number of
40276 instructions and of bytes in the object code for this dispatch
40277 window. */
40278 typedef struct dispatch_windows_s {
40279 int num_insn; /* Number of insn in the window. */
40280 int num_uops; /* Number of uops in the window. */
40281 int window_size; /* Number of bytes in the window. */
40282 int window_num; /* Window number between 0 or 1. */
40283 int num_imm; /* Number of immediates in an insn. */
40284 int num_imm_32; /* Number of 32 bit immediates in an insn. */
40285 int num_imm_64; /* Number of 64 bit immediates in an insn. */
40286 int imm_size; /* Total immediates in the window. */
40287 int num_loads; /* Total memory loads in the window. */
40288 int num_stores; /* Total memory stores in the window. */
40289 int violation; /* Violation exists in window. */
40290 sched_insn_info *window; /* Pointer to the window. */
40291 struct dispatch_windows_s *next;
40292 struct dispatch_windows_s *prev;
40293 } dispatch_windows;
40294
40295 /* Immediate valuse used in an insn. */
40296 typedef struct imm_info_s
40297 {
40298 int imm;
40299 int imm32;
40300 int imm64;
40301 } imm_info;
40302
40303 static dispatch_windows *dispatch_window_list;
40304 static dispatch_windows *dispatch_window_list1;
40305
40306 /* Get dispatch group of insn. */
40307
40308 static enum dispatch_group
40309 get_mem_group (rtx insn)
40310 {
40311 enum attr_memory memory;
40312
40313 if (INSN_CODE (insn) < 0)
40314 return disp_no_group;
40315 memory = get_attr_memory (insn);
40316 if (memory == MEMORY_STORE)
40317 return disp_store;
40318
40319 if (memory == MEMORY_LOAD)
40320 return disp_load;
40321
40322 if (memory == MEMORY_BOTH)
40323 return disp_load_store;
40324
40325 return disp_no_group;
40326 }
40327
40328 /* Return true if insn is a compare instruction. */
40329
40330 static bool
40331 is_cmp (rtx insn)
40332 {
40333 enum attr_type type;
40334
40335 type = get_attr_type (insn);
40336 return (type == TYPE_TEST
40337 || type == TYPE_ICMP
40338 || type == TYPE_FCMP
40339 || GET_CODE (PATTERN (insn)) == COMPARE);
40340 }
40341
40342 /* Return true if a dispatch violation encountered. */
40343
40344 static bool
40345 dispatch_violation (void)
40346 {
40347 if (dispatch_window_list->next)
40348 return dispatch_window_list->next->violation;
40349 return dispatch_window_list->violation;
40350 }
40351
40352 /* Return true if insn is a branch instruction. */
40353
40354 static bool
40355 is_branch (rtx insn)
40356 {
40357 return (CALL_P (insn) || JUMP_P (insn));
40358 }
40359
40360 /* Return true if insn is a prefetch instruction. */
40361
40362 static bool
40363 is_prefetch (rtx insn)
40364 {
40365 return NONJUMP_INSN_P (insn) && GET_CODE (PATTERN (insn)) == PREFETCH;
40366 }
40367
40368 /* This function initializes a dispatch window and the list container holding a
40369 pointer to the window. */
40370
40371 static void
40372 init_window (int window_num)
40373 {
40374 int i;
40375 dispatch_windows *new_list;
40376
40377 if (window_num == 0)
40378 new_list = dispatch_window_list;
40379 else
40380 new_list = dispatch_window_list1;
40381
40382 new_list->num_insn = 0;
40383 new_list->num_uops = 0;
40384 new_list->window_size = 0;
40385 new_list->next = NULL;
40386 new_list->prev = NULL;
40387 new_list->window_num = window_num;
40388 new_list->num_imm = 0;
40389 new_list->num_imm_32 = 0;
40390 new_list->num_imm_64 = 0;
40391 new_list->imm_size = 0;
40392 new_list->num_loads = 0;
40393 new_list->num_stores = 0;
40394 new_list->violation = false;
40395
40396 for (i = 0; i < MAX_INSN; i++)
40397 {
40398 new_list->window[i].insn = NULL;
40399 new_list->window[i].group = disp_no_group;
40400 new_list->window[i].path = no_path;
40401 new_list->window[i].byte_len = 0;
40402 new_list->window[i].imm_bytes = 0;
40403 }
40404 return;
40405 }
40406
40407 /* This function allocates and initializes a dispatch window and the
40408 list container holding a pointer to the window. */
40409
40410 static dispatch_windows *
40411 allocate_window (void)
40412 {
40413 dispatch_windows *new_list = XNEW (struct dispatch_windows_s);
40414 new_list->window = XNEWVEC (struct sched_insn_info_s, MAX_INSN + 1);
40415
40416 return new_list;
40417 }
40418
40419 /* This routine initializes the dispatch scheduling information. It
40420 initiates building dispatch scheduler tables and constructs the
40421 first dispatch window. */
40422
40423 static void
40424 init_dispatch_sched (void)
40425 {
40426 /* Allocate a dispatch list and a window. */
40427 dispatch_window_list = allocate_window ();
40428 dispatch_window_list1 = allocate_window ();
40429 init_window (0);
40430 init_window (1);
40431 }
40432
40433 /* This function returns true if a branch is detected. End of a basic block
40434 does not have to be a branch, but here we assume only branches end a
40435 window. */
40436
40437 static bool
40438 is_end_basic_block (enum dispatch_group group)
40439 {
40440 return group == disp_branch;
40441 }
40442
40443 /* This function is called when the end of a window processing is reached. */
40444
40445 static void
40446 process_end_window (void)
40447 {
40448 gcc_assert (dispatch_window_list->num_insn <= MAX_INSN);
40449 if (dispatch_window_list->next)
40450 {
40451 gcc_assert (dispatch_window_list1->num_insn <= MAX_INSN);
40452 gcc_assert (dispatch_window_list->window_size
40453 + dispatch_window_list1->window_size <= 48);
40454 init_window (1);
40455 }
40456 init_window (0);
40457 }
40458
40459 /* Allocates a new dispatch window and adds it to WINDOW_LIST.
40460 WINDOW_NUM is either 0 or 1. A maximum of two windows are generated
40461 for 48 bytes of instructions. Note that these windows are not dispatch
40462 windows that their sizes are DISPATCH_WINDOW_SIZE. */
40463
40464 static dispatch_windows *
40465 allocate_next_window (int window_num)
40466 {
40467 if (window_num == 0)
40468 {
40469 if (dispatch_window_list->next)
40470 init_window (1);
40471 init_window (0);
40472 return dispatch_window_list;
40473 }
40474
40475 dispatch_window_list->next = dispatch_window_list1;
40476 dispatch_window_list1->prev = dispatch_window_list;
40477
40478 return dispatch_window_list1;
40479 }
40480
40481 /* Increment the number of immediate operands of an instruction. */
40482
40483 static int
40484 find_constant_1 (rtx *in_rtx, imm_info *imm_values)
40485 {
40486 if (*in_rtx == 0)
40487 return 0;
40488
40489 switch ( GET_CODE (*in_rtx))
40490 {
40491 case CONST:
40492 case SYMBOL_REF:
40493 case CONST_INT:
40494 (imm_values->imm)++;
40495 if (x86_64_immediate_operand (*in_rtx, SImode))
40496 (imm_values->imm32)++;
40497 else
40498 (imm_values->imm64)++;
40499 break;
40500
40501 case CONST_DOUBLE:
40502 (imm_values->imm)++;
40503 (imm_values->imm64)++;
40504 break;
40505
40506 case CODE_LABEL:
40507 if (LABEL_KIND (*in_rtx) == LABEL_NORMAL)
40508 {
40509 (imm_values->imm)++;
40510 (imm_values->imm32)++;
40511 }
40512 break;
40513
40514 default:
40515 break;
40516 }
40517
40518 return 0;
40519 }
40520
40521 /* Compute number of immediate operands of an instruction. */
40522
40523 static void
40524 find_constant (rtx in_rtx, imm_info *imm_values)
40525 {
40526 for_each_rtx (INSN_P (in_rtx) ? &PATTERN (in_rtx) : &in_rtx,
40527 (rtx_function) find_constant_1, (void *) imm_values);
40528 }
40529
40530 /* Return total size of immediate operands of an instruction along with number
40531 of corresponding immediate-operands. It initializes its parameters to zero
40532 befor calling FIND_CONSTANT.
40533 INSN is the input instruction. IMM is the total of immediates.
40534 IMM32 is the number of 32 bit immediates. IMM64 is the number of 64
40535 bit immediates. */
40536
40537 static int
40538 get_num_immediates (rtx insn, int *imm, int *imm32, int *imm64)
40539 {
40540 imm_info imm_values = {0, 0, 0};
40541
40542 find_constant (insn, &imm_values);
40543 *imm = imm_values.imm;
40544 *imm32 = imm_values.imm32;
40545 *imm64 = imm_values.imm64;
40546 return imm_values.imm32 * 4 + imm_values.imm64 * 8;
40547 }
40548
40549 /* This function indicates if an operand of an instruction is an
40550 immediate. */
40551
40552 static bool
40553 has_immediate (rtx insn)
40554 {
40555 int num_imm_operand;
40556 int num_imm32_operand;
40557 int num_imm64_operand;
40558
40559 if (insn)
40560 return get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
40561 &num_imm64_operand);
40562 return false;
40563 }
40564
40565 /* Return single or double path for instructions. */
40566
40567 static enum insn_path
40568 get_insn_path (rtx insn)
40569 {
40570 enum attr_amdfam10_decode path = get_attr_amdfam10_decode (insn);
40571
40572 if ((int)path == 0)
40573 return path_single;
40574
40575 if ((int)path == 1)
40576 return path_double;
40577
40578 return path_multi;
40579 }
40580
40581 /* Return insn dispatch group. */
40582
40583 static enum dispatch_group
40584 get_insn_group (rtx insn)
40585 {
40586 enum dispatch_group group = get_mem_group (insn);
40587 if (group)
40588 return group;
40589
40590 if (is_branch (insn))
40591 return disp_branch;
40592
40593 if (is_cmp (insn))
40594 return disp_cmp;
40595
40596 if (has_immediate (insn))
40597 return disp_imm;
40598
40599 if (is_prefetch (insn))
40600 return disp_prefetch;
40601
40602 return disp_no_group;
40603 }
40604
40605 /* Count number of GROUP restricted instructions in a dispatch
40606 window WINDOW_LIST. */
40607
40608 static int
40609 count_num_restricted (rtx insn, dispatch_windows *window_list)
40610 {
40611 enum dispatch_group group = get_insn_group (insn);
40612 int imm_size;
40613 int num_imm_operand;
40614 int num_imm32_operand;
40615 int num_imm64_operand;
40616
40617 if (group == disp_no_group)
40618 return 0;
40619
40620 if (group == disp_imm)
40621 {
40622 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
40623 &num_imm64_operand);
40624 if (window_list->imm_size + imm_size > MAX_IMM_SIZE
40625 || num_imm_operand + window_list->num_imm > MAX_IMM
40626 || (num_imm32_operand > 0
40627 && (window_list->num_imm_32 + num_imm32_operand > MAX_IMM_32
40628 || window_list->num_imm_64 * 2 + num_imm32_operand > MAX_IMM_32))
40629 || (num_imm64_operand > 0
40630 && (window_list->num_imm_64 + num_imm64_operand > MAX_IMM_64
40631 || window_list->num_imm_32 + num_imm64_operand * 2 > MAX_IMM_32))
40632 || (window_list->imm_size + imm_size == MAX_IMM_SIZE
40633 && num_imm64_operand > 0
40634 && ((window_list->num_imm_64 > 0
40635 && window_list->num_insn >= 2)
40636 || window_list->num_insn >= 3)))
40637 return BIG;
40638
40639 return 1;
40640 }
40641
40642 if ((group == disp_load_store
40643 && (window_list->num_loads >= MAX_LOAD
40644 || window_list->num_stores >= MAX_STORE))
40645 || ((group == disp_load
40646 || group == disp_prefetch)
40647 && window_list->num_loads >= MAX_LOAD)
40648 || (group == disp_store
40649 && window_list->num_stores >= MAX_STORE))
40650 return BIG;
40651
40652 return 1;
40653 }
40654
40655 /* This function returns true if insn satisfies dispatch rules on the
40656 last window scheduled. */
40657
40658 static bool
40659 fits_dispatch_window (rtx insn)
40660 {
40661 dispatch_windows *window_list = dispatch_window_list;
40662 dispatch_windows *window_list_next = dispatch_window_list->next;
40663 unsigned int num_restrict;
40664 enum dispatch_group group = get_insn_group (insn);
40665 enum insn_path path = get_insn_path (insn);
40666 int sum;
40667
40668 /* Make disp_cmp and disp_jcc get scheduled at the latest. These
40669 instructions should be given the lowest priority in the
40670 scheduling process in Haifa scheduler to make sure they will be
40671 scheduled in the same dispatch window as the reference to them. */
40672 if (group == disp_jcc || group == disp_cmp)
40673 return false;
40674
40675 /* Check nonrestricted. */
40676 if (group == disp_no_group || group == disp_branch)
40677 return true;
40678
40679 /* Get last dispatch window. */
40680 if (window_list_next)
40681 window_list = window_list_next;
40682
40683 if (window_list->window_num == 1)
40684 {
40685 sum = window_list->prev->window_size + window_list->window_size;
40686
40687 if (sum == 32
40688 || (min_insn_size (insn) + sum) >= 48)
40689 /* Window 1 is full. Go for next window. */
40690 return true;
40691 }
40692
40693 num_restrict = count_num_restricted (insn, window_list);
40694
40695 if (num_restrict > num_allowable_groups[group])
40696 return false;
40697
40698 /* See if it fits in the first window. */
40699 if (window_list->window_num == 0)
40700 {
40701 /* The first widow should have only single and double path
40702 uops. */
40703 if (path == path_double
40704 && (window_list->num_uops + 2) > MAX_INSN)
40705 return false;
40706 else if (path != path_single)
40707 return false;
40708 }
40709 return true;
40710 }
40711
40712 /* Add an instruction INSN with NUM_UOPS micro-operations to the
40713 dispatch window WINDOW_LIST. */
40714
40715 static void
40716 add_insn_window (rtx insn, dispatch_windows *window_list, int num_uops)
40717 {
40718 int byte_len = min_insn_size (insn);
40719 int num_insn = window_list->num_insn;
40720 int imm_size;
40721 sched_insn_info *window = window_list->window;
40722 enum dispatch_group group = get_insn_group (insn);
40723 enum insn_path path = get_insn_path (insn);
40724 int num_imm_operand;
40725 int num_imm32_operand;
40726 int num_imm64_operand;
40727
40728 if (!window_list->violation && group != disp_cmp
40729 && !fits_dispatch_window (insn))
40730 window_list->violation = true;
40731
40732 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
40733 &num_imm64_operand);
40734
40735 /* Initialize window with new instruction. */
40736 window[num_insn].insn = insn;
40737 window[num_insn].byte_len = byte_len;
40738 window[num_insn].group = group;
40739 window[num_insn].path = path;
40740 window[num_insn].imm_bytes = imm_size;
40741
40742 window_list->window_size += byte_len;
40743 window_list->num_insn = num_insn + 1;
40744 window_list->num_uops = window_list->num_uops + num_uops;
40745 window_list->imm_size += imm_size;
40746 window_list->num_imm += num_imm_operand;
40747 window_list->num_imm_32 += num_imm32_operand;
40748 window_list->num_imm_64 += num_imm64_operand;
40749
40750 if (group == disp_store)
40751 window_list->num_stores += 1;
40752 else if (group == disp_load
40753 || group == disp_prefetch)
40754 window_list->num_loads += 1;
40755 else if (group == disp_load_store)
40756 {
40757 window_list->num_stores += 1;
40758 window_list->num_loads += 1;
40759 }
40760 }
40761
40762 /* Adds a scheduled instruction, INSN, to the current dispatch window.
40763 If the total bytes of instructions or the number of instructions in
40764 the window exceed allowable, it allocates a new window. */
40765
40766 static void
40767 add_to_dispatch_window (rtx insn)
40768 {
40769 int byte_len;
40770 dispatch_windows *window_list;
40771 dispatch_windows *next_list;
40772 dispatch_windows *window0_list;
40773 enum insn_path path;
40774 enum dispatch_group insn_group;
40775 bool insn_fits;
40776 int num_insn;
40777 int num_uops;
40778 int window_num;
40779 int insn_num_uops;
40780 int sum;
40781
40782 if (INSN_CODE (insn) < 0)
40783 return;
40784
40785 byte_len = min_insn_size (insn);
40786 window_list = dispatch_window_list;
40787 next_list = window_list->next;
40788 path = get_insn_path (insn);
40789 insn_group = get_insn_group (insn);
40790
40791 /* Get the last dispatch window. */
40792 if (next_list)
40793 window_list = dispatch_window_list->next;
40794
40795 if (path == path_single)
40796 insn_num_uops = 1;
40797 else if (path == path_double)
40798 insn_num_uops = 2;
40799 else
40800 insn_num_uops = (int) path;
40801
40802 /* If current window is full, get a new window.
40803 Window number zero is full, if MAX_INSN uops are scheduled in it.
40804 Window number one is full, if window zero's bytes plus window
40805 one's bytes is 32, or if the bytes of the new instruction added
40806 to the total makes it greater than 48, or it has already MAX_INSN
40807 instructions in it. */
40808 num_insn = window_list->num_insn;
40809 num_uops = window_list->num_uops;
40810 window_num = window_list->window_num;
40811 insn_fits = fits_dispatch_window (insn);
40812
40813 if (num_insn >= MAX_INSN
40814 || num_uops + insn_num_uops > MAX_INSN
40815 || !(insn_fits))
40816 {
40817 window_num = ~window_num & 1;
40818 window_list = allocate_next_window (window_num);
40819 }
40820
40821 if (window_num == 0)
40822 {
40823 add_insn_window (insn, window_list, insn_num_uops);
40824 if (window_list->num_insn >= MAX_INSN
40825 && insn_group == disp_branch)
40826 {
40827 process_end_window ();
40828 return;
40829 }
40830 }
40831 else if (window_num == 1)
40832 {
40833 window0_list = window_list->prev;
40834 sum = window0_list->window_size + window_list->window_size;
40835 if (sum == 32
40836 || (byte_len + sum) >= 48)
40837 {
40838 process_end_window ();
40839 window_list = dispatch_window_list;
40840 }
40841
40842 add_insn_window (insn, window_list, insn_num_uops);
40843 }
40844 else
40845 gcc_unreachable ();
40846
40847 if (is_end_basic_block (insn_group))
40848 {
40849 /* End of basic block is reached do end-basic-block process. */
40850 process_end_window ();
40851 return;
40852 }
40853 }
40854
40855 /* Print the dispatch window, WINDOW_NUM, to FILE. */
40856
40857 DEBUG_FUNCTION static void
40858 debug_dispatch_window_file (FILE *file, int window_num)
40859 {
40860 dispatch_windows *list;
40861 int i;
40862
40863 if (window_num == 0)
40864 list = dispatch_window_list;
40865 else
40866 list = dispatch_window_list1;
40867
40868 fprintf (file, "Window #%d:\n", list->window_num);
40869 fprintf (file, " num_insn = %d, num_uops = %d, window_size = %d\n",
40870 list->num_insn, list->num_uops, list->window_size);
40871 fprintf (file, " num_imm = %d, num_imm_32 = %d, num_imm_64 = %d, imm_size = %d\n",
40872 list->num_imm, list->num_imm_32, list->num_imm_64, list->imm_size);
40873
40874 fprintf (file, " num_loads = %d, num_stores = %d\n", list->num_loads,
40875 list->num_stores);
40876 fprintf (file, " insn info:\n");
40877
40878 for (i = 0; i < MAX_INSN; i++)
40879 {
40880 if (!list->window[i].insn)
40881 break;
40882 fprintf (file, " group[%d] = %s, insn[%d] = %p, path[%d] = %d byte_len[%d] = %d, imm_bytes[%d] = %d\n",
40883 i, group_name[list->window[i].group],
40884 i, (void *)list->window[i].insn,
40885 i, list->window[i].path,
40886 i, list->window[i].byte_len,
40887 i, list->window[i].imm_bytes);
40888 }
40889 }
40890
40891 /* Print to stdout a dispatch window. */
40892
40893 DEBUG_FUNCTION void
40894 debug_dispatch_window (int window_num)
40895 {
40896 debug_dispatch_window_file (stdout, window_num);
40897 }
40898
40899 /* Print INSN dispatch information to FILE. */
40900
40901 DEBUG_FUNCTION static void
40902 debug_insn_dispatch_info_file (FILE *file, rtx insn)
40903 {
40904 int byte_len;
40905 enum insn_path path;
40906 enum dispatch_group group;
40907 int imm_size;
40908 int num_imm_operand;
40909 int num_imm32_operand;
40910 int num_imm64_operand;
40911
40912 if (INSN_CODE (insn) < 0)
40913 return;
40914
40915 byte_len = min_insn_size (insn);
40916 path = get_insn_path (insn);
40917 group = get_insn_group (insn);
40918 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
40919 &num_imm64_operand);
40920
40921 fprintf (file, " insn info:\n");
40922 fprintf (file, " group = %s, path = %d, byte_len = %d\n",
40923 group_name[group], path, byte_len);
40924 fprintf (file, " num_imm = %d, num_imm_32 = %d, num_imm_64 = %d, imm_size = %d\n",
40925 num_imm_operand, num_imm32_operand, num_imm64_operand, imm_size);
40926 }
40927
40928 /* Print to STDERR the status of the ready list with respect to
40929 dispatch windows. */
40930
40931 DEBUG_FUNCTION void
40932 debug_ready_dispatch (void)
40933 {
40934 int i;
40935 int no_ready = number_in_ready ();
40936
40937 fprintf (stdout, "Number of ready: %d\n", no_ready);
40938
40939 for (i = 0; i < no_ready; i++)
40940 debug_insn_dispatch_info_file (stdout, get_ready_element (i));
40941 }
40942
40943 /* This routine is the driver of the dispatch scheduler. */
40944
40945 static void
40946 do_dispatch (rtx insn, int mode)
40947 {
40948 if (mode == DISPATCH_INIT)
40949 init_dispatch_sched ();
40950 else if (mode == ADD_TO_DISPATCH_WINDOW)
40951 add_to_dispatch_window (insn);
40952 }
40953
40954 /* Return TRUE if Dispatch Scheduling is supported. */
40955
40956 static bool
40957 has_dispatch (rtx insn, int action)
40958 {
40959 if ((TARGET_BDVER1 || TARGET_BDVER2)
40960 && flag_dispatch_scheduler)
40961 switch (action)
40962 {
40963 default:
40964 return false;
40965
40966 case IS_DISPATCH_ON:
40967 return true;
40968 break;
40969
40970 case IS_CMP:
40971 return is_cmp (insn);
40972
40973 case DISPATCH_VIOLATION:
40974 return dispatch_violation ();
40975
40976 case FITS_DISPATCH_WINDOW:
40977 return fits_dispatch_window (insn);
40978 }
40979
40980 return false;
40981 }
40982
40983 /* Implementation of reassociation_width target hook used by
40984 reassoc phase to identify parallelism level in reassociated
40985 tree. Statements tree_code is passed in OPC. Arguments type
40986 is passed in MODE.
40987
40988 Currently parallel reassociation is enabled for Atom
40989 processors only and we set reassociation width to be 2
40990 because Atom may issue up to 2 instructions per cycle.
40991
40992 Return value should be fixed if parallel reassociation is
40993 enabled for other processors. */
40994
40995 static int
40996 ix86_reassociation_width (unsigned int opc ATTRIBUTE_UNUSED,
40997 enum machine_mode mode)
40998 {
40999 int res = 1;
41000
41001 if (INTEGRAL_MODE_P (mode) && TARGET_REASSOC_INT_TO_PARALLEL)
41002 res = 2;
41003 else if (FLOAT_MODE_P (mode) && TARGET_REASSOC_FP_TO_PARALLEL)
41004 res = 2;
41005
41006 return res;
41007 }
41008
41009 /* ??? No autovectorization into MMX or 3DNOW until we can reliably
41010 place emms and femms instructions. */
41011
41012 static enum machine_mode
41013 ix86_preferred_simd_mode (enum machine_mode mode)
41014 {
41015 if (!TARGET_SSE)
41016 return word_mode;
41017
41018 switch (mode)
41019 {
41020 case QImode:
41021 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V32QImode : V16QImode;
41022 case HImode:
41023 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V16HImode : V8HImode;
41024 case SImode:
41025 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V8SImode : V4SImode;
41026 case DImode:
41027 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V4DImode : V2DImode;
41028
41029 case SFmode:
41030 if (TARGET_AVX && !TARGET_PREFER_AVX128)
41031 return V8SFmode;
41032 else
41033 return V4SFmode;
41034
41035 case DFmode:
41036 if (!TARGET_VECTORIZE_DOUBLE)
41037 return word_mode;
41038 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
41039 return V4DFmode;
41040 else if (TARGET_SSE2)
41041 return V2DFmode;
41042 /* FALLTHRU */
41043
41044 default:
41045 return word_mode;
41046 }
41047 }
41048
41049 /* If AVX is enabled then try vectorizing with both 256bit and 128bit
41050 vectors. */
41051
41052 static unsigned int
41053 ix86_autovectorize_vector_sizes (void)
41054 {
41055 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? 32 | 16 : 0;
41056 }
41057
41058 \f
41059
41060 /* Return class of registers which could be used for pseudo of MODE
41061 and of class RCLASS for spilling instead of memory. Return NO_REGS
41062 if it is not possible or non-profitable. */
41063 static reg_class_t
41064 ix86_spill_class (reg_class_t rclass, enum machine_mode mode)
41065 {
41066 if (TARGET_SSE && TARGET_GENERAL_REGS_SSE_SPILL && ! TARGET_MMX
41067 && hard_reg_set_subset_p (reg_class_contents[rclass],
41068 reg_class_contents[GENERAL_REGS])
41069 && (mode == SImode || (TARGET_64BIT && mode == DImode)))
41070 return SSE_REGS;
41071 return NO_REGS;
41072 }
41073
41074 /* Implement targetm.vectorize.init_cost. */
41075
41076 static void *
41077 ix86_init_cost (struct loop *loop_info ATTRIBUTE_UNUSED)
41078 {
41079 unsigned *cost = XNEWVEC (unsigned, 3);
41080 cost[vect_prologue] = cost[vect_body] = cost[vect_epilogue] = 0;
41081 return cost;
41082 }
41083
41084 /* Implement targetm.vectorize.add_stmt_cost. */
41085
41086 static unsigned
41087 ix86_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
41088 struct _stmt_vec_info *stmt_info, int misalign,
41089 enum vect_cost_model_location where)
41090 {
41091 unsigned *cost = (unsigned *) data;
41092 unsigned retval = 0;
41093
41094 if (flag_vect_cost_model)
41095 {
41096 tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
41097 int stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign);
41098
41099 /* Statements in an inner loop relative to the loop being
41100 vectorized are weighted more heavily. The value here is
41101 arbitrary and could potentially be improved with analysis. */
41102 if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
41103 count *= 50; /* FIXME. */
41104
41105 retval = (unsigned) (count * stmt_cost);
41106 cost[where] += retval;
41107 }
41108
41109 return retval;
41110 }
41111
41112 /* Implement targetm.vectorize.finish_cost. */
41113
41114 static void
41115 ix86_finish_cost (void *data, unsigned *prologue_cost,
41116 unsigned *body_cost, unsigned *epilogue_cost)
41117 {
41118 unsigned *cost = (unsigned *) data;
41119 *prologue_cost = cost[vect_prologue];
41120 *body_cost = cost[vect_body];
41121 *epilogue_cost = cost[vect_epilogue];
41122 }
41123
41124 /* Implement targetm.vectorize.destroy_cost_data. */
41125
41126 static void
41127 ix86_destroy_cost_data (void *data)
41128 {
41129 free (data);
41130 }
41131
41132 /* Validate target specific memory model bits in VAL. */
41133
41134 static unsigned HOST_WIDE_INT
41135 ix86_memmodel_check (unsigned HOST_WIDE_INT val)
41136 {
41137 unsigned HOST_WIDE_INT model = val & MEMMODEL_MASK;
41138 unsigned HOST_WIDE_INT strong;
41139
41140 if (val & ~(unsigned HOST_WIDE_INT)(IX86_HLE_ACQUIRE|IX86_HLE_RELEASE
41141 |MEMMODEL_MASK)
41142 || ((val & IX86_HLE_ACQUIRE) && (val & IX86_HLE_RELEASE)))
41143 {
41144 warning (OPT_Winvalid_memory_model,
41145 "Unknown architecture specific memory model");
41146 return MEMMODEL_SEQ_CST;
41147 }
41148 strong = (model == MEMMODEL_ACQ_REL || model == MEMMODEL_SEQ_CST);
41149 if (val & IX86_HLE_ACQUIRE && !(model == MEMMODEL_ACQUIRE || strong))
41150 {
41151 warning (OPT_Winvalid_memory_model,
41152 "HLE_ACQUIRE not used with ACQUIRE or stronger memory model");
41153 return MEMMODEL_SEQ_CST | IX86_HLE_ACQUIRE;
41154 }
41155 if (val & IX86_HLE_RELEASE && !(model == MEMMODEL_RELEASE || strong))
41156 {
41157 warning (OPT_Winvalid_memory_model,
41158 "HLE_RELEASE not used with RELEASE or stronger memory model");
41159 return MEMMODEL_SEQ_CST | IX86_HLE_RELEASE;
41160 }
41161 return val;
41162 }
41163
41164 /* Initialize the GCC target structure. */
41165 #undef TARGET_RETURN_IN_MEMORY
41166 #define TARGET_RETURN_IN_MEMORY ix86_return_in_memory
41167
41168 #undef TARGET_LEGITIMIZE_ADDRESS
41169 #define TARGET_LEGITIMIZE_ADDRESS ix86_legitimize_address
41170
41171 #undef TARGET_ATTRIBUTE_TABLE
41172 #define TARGET_ATTRIBUTE_TABLE ix86_attribute_table
41173 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
41174 # undef TARGET_MERGE_DECL_ATTRIBUTES
41175 # define TARGET_MERGE_DECL_ATTRIBUTES merge_dllimport_decl_attributes
41176 #endif
41177
41178 #undef TARGET_COMP_TYPE_ATTRIBUTES
41179 #define TARGET_COMP_TYPE_ATTRIBUTES ix86_comp_type_attributes
41180
41181 #undef TARGET_INIT_BUILTINS
41182 #define TARGET_INIT_BUILTINS ix86_init_builtins
41183 #undef TARGET_BUILTIN_DECL
41184 #define TARGET_BUILTIN_DECL ix86_builtin_decl
41185 #undef TARGET_EXPAND_BUILTIN
41186 #define TARGET_EXPAND_BUILTIN ix86_expand_builtin
41187
41188 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
41189 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
41190 ix86_builtin_vectorized_function
41191
41192 #undef TARGET_VECTORIZE_BUILTIN_TM_LOAD
41193 #define TARGET_VECTORIZE_BUILTIN_TM_LOAD ix86_builtin_tm_load
41194
41195 #undef TARGET_VECTORIZE_BUILTIN_TM_STORE
41196 #define TARGET_VECTORIZE_BUILTIN_TM_STORE ix86_builtin_tm_store
41197
41198 #undef TARGET_VECTORIZE_BUILTIN_GATHER
41199 #define TARGET_VECTORIZE_BUILTIN_GATHER ix86_vectorize_builtin_gather
41200
41201 #undef TARGET_BUILTIN_RECIPROCAL
41202 #define TARGET_BUILTIN_RECIPROCAL ix86_builtin_reciprocal
41203
41204 #undef TARGET_ASM_FUNCTION_EPILOGUE
41205 #define TARGET_ASM_FUNCTION_EPILOGUE ix86_output_function_epilogue
41206
41207 #undef TARGET_ENCODE_SECTION_INFO
41208 #ifndef SUBTARGET_ENCODE_SECTION_INFO
41209 #define TARGET_ENCODE_SECTION_INFO ix86_encode_section_info
41210 #else
41211 #define TARGET_ENCODE_SECTION_INFO SUBTARGET_ENCODE_SECTION_INFO
41212 #endif
41213
41214 #undef TARGET_ASM_OPEN_PAREN
41215 #define TARGET_ASM_OPEN_PAREN ""
41216 #undef TARGET_ASM_CLOSE_PAREN
41217 #define TARGET_ASM_CLOSE_PAREN ""
41218
41219 #undef TARGET_ASM_BYTE_OP
41220 #define TARGET_ASM_BYTE_OP ASM_BYTE
41221
41222 #undef TARGET_ASM_ALIGNED_HI_OP
41223 #define TARGET_ASM_ALIGNED_HI_OP ASM_SHORT
41224 #undef TARGET_ASM_ALIGNED_SI_OP
41225 #define TARGET_ASM_ALIGNED_SI_OP ASM_LONG
41226 #ifdef ASM_QUAD
41227 #undef TARGET_ASM_ALIGNED_DI_OP
41228 #define TARGET_ASM_ALIGNED_DI_OP ASM_QUAD
41229 #endif
41230
41231 #undef TARGET_PROFILE_BEFORE_PROLOGUE
41232 #define TARGET_PROFILE_BEFORE_PROLOGUE ix86_profile_before_prologue
41233
41234 #undef TARGET_ASM_UNALIGNED_HI_OP
41235 #define TARGET_ASM_UNALIGNED_HI_OP TARGET_ASM_ALIGNED_HI_OP
41236 #undef TARGET_ASM_UNALIGNED_SI_OP
41237 #define TARGET_ASM_UNALIGNED_SI_OP TARGET_ASM_ALIGNED_SI_OP
41238 #undef TARGET_ASM_UNALIGNED_DI_OP
41239 #define TARGET_ASM_UNALIGNED_DI_OP TARGET_ASM_ALIGNED_DI_OP
41240
41241 #undef TARGET_PRINT_OPERAND
41242 #define TARGET_PRINT_OPERAND ix86_print_operand
41243 #undef TARGET_PRINT_OPERAND_ADDRESS
41244 #define TARGET_PRINT_OPERAND_ADDRESS ix86_print_operand_address
41245 #undef TARGET_PRINT_OPERAND_PUNCT_VALID_P
41246 #define TARGET_PRINT_OPERAND_PUNCT_VALID_P ix86_print_operand_punct_valid_p
41247 #undef TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA
41248 #define TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA i386_asm_output_addr_const_extra
41249
41250 #undef TARGET_SCHED_INIT_GLOBAL
41251 #define TARGET_SCHED_INIT_GLOBAL ix86_sched_init_global
41252 #undef TARGET_SCHED_ADJUST_COST
41253 #define TARGET_SCHED_ADJUST_COST ix86_adjust_cost
41254 #undef TARGET_SCHED_ISSUE_RATE
41255 #define TARGET_SCHED_ISSUE_RATE ix86_issue_rate
41256 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
41257 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
41258 ia32_multipass_dfa_lookahead
41259
41260 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
41261 #define TARGET_FUNCTION_OK_FOR_SIBCALL ix86_function_ok_for_sibcall
41262
41263 #undef TARGET_MEMMODEL_CHECK
41264 #define TARGET_MEMMODEL_CHECK ix86_memmodel_check
41265
41266 #ifdef HAVE_AS_TLS
41267 #undef TARGET_HAVE_TLS
41268 #define TARGET_HAVE_TLS true
41269 #endif
41270 #undef TARGET_CANNOT_FORCE_CONST_MEM
41271 #define TARGET_CANNOT_FORCE_CONST_MEM ix86_cannot_force_const_mem
41272 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
41273 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P hook_bool_mode_const_rtx_true
41274
41275 #undef TARGET_DELEGITIMIZE_ADDRESS
41276 #define TARGET_DELEGITIMIZE_ADDRESS ix86_delegitimize_address
41277
41278 #undef TARGET_MS_BITFIELD_LAYOUT_P
41279 #define TARGET_MS_BITFIELD_LAYOUT_P ix86_ms_bitfield_layout_p
41280
41281 #if TARGET_MACHO
41282 #undef TARGET_BINDS_LOCAL_P
41283 #define TARGET_BINDS_LOCAL_P darwin_binds_local_p
41284 #endif
41285 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
41286 #undef TARGET_BINDS_LOCAL_P
41287 #define TARGET_BINDS_LOCAL_P i386_pe_binds_local_p
41288 #endif
41289
41290 #undef TARGET_ASM_OUTPUT_MI_THUNK
41291 #define TARGET_ASM_OUTPUT_MI_THUNK x86_output_mi_thunk
41292 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
41293 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK x86_can_output_mi_thunk
41294
41295 #undef TARGET_ASM_FILE_START
41296 #define TARGET_ASM_FILE_START x86_file_start
41297
41298 #undef TARGET_OPTION_OVERRIDE
41299 #define TARGET_OPTION_OVERRIDE ix86_option_override
41300
41301 #undef TARGET_REGISTER_MOVE_COST
41302 #define TARGET_REGISTER_MOVE_COST ix86_register_move_cost
41303 #undef TARGET_MEMORY_MOVE_COST
41304 #define TARGET_MEMORY_MOVE_COST ix86_memory_move_cost
41305 #undef TARGET_RTX_COSTS
41306 #define TARGET_RTX_COSTS ix86_rtx_costs
41307 #undef TARGET_ADDRESS_COST
41308 #define TARGET_ADDRESS_COST ix86_address_cost
41309
41310 #undef TARGET_FIXED_CONDITION_CODE_REGS
41311 #define TARGET_FIXED_CONDITION_CODE_REGS ix86_fixed_condition_code_regs
41312 #undef TARGET_CC_MODES_COMPATIBLE
41313 #define TARGET_CC_MODES_COMPATIBLE ix86_cc_modes_compatible
41314
41315 #undef TARGET_MACHINE_DEPENDENT_REORG
41316 #define TARGET_MACHINE_DEPENDENT_REORG ix86_reorg
41317
41318 #undef TARGET_BUILTIN_SETJMP_FRAME_VALUE
41319 #define TARGET_BUILTIN_SETJMP_FRAME_VALUE ix86_builtin_setjmp_frame_value
41320
41321 #undef TARGET_BUILD_BUILTIN_VA_LIST
41322 #define TARGET_BUILD_BUILTIN_VA_LIST ix86_build_builtin_va_list
41323
41324 #undef TARGET_FOLD_BUILTIN
41325 #define TARGET_FOLD_BUILTIN ix86_fold_builtin
41326
41327 #undef TARGET_ENUM_VA_LIST_P
41328 #define TARGET_ENUM_VA_LIST_P ix86_enum_va_list
41329
41330 #undef TARGET_FN_ABI_VA_LIST
41331 #define TARGET_FN_ABI_VA_LIST ix86_fn_abi_va_list
41332
41333 #undef TARGET_CANONICAL_VA_LIST_TYPE
41334 #define TARGET_CANONICAL_VA_LIST_TYPE ix86_canonical_va_list_type
41335
41336 #undef TARGET_EXPAND_BUILTIN_VA_START
41337 #define TARGET_EXPAND_BUILTIN_VA_START ix86_va_start
41338
41339 #undef TARGET_MD_ASM_CLOBBERS
41340 #define TARGET_MD_ASM_CLOBBERS ix86_md_asm_clobbers
41341
41342 #undef TARGET_PROMOTE_PROTOTYPES
41343 #define TARGET_PROMOTE_PROTOTYPES hook_bool_const_tree_true
41344 #undef TARGET_STRUCT_VALUE_RTX
41345 #define TARGET_STRUCT_VALUE_RTX ix86_struct_value_rtx
41346 #undef TARGET_SETUP_INCOMING_VARARGS
41347 #define TARGET_SETUP_INCOMING_VARARGS ix86_setup_incoming_varargs
41348 #undef TARGET_MUST_PASS_IN_STACK
41349 #define TARGET_MUST_PASS_IN_STACK ix86_must_pass_in_stack
41350 #undef TARGET_FUNCTION_ARG_ADVANCE
41351 #define TARGET_FUNCTION_ARG_ADVANCE ix86_function_arg_advance
41352 #undef TARGET_FUNCTION_ARG
41353 #define TARGET_FUNCTION_ARG ix86_function_arg
41354 #undef TARGET_FUNCTION_ARG_BOUNDARY
41355 #define TARGET_FUNCTION_ARG_BOUNDARY ix86_function_arg_boundary
41356 #undef TARGET_PASS_BY_REFERENCE
41357 #define TARGET_PASS_BY_REFERENCE ix86_pass_by_reference
41358 #undef TARGET_INTERNAL_ARG_POINTER
41359 #define TARGET_INTERNAL_ARG_POINTER ix86_internal_arg_pointer
41360 #undef TARGET_UPDATE_STACK_BOUNDARY
41361 #define TARGET_UPDATE_STACK_BOUNDARY ix86_update_stack_boundary
41362 #undef TARGET_GET_DRAP_RTX
41363 #define TARGET_GET_DRAP_RTX ix86_get_drap_rtx
41364 #undef TARGET_STRICT_ARGUMENT_NAMING
41365 #define TARGET_STRICT_ARGUMENT_NAMING hook_bool_CUMULATIVE_ARGS_true
41366 #undef TARGET_STATIC_CHAIN
41367 #define TARGET_STATIC_CHAIN ix86_static_chain
41368 #undef TARGET_TRAMPOLINE_INIT
41369 #define TARGET_TRAMPOLINE_INIT ix86_trampoline_init
41370 #undef TARGET_RETURN_POPS_ARGS
41371 #define TARGET_RETURN_POPS_ARGS ix86_return_pops_args
41372
41373 #undef TARGET_LEGITIMATE_COMBINED_INSN
41374 #define TARGET_LEGITIMATE_COMBINED_INSN ix86_legitimate_combined_insn
41375
41376 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
41377 #define TARGET_GIMPLIFY_VA_ARG_EXPR ix86_gimplify_va_arg
41378
41379 #undef TARGET_SCALAR_MODE_SUPPORTED_P
41380 #define TARGET_SCALAR_MODE_SUPPORTED_P ix86_scalar_mode_supported_p
41381
41382 #undef TARGET_VECTOR_MODE_SUPPORTED_P
41383 #define TARGET_VECTOR_MODE_SUPPORTED_P ix86_vector_mode_supported_p
41384
41385 #undef TARGET_C_MODE_FOR_SUFFIX
41386 #define TARGET_C_MODE_FOR_SUFFIX ix86_c_mode_for_suffix
41387
41388 #ifdef HAVE_AS_TLS
41389 #undef TARGET_ASM_OUTPUT_DWARF_DTPREL
41390 #define TARGET_ASM_OUTPUT_DWARF_DTPREL i386_output_dwarf_dtprel
41391 #endif
41392
41393 #ifdef SUBTARGET_INSERT_ATTRIBUTES
41394 #undef TARGET_INSERT_ATTRIBUTES
41395 #define TARGET_INSERT_ATTRIBUTES SUBTARGET_INSERT_ATTRIBUTES
41396 #endif
41397
41398 #undef TARGET_MANGLE_TYPE
41399 #define TARGET_MANGLE_TYPE ix86_mangle_type
41400
41401 #if !TARGET_MACHO
41402 #undef TARGET_STACK_PROTECT_FAIL
41403 #define TARGET_STACK_PROTECT_FAIL ix86_stack_protect_fail
41404 #endif
41405
41406 #undef TARGET_FUNCTION_VALUE
41407 #define TARGET_FUNCTION_VALUE ix86_function_value
41408
41409 #undef TARGET_FUNCTION_VALUE_REGNO_P
41410 #define TARGET_FUNCTION_VALUE_REGNO_P ix86_function_value_regno_p
41411
41412 #undef TARGET_PROMOTE_FUNCTION_MODE
41413 #define TARGET_PROMOTE_FUNCTION_MODE ix86_promote_function_mode
41414
41415 #undef TARGET_MEMBER_TYPE_FORCES_BLK
41416 #define TARGET_MEMBER_TYPE_FORCES_BLK ix86_member_type_forces_blk
41417
41418 #undef TARGET_SECONDARY_RELOAD
41419 #define TARGET_SECONDARY_RELOAD ix86_secondary_reload
41420
41421 #undef TARGET_CLASS_MAX_NREGS
41422 #define TARGET_CLASS_MAX_NREGS ix86_class_max_nregs
41423
41424 #undef TARGET_PREFERRED_RELOAD_CLASS
41425 #define TARGET_PREFERRED_RELOAD_CLASS ix86_preferred_reload_class
41426 #undef TARGET_PREFERRED_OUTPUT_RELOAD_CLASS
41427 #define TARGET_PREFERRED_OUTPUT_RELOAD_CLASS ix86_preferred_output_reload_class
41428 #undef TARGET_CLASS_LIKELY_SPILLED_P
41429 #define TARGET_CLASS_LIKELY_SPILLED_P ix86_class_likely_spilled_p
41430
41431 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
41432 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
41433 ix86_builtin_vectorization_cost
41434 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
41435 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
41436 ix86_vectorize_vec_perm_const_ok
41437 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
41438 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE \
41439 ix86_preferred_simd_mode
41440 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
41441 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
41442 ix86_autovectorize_vector_sizes
41443 #undef TARGET_VECTORIZE_INIT_COST
41444 #define TARGET_VECTORIZE_INIT_COST ix86_init_cost
41445 #undef TARGET_VECTORIZE_ADD_STMT_COST
41446 #define TARGET_VECTORIZE_ADD_STMT_COST ix86_add_stmt_cost
41447 #undef TARGET_VECTORIZE_FINISH_COST
41448 #define TARGET_VECTORIZE_FINISH_COST ix86_finish_cost
41449 #undef TARGET_VECTORIZE_DESTROY_COST_DATA
41450 #define TARGET_VECTORIZE_DESTROY_COST_DATA ix86_destroy_cost_data
41451
41452 #undef TARGET_SET_CURRENT_FUNCTION
41453 #define TARGET_SET_CURRENT_FUNCTION ix86_set_current_function
41454
41455 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
41456 #define TARGET_OPTION_VALID_ATTRIBUTE_P ix86_valid_target_attribute_p
41457
41458 #undef TARGET_OPTION_SAVE
41459 #define TARGET_OPTION_SAVE ix86_function_specific_save
41460
41461 #undef TARGET_OPTION_RESTORE
41462 #define TARGET_OPTION_RESTORE ix86_function_specific_restore
41463
41464 #undef TARGET_OPTION_PRINT
41465 #define TARGET_OPTION_PRINT ix86_function_specific_print
41466
41467 #undef TARGET_CAN_INLINE_P
41468 #define TARGET_CAN_INLINE_P ix86_can_inline_p
41469
41470 #undef TARGET_EXPAND_TO_RTL_HOOK
41471 #define TARGET_EXPAND_TO_RTL_HOOK ix86_maybe_switch_abi
41472
41473 #undef TARGET_LEGITIMATE_ADDRESS_P
41474 #define TARGET_LEGITIMATE_ADDRESS_P ix86_legitimate_address_p
41475
41476 #undef TARGET_LRA_P
41477 #define TARGET_LRA_P ix86_lra_p
41478
41479 #undef TARGET_REGISTER_PRIORITY
41480 #define TARGET_REGISTER_PRIORITY ix86_register_priority
41481
41482 #undef TARGET_LEGITIMATE_CONSTANT_P
41483 #define TARGET_LEGITIMATE_CONSTANT_P ix86_legitimate_constant_p
41484
41485 #undef TARGET_FRAME_POINTER_REQUIRED
41486 #define TARGET_FRAME_POINTER_REQUIRED ix86_frame_pointer_required
41487
41488 #undef TARGET_CAN_ELIMINATE
41489 #define TARGET_CAN_ELIMINATE ix86_can_eliminate
41490
41491 #undef TARGET_EXTRA_LIVE_ON_ENTRY
41492 #define TARGET_EXTRA_LIVE_ON_ENTRY ix86_live_on_entry
41493
41494 #undef TARGET_ASM_CODE_END
41495 #define TARGET_ASM_CODE_END ix86_code_end
41496
41497 #undef TARGET_CONDITIONAL_REGISTER_USAGE
41498 #define TARGET_CONDITIONAL_REGISTER_USAGE ix86_conditional_register_usage
41499
41500 #if TARGET_MACHO
41501 #undef TARGET_INIT_LIBFUNCS
41502 #define TARGET_INIT_LIBFUNCS darwin_rename_builtins
41503 #endif
41504
41505 #undef TARGET_SPILL_CLASS
41506 #define TARGET_SPILL_CLASS ix86_spill_class
41507
41508 struct gcc_target targetm = TARGET_INITIALIZER;
41509 \f
41510 #include "gt-i386.h"