b575dc20aae1084f876e8963b364d523620980eb
[gcc.git] / gcc / config / i386 / i386.c
1 /* Subroutines used for code generation on IA-32.
2 Copyright (C) 1988, 1992, 1994, 1995, 1996, 1997, 1998, 1999, 2000,
3 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012
4 Free Software Foundation, Inc.
5
6 This file is part of GCC.
7
8 GCC is free software; you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation; either version 3, or (at your option)
11 any later version.
12
13 GCC is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
21
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "tm.h"
26 #include "rtl.h"
27 #include "tree.h"
28 #include "tm_p.h"
29 #include "regs.h"
30 #include "hard-reg-set.h"
31 #include "insn-config.h"
32 #include "conditions.h"
33 #include "output.h"
34 #include "insn-codes.h"
35 #include "insn-attr.h"
36 #include "flags.h"
37 #include "except.h"
38 #include "function.h"
39 #include "recog.h"
40 #include "expr.h"
41 #include "optabs.h"
42 #include "diagnostic-core.h"
43 #include "toplev.h"
44 #include "basic-block.h"
45 #include "ggc.h"
46 #include "target.h"
47 #include "target-def.h"
48 #include "common/common-target.h"
49 #include "langhooks.h"
50 #include "reload.h"
51 #include "cgraph.h"
52 #include "gimple.h"
53 #include "dwarf2.h"
54 #include "df.h"
55 #include "tm-constrs.h"
56 #include "params.h"
57 #include "cselib.h"
58 #include "debug.h"
59 #include "sched-int.h"
60 #include "sbitmap.h"
61 #include "fibheap.h"
62 #include "opts.h"
63 #include "diagnostic.h"
64 #include "dumpfile.h"
65
66 enum upper_128bits_state
67 {
68 unknown = 0,
69 unused,
70 used
71 };
72
73 typedef struct block_info_def
74 {
75 /* State of the upper 128bits of AVX registers at exit. */
76 enum upper_128bits_state state;
77 /* TRUE if state of the upper 128bits of AVX registers is unchanged
78 in this block. */
79 bool unchanged;
80 /* TRUE if block has been processed. */
81 bool processed;
82 /* TRUE if block has been scanned. */
83 bool scanned;
84 /* Previous state of the upper 128bits of AVX registers at entry. */
85 enum upper_128bits_state prev;
86 } *block_info;
87
88 #define BLOCK_INFO(B) ((block_info) (B)->aux)
89
90 enum call_avx256_state
91 {
92 /* Callee returns 256bit AVX register. */
93 callee_return_avx256 = -1,
94 /* Callee returns and passes 256bit AVX register. */
95 callee_return_pass_avx256,
96 /* Callee passes 256bit AVX register. */
97 callee_pass_avx256,
98 /* Callee doesn't return nor passe 256bit AVX register, or no
99 256bit AVX register in function return. */
100 call_no_avx256,
101 /* vzeroupper intrinsic. */
102 vzeroupper_intrinsic
103 };
104
105 /* Check if a 256bit AVX register is referenced in stores. */
106
107 static void
108 check_avx256_stores (rtx dest, const_rtx set, void *data)
109 {
110 if ((REG_P (dest)
111 && VALID_AVX256_REG_MODE (GET_MODE (dest)))
112 || (GET_CODE (set) == SET
113 && REG_P (SET_SRC (set))
114 && VALID_AVX256_REG_MODE (GET_MODE (SET_SRC (set)))))
115 {
116 enum upper_128bits_state *state
117 = (enum upper_128bits_state *) data;
118 *state = used;
119 }
120 }
121
122 /* Helper function for move_or_delete_vzeroupper_1. Look for vzeroupper
123 in basic block BB. Delete it if upper 128bit AVX registers are
124 unused. If it isn't deleted, move it to just before a jump insn.
125
126 STATE is state of the upper 128bits of AVX registers at entry. */
127
128 static void
129 move_or_delete_vzeroupper_2 (basic_block bb,
130 enum upper_128bits_state state)
131 {
132 rtx insn, bb_end;
133 rtx vzeroupper_insn = NULL_RTX;
134 rtx pat;
135 int avx256;
136 bool unchanged;
137
138 if (BLOCK_INFO (bb)->unchanged)
139 {
140 if (dump_file)
141 fprintf (dump_file, " [bb %i] unchanged: upper 128bits: %d\n",
142 bb->index, state);
143
144 BLOCK_INFO (bb)->state = state;
145 return;
146 }
147
148 if (BLOCK_INFO (bb)->scanned && BLOCK_INFO (bb)->prev == state)
149 {
150 if (dump_file)
151 fprintf (dump_file, " [bb %i] scanned: upper 128bits: %d\n",
152 bb->index, BLOCK_INFO (bb)->state);
153 return;
154 }
155
156 BLOCK_INFO (bb)->prev = state;
157
158 if (dump_file)
159 fprintf (dump_file, " [bb %i] entry: upper 128bits: %d\n",
160 bb->index, state);
161
162 unchanged = true;
163
164 /* BB_END changes when it is deleted. */
165 bb_end = BB_END (bb);
166 insn = BB_HEAD (bb);
167 while (insn != bb_end)
168 {
169 insn = NEXT_INSN (insn);
170
171 if (!NONDEBUG_INSN_P (insn))
172 continue;
173
174 /* Move vzeroupper before jump/call. */
175 if (JUMP_P (insn) || CALL_P (insn))
176 {
177 if (!vzeroupper_insn)
178 continue;
179
180 if (PREV_INSN (insn) != vzeroupper_insn)
181 {
182 if (dump_file)
183 {
184 fprintf (dump_file, "Move vzeroupper after:\n");
185 print_rtl_single (dump_file, PREV_INSN (insn));
186 fprintf (dump_file, "before:\n");
187 print_rtl_single (dump_file, insn);
188 }
189 reorder_insns_nobb (vzeroupper_insn, vzeroupper_insn,
190 PREV_INSN (insn));
191 }
192 vzeroupper_insn = NULL_RTX;
193 continue;
194 }
195
196 pat = PATTERN (insn);
197
198 /* Check insn for vzeroupper intrinsic. */
199 if (GET_CODE (pat) == UNSPEC_VOLATILE
200 && XINT (pat, 1) == UNSPECV_VZEROUPPER)
201 {
202 if (dump_file)
203 {
204 /* Found vzeroupper intrinsic. */
205 fprintf (dump_file, "Found vzeroupper:\n");
206 print_rtl_single (dump_file, insn);
207 }
208 }
209 else
210 {
211 /* Check insn for vzeroall intrinsic. */
212 if (GET_CODE (pat) == PARALLEL
213 && GET_CODE (XVECEXP (pat, 0, 0)) == UNSPEC_VOLATILE
214 && XINT (XVECEXP (pat, 0, 0), 1) == UNSPECV_VZEROALL)
215 {
216 state = unused;
217 unchanged = false;
218
219 /* Delete pending vzeroupper insertion. */
220 if (vzeroupper_insn)
221 {
222 delete_insn (vzeroupper_insn);
223 vzeroupper_insn = NULL_RTX;
224 }
225 }
226 else if (state != used)
227 {
228 note_stores (pat, check_avx256_stores, &state);
229 if (state == used)
230 unchanged = false;
231 }
232 continue;
233 }
234
235 /* Process vzeroupper intrinsic. */
236 avx256 = INTVAL (XVECEXP (pat, 0, 0));
237
238 if (state == unused)
239 {
240 /* Since the upper 128bits are cleared, callee must not pass
241 256bit AVX register. We only need to check if callee
242 returns 256bit AVX register. */
243 if (avx256 == callee_return_avx256)
244 {
245 state = used;
246 unchanged = false;
247 }
248
249 /* Remove unnecessary vzeroupper since upper 128bits are
250 cleared. */
251 if (dump_file)
252 {
253 fprintf (dump_file, "Delete redundant vzeroupper:\n");
254 print_rtl_single (dump_file, insn);
255 }
256 delete_insn (insn);
257 }
258 else
259 {
260 /* Set state to UNUSED if callee doesn't return 256bit AVX
261 register. */
262 if (avx256 != callee_return_pass_avx256)
263 state = unused;
264
265 if (avx256 == callee_return_pass_avx256
266 || avx256 == callee_pass_avx256)
267 {
268 /* Must remove vzeroupper since callee passes in 256bit
269 AVX register. */
270 if (dump_file)
271 {
272 fprintf (dump_file, "Delete callee pass vzeroupper:\n");
273 print_rtl_single (dump_file, insn);
274 }
275 delete_insn (insn);
276 }
277 else
278 {
279 vzeroupper_insn = insn;
280 unchanged = false;
281 }
282 }
283 }
284
285 BLOCK_INFO (bb)->state = state;
286 BLOCK_INFO (bb)->unchanged = unchanged;
287 BLOCK_INFO (bb)->scanned = true;
288
289 if (dump_file)
290 fprintf (dump_file, " [bb %i] exit: %s: upper 128bits: %d\n",
291 bb->index, unchanged ? "unchanged" : "changed",
292 state);
293 }
294
295 /* Helper function for move_or_delete_vzeroupper. Process vzeroupper
296 in BLOCK and check its predecessor blocks. Treat UNKNOWN state
297 as USED if UNKNOWN_IS_UNUSED is true. Return TRUE if the exit
298 state is changed. */
299
300 static bool
301 move_or_delete_vzeroupper_1 (basic_block block, bool unknown_is_unused)
302 {
303 edge e;
304 edge_iterator ei;
305 enum upper_128bits_state state, old_state, new_state;
306 bool seen_unknown;
307
308 if (dump_file)
309 fprintf (dump_file, " Process [bb %i]: status: %d\n",
310 block->index, BLOCK_INFO (block)->processed);
311
312 if (BLOCK_INFO (block)->processed)
313 return false;
314
315 state = unused;
316
317 /* Check all predecessor edges of this block. */
318 seen_unknown = false;
319 FOR_EACH_EDGE (e, ei, block->preds)
320 {
321 if (e->src == block)
322 continue;
323 switch (BLOCK_INFO (e->src)->state)
324 {
325 case unknown:
326 if (!unknown_is_unused)
327 seen_unknown = true;
328 case unused:
329 break;
330 case used:
331 state = used;
332 goto done;
333 }
334 }
335
336 if (seen_unknown)
337 state = unknown;
338
339 done:
340 old_state = BLOCK_INFO (block)->state;
341 move_or_delete_vzeroupper_2 (block, state);
342 new_state = BLOCK_INFO (block)->state;
343
344 if (state != unknown || new_state == used)
345 BLOCK_INFO (block)->processed = true;
346
347 /* Need to rescan if the upper 128bits of AVX registers are changed
348 to USED at exit. */
349 if (new_state != old_state)
350 {
351 if (new_state == used)
352 cfun->machine->rescan_vzeroupper_p = 1;
353 return true;
354 }
355 else
356 return false;
357 }
358
359 /* Go through the instruction stream looking for vzeroupper. Delete
360 it if upper 128bit AVX registers are unused. If it isn't deleted,
361 move it to just before a jump insn. */
362
363 static void
364 move_or_delete_vzeroupper (void)
365 {
366 edge e;
367 edge_iterator ei;
368 basic_block bb;
369 fibheap_t worklist, pending, fibheap_swap;
370 sbitmap visited, in_worklist, in_pending, sbitmap_swap;
371 int *bb_order;
372 int *rc_order;
373 int i;
374
375 /* Set up block info for each basic block. */
376 alloc_aux_for_blocks (sizeof (struct block_info_def));
377
378 /* Process outgoing edges of entry point. */
379 if (dump_file)
380 fprintf (dump_file, "Process outgoing edges of entry point\n");
381
382 FOR_EACH_EDGE (e, ei, ENTRY_BLOCK_PTR->succs)
383 {
384 move_or_delete_vzeroupper_2 (e->dest,
385 cfun->machine->caller_pass_avx256_p
386 ? used : unused);
387 BLOCK_INFO (e->dest)->processed = true;
388 }
389
390 /* Compute reverse completion order of depth first search of the CFG
391 so that the data-flow runs faster. */
392 rc_order = XNEWVEC (int, n_basic_blocks - NUM_FIXED_BLOCKS);
393 bb_order = XNEWVEC (int, last_basic_block);
394 pre_and_rev_post_order_compute (NULL, rc_order, false);
395 for (i = 0; i < n_basic_blocks - NUM_FIXED_BLOCKS; i++)
396 bb_order[rc_order[i]] = i;
397 free (rc_order);
398
399 worklist = fibheap_new ();
400 pending = fibheap_new ();
401 visited = sbitmap_alloc (last_basic_block);
402 in_worklist = sbitmap_alloc (last_basic_block);
403 in_pending = sbitmap_alloc (last_basic_block);
404 sbitmap_zero (in_worklist);
405
406 /* Don't check outgoing edges of entry point. */
407 sbitmap_ones (in_pending);
408 FOR_EACH_BB (bb)
409 if (BLOCK_INFO (bb)->processed)
410 RESET_BIT (in_pending, bb->index);
411 else
412 {
413 move_or_delete_vzeroupper_1 (bb, false);
414 fibheap_insert (pending, bb_order[bb->index], bb);
415 }
416
417 if (dump_file)
418 fprintf (dump_file, "Check remaining basic blocks\n");
419
420 while (!fibheap_empty (pending))
421 {
422 fibheap_swap = pending;
423 pending = worklist;
424 worklist = fibheap_swap;
425 sbitmap_swap = in_pending;
426 in_pending = in_worklist;
427 in_worklist = sbitmap_swap;
428
429 sbitmap_zero (visited);
430
431 cfun->machine->rescan_vzeroupper_p = 0;
432
433 while (!fibheap_empty (worklist))
434 {
435 bb = (basic_block) fibheap_extract_min (worklist);
436 RESET_BIT (in_worklist, bb->index);
437 gcc_assert (!TEST_BIT (visited, bb->index));
438 if (!TEST_BIT (visited, bb->index))
439 {
440 edge_iterator ei;
441
442 SET_BIT (visited, bb->index);
443
444 if (move_or_delete_vzeroupper_1 (bb, false))
445 FOR_EACH_EDGE (e, ei, bb->succs)
446 {
447 if (e->dest == EXIT_BLOCK_PTR
448 || BLOCK_INFO (e->dest)->processed)
449 continue;
450
451 if (TEST_BIT (visited, e->dest->index))
452 {
453 if (!TEST_BIT (in_pending, e->dest->index))
454 {
455 /* Send E->DEST to next round. */
456 SET_BIT (in_pending, e->dest->index);
457 fibheap_insert (pending,
458 bb_order[e->dest->index],
459 e->dest);
460 }
461 }
462 else if (!TEST_BIT (in_worklist, e->dest->index))
463 {
464 /* Add E->DEST to current round. */
465 SET_BIT (in_worklist, e->dest->index);
466 fibheap_insert (worklist, bb_order[e->dest->index],
467 e->dest);
468 }
469 }
470 }
471 }
472
473 if (!cfun->machine->rescan_vzeroupper_p)
474 break;
475 }
476
477 free (bb_order);
478 fibheap_delete (worklist);
479 fibheap_delete (pending);
480 sbitmap_free (visited);
481 sbitmap_free (in_worklist);
482 sbitmap_free (in_pending);
483
484 if (dump_file)
485 fprintf (dump_file, "Process remaining basic blocks\n");
486
487 FOR_EACH_BB (bb)
488 move_or_delete_vzeroupper_1 (bb, true);
489
490 free_aux_for_blocks ();
491 }
492
493 static rtx legitimize_dllimport_symbol (rtx, bool);
494
495 #ifndef CHECK_STACK_LIMIT
496 #define CHECK_STACK_LIMIT (-1)
497 #endif
498
499 /* Return index of given mode in mult and division cost tables. */
500 #define MODE_INDEX(mode) \
501 ((mode) == QImode ? 0 \
502 : (mode) == HImode ? 1 \
503 : (mode) == SImode ? 2 \
504 : (mode) == DImode ? 3 \
505 : 4)
506
507 /* Processor costs (relative to an add) */
508 /* We assume COSTS_N_INSNS is defined as (N)*4 and an addition is 2 bytes. */
509 #define COSTS_N_BYTES(N) ((N) * 2)
510
511 #define DUMMY_STRINGOP_ALGS {libcall, {{-1, libcall}}}
512
513 const
514 struct processor_costs ix86_size_cost = {/* costs for tuning for size */
515 COSTS_N_BYTES (2), /* cost of an add instruction */
516 COSTS_N_BYTES (3), /* cost of a lea instruction */
517 COSTS_N_BYTES (2), /* variable shift costs */
518 COSTS_N_BYTES (3), /* constant shift costs */
519 {COSTS_N_BYTES (3), /* cost of starting multiply for QI */
520 COSTS_N_BYTES (3), /* HI */
521 COSTS_N_BYTES (3), /* SI */
522 COSTS_N_BYTES (3), /* DI */
523 COSTS_N_BYTES (5)}, /* other */
524 0, /* cost of multiply per each bit set */
525 {COSTS_N_BYTES (3), /* cost of a divide/mod for QI */
526 COSTS_N_BYTES (3), /* HI */
527 COSTS_N_BYTES (3), /* SI */
528 COSTS_N_BYTES (3), /* DI */
529 COSTS_N_BYTES (5)}, /* other */
530 COSTS_N_BYTES (3), /* cost of movsx */
531 COSTS_N_BYTES (3), /* cost of movzx */
532 0, /* "large" insn */
533 2, /* MOVE_RATIO */
534 2, /* cost for loading QImode using movzbl */
535 {2, 2, 2}, /* cost of loading integer registers
536 in QImode, HImode and SImode.
537 Relative to reg-reg move (2). */
538 {2, 2, 2}, /* cost of storing integer registers */
539 2, /* cost of reg,reg fld/fst */
540 {2, 2, 2}, /* cost of loading fp registers
541 in SFmode, DFmode and XFmode */
542 {2, 2, 2}, /* cost of storing fp registers
543 in SFmode, DFmode and XFmode */
544 3, /* cost of moving MMX register */
545 {3, 3}, /* cost of loading MMX registers
546 in SImode and DImode */
547 {3, 3}, /* cost of storing MMX registers
548 in SImode and DImode */
549 3, /* cost of moving SSE register */
550 {3, 3, 3}, /* cost of loading SSE registers
551 in SImode, DImode and TImode */
552 {3, 3, 3}, /* cost of storing SSE registers
553 in SImode, DImode and TImode */
554 3, /* MMX or SSE register to integer */
555 0, /* size of l1 cache */
556 0, /* size of l2 cache */
557 0, /* size of prefetch block */
558 0, /* number of parallel prefetches */
559 2, /* Branch cost */
560 COSTS_N_BYTES (2), /* cost of FADD and FSUB insns. */
561 COSTS_N_BYTES (2), /* cost of FMUL instruction. */
562 COSTS_N_BYTES (2), /* cost of FDIV instruction. */
563 COSTS_N_BYTES (2), /* cost of FABS instruction. */
564 COSTS_N_BYTES (2), /* cost of FCHS instruction. */
565 COSTS_N_BYTES (2), /* cost of FSQRT instruction. */
566 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
567 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}}},
568 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
569 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}}},
570 1, /* scalar_stmt_cost. */
571 1, /* scalar load_cost. */
572 1, /* scalar_store_cost. */
573 1, /* vec_stmt_cost. */
574 1, /* vec_to_scalar_cost. */
575 1, /* scalar_to_vec_cost. */
576 1, /* vec_align_load_cost. */
577 1, /* vec_unalign_load_cost. */
578 1, /* vec_store_cost. */
579 1, /* cond_taken_branch_cost. */
580 1, /* cond_not_taken_branch_cost. */
581 };
582
583 /* Processor costs (relative to an add) */
584 static const
585 struct processor_costs i386_cost = { /* 386 specific costs */
586 COSTS_N_INSNS (1), /* cost of an add instruction */
587 COSTS_N_INSNS (1), /* cost of a lea instruction */
588 COSTS_N_INSNS (3), /* variable shift costs */
589 COSTS_N_INSNS (2), /* constant shift costs */
590 {COSTS_N_INSNS (6), /* cost of starting multiply for QI */
591 COSTS_N_INSNS (6), /* HI */
592 COSTS_N_INSNS (6), /* SI */
593 COSTS_N_INSNS (6), /* DI */
594 COSTS_N_INSNS (6)}, /* other */
595 COSTS_N_INSNS (1), /* cost of multiply per each bit set */
596 {COSTS_N_INSNS (23), /* cost of a divide/mod for QI */
597 COSTS_N_INSNS (23), /* HI */
598 COSTS_N_INSNS (23), /* SI */
599 COSTS_N_INSNS (23), /* DI */
600 COSTS_N_INSNS (23)}, /* other */
601 COSTS_N_INSNS (3), /* cost of movsx */
602 COSTS_N_INSNS (2), /* cost of movzx */
603 15, /* "large" insn */
604 3, /* MOVE_RATIO */
605 4, /* cost for loading QImode using movzbl */
606 {2, 4, 2}, /* cost of loading integer registers
607 in QImode, HImode and SImode.
608 Relative to reg-reg move (2). */
609 {2, 4, 2}, /* cost of storing integer registers */
610 2, /* cost of reg,reg fld/fst */
611 {8, 8, 8}, /* cost of loading fp registers
612 in SFmode, DFmode and XFmode */
613 {8, 8, 8}, /* cost of storing fp registers
614 in SFmode, DFmode and XFmode */
615 2, /* cost of moving MMX register */
616 {4, 8}, /* cost of loading MMX registers
617 in SImode and DImode */
618 {4, 8}, /* cost of storing MMX registers
619 in SImode and DImode */
620 2, /* cost of moving SSE register */
621 {4, 8, 16}, /* cost of loading SSE registers
622 in SImode, DImode and TImode */
623 {4, 8, 16}, /* cost of storing SSE registers
624 in SImode, DImode and TImode */
625 3, /* MMX or SSE register to integer */
626 0, /* size of l1 cache */
627 0, /* size of l2 cache */
628 0, /* size of prefetch block */
629 0, /* number of parallel prefetches */
630 1, /* Branch cost */
631 COSTS_N_INSNS (23), /* cost of FADD and FSUB insns. */
632 COSTS_N_INSNS (27), /* cost of FMUL instruction. */
633 COSTS_N_INSNS (88), /* cost of FDIV instruction. */
634 COSTS_N_INSNS (22), /* cost of FABS instruction. */
635 COSTS_N_INSNS (24), /* cost of FCHS instruction. */
636 COSTS_N_INSNS (122), /* cost of FSQRT instruction. */
637 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
638 DUMMY_STRINGOP_ALGS},
639 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
640 DUMMY_STRINGOP_ALGS},
641 1, /* scalar_stmt_cost. */
642 1, /* scalar load_cost. */
643 1, /* scalar_store_cost. */
644 1, /* vec_stmt_cost. */
645 1, /* vec_to_scalar_cost. */
646 1, /* scalar_to_vec_cost. */
647 1, /* vec_align_load_cost. */
648 2, /* vec_unalign_load_cost. */
649 1, /* vec_store_cost. */
650 3, /* cond_taken_branch_cost. */
651 1, /* cond_not_taken_branch_cost. */
652 };
653
654 static const
655 struct processor_costs i486_cost = { /* 486 specific costs */
656 COSTS_N_INSNS (1), /* cost of an add instruction */
657 COSTS_N_INSNS (1), /* cost of a lea instruction */
658 COSTS_N_INSNS (3), /* variable shift costs */
659 COSTS_N_INSNS (2), /* constant shift costs */
660 {COSTS_N_INSNS (12), /* cost of starting multiply for QI */
661 COSTS_N_INSNS (12), /* HI */
662 COSTS_N_INSNS (12), /* SI */
663 COSTS_N_INSNS (12), /* DI */
664 COSTS_N_INSNS (12)}, /* other */
665 1, /* cost of multiply per each bit set */
666 {COSTS_N_INSNS (40), /* cost of a divide/mod for QI */
667 COSTS_N_INSNS (40), /* HI */
668 COSTS_N_INSNS (40), /* SI */
669 COSTS_N_INSNS (40), /* DI */
670 COSTS_N_INSNS (40)}, /* other */
671 COSTS_N_INSNS (3), /* cost of movsx */
672 COSTS_N_INSNS (2), /* cost of movzx */
673 15, /* "large" insn */
674 3, /* MOVE_RATIO */
675 4, /* cost for loading QImode using movzbl */
676 {2, 4, 2}, /* cost of loading integer registers
677 in QImode, HImode and SImode.
678 Relative to reg-reg move (2). */
679 {2, 4, 2}, /* cost of storing integer registers */
680 2, /* cost of reg,reg fld/fst */
681 {8, 8, 8}, /* cost of loading fp registers
682 in SFmode, DFmode and XFmode */
683 {8, 8, 8}, /* cost of storing fp registers
684 in SFmode, DFmode and XFmode */
685 2, /* cost of moving MMX register */
686 {4, 8}, /* cost of loading MMX registers
687 in SImode and DImode */
688 {4, 8}, /* cost of storing MMX registers
689 in SImode and DImode */
690 2, /* cost of moving SSE register */
691 {4, 8, 16}, /* cost of loading SSE registers
692 in SImode, DImode and TImode */
693 {4, 8, 16}, /* cost of storing SSE registers
694 in SImode, DImode and TImode */
695 3, /* MMX or SSE register to integer */
696 4, /* size of l1 cache. 486 has 8kB cache
697 shared for code and data, so 4kB is
698 not really precise. */
699 4, /* size of l2 cache */
700 0, /* size of prefetch block */
701 0, /* number of parallel prefetches */
702 1, /* Branch cost */
703 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
704 COSTS_N_INSNS (16), /* cost of FMUL instruction. */
705 COSTS_N_INSNS (73), /* cost of FDIV instruction. */
706 COSTS_N_INSNS (3), /* cost of FABS instruction. */
707 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
708 COSTS_N_INSNS (83), /* cost of FSQRT instruction. */
709 {{rep_prefix_4_byte, {{-1, rep_prefix_4_byte}}},
710 DUMMY_STRINGOP_ALGS},
711 {{rep_prefix_4_byte, {{-1, rep_prefix_4_byte}}},
712 DUMMY_STRINGOP_ALGS},
713 1, /* scalar_stmt_cost. */
714 1, /* scalar load_cost. */
715 1, /* scalar_store_cost. */
716 1, /* vec_stmt_cost. */
717 1, /* vec_to_scalar_cost. */
718 1, /* scalar_to_vec_cost. */
719 1, /* vec_align_load_cost. */
720 2, /* vec_unalign_load_cost. */
721 1, /* vec_store_cost. */
722 3, /* cond_taken_branch_cost. */
723 1, /* cond_not_taken_branch_cost. */
724 };
725
726 static const
727 struct processor_costs pentium_cost = {
728 COSTS_N_INSNS (1), /* cost of an add instruction */
729 COSTS_N_INSNS (1), /* cost of a lea instruction */
730 COSTS_N_INSNS (4), /* variable shift costs */
731 COSTS_N_INSNS (1), /* constant shift costs */
732 {COSTS_N_INSNS (11), /* cost of starting multiply for QI */
733 COSTS_N_INSNS (11), /* HI */
734 COSTS_N_INSNS (11), /* SI */
735 COSTS_N_INSNS (11), /* DI */
736 COSTS_N_INSNS (11)}, /* other */
737 0, /* cost of multiply per each bit set */
738 {COSTS_N_INSNS (25), /* cost of a divide/mod for QI */
739 COSTS_N_INSNS (25), /* HI */
740 COSTS_N_INSNS (25), /* SI */
741 COSTS_N_INSNS (25), /* DI */
742 COSTS_N_INSNS (25)}, /* other */
743 COSTS_N_INSNS (3), /* cost of movsx */
744 COSTS_N_INSNS (2), /* cost of movzx */
745 8, /* "large" insn */
746 6, /* MOVE_RATIO */
747 6, /* cost for loading QImode using movzbl */
748 {2, 4, 2}, /* cost of loading integer registers
749 in QImode, HImode and SImode.
750 Relative to reg-reg move (2). */
751 {2, 4, 2}, /* cost of storing integer registers */
752 2, /* cost of reg,reg fld/fst */
753 {2, 2, 6}, /* cost of loading fp registers
754 in SFmode, DFmode and XFmode */
755 {4, 4, 6}, /* cost of storing fp registers
756 in SFmode, DFmode and XFmode */
757 8, /* cost of moving MMX register */
758 {8, 8}, /* cost of loading MMX registers
759 in SImode and DImode */
760 {8, 8}, /* cost of storing MMX registers
761 in SImode and DImode */
762 2, /* cost of moving SSE register */
763 {4, 8, 16}, /* cost of loading SSE registers
764 in SImode, DImode and TImode */
765 {4, 8, 16}, /* cost of storing SSE registers
766 in SImode, DImode and TImode */
767 3, /* MMX or SSE register to integer */
768 8, /* size of l1 cache. */
769 8, /* size of l2 cache */
770 0, /* size of prefetch block */
771 0, /* number of parallel prefetches */
772 2, /* Branch cost */
773 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
774 COSTS_N_INSNS (3), /* cost of FMUL instruction. */
775 COSTS_N_INSNS (39), /* cost of FDIV instruction. */
776 COSTS_N_INSNS (1), /* cost of FABS instruction. */
777 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
778 COSTS_N_INSNS (70), /* cost of FSQRT instruction. */
779 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
780 DUMMY_STRINGOP_ALGS},
781 {{libcall, {{-1, rep_prefix_4_byte}}},
782 DUMMY_STRINGOP_ALGS},
783 1, /* scalar_stmt_cost. */
784 1, /* scalar load_cost. */
785 1, /* scalar_store_cost. */
786 1, /* vec_stmt_cost. */
787 1, /* vec_to_scalar_cost. */
788 1, /* scalar_to_vec_cost. */
789 1, /* vec_align_load_cost. */
790 2, /* vec_unalign_load_cost. */
791 1, /* vec_store_cost. */
792 3, /* cond_taken_branch_cost. */
793 1, /* cond_not_taken_branch_cost. */
794 };
795
796 static const
797 struct processor_costs pentiumpro_cost = {
798 COSTS_N_INSNS (1), /* cost of an add instruction */
799 COSTS_N_INSNS (1), /* cost of a lea instruction */
800 COSTS_N_INSNS (1), /* variable shift costs */
801 COSTS_N_INSNS (1), /* constant shift costs */
802 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
803 COSTS_N_INSNS (4), /* HI */
804 COSTS_N_INSNS (4), /* SI */
805 COSTS_N_INSNS (4), /* DI */
806 COSTS_N_INSNS (4)}, /* other */
807 0, /* cost of multiply per each bit set */
808 {COSTS_N_INSNS (17), /* cost of a divide/mod for QI */
809 COSTS_N_INSNS (17), /* HI */
810 COSTS_N_INSNS (17), /* SI */
811 COSTS_N_INSNS (17), /* DI */
812 COSTS_N_INSNS (17)}, /* other */
813 COSTS_N_INSNS (1), /* cost of movsx */
814 COSTS_N_INSNS (1), /* cost of movzx */
815 8, /* "large" insn */
816 6, /* MOVE_RATIO */
817 2, /* cost for loading QImode using movzbl */
818 {4, 4, 4}, /* cost of loading integer registers
819 in QImode, HImode and SImode.
820 Relative to reg-reg move (2). */
821 {2, 2, 2}, /* cost of storing integer registers */
822 2, /* cost of reg,reg fld/fst */
823 {2, 2, 6}, /* cost of loading fp registers
824 in SFmode, DFmode and XFmode */
825 {4, 4, 6}, /* cost of storing fp registers
826 in SFmode, DFmode and XFmode */
827 2, /* cost of moving MMX register */
828 {2, 2}, /* cost of loading MMX registers
829 in SImode and DImode */
830 {2, 2}, /* cost of storing MMX registers
831 in SImode and DImode */
832 2, /* cost of moving SSE register */
833 {2, 2, 8}, /* cost of loading SSE registers
834 in SImode, DImode and TImode */
835 {2, 2, 8}, /* cost of storing SSE registers
836 in SImode, DImode and TImode */
837 3, /* MMX or SSE register to integer */
838 8, /* size of l1 cache. */
839 256, /* size of l2 cache */
840 32, /* size of prefetch block */
841 6, /* number of parallel prefetches */
842 2, /* Branch cost */
843 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
844 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
845 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
846 COSTS_N_INSNS (2), /* cost of FABS instruction. */
847 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
848 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
849 /* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes
850 (we ensure the alignment). For small blocks inline loop is still a
851 noticeable win, for bigger blocks either rep movsl or rep movsb is
852 way to go. Rep movsb has apparently more expensive startup time in CPU,
853 but after 4K the difference is down in the noise. */
854 {{rep_prefix_4_byte, {{128, loop}, {1024, unrolled_loop},
855 {8192, rep_prefix_4_byte}, {-1, rep_prefix_1_byte}}},
856 DUMMY_STRINGOP_ALGS},
857 {{rep_prefix_4_byte, {{1024, unrolled_loop},
858 {8192, rep_prefix_4_byte}, {-1, libcall}}},
859 DUMMY_STRINGOP_ALGS},
860 1, /* scalar_stmt_cost. */
861 1, /* scalar load_cost. */
862 1, /* scalar_store_cost. */
863 1, /* vec_stmt_cost. */
864 1, /* vec_to_scalar_cost. */
865 1, /* scalar_to_vec_cost. */
866 1, /* vec_align_load_cost. */
867 2, /* vec_unalign_load_cost. */
868 1, /* vec_store_cost. */
869 3, /* cond_taken_branch_cost. */
870 1, /* cond_not_taken_branch_cost. */
871 };
872
873 static const
874 struct processor_costs geode_cost = {
875 COSTS_N_INSNS (1), /* cost of an add instruction */
876 COSTS_N_INSNS (1), /* cost of a lea instruction */
877 COSTS_N_INSNS (2), /* variable shift costs */
878 COSTS_N_INSNS (1), /* constant shift costs */
879 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
880 COSTS_N_INSNS (4), /* HI */
881 COSTS_N_INSNS (7), /* SI */
882 COSTS_N_INSNS (7), /* DI */
883 COSTS_N_INSNS (7)}, /* other */
884 0, /* cost of multiply per each bit set */
885 {COSTS_N_INSNS (15), /* cost of a divide/mod for QI */
886 COSTS_N_INSNS (23), /* HI */
887 COSTS_N_INSNS (39), /* SI */
888 COSTS_N_INSNS (39), /* DI */
889 COSTS_N_INSNS (39)}, /* other */
890 COSTS_N_INSNS (1), /* cost of movsx */
891 COSTS_N_INSNS (1), /* cost of movzx */
892 8, /* "large" insn */
893 4, /* MOVE_RATIO */
894 1, /* cost for loading QImode using movzbl */
895 {1, 1, 1}, /* cost of loading integer registers
896 in QImode, HImode and SImode.
897 Relative to reg-reg move (2). */
898 {1, 1, 1}, /* cost of storing integer registers */
899 1, /* cost of reg,reg fld/fst */
900 {1, 1, 1}, /* cost of loading fp registers
901 in SFmode, DFmode and XFmode */
902 {4, 6, 6}, /* cost of storing fp registers
903 in SFmode, DFmode and XFmode */
904
905 1, /* cost of moving MMX register */
906 {1, 1}, /* cost of loading MMX registers
907 in SImode and DImode */
908 {1, 1}, /* cost of storing MMX registers
909 in SImode and DImode */
910 1, /* cost of moving SSE register */
911 {1, 1, 1}, /* cost of loading SSE registers
912 in SImode, DImode and TImode */
913 {1, 1, 1}, /* cost of storing SSE registers
914 in SImode, DImode and TImode */
915 1, /* MMX or SSE register to integer */
916 64, /* size of l1 cache. */
917 128, /* size of l2 cache. */
918 32, /* size of prefetch block */
919 1, /* number of parallel prefetches */
920 1, /* Branch cost */
921 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
922 COSTS_N_INSNS (11), /* cost of FMUL instruction. */
923 COSTS_N_INSNS (47), /* cost of FDIV instruction. */
924 COSTS_N_INSNS (1), /* cost of FABS instruction. */
925 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
926 COSTS_N_INSNS (54), /* cost of FSQRT instruction. */
927 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
928 DUMMY_STRINGOP_ALGS},
929 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
930 DUMMY_STRINGOP_ALGS},
931 1, /* scalar_stmt_cost. */
932 1, /* scalar load_cost. */
933 1, /* scalar_store_cost. */
934 1, /* vec_stmt_cost. */
935 1, /* vec_to_scalar_cost. */
936 1, /* scalar_to_vec_cost. */
937 1, /* vec_align_load_cost. */
938 2, /* vec_unalign_load_cost. */
939 1, /* vec_store_cost. */
940 3, /* cond_taken_branch_cost. */
941 1, /* cond_not_taken_branch_cost. */
942 };
943
944 static const
945 struct processor_costs k6_cost = {
946 COSTS_N_INSNS (1), /* cost of an add instruction */
947 COSTS_N_INSNS (2), /* cost of a lea instruction */
948 COSTS_N_INSNS (1), /* variable shift costs */
949 COSTS_N_INSNS (1), /* constant shift costs */
950 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
951 COSTS_N_INSNS (3), /* HI */
952 COSTS_N_INSNS (3), /* SI */
953 COSTS_N_INSNS (3), /* DI */
954 COSTS_N_INSNS (3)}, /* other */
955 0, /* cost of multiply per each bit set */
956 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
957 COSTS_N_INSNS (18), /* HI */
958 COSTS_N_INSNS (18), /* SI */
959 COSTS_N_INSNS (18), /* DI */
960 COSTS_N_INSNS (18)}, /* other */
961 COSTS_N_INSNS (2), /* cost of movsx */
962 COSTS_N_INSNS (2), /* cost of movzx */
963 8, /* "large" insn */
964 4, /* MOVE_RATIO */
965 3, /* cost for loading QImode using movzbl */
966 {4, 5, 4}, /* cost of loading integer registers
967 in QImode, HImode and SImode.
968 Relative to reg-reg move (2). */
969 {2, 3, 2}, /* cost of storing integer registers */
970 4, /* cost of reg,reg fld/fst */
971 {6, 6, 6}, /* cost of loading fp registers
972 in SFmode, DFmode and XFmode */
973 {4, 4, 4}, /* cost of storing fp registers
974 in SFmode, DFmode and XFmode */
975 2, /* cost of moving MMX register */
976 {2, 2}, /* cost of loading MMX registers
977 in SImode and DImode */
978 {2, 2}, /* cost of storing MMX registers
979 in SImode and DImode */
980 2, /* cost of moving SSE register */
981 {2, 2, 8}, /* cost of loading SSE registers
982 in SImode, DImode and TImode */
983 {2, 2, 8}, /* cost of storing SSE registers
984 in SImode, DImode and TImode */
985 6, /* MMX or SSE register to integer */
986 32, /* size of l1 cache. */
987 32, /* size of l2 cache. Some models
988 have integrated l2 cache, but
989 optimizing for k6 is not important
990 enough to worry about that. */
991 32, /* size of prefetch block */
992 1, /* number of parallel prefetches */
993 1, /* Branch cost */
994 COSTS_N_INSNS (2), /* cost of FADD and FSUB insns. */
995 COSTS_N_INSNS (2), /* cost of FMUL instruction. */
996 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
997 COSTS_N_INSNS (2), /* cost of FABS instruction. */
998 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
999 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
1000 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
1001 DUMMY_STRINGOP_ALGS},
1002 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
1003 DUMMY_STRINGOP_ALGS},
1004 1, /* scalar_stmt_cost. */
1005 1, /* scalar load_cost. */
1006 1, /* scalar_store_cost. */
1007 1, /* vec_stmt_cost. */
1008 1, /* vec_to_scalar_cost. */
1009 1, /* scalar_to_vec_cost. */
1010 1, /* vec_align_load_cost. */
1011 2, /* vec_unalign_load_cost. */
1012 1, /* vec_store_cost. */
1013 3, /* cond_taken_branch_cost. */
1014 1, /* cond_not_taken_branch_cost. */
1015 };
1016
1017 static const
1018 struct processor_costs athlon_cost = {
1019 COSTS_N_INSNS (1), /* cost of an add instruction */
1020 COSTS_N_INSNS (2), /* cost of a lea instruction */
1021 COSTS_N_INSNS (1), /* variable shift costs */
1022 COSTS_N_INSNS (1), /* constant shift costs */
1023 {COSTS_N_INSNS (5), /* cost of starting multiply for QI */
1024 COSTS_N_INSNS (5), /* HI */
1025 COSTS_N_INSNS (5), /* SI */
1026 COSTS_N_INSNS (5), /* DI */
1027 COSTS_N_INSNS (5)}, /* other */
1028 0, /* cost of multiply per each bit set */
1029 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1030 COSTS_N_INSNS (26), /* HI */
1031 COSTS_N_INSNS (42), /* SI */
1032 COSTS_N_INSNS (74), /* DI */
1033 COSTS_N_INSNS (74)}, /* other */
1034 COSTS_N_INSNS (1), /* cost of movsx */
1035 COSTS_N_INSNS (1), /* cost of movzx */
1036 8, /* "large" insn */
1037 9, /* MOVE_RATIO */
1038 4, /* cost for loading QImode using movzbl */
1039 {3, 4, 3}, /* cost of loading integer registers
1040 in QImode, HImode and SImode.
1041 Relative to reg-reg move (2). */
1042 {3, 4, 3}, /* cost of storing integer registers */
1043 4, /* cost of reg,reg fld/fst */
1044 {4, 4, 12}, /* cost of loading fp registers
1045 in SFmode, DFmode and XFmode */
1046 {6, 6, 8}, /* cost of storing fp registers
1047 in SFmode, DFmode and XFmode */
1048 2, /* cost of moving MMX register */
1049 {4, 4}, /* cost of loading MMX registers
1050 in SImode and DImode */
1051 {4, 4}, /* cost of storing MMX registers
1052 in SImode and DImode */
1053 2, /* cost of moving SSE register */
1054 {4, 4, 6}, /* cost of loading SSE registers
1055 in SImode, DImode and TImode */
1056 {4, 4, 5}, /* cost of storing SSE registers
1057 in SImode, DImode and TImode */
1058 5, /* MMX or SSE register to integer */
1059 64, /* size of l1 cache. */
1060 256, /* size of l2 cache. */
1061 64, /* size of prefetch block */
1062 6, /* number of parallel prefetches */
1063 5, /* Branch cost */
1064 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1065 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1066 COSTS_N_INSNS (24), /* cost of FDIV instruction. */
1067 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1068 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1069 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1070 /* For some reason, Athlon deals better with REP prefix (relative to loops)
1071 compared to K8. Alignment becomes important after 8 bytes for memcpy and
1072 128 bytes for memset. */
1073 {{libcall, {{2048, rep_prefix_4_byte}, {-1, libcall}}},
1074 DUMMY_STRINGOP_ALGS},
1075 {{libcall, {{2048, rep_prefix_4_byte}, {-1, libcall}}},
1076 DUMMY_STRINGOP_ALGS},
1077 1, /* scalar_stmt_cost. */
1078 1, /* scalar load_cost. */
1079 1, /* scalar_store_cost. */
1080 1, /* vec_stmt_cost. */
1081 1, /* vec_to_scalar_cost. */
1082 1, /* scalar_to_vec_cost. */
1083 1, /* vec_align_load_cost. */
1084 2, /* vec_unalign_load_cost. */
1085 1, /* vec_store_cost. */
1086 3, /* cond_taken_branch_cost. */
1087 1, /* cond_not_taken_branch_cost. */
1088 };
1089
1090 static const
1091 struct processor_costs k8_cost = {
1092 COSTS_N_INSNS (1), /* cost of an add instruction */
1093 COSTS_N_INSNS (2), /* cost of a lea instruction */
1094 COSTS_N_INSNS (1), /* variable shift costs */
1095 COSTS_N_INSNS (1), /* constant shift costs */
1096 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1097 COSTS_N_INSNS (4), /* HI */
1098 COSTS_N_INSNS (3), /* SI */
1099 COSTS_N_INSNS (4), /* DI */
1100 COSTS_N_INSNS (5)}, /* other */
1101 0, /* cost of multiply per each bit set */
1102 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1103 COSTS_N_INSNS (26), /* HI */
1104 COSTS_N_INSNS (42), /* SI */
1105 COSTS_N_INSNS (74), /* DI */
1106 COSTS_N_INSNS (74)}, /* other */
1107 COSTS_N_INSNS (1), /* cost of movsx */
1108 COSTS_N_INSNS (1), /* cost of movzx */
1109 8, /* "large" insn */
1110 9, /* MOVE_RATIO */
1111 4, /* cost for loading QImode using movzbl */
1112 {3, 4, 3}, /* cost of loading integer registers
1113 in QImode, HImode and SImode.
1114 Relative to reg-reg move (2). */
1115 {3, 4, 3}, /* cost of storing integer registers */
1116 4, /* cost of reg,reg fld/fst */
1117 {4, 4, 12}, /* cost of loading fp registers
1118 in SFmode, DFmode and XFmode */
1119 {6, 6, 8}, /* cost of storing fp registers
1120 in SFmode, DFmode and XFmode */
1121 2, /* cost of moving MMX register */
1122 {3, 3}, /* cost of loading MMX registers
1123 in SImode and DImode */
1124 {4, 4}, /* cost of storing MMX registers
1125 in SImode and DImode */
1126 2, /* cost of moving SSE register */
1127 {4, 3, 6}, /* cost of loading SSE registers
1128 in SImode, DImode and TImode */
1129 {4, 4, 5}, /* cost of storing SSE registers
1130 in SImode, DImode and TImode */
1131 5, /* MMX or SSE register to integer */
1132 64, /* size of l1 cache. */
1133 512, /* size of l2 cache. */
1134 64, /* size of prefetch block */
1135 /* New AMD processors never drop prefetches; if they cannot be performed
1136 immediately, they are queued. We set number of simultaneous prefetches
1137 to a large constant to reflect this (it probably is not a good idea not
1138 to limit number of prefetches at all, as their execution also takes some
1139 time). */
1140 100, /* number of parallel prefetches */
1141 3, /* Branch cost */
1142 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1143 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1144 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1145 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1146 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1147 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1148 /* K8 has optimized REP instruction for medium sized blocks, but for very
1149 small blocks it is better to use loop. For large blocks, libcall can
1150 do nontemporary accesses and beat inline considerably. */
1151 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
1152 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1153 {{libcall, {{8, loop}, {24, unrolled_loop},
1154 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1155 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1156 4, /* scalar_stmt_cost. */
1157 2, /* scalar load_cost. */
1158 2, /* scalar_store_cost. */
1159 5, /* vec_stmt_cost. */
1160 0, /* vec_to_scalar_cost. */
1161 2, /* scalar_to_vec_cost. */
1162 2, /* vec_align_load_cost. */
1163 3, /* vec_unalign_load_cost. */
1164 3, /* vec_store_cost. */
1165 3, /* cond_taken_branch_cost. */
1166 2, /* cond_not_taken_branch_cost. */
1167 };
1168
1169 struct processor_costs amdfam10_cost = {
1170 COSTS_N_INSNS (1), /* cost of an add instruction */
1171 COSTS_N_INSNS (2), /* cost of a lea instruction */
1172 COSTS_N_INSNS (1), /* variable shift costs */
1173 COSTS_N_INSNS (1), /* constant shift costs */
1174 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1175 COSTS_N_INSNS (4), /* HI */
1176 COSTS_N_INSNS (3), /* SI */
1177 COSTS_N_INSNS (4), /* DI */
1178 COSTS_N_INSNS (5)}, /* other */
1179 0, /* cost of multiply per each bit set */
1180 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1181 COSTS_N_INSNS (35), /* HI */
1182 COSTS_N_INSNS (51), /* SI */
1183 COSTS_N_INSNS (83), /* DI */
1184 COSTS_N_INSNS (83)}, /* other */
1185 COSTS_N_INSNS (1), /* cost of movsx */
1186 COSTS_N_INSNS (1), /* cost of movzx */
1187 8, /* "large" insn */
1188 9, /* MOVE_RATIO */
1189 4, /* cost for loading QImode using movzbl */
1190 {3, 4, 3}, /* cost of loading integer registers
1191 in QImode, HImode and SImode.
1192 Relative to reg-reg move (2). */
1193 {3, 4, 3}, /* cost of storing integer registers */
1194 4, /* cost of reg,reg fld/fst */
1195 {4, 4, 12}, /* cost of loading fp registers
1196 in SFmode, DFmode and XFmode */
1197 {6, 6, 8}, /* cost of storing fp registers
1198 in SFmode, DFmode and XFmode */
1199 2, /* cost of moving MMX register */
1200 {3, 3}, /* cost of loading MMX registers
1201 in SImode and DImode */
1202 {4, 4}, /* cost of storing MMX registers
1203 in SImode and DImode */
1204 2, /* cost of moving SSE register */
1205 {4, 4, 3}, /* cost of loading SSE registers
1206 in SImode, DImode and TImode */
1207 {4, 4, 5}, /* cost of storing SSE registers
1208 in SImode, DImode and TImode */
1209 3, /* MMX or SSE register to integer */
1210 /* On K8:
1211 MOVD reg64, xmmreg Double FSTORE 4
1212 MOVD reg32, xmmreg Double FSTORE 4
1213 On AMDFAM10:
1214 MOVD reg64, xmmreg Double FADD 3
1215 1/1 1/1
1216 MOVD reg32, xmmreg Double FADD 3
1217 1/1 1/1 */
1218 64, /* size of l1 cache. */
1219 512, /* size of l2 cache. */
1220 64, /* size of prefetch block */
1221 /* New AMD processors never drop prefetches; if they cannot be performed
1222 immediately, they are queued. We set number of simultaneous prefetches
1223 to a large constant to reflect this (it probably is not a good idea not
1224 to limit number of prefetches at all, as their execution also takes some
1225 time). */
1226 100, /* number of parallel prefetches */
1227 2, /* Branch cost */
1228 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1229 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1230 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1231 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1232 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1233 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1234
1235 /* AMDFAM10 has optimized REP instruction for medium sized blocks, but for
1236 very small blocks it is better to use loop. For large blocks, libcall can
1237 do nontemporary accesses and beat inline considerably. */
1238 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
1239 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1240 {{libcall, {{8, loop}, {24, unrolled_loop},
1241 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1242 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1243 4, /* scalar_stmt_cost. */
1244 2, /* scalar load_cost. */
1245 2, /* scalar_store_cost. */
1246 6, /* vec_stmt_cost. */
1247 0, /* vec_to_scalar_cost. */
1248 2, /* scalar_to_vec_cost. */
1249 2, /* vec_align_load_cost. */
1250 2, /* vec_unalign_load_cost. */
1251 2, /* vec_store_cost. */
1252 2, /* cond_taken_branch_cost. */
1253 1, /* cond_not_taken_branch_cost. */
1254 };
1255
1256 struct processor_costs bdver1_cost = {
1257 COSTS_N_INSNS (1), /* cost of an add instruction */
1258 COSTS_N_INSNS (1), /* cost of a lea instruction */
1259 COSTS_N_INSNS (1), /* variable shift costs */
1260 COSTS_N_INSNS (1), /* constant shift costs */
1261 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1262 COSTS_N_INSNS (4), /* HI */
1263 COSTS_N_INSNS (4), /* SI */
1264 COSTS_N_INSNS (6), /* DI */
1265 COSTS_N_INSNS (6)}, /* other */
1266 0, /* cost of multiply per each bit set */
1267 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1268 COSTS_N_INSNS (35), /* HI */
1269 COSTS_N_INSNS (51), /* SI */
1270 COSTS_N_INSNS (83), /* DI */
1271 COSTS_N_INSNS (83)}, /* other */
1272 COSTS_N_INSNS (1), /* cost of movsx */
1273 COSTS_N_INSNS (1), /* cost of movzx */
1274 8, /* "large" insn */
1275 9, /* MOVE_RATIO */
1276 4, /* cost for loading QImode using movzbl */
1277 {5, 5, 4}, /* cost of loading integer registers
1278 in QImode, HImode and SImode.
1279 Relative to reg-reg move (2). */
1280 {4, 4, 4}, /* cost of storing integer registers */
1281 2, /* cost of reg,reg fld/fst */
1282 {5, 5, 12}, /* cost of loading fp registers
1283 in SFmode, DFmode and XFmode */
1284 {4, 4, 8}, /* cost of storing fp registers
1285 in SFmode, DFmode and XFmode */
1286 2, /* cost of moving MMX register */
1287 {4, 4}, /* cost of loading MMX registers
1288 in SImode and DImode */
1289 {4, 4}, /* cost of storing MMX registers
1290 in SImode and DImode */
1291 2, /* cost of moving SSE register */
1292 {4, 4, 4}, /* cost of loading SSE registers
1293 in SImode, DImode and TImode */
1294 {4, 4, 4}, /* cost of storing SSE registers
1295 in SImode, DImode and TImode */
1296 2, /* MMX or SSE register to integer */
1297 /* On K8:
1298 MOVD reg64, xmmreg Double FSTORE 4
1299 MOVD reg32, xmmreg Double FSTORE 4
1300 On AMDFAM10:
1301 MOVD reg64, xmmreg Double FADD 3
1302 1/1 1/1
1303 MOVD reg32, xmmreg Double FADD 3
1304 1/1 1/1 */
1305 16, /* size of l1 cache. */
1306 2048, /* size of l2 cache. */
1307 64, /* size of prefetch block */
1308 /* New AMD processors never drop prefetches; if they cannot be performed
1309 immediately, they are queued. We set number of simultaneous prefetches
1310 to a large constant to reflect this (it probably is not a good idea not
1311 to limit number of prefetches at all, as their execution also takes some
1312 time). */
1313 100, /* number of parallel prefetches */
1314 2, /* Branch cost */
1315 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1316 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1317 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1318 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1319 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1320 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1321
1322 /* BDVER1 has optimized REP instruction for medium sized blocks, but for
1323 very small blocks it is better to use loop. For large blocks, libcall
1324 can do nontemporary accesses and beat inline considerably. */
1325 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
1326 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1327 {{libcall, {{8, loop}, {24, unrolled_loop},
1328 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1329 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1330 6, /* scalar_stmt_cost. */
1331 4, /* scalar load_cost. */
1332 4, /* scalar_store_cost. */
1333 6, /* vec_stmt_cost. */
1334 0, /* vec_to_scalar_cost. */
1335 2, /* scalar_to_vec_cost. */
1336 4, /* vec_align_load_cost. */
1337 4, /* vec_unalign_load_cost. */
1338 4, /* vec_store_cost. */
1339 2, /* cond_taken_branch_cost. */
1340 1, /* cond_not_taken_branch_cost. */
1341 };
1342
1343 struct processor_costs bdver2_cost = {
1344 COSTS_N_INSNS (1), /* cost of an add instruction */
1345 COSTS_N_INSNS (1), /* cost of a lea instruction */
1346 COSTS_N_INSNS (1), /* variable shift costs */
1347 COSTS_N_INSNS (1), /* constant shift costs */
1348 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1349 COSTS_N_INSNS (4), /* HI */
1350 COSTS_N_INSNS (4), /* SI */
1351 COSTS_N_INSNS (6), /* DI */
1352 COSTS_N_INSNS (6)}, /* other */
1353 0, /* cost of multiply per each bit set */
1354 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1355 COSTS_N_INSNS (35), /* HI */
1356 COSTS_N_INSNS (51), /* SI */
1357 COSTS_N_INSNS (83), /* DI */
1358 COSTS_N_INSNS (83)}, /* other */
1359 COSTS_N_INSNS (1), /* cost of movsx */
1360 COSTS_N_INSNS (1), /* cost of movzx */
1361 8, /* "large" insn */
1362 9, /* MOVE_RATIO */
1363 4, /* cost for loading QImode using movzbl */
1364 {5, 5, 4}, /* cost of loading integer registers
1365 in QImode, HImode and SImode.
1366 Relative to reg-reg move (2). */
1367 {4, 4, 4}, /* cost of storing integer registers */
1368 2, /* cost of reg,reg fld/fst */
1369 {5, 5, 12}, /* cost of loading fp registers
1370 in SFmode, DFmode and XFmode */
1371 {4, 4, 8}, /* cost of storing fp registers
1372 in SFmode, DFmode and XFmode */
1373 2, /* cost of moving MMX register */
1374 {4, 4}, /* cost of loading MMX registers
1375 in SImode and DImode */
1376 {4, 4}, /* cost of storing MMX registers
1377 in SImode and DImode */
1378 2, /* cost of moving SSE register */
1379 {4, 4, 4}, /* cost of loading SSE registers
1380 in SImode, DImode and TImode */
1381 {4, 4, 4}, /* cost of storing SSE registers
1382 in SImode, DImode and TImode */
1383 2, /* MMX or SSE register to integer */
1384 /* On K8:
1385 MOVD reg64, xmmreg Double FSTORE 4
1386 MOVD reg32, xmmreg Double FSTORE 4
1387 On AMDFAM10:
1388 MOVD reg64, xmmreg Double FADD 3
1389 1/1 1/1
1390 MOVD reg32, xmmreg Double FADD 3
1391 1/1 1/1 */
1392 16, /* size of l1 cache. */
1393 2048, /* size of l2 cache. */
1394 64, /* size of prefetch block */
1395 /* New AMD processors never drop prefetches; if they cannot be performed
1396 immediately, they are queued. We set number of simultaneous prefetches
1397 to a large constant to reflect this (it probably is not a good idea not
1398 to limit number of prefetches at all, as their execution also takes some
1399 time). */
1400 100, /* number of parallel prefetches */
1401 2, /* Branch cost */
1402 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1403 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1404 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1405 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1406 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1407 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1408
1409 /* BDVER2 has optimized REP instruction for medium sized blocks, but for
1410 very small blocks it is better to use loop. For large blocks, libcall
1411 can do nontemporary accesses and beat inline considerably. */
1412 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
1413 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1414 {{libcall, {{8, loop}, {24, unrolled_loop},
1415 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1416 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1417 6, /* scalar_stmt_cost. */
1418 4, /* scalar load_cost. */
1419 4, /* scalar_store_cost. */
1420 6, /* vec_stmt_cost. */
1421 0, /* vec_to_scalar_cost. */
1422 2, /* scalar_to_vec_cost. */
1423 4, /* vec_align_load_cost. */
1424 4, /* vec_unalign_load_cost. */
1425 4, /* vec_store_cost. */
1426 2, /* cond_taken_branch_cost. */
1427 1, /* cond_not_taken_branch_cost. */
1428 };
1429
1430 struct processor_costs btver1_cost = {
1431 COSTS_N_INSNS (1), /* cost of an add instruction */
1432 COSTS_N_INSNS (2), /* cost of a lea instruction */
1433 COSTS_N_INSNS (1), /* variable shift costs */
1434 COSTS_N_INSNS (1), /* constant shift costs */
1435 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1436 COSTS_N_INSNS (4), /* HI */
1437 COSTS_N_INSNS (3), /* SI */
1438 COSTS_N_INSNS (4), /* DI */
1439 COSTS_N_INSNS (5)}, /* other */
1440 0, /* cost of multiply per each bit set */
1441 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1442 COSTS_N_INSNS (35), /* HI */
1443 COSTS_N_INSNS (51), /* SI */
1444 COSTS_N_INSNS (83), /* DI */
1445 COSTS_N_INSNS (83)}, /* other */
1446 COSTS_N_INSNS (1), /* cost of movsx */
1447 COSTS_N_INSNS (1), /* cost of movzx */
1448 8, /* "large" insn */
1449 9, /* MOVE_RATIO */
1450 4, /* cost for loading QImode using movzbl */
1451 {3, 4, 3}, /* cost of loading integer registers
1452 in QImode, HImode and SImode.
1453 Relative to reg-reg move (2). */
1454 {3, 4, 3}, /* cost of storing integer registers */
1455 4, /* cost of reg,reg fld/fst */
1456 {4, 4, 12}, /* cost of loading fp registers
1457 in SFmode, DFmode and XFmode */
1458 {6, 6, 8}, /* cost of storing fp registers
1459 in SFmode, DFmode and XFmode */
1460 2, /* cost of moving MMX register */
1461 {3, 3}, /* cost of loading MMX registers
1462 in SImode and DImode */
1463 {4, 4}, /* cost of storing MMX registers
1464 in SImode and DImode */
1465 2, /* cost of moving SSE register */
1466 {4, 4, 3}, /* cost of loading SSE registers
1467 in SImode, DImode and TImode */
1468 {4, 4, 5}, /* cost of storing SSE registers
1469 in SImode, DImode and TImode */
1470 3, /* MMX or SSE register to integer */
1471 /* On K8:
1472 MOVD reg64, xmmreg Double FSTORE 4
1473 MOVD reg32, xmmreg Double FSTORE 4
1474 On AMDFAM10:
1475 MOVD reg64, xmmreg Double FADD 3
1476 1/1 1/1
1477 MOVD reg32, xmmreg Double FADD 3
1478 1/1 1/1 */
1479 32, /* size of l1 cache. */
1480 512, /* size of l2 cache. */
1481 64, /* size of prefetch block */
1482 100, /* number of parallel prefetches */
1483 2, /* Branch cost */
1484 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1485 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1486 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1487 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1488 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1489 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1490
1491 /* BTVER1 has optimized REP instruction for medium sized blocks, but for
1492 very small blocks it is better to use loop. For large blocks, libcall can
1493 do nontemporary accesses and beat inline considerably. */
1494 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
1495 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1496 {{libcall, {{8, loop}, {24, unrolled_loop},
1497 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1498 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1499 4, /* scalar_stmt_cost. */
1500 2, /* scalar load_cost. */
1501 2, /* scalar_store_cost. */
1502 6, /* vec_stmt_cost. */
1503 0, /* vec_to_scalar_cost. */
1504 2, /* scalar_to_vec_cost. */
1505 2, /* vec_align_load_cost. */
1506 2, /* vec_unalign_load_cost. */
1507 2, /* vec_store_cost. */
1508 2, /* cond_taken_branch_cost. */
1509 1, /* cond_not_taken_branch_cost. */
1510 };
1511
1512 struct processor_costs btver2_cost = {
1513 COSTS_N_INSNS (1), /* cost of an add instruction */
1514 COSTS_N_INSNS (2), /* cost of a lea instruction */
1515 COSTS_N_INSNS (1), /* variable shift costs */
1516 COSTS_N_INSNS (1), /* constant shift costs */
1517 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1518 COSTS_N_INSNS (4), /* HI */
1519 COSTS_N_INSNS (3), /* SI */
1520 COSTS_N_INSNS (4), /* DI */
1521 COSTS_N_INSNS (5)}, /* other */
1522 0, /* cost of multiply per each bit set */
1523 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1524 COSTS_N_INSNS (35), /* HI */
1525 COSTS_N_INSNS (51), /* SI */
1526 COSTS_N_INSNS (83), /* DI */
1527 COSTS_N_INSNS (83)}, /* other */
1528 COSTS_N_INSNS (1), /* cost of movsx */
1529 COSTS_N_INSNS (1), /* cost of movzx */
1530 8, /* "large" insn */
1531 9, /* MOVE_RATIO */
1532 4, /* cost for loading QImode using movzbl */
1533 {3, 4, 3}, /* cost of loading integer registers
1534 in QImode, HImode and SImode.
1535 Relative to reg-reg move (2). */
1536 {3, 4, 3}, /* cost of storing integer registers */
1537 4, /* cost of reg,reg fld/fst */
1538 {4, 4, 12}, /* cost of loading fp registers
1539 in SFmode, DFmode and XFmode */
1540 {6, 6, 8}, /* cost of storing fp registers
1541 in SFmode, DFmode and XFmode */
1542 2, /* cost of moving MMX register */
1543 {3, 3}, /* cost of loading MMX registers
1544 in SImode and DImode */
1545 {4, 4}, /* cost of storing MMX registers
1546 in SImode and DImode */
1547 2, /* cost of moving SSE register */
1548 {4, 4, 3}, /* cost of loading SSE registers
1549 in SImode, DImode and TImode */
1550 {4, 4, 5}, /* cost of storing SSE registers
1551 in SImode, DImode and TImode */
1552 3, /* MMX or SSE register to integer */
1553 /* On K8:
1554 MOVD reg64, xmmreg Double FSTORE 4
1555 MOVD reg32, xmmreg Double FSTORE 4
1556 On AMDFAM10:
1557 MOVD reg64, xmmreg Double FADD 3
1558 1/1 1/1
1559 MOVD reg32, xmmreg Double FADD 3
1560 1/1 1/1 */
1561 32, /* size of l1 cache. */
1562 2048, /* size of l2 cache. */
1563 64, /* size of prefetch block */
1564 100, /* number of parallel prefetches */
1565 2, /* Branch cost */
1566 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1567 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1568 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1569 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1570 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1571 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1572
1573 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
1574 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1575 {{libcall, {{8, loop}, {24, unrolled_loop},
1576 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1577 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1578 4, /* scalar_stmt_cost. */
1579 2, /* scalar load_cost. */
1580 2, /* scalar_store_cost. */
1581 6, /* vec_stmt_cost. */
1582 0, /* vec_to_scalar_cost. */
1583 2, /* scalar_to_vec_cost. */
1584 2, /* vec_align_load_cost. */
1585 2, /* vec_unalign_load_cost. */
1586 2, /* vec_store_cost. */
1587 2, /* cond_taken_branch_cost. */
1588 1, /* cond_not_taken_branch_cost. */
1589 };
1590
1591 static const
1592 struct processor_costs pentium4_cost = {
1593 COSTS_N_INSNS (1), /* cost of an add instruction */
1594 COSTS_N_INSNS (3), /* cost of a lea instruction */
1595 COSTS_N_INSNS (4), /* variable shift costs */
1596 COSTS_N_INSNS (4), /* constant shift costs */
1597 {COSTS_N_INSNS (15), /* cost of starting multiply for QI */
1598 COSTS_N_INSNS (15), /* HI */
1599 COSTS_N_INSNS (15), /* SI */
1600 COSTS_N_INSNS (15), /* DI */
1601 COSTS_N_INSNS (15)}, /* other */
1602 0, /* cost of multiply per each bit set */
1603 {COSTS_N_INSNS (56), /* cost of a divide/mod for QI */
1604 COSTS_N_INSNS (56), /* HI */
1605 COSTS_N_INSNS (56), /* SI */
1606 COSTS_N_INSNS (56), /* DI */
1607 COSTS_N_INSNS (56)}, /* other */
1608 COSTS_N_INSNS (1), /* cost of movsx */
1609 COSTS_N_INSNS (1), /* cost of movzx */
1610 16, /* "large" insn */
1611 6, /* MOVE_RATIO */
1612 2, /* cost for loading QImode using movzbl */
1613 {4, 5, 4}, /* cost of loading integer registers
1614 in QImode, HImode and SImode.
1615 Relative to reg-reg move (2). */
1616 {2, 3, 2}, /* cost of storing integer registers */
1617 2, /* cost of reg,reg fld/fst */
1618 {2, 2, 6}, /* cost of loading fp registers
1619 in SFmode, DFmode and XFmode */
1620 {4, 4, 6}, /* cost of storing fp registers
1621 in SFmode, DFmode and XFmode */
1622 2, /* cost of moving MMX register */
1623 {2, 2}, /* cost of loading MMX registers
1624 in SImode and DImode */
1625 {2, 2}, /* cost of storing MMX registers
1626 in SImode and DImode */
1627 12, /* cost of moving SSE register */
1628 {12, 12, 12}, /* cost of loading SSE registers
1629 in SImode, DImode and TImode */
1630 {2, 2, 8}, /* cost of storing SSE registers
1631 in SImode, DImode and TImode */
1632 10, /* MMX or SSE register to integer */
1633 8, /* size of l1 cache. */
1634 256, /* size of l2 cache. */
1635 64, /* size of prefetch block */
1636 6, /* number of parallel prefetches */
1637 2, /* Branch cost */
1638 COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */
1639 COSTS_N_INSNS (7), /* cost of FMUL instruction. */
1640 COSTS_N_INSNS (43), /* cost of FDIV instruction. */
1641 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1642 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1643 COSTS_N_INSNS (43), /* cost of FSQRT instruction. */
1644 {{libcall, {{12, loop_1_byte}, {-1, rep_prefix_4_byte}}},
1645 DUMMY_STRINGOP_ALGS},
1646 {{libcall, {{6, loop_1_byte}, {48, loop}, {20480, rep_prefix_4_byte},
1647 {-1, libcall}}},
1648 DUMMY_STRINGOP_ALGS},
1649 1, /* scalar_stmt_cost. */
1650 1, /* scalar load_cost. */
1651 1, /* scalar_store_cost. */
1652 1, /* vec_stmt_cost. */
1653 1, /* vec_to_scalar_cost. */
1654 1, /* scalar_to_vec_cost. */
1655 1, /* vec_align_load_cost. */
1656 2, /* vec_unalign_load_cost. */
1657 1, /* vec_store_cost. */
1658 3, /* cond_taken_branch_cost. */
1659 1, /* cond_not_taken_branch_cost. */
1660 };
1661
1662 static const
1663 struct processor_costs nocona_cost = {
1664 COSTS_N_INSNS (1), /* cost of an add instruction */
1665 COSTS_N_INSNS (1), /* cost of a lea instruction */
1666 COSTS_N_INSNS (1), /* variable shift costs */
1667 COSTS_N_INSNS (1), /* constant shift costs */
1668 {COSTS_N_INSNS (10), /* cost of starting multiply for QI */
1669 COSTS_N_INSNS (10), /* HI */
1670 COSTS_N_INSNS (10), /* SI */
1671 COSTS_N_INSNS (10), /* DI */
1672 COSTS_N_INSNS (10)}, /* other */
1673 0, /* cost of multiply per each bit set */
1674 {COSTS_N_INSNS (66), /* cost of a divide/mod for QI */
1675 COSTS_N_INSNS (66), /* HI */
1676 COSTS_N_INSNS (66), /* SI */
1677 COSTS_N_INSNS (66), /* DI */
1678 COSTS_N_INSNS (66)}, /* other */
1679 COSTS_N_INSNS (1), /* cost of movsx */
1680 COSTS_N_INSNS (1), /* cost of movzx */
1681 16, /* "large" insn */
1682 17, /* MOVE_RATIO */
1683 4, /* cost for loading QImode using movzbl */
1684 {4, 4, 4}, /* cost of loading integer registers
1685 in QImode, HImode and SImode.
1686 Relative to reg-reg move (2). */
1687 {4, 4, 4}, /* cost of storing integer registers */
1688 3, /* cost of reg,reg fld/fst */
1689 {12, 12, 12}, /* cost of loading fp registers
1690 in SFmode, DFmode and XFmode */
1691 {4, 4, 4}, /* cost of storing fp registers
1692 in SFmode, DFmode and XFmode */
1693 6, /* cost of moving MMX register */
1694 {12, 12}, /* cost of loading MMX registers
1695 in SImode and DImode */
1696 {12, 12}, /* cost of storing MMX registers
1697 in SImode and DImode */
1698 6, /* cost of moving SSE register */
1699 {12, 12, 12}, /* cost of loading SSE registers
1700 in SImode, DImode and TImode */
1701 {12, 12, 12}, /* cost of storing SSE registers
1702 in SImode, DImode and TImode */
1703 8, /* MMX or SSE register to integer */
1704 8, /* size of l1 cache. */
1705 1024, /* size of l2 cache. */
1706 128, /* size of prefetch block */
1707 8, /* number of parallel prefetches */
1708 1, /* Branch cost */
1709 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1710 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1711 COSTS_N_INSNS (40), /* cost of FDIV instruction. */
1712 COSTS_N_INSNS (3), /* cost of FABS instruction. */
1713 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
1714 COSTS_N_INSNS (44), /* cost of FSQRT instruction. */
1715 {{libcall, {{12, loop_1_byte}, {-1, rep_prefix_4_byte}}},
1716 {libcall, {{32, loop}, {20000, rep_prefix_8_byte},
1717 {100000, unrolled_loop}, {-1, libcall}}}},
1718 {{libcall, {{6, loop_1_byte}, {48, loop}, {20480, rep_prefix_4_byte},
1719 {-1, libcall}}},
1720 {libcall, {{24, loop}, {64, unrolled_loop},
1721 {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1722 1, /* scalar_stmt_cost. */
1723 1, /* scalar load_cost. */
1724 1, /* scalar_store_cost. */
1725 1, /* vec_stmt_cost. */
1726 1, /* vec_to_scalar_cost. */
1727 1, /* scalar_to_vec_cost. */
1728 1, /* vec_align_load_cost. */
1729 2, /* vec_unalign_load_cost. */
1730 1, /* vec_store_cost. */
1731 3, /* cond_taken_branch_cost. */
1732 1, /* cond_not_taken_branch_cost. */
1733 };
1734
1735 static const
1736 struct processor_costs atom_cost = {
1737 COSTS_N_INSNS (1), /* cost of an add instruction */
1738 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1739 COSTS_N_INSNS (1), /* variable shift costs */
1740 COSTS_N_INSNS (1), /* constant shift costs */
1741 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1742 COSTS_N_INSNS (4), /* HI */
1743 COSTS_N_INSNS (3), /* SI */
1744 COSTS_N_INSNS (4), /* DI */
1745 COSTS_N_INSNS (2)}, /* other */
1746 0, /* cost of multiply per each bit set */
1747 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1748 COSTS_N_INSNS (26), /* HI */
1749 COSTS_N_INSNS (42), /* SI */
1750 COSTS_N_INSNS (74), /* DI */
1751 COSTS_N_INSNS (74)}, /* other */
1752 COSTS_N_INSNS (1), /* cost of movsx */
1753 COSTS_N_INSNS (1), /* cost of movzx */
1754 8, /* "large" insn */
1755 17, /* MOVE_RATIO */
1756 4, /* cost for loading QImode using movzbl */
1757 {4, 4, 4}, /* cost of loading integer registers
1758 in QImode, HImode and SImode.
1759 Relative to reg-reg move (2). */
1760 {4, 4, 4}, /* cost of storing integer registers */
1761 4, /* cost of reg,reg fld/fst */
1762 {12, 12, 12}, /* cost of loading fp registers
1763 in SFmode, DFmode and XFmode */
1764 {6, 6, 8}, /* cost of storing fp registers
1765 in SFmode, DFmode and XFmode */
1766 2, /* cost of moving MMX register */
1767 {8, 8}, /* cost of loading MMX registers
1768 in SImode and DImode */
1769 {8, 8}, /* cost of storing MMX registers
1770 in SImode and DImode */
1771 2, /* cost of moving SSE register */
1772 {8, 8, 8}, /* cost of loading SSE registers
1773 in SImode, DImode and TImode */
1774 {8, 8, 8}, /* cost of storing SSE registers
1775 in SImode, DImode and TImode */
1776 5, /* MMX or SSE register to integer */
1777 32, /* size of l1 cache. */
1778 256, /* size of l2 cache. */
1779 64, /* size of prefetch block */
1780 6, /* number of parallel prefetches */
1781 3, /* Branch cost */
1782 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1783 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1784 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1785 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1786 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1787 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1788 {{libcall, {{11, loop}, {-1, rep_prefix_4_byte}}},
1789 {libcall, {{32, loop}, {64, rep_prefix_4_byte},
1790 {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1791 {{libcall, {{8, loop}, {15, unrolled_loop},
1792 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1793 {libcall, {{24, loop}, {32, unrolled_loop},
1794 {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1795 1, /* scalar_stmt_cost. */
1796 1, /* scalar load_cost. */
1797 1, /* scalar_store_cost. */
1798 1, /* vec_stmt_cost. */
1799 1, /* vec_to_scalar_cost. */
1800 1, /* scalar_to_vec_cost. */
1801 1, /* vec_align_load_cost. */
1802 2, /* vec_unalign_load_cost. */
1803 1, /* vec_store_cost. */
1804 3, /* cond_taken_branch_cost. */
1805 1, /* cond_not_taken_branch_cost. */
1806 };
1807
1808 /* Generic64 should produce code tuned for Nocona and K8. */
1809 static const
1810 struct processor_costs generic64_cost = {
1811 COSTS_N_INSNS (1), /* cost of an add instruction */
1812 /* On all chips taken into consideration lea is 2 cycles and more. With
1813 this cost however our current implementation of synth_mult results in
1814 use of unnecessary temporary registers causing regression on several
1815 SPECfp benchmarks. */
1816 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1817 COSTS_N_INSNS (1), /* variable shift costs */
1818 COSTS_N_INSNS (1), /* constant shift costs */
1819 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1820 COSTS_N_INSNS (4), /* HI */
1821 COSTS_N_INSNS (3), /* SI */
1822 COSTS_N_INSNS (4), /* DI */
1823 COSTS_N_INSNS (2)}, /* other */
1824 0, /* cost of multiply per each bit set */
1825 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1826 COSTS_N_INSNS (26), /* HI */
1827 COSTS_N_INSNS (42), /* SI */
1828 COSTS_N_INSNS (74), /* DI */
1829 COSTS_N_INSNS (74)}, /* other */
1830 COSTS_N_INSNS (1), /* cost of movsx */
1831 COSTS_N_INSNS (1), /* cost of movzx */
1832 8, /* "large" insn */
1833 17, /* MOVE_RATIO */
1834 4, /* cost for loading QImode using movzbl */
1835 {4, 4, 4}, /* cost of loading integer registers
1836 in QImode, HImode and SImode.
1837 Relative to reg-reg move (2). */
1838 {4, 4, 4}, /* cost of storing integer registers */
1839 4, /* cost of reg,reg fld/fst */
1840 {12, 12, 12}, /* cost of loading fp registers
1841 in SFmode, DFmode and XFmode */
1842 {6, 6, 8}, /* cost of storing fp registers
1843 in SFmode, DFmode and XFmode */
1844 2, /* cost of moving MMX register */
1845 {8, 8}, /* cost of loading MMX registers
1846 in SImode and DImode */
1847 {8, 8}, /* cost of storing MMX registers
1848 in SImode and DImode */
1849 2, /* cost of moving SSE register */
1850 {8, 8, 8}, /* cost of loading SSE registers
1851 in SImode, DImode and TImode */
1852 {8, 8, 8}, /* cost of storing SSE registers
1853 in SImode, DImode and TImode */
1854 5, /* MMX or SSE register to integer */
1855 32, /* size of l1 cache. */
1856 512, /* size of l2 cache. */
1857 64, /* size of prefetch block */
1858 6, /* number of parallel prefetches */
1859 /* Benchmarks shows large regressions on K8 sixtrack benchmark when this
1860 value is increased to perhaps more appropriate value of 5. */
1861 3, /* Branch cost */
1862 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1863 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1864 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1865 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1866 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1867 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1868 {DUMMY_STRINGOP_ALGS,
1869 {libcall, {{32, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1870 {DUMMY_STRINGOP_ALGS,
1871 {libcall, {{32, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1872 1, /* scalar_stmt_cost. */
1873 1, /* scalar load_cost. */
1874 1, /* scalar_store_cost. */
1875 1, /* vec_stmt_cost. */
1876 1, /* vec_to_scalar_cost. */
1877 1, /* scalar_to_vec_cost. */
1878 1, /* vec_align_load_cost. */
1879 2, /* vec_unalign_load_cost. */
1880 1, /* vec_store_cost. */
1881 3, /* cond_taken_branch_cost. */
1882 1, /* cond_not_taken_branch_cost. */
1883 };
1884
1885 /* Generic32 should produce code tuned for PPro, Pentium4, Nocona,
1886 Athlon and K8. */
1887 static const
1888 struct processor_costs generic32_cost = {
1889 COSTS_N_INSNS (1), /* cost of an add instruction */
1890 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1891 COSTS_N_INSNS (1), /* variable shift costs */
1892 COSTS_N_INSNS (1), /* constant shift costs */
1893 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1894 COSTS_N_INSNS (4), /* HI */
1895 COSTS_N_INSNS (3), /* SI */
1896 COSTS_N_INSNS (4), /* DI */
1897 COSTS_N_INSNS (2)}, /* other */
1898 0, /* cost of multiply per each bit set */
1899 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1900 COSTS_N_INSNS (26), /* HI */
1901 COSTS_N_INSNS (42), /* SI */
1902 COSTS_N_INSNS (74), /* DI */
1903 COSTS_N_INSNS (74)}, /* other */
1904 COSTS_N_INSNS (1), /* cost of movsx */
1905 COSTS_N_INSNS (1), /* cost of movzx */
1906 8, /* "large" insn */
1907 17, /* MOVE_RATIO */
1908 4, /* cost for loading QImode using movzbl */
1909 {4, 4, 4}, /* cost of loading integer registers
1910 in QImode, HImode and SImode.
1911 Relative to reg-reg move (2). */
1912 {4, 4, 4}, /* cost of storing integer registers */
1913 4, /* cost of reg,reg fld/fst */
1914 {12, 12, 12}, /* cost of loading fp registers
1915 in SFmode, DFmode and XFmode */
1916 {6, 6, 8}, /* cost of storing fp registers
1917 in SFmode, DFmode and XFmode */
1918 2, /* cost of moving MMX register */
1919 {8, 8}, /* cost of loading MMX registers
1920 in SImode and DImode */
1921 {8, 8}, /* cost of storing MMX registers
1922 in SImode and DImode */
1923 2, /* cost of moving SSE register */
1924 {8, 8, 8}, /* cost of loading SSE registers
1925 in SImode, DImode and TImode */
1926 {8, 8, 8}, /* cost of storing SSE registers
1927 in SImode, DImode and TImode */
1928 5, /* MMX or SSE register to integer */
1929 32, /* size of l1 cache. */
1930 256, /* size of l2 cache. */
1931 64, /* size of prefetch block */
1932 6, /* number of parallel prefetches */
1933 3, /* Branch cost */
1934 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1935 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1936 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1937 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1938 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1939 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1940 {{libcall, {{32, loop}, {8192, rep_prefix_4_byte}, {-1, libcall}}},
1941 DUMMY_STRINGOP_ALGS},
1942 {{libcall, {{32, loop}, {8192, rep_prefix_4_byte}, {-1, libcall}}},
1943 DUMMY_STRINGOP_ALGS},
1944 1, /* scalar_stmt_cost. */
1945 1, /* scalar load_cost. */
1946 1, /* scalar_store_cost. */
1947 1, /* vec_stmt_cost. */
1948 1, /* vec_to_scalar_cost. */
1949 1, /* scalar_to_vec_cost. */
1950 1, /* vec_align_load_cost. */
1951 2, /* vec_unalign_load_cost. */
1952 1, /* vec_store_cost. */
1953 3, /* cond_taken_branch_cost. */
1954 1, /* cond_not_taken_branch_cost. */
1955 };
1956
1957 /* Set by -mtune. */
1958 const struct processor_costs *ix86_tune_cost = &pentium_cost;
1959
1960 /* Set by -mtune or -Os. */
1961 const struct processor_costs *ix86_cost = &pentium_cost;
1962
1963 /* Processor feature/optimization bitmasks. */
1964 #define m_386 (1<<PROCESSOR_I386)
1965 #define m_486 (1<<PROCESSOR_I486)
1966 #define m_PENT (1<<PROCESSOR_PENTIUM)
1967 #define m_PPRO (1<<PROCESSOR_PENTIUMPRO)
1968 #define m_PENT4 (1<<PROCESSOR_PENTIUM4)
1969 #define m_NOCONA (1<<PROCESSOR_NOCONA)
1970 #define m_P4_NOCONA (m_PENT4 | m_NOCONA)
1971 #define m_CORE2_32 (1<<PROCESSOR_CORE2_32)
1972 #define m_CORE2_64 (1<<PROCESSOR_CORE2_64)
1973 #define m_COREI7_32 (1<<PROCESSOR_COREI7_32)
1974 #define m_COREI7_64 (1<<PROCESSOR_COREI7_64)
1975 #define m_COREI7 (m_COREI7_32 | m_COREI7_64)
1976 #define m_CORE2I7_32 (m_CORE2_32 | m_COREI7_32)
1977 #define m_CORE2I7_64 (m_CORE2_64 | m_COREI7_64)
1978 #define m_CORE2I7 (m_CORE2I7_32 | m_CORE2I7_64)
1979 #define m_ATOM (1<<PROCESSOR_ATOM)
1980
1981 #define m_GEODE (1<<PROCESSOR_GEODE)
1982 #define m_K6 (1<<PROCESSOR_K6)
1983 #define m_K6_GEODE (m_K6 | m_GEODE)
1984 #define m_K8 (1<<PROCESSOR_K8)
1985 #define m_ATHLON (1<<PROCESSOR_ATHLON)
1986 #define m_ATHLON_K8 (m_K8 | m_ATHLON)
1987 #define m_AMDFAM10 (1<<PROCESSOR_AMDFAM10)
1988 #define m_BDVER1 (1<<PROCESSOR_BDVER1)
1989 #define m_BDVER2 (1<<PROCESSOR_BDVER2)
1990 #define m_BDVER (m_BDVER1 | m_BDVER2)
1991 #define m_BTVER (m_BTVER1 | m_BTVER2)
1992 #define m_BTVER1 (1<<PROCESSOR_BTVER1)
1993 #define m_BTVER2 (1<<PROCESSOR_BTVER2)
1994 #define m_AMD_MULTIPLE (m_ATHLON_K8 | m_AMDFAM10 | m_BDVER | m_BTVER)
1995
1996 #define m_GENERIC32 (1<<PROCESSOR_GENERIC32)
1997 #define m_GENERIC64 (1<<PROCESSOR_GENERIC64)
1998
1999 /* Generic instruction choice should be common subset of supported CPUs
2000 (PPro/PENT4/NOCONA/CORE2/Athlon/K8). */
2001 #define m_GENERIC (m_GENERIC32 | m_GENERIC64)
2002
2003 /* Feature tests against the various tunings. */
2004 unsigned char ix86_tune_features[X86_TUNE_LAST];
2005
2006 /* Feature tests against the various tunings used to create ix86_tune_features
2007 based on the processor mask. */
2008 static unsigned int initial_ix86_tune_features[X86_TUNE_LAST] = {
2009 /* X86_TUNE_USE_LEAVE: Leave does not affect Nocona SPEC2000 results
2010 negatively, so enabling for Generic64 seems like good code size
2011 tradeoff. We can't enable it for 32bit generic because it does not
2012 work well with PPro base chips. */
2013 m_386 | m_CORE2I7_64 | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC64,
2014
2015 /* X86_TUNE_PUSH_MEMORY */
2016 m_386 | m_P4_NOCONA | m_CORE2I7 | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC,
2017
2018 /* X86_TUNE_ZERO_EXTEND_WITH_AND */
2019 m_486 | m_PENT,
2020
2021 /* X86_TUNE_UNROLL_STRLEN */
2022 m_486 | m_PENT | m_PPRO | m_ATOM | m_CORE2I7 | m_K6 | m_AMD_MULTIPLE | m_GENERIC,
2023
2024 /* X86_TUNE_BRANCH_PREDICTION_HINTS: Branch hints were put in P4 based
2025 on simulation result. But after P4 was made, no performance benefit
2026 was observed with branch hints. It also increases the code size.
2027 As a result, icc never generates branch hints. */
2028 0,
2029
2030 /* X86_TUNE_DOUBLE_WITH_ADD */
2031 ~m_386,
2032
2033 /* X86_TUNE_USE_SAHF */
2034 m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_K6_GEODE | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER | m_GENERIC,
2035
2036 /* X86_TUNE_MOVX: Enable to zero extend integer registers to avoid
2037 partial dependencies. */
2038 m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_GEODE | m_AMD_MULTIPLE | m_GENERIC,
2039
2040 /* X86_TUNE_PARTIAL_REG_STALL: We probably ought to watch for partial
2041 register stalls on Generic32 compilation setting as well. However
2042 in current implementation the partial register stalls are not eliminated
2043 very well - they can be introduced via subregs synthesized by combine
2044 and can happen in caller/callee saving sequences. Because this option
2045 pays back little on PPro based chips and is in conflict with partial reg
2046 dependencies used by Athlon/P4 based chips, it is better to leave it off
2047 for generic32 for now. */
2048 m_PPRO,
2049
2050 /* X86_TUNE_PARTIAL_FLAG_REG_STALL */
2051 m_CORE2I7 | m_GENERIC,
2052
2053 /* X86_TUNE_LCP_STALL: Avoid an expensive length-changing prefix stall
2054 * on 16-bit immediate moves into memory on Core2 and Corei7. */
2055 m_CORE2I7 | m_GENERIC,
2056
2057 /* X86_TUNE_USE_HIMODE_FIOP */
2058 m_386 | m_486 | m_K6_GEODE,
2059
2060 /* X86_TUNE_USE_SIMODE_FIOP */
2061 ~(m_PENT | m_PPRO | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC),
2062
2063 /* X86_TUNE_USE_MOV0 */
2064 m_K6,
2065
2066 /* X86_TUNE_USE_CLTD */
2067 ~(m_PENT | m_CORE2I7 | m_ATOM | m_K6 | m_GENERIC),
2068
2069 /* X86_TUNE_USE_XCHGB: Use xchgb %rh,%rl instead of rolw/rorw $8,rx. */
2070 m_PENT4,
2071
2072 /* X86_TUNE_SPLIT_LONG_MOVES */
2073 m_PPRO,
2074
2075 /* X86_TUNE_READ_MODIFY_WRITE */
2076 ~m_PENT,
2077
2078 /* X86_TUNE_READ_MODIFY */
2079 ~(m_PENT | m_PPRO),
2080
2081 /* X86_TUNE_PROMOTE_QIMODE */
2082 m_386 | m_486 | m_PENT | m_CORE2I7 | m_ATOM | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC,
2083
2084 /* X86_TUNE_FAST_PREFIX */
2085 ~(m_386 | m_486 | m_PENT),
2086
2087 /* X86_TUNE_SINGLE_STRINGOP */
2088 m_386 | m_P4_NOCONA,
2089
2090 /* X86_TUNE_QIMODE_MATH */
2091 ~0,
2092
2093 /* X86_TUNE_HIMODE_MATH: On PPro this flag is meant to avoid partial
2094 register stalls. Just like X86_TUNE_PARTIAL_REG_STALL this option
2095 might be considered for Generic32 if our scheme for avoiding partial
2096 stalls was more effective. */
2097 ~m_PPRO,
2098
2099 /* X86_TUNE_PROMOTE_QI_REGS */
2100 0,
2101
2102 /* X86_TUNE_PROMOTE_HI_REGS */
2103 m_PPRO,
2104
2105 /* X86_TUNE_SINGLE_POP: Enable if single pop insn is preferred
2106 over esp addition. */
2107 m_386 | m_486 | m_PENT | m_PPRO,
2108
2109 /* X86_TUNE_DOUBLE_POP: Enable if double pop insn is preferred
2110 over esp addition. */
2111 m_PENT,
2112
2113 /* X86_TUNE_SINGLE_PUSH: Enable if single push insn is preferred
2114 over esp subtraction. */
2115 m_386 | m_486 | m_PENT | m_K6_GEODE,
2116
2117 /* X86_TUNE_DOUBLE_PUSH. Enable if double push insn is preferred
2118 over esp subtraction. */
2119 m_PENT | m_K6_GEODE,
2120
2121 /* X86_TUNE_INTEGER_DFMODE_MOVES: Enable if integer moves are preferred
2122 for DFmode copies */
2123 ~(m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_GEODE | m_AMD_MULTIPLE | m_ATOM | m_GENERIC),
2124
2125 /* X86_TUNE_PARTIAL_REG_DEPENDENCY */
2126 m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
2127
2128 /* X86_TUNE_SSE_PARTIAL_REG_DEPENDENCY: In the Generic model we have a
2129 conflict here in between PPro/Pentium4 based chips that thread 128bit
2130 SSE registers as single units versus K8 based chips that divide SSE
2131 registers to two 64bit halves. This knob promotes all store destinations
2132 to be 128bit to allow register renaming on 128bit SSE units, but usually
2133 results in one extra microop on 64bit SSE units. Experimental results
2134 shows that disabling this option on P4 brings over 20% SPECfp regression,
2135 while enabling it on K8 brings roughly 2.4% regression that can be partly
2136 masked by careful scheduling of moves. */
2137 m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMDFAM10 | m_BDVER | m_GENERIC,
2138
2139 /* X86_TUNE_SSE_UNALIGNED_LOAD_OPTIMAL */
2140 m_COREI7 | m_AMDFAM10 | m_BDVER | m_BTVER,
2141
2142 /* X86_TUNE_SSE_UNALIGNED_STORE_OPTIMAL */
2143 m_COREI7 | m_BDVER,
2144
2145 /* X86_TUNE_SSE_PACKED_SINGLE_INSN_OPTIMAL */
2146 m_BDVER ,
2147
2148 /* X86_TUNE_SSE_SPLIT_REGS: Set for machines where the type and dependencies
2149 are resolved on SSE register parts instead of whole registers, so we may
2150 maintain just lower part of scalar values in proper format leaving the
2151 upper part undefined. */
2152 m_ATHLON_K8,
2153
2154 /* X86_TUNE_SSE_TYPELESS_STORES */
2155 m_AMD_MULTIPLE,
2156
2157 /* X86_TUNE_SSE_LOAD0_BY_PXOR */
2158 m_PPRO | m_P4_NOCONA,
2159
2160 /* X86_TUNE_MEMORY_MISMATCH_STALL */
2161 m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
2162
2163 /* X86_TUNE_PROLOGUE_USING_MOVE */
2164 m_PPRO | m_CORE2I7 | m_ATOM | m_ATHLON_K8 | m_GENERIC,
2165
2166 /* X86_TUNE_EPILOGUE_USING_MOVE */
2167 m_PPRO | m_CORE2I7 | m_ATOM | m_ATHLON_K8 | m_GENERIC,
2168
2169 /* X86_TUNE_SHIFT1 */
2170 ~m_486,
2171
2172 /* X86_TUNE_USE_FFREEP */
2173 m_AMD_MULTIPLE,
2174
2175 /* X86_TUNE_INTER_UNIT_MOVES */
2176 ~(m_AMD_MULTIPLE | m_GENERIC),
2177
2178 /* X86_TUNE_INTER_UNIT_CONVERSIONS */
2179 ~(m_AMDFAM10 | m_BDVER ),
2180
2181 /* X86_TUNE_FOUR_JUMP_LIMIT: Some CPU cores are not able to predict more
2182 than 4 branch instructions in the 16 byte window. */
2183 m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
2184
2185 /* X86_TUNE_SCHEDULE */
2186 m_PENT | m_PPRO | m_CORE2I7 | m_ATOM | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC,
2187
2188 /* X86_TUNE_USE_BT */
2189 m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
2190
2191 /* X86_TUNE_USE_INCDEC */
2192 ~(m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_GENERIC),
2193
2194 /* X86_TUNE_PAD_RETURNS */
2195 m_CORE2I7 | m_AMD_MULTIPLE | m_GENERIC,
2196
2197 /* X86_TUNE_PAD_SHORT_FUNCTION: Pad short funtion. */
2198 m_ATOM,
2199
2200 /* X86_TUNE_EXT_80387_CONSTANTS */
2201 m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_K6_GEODE | m_ATHLON_K8 | m_GENERIC,
2202
2203 /* X86_TUNE_SHORTEN_X87_SSE */
2204 ~m_K8,
2205
2206 /* X86_TUNE_AVOID_VECTOR_DECODE */
2207 m_CORE2I7_64 | m_K8 | m_GENERIC64,
2208
2209 /* X86_TUNE_PROMOTE_HIMODE_IMUL: Modern CPUs have same latency for HImode
2210 and SImode multiply, but 386 and 486 do HImode multiply faster. */
2211 ~(m_386 | m_486),
2212
2213 /* X86_TUNE_SLOW_IMUL_IMM32_MEM: Imul of 32-bit constant and memory is
2214 vector path on AMD machines. */
2215 m_CORE2I7_64 | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER | m_GENERIC64,
2216
2217 /* X86_TUNE_SLOW_IMUL_IMM8: Imul of 8-bit constant is vector path on AMD
2218 machines. */
2219 m_CORE2I7_64 | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER | m_GENERIC64,
2220
2221 /* X86_TUNE_MOVE_M1_VIA_OR: On pentiums, it is faster to load -1 via OR
2222 than a MOV. */
2223 m_PENT,
2224
2225 /* X86_TUNE_NOT_UNPAIRABLE: NOT is not pairable on Pentium, while XOR is,
2226 but one byte longer. */
2227 m_PENT,
2228
2229 /* X86_TUNE_NOT_VECTORMODE: On AMD K6, NOT is vector decoded with memory
2230 operand that cannot be represented using a modRM byte. The XOR
2231 replacement is long decoded, so this split helps here as well. */
2232 m_K6,
2233
2234 /* X86_TUNE_USE_VECTOR_FP_CONVERTS: Prefer vector packed SSE conversion
2235 from FP to FP. */
2236 m_CORE2I7 | m_AMDFAM10 | m_GENERIC,
2237
2238 /* X86_TUNE_USE_VECTOR_CONVERTS: Prefer vector packed SSE conversion
2239 from integer to FP. */
2240 m_AMDFAM10,
2241
2242 /* X86_TUNE_FUSE_CMP_AND_BRANCH: Fuse a compare or test instruction
2243 with a subsequent conditional jump instruction into a single
2244 compare-and-branch uop. */
2245 m_BDVER,
2246
2247 /* X86_TUNE_OPT_AGU: Optimize for Address Generation Unit. This flag
2248 will impact LEA instruction selection. */
2249 m_ATOM,
2250
2251 /* X86_TUNE_VECTORIZE_DOUBLE: Enable double precision vector
2252 instructions. */
2253 ~m_ATOM,
2254
2255 /* X86_SOFTARE_PREFETCHING_BENEFICIAL: Enable software prefetching
2256 at -O3. For the moment, the prefetching seems badly tuned for Intel
2257 chips. */
2258 m_K6_GEODE | m_AMD_MULTIPLE,
2259
2260 /* X86_TUNE_AVX128_OPTIMAL: Enable 128-bit AVX instruction generation for
2261 the auto-vectorizer. */
2262 m_BDVER,
2263
2264 /* X86_TUNE_REASSOC_INT_TO_PARALLEL: Try to produce parallel computations
2265 during reassociation of integer computation. */
2266 m_ATOM,
2267
2268 /* X86_TUNE_REASSOC_FP_TO_PARALLEL: Try to produce parallel computations
2269 during reassociation of fp computation. */
2270 m_ATOM,
2271
2272 /* X86_TUNE_GENERAL_REGS_SSE_SPILL: Try to spill general regs to SSE
2273 regs instead of memory. */
2274 m_COREI7 | m_CORE2I7
2275 };
2276
2277 /* Feature tests against the various architecture variations. */
2278 unsigned char ix86_arch_features[X86_ARCH_LAST];
2279
2280 /* Feature tests against the various architecture variations, used to create
2281 ix86_arch_features based on the processor mask. */
2282 static unsigned int initial_ix86_arch_features[X86_ARCH_LAST] = {
2283 /* X86_ARCH_CMOV: Conditional move was added for pentiumpro. */
2284 ~(m_386 | m_486 | m_PENT | m_K6),
2285
2286 /* X86_ARCH_CMPXCHG: Compare and exchange was added for 80486. */
2287 ~m_386,
2288
2289 /* X86_ARCH_CMPXCHG8B: Compare and exchange 8 bytes was added for pentium. */
2290 ~(m_386 | m_486),
2291
2292 /* X86_ARCH_XADD: Exchange and add was added for 80486. */
2293 ~m_386,
2294
2295 /* X86_ARCH_BSWAP: Byteswap was added for 80486. */
2296 ~m_386,
2297 };
2298
2299 static const unsigned int x86_accumulate_outgoing_args
2300 = m_PPRO | m_P4_NOCONA | m_ATOM | m_CORE2I7 | m_AMD_MULTIPLE | m_GENERIC;
2301
2302 static const unsigned int x86_arch_always_fancy_math_387
2303 = m_PENT | m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC;
2304
2305 static const unsigned int x86_avx256_split_unaligned_load
2306 = m_COREI7 | m_GENERIC;
2307
2308 static const unsigned int x86_avx256_split_unaligned_store
2309 = m_COREI7 | m_BDVER | m_GENERIC;
2310
2311 /* In case the average insn count for single function invocation is
2312 lower than this constant, emit fast (but longer) prologue and
2313 epilogue code. */
2314 #define FAST_PROLOGUE_INSN_COUNT 20
2315
2316 /* Names for 8 (low), 8 (high), and 16-bit registers, respectively. */
2317 static const char *const qi_reg_name[] = QI_REGISTER_NAMES;
2318 static const char *const qi_high_reg_name[] = QI_HIGH_REGISTER_NAMES;
2319 static const char *const hi_reg_name[] = HI_REGISTER_NAMES;
2320
2321 /* Array of the smallest class containing reg number REGNO, indexed by
2322 REGNO. Used by REGNO_REG_CLASS in i386.h. */
2323
2324 enum reg_class const regclass_map[FIRST_PSEUDO_REGISTER] =
2325 {
2326 /* ax, dx, cx, bx */
2327 AREG, DREG, CREG, BREG,
2328 /* si, di, bp, sp */
2329 SIREG, DIREG, NON_Q_REGS, NON_Q_REGS,
2330 /* FP registers */
2331 FP_TOP_REG, FP_SECOND_REG, FLOAT_REGS, FLOAT_REGS,
2332 FLOAT_REGS, FLOAT_REGS, FLOAT_REGS, FLOAT_REGS,
2333 /* arg pointer */
2334 NON_Q_REGS,
2335 /* flags, fpsr, fpcr, frame */
2336 NO_REGS, NO_REGS, NO_REGS, NON_Q_REGS,
2337 /* SSE registers */
2338 SSE_FIRST_REG, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
2339 SSE_REGS, SSE_REGS,
2340 /* MMX registers */
2341 MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS,
2342 MMX_REGS, MMX_REGS,
2343 /* REX registers */
2344 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
2345 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
2346 /* SSE REX registers */
2347 SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
2348 SSE_REGS, SSE_REGS,
2349 };
2350
2351 /* The "default" register map used in 32bit mode. */
2352
2353 int const dbx_register_map[FIRST_PSEUDO_REGISTER] =
2354 {
2355 0, 2, 1, 3, 6, 7, 4, 5, /* general regs */
2356 12, 13, 14, 15, 16, 17, 18, 19, /* fp regs */
2357 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2358 21, 22, 23, 24, 25, 26, 27, 28, /* SSE */
2359 29, 30, 31, 32, 33, 34, 35, 36, /* MMX */
2360 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
2361 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
2362 };
2363
2364 /* The "default" register map used in 64bit mode. */
2365
2366 int const dbx64_register_map[FIRST_PSEUDO_REGISTER] =
2367 {
2368 0, 1, 2, 3, 4, 5, 6, 7, /* general regs */
2369 33, 34, 35, 36, 37, 38, 39, 40, /* fp regs */
2370 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2371 17, 18, 19, 20, 21, 22, 23, 24, /* SSE */
2372 41, 42, 43, 44, 45, 46, 47, 48, /* MMX */
2373 8,9,10,11,12,13,14,15, /* extended integer registers */
2374 25, 26, 27, 28, 29, 30, 31, 32, /* extended SSE registers */
2375 };
2376
2377 /* Define the register numbers to be used in Dwarf debugging information.
2378 The SVR4 reference port C compiler uses the following register numbers
2379 in its Dwarf output code:
2380 0 for %eax (gcc regno = 0)
2381 1 for %ecx (gcc regno = 2)
2382 2 for %edx (gcc regno = 1)
2383 3 for %ebx (gcc regno = 3)
2384 4 for %esp (gcc regno = 7)
2385 5 for %ebp (gcc regno = 6)
2386 6 for %esi (gcc regno = 4)
2387 7 for %edi (gcc regno = 5)
2388 The following three DWARF register numbers are never generated by
2389 the SVR4 C compiler or by the GNU compilers, but SDB on x86/svr4
2390 believes these numbers have these meanings.
2391 8 for %eip (no gcc equivalent)
2392 9 for %eflags (gcc regno = 17)
2393 10 for %trapno (no gcc equivalent)
2394 It is not at all clear how we should number the FP stack registers
2395 for the x86 architecture. If the version of SDB on x86/svr4 were
2396 a bit less brain dead with respect to floating-point then we would
2397 have a precedent to follow with respect to DWARF register numbers
2398 for x86 FP registers, but the SDB on x86/svr4 is so completely
2399 broken with respect to FP registers that it is hardly worth thinking
2400 of it as something to strive for compatibility with.
2401 The version of x86/svr4 SDB I have at the moment does (partially)
2402 seem to believe that DWARF register number 11 is associated with
2403 the x86 register %st(0), but that's about all. Higher DWARF
2404 register numbers don't seem to be associated with anything in
2405 particular, and even for DWARF regno 11, SDB only seems to under-
2406 stand that it should say that a variable lives in %st(0) (when
2407 asked via an `=' command) if we said it was in DWARF regno 11,
2408 but SDB still prints garbage when asked for the value of the
2409 variable in question (via a `/' command).
2410 (Also note that the labels SDB prints for various FP stack regs
2411 when doing an `x' command are all wrong.)
2412 Note that these problems generally don't affect the native SVR4
2413 C compiler because it doesn't allow the use of -O with -g and
2414 because when it is *not* optimizing, it allocates a memory
2415 location for each floating-point variable, and the memory
2416 location is what gets described in the DWARF AT_location
2417 attribute for the variable in question.
2418 Regardless of the severe mental illness of the x86/svr4 SDB, we
2419 do something sensible here and we use the following DWARF
2420 register numbers. Note that these are all stack-top-relative
2421 numbers.
2422 11 for %st(0) (gcc regno = 8)
2423 12 for %st(1) (gcc regno = 9)
2424 13 for %st(2) (gcc regno = 10)
2425 14 for %st(3) (gcc regno = 11)
2426 15 for %st(4) (gcc regno = 12)
2427 16 for %st(5) (gcc regno = 13)
2428 17 for %st(6) (gcc regno = 14)
2429 18 for %st(7) (gcc regno = 15)
2430 */
2431 int const svr4_dbx_register_map[FIRST_PSEUDO_REGISTER] =
2432 {
2433 0, 2, 1, 3, 6, 7, 5, 4, /* general regs */
2434 11, 12, 13, 14, 15, 16, 17, 18, /* fp regs */
2435 -1, 9, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2436 21, 22, 23, 24, 25, 26, 27, 28, /* SSE registers */
2437 29, 30, 31, 32, 33, 34, 35, 36, /* MMX registers */
2438 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
2439 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
2440 };
2441
2442 /* Define parameter passing and return registers. */
2443
2444 static int const x86_64_int_parameter_registers[6] =
2445 {
2446 DI_REG, SI_REG, DX_REG, CX_REG, R8_REG, R9_REG
2447 };
2448
2449 static int const x86_64_ms_abi_int_parameter_registers[4] =
2450 {
2451 CX_REG, DX_REG, R8_REG, R9_REG
2452 };
2453
2454 static int const x86_64_int_return_registers[4] =
2455 {
2456 AX_REG, DX_REG, DI_REG, SI_REG
2457 };
2458
2459 /* Define the structure for the machine field in struct function. */
2460
2461 struct GTY(()) stack_local_entry {
2462 unsigned short mode;
2463 unsigned short n;
2464 rtx rtl;
2465 struct stack_local_entry *next;
2466 };
2467
2468 /* Structure describing stack frame layout.
2469 Stack grows downward:
2470
2471 [arguments]
2472 <- ARG_POINTER
2473 saved pc
2474
2475 saved static chain if ix86_static_chain_on_stack
2476
2477 saved frame pointer if frame_pointer_needed
2478 <- HARD_FRAME_POINTER
2479 [saved regs]
2480 <- regs_save_offset
2481 [padding0]
2482
2483 [saved SSE regs]
2484 <- sse_regs_save_offset
2485 [padding1] |
2486 | <- FRAME_POINTER
2487 [va_arg registers] |
2488 |
2489 [frame] |
2490 |
2491 [padding2] | = to_allocate
2492 <- STACK_POINTER
2493 */
2494 struct ix86_frame
2495 {
2496 int nsseregs;
2497 int nregs;
2498 int va_arg_size;
2499 int red_zone_size;
2500 int outgoing_arguments_size;
2501
2502 /* The offsets relative to ARG_POINTER. */
2503 HOST_WIDE_INT frame_pointer_offset;
2504 HOST_WIDE_INT hard_frame_pointer_offset;
2505 HOST_WIDE_INT stack_pointer_offset;
2506 HOST_WIDE_INT hfp_save_offset;
2507 HOST_WIDE_INT reg_save_offset;
2508 HOST_WIDE_INT sse_reg_save_offset;
2509
2510 /* When save_regs_using_mov is set, emit prologue using
2511 move instead of push instructions. */
2512 bool save_regs_using_mov;
2513 };
2514
2515 /* Which cpu are we scheduling for. */
2516 enum attr_cpu ix86_schedule;
2517
2518 /* Which cpu are we optimizing for. */
2519 enum processor_type ix86_tune;
2520
2521 /* Which instruction set architecture to use. */
2522 enum processor_type ix86_arch;
2523
2524 /* True if processor has SSE prefetch instruction. */
2525 unsigned char x86_prefetch_sse;
2526
2527 /* -mstackrealign option */
2528 static const char ix86_force_align_arg_pointer_string[]
2529 = "force_align_arg_pointer";
2530
2531 static rtx (*ix86_gen_leave) (void);
2532 static rtx (*ix86_gen_add3) (rtx, rtx, rtx);
2533 static rtx (*ix86_gen_sub3) (rtx, rtx, rtx);
2534 static rtx (*ix86_gen_sub3_carry) (rtx, rtx, rtx, rtx, rtx);
2535 static rtx (*ix86_gen_one_cmpl2) (rtx, rtx);
2536 static rtx (*ix86_gen_monitor) (rtx, rtx, rtx);
2537 static rtx (*ix86_gen_andsp) (rtx, rtx, rtx);
2538 static rtx (*ix86_gen_allocate_stack_worker) (rtx, rtx);
2539 static rtx (*ix86_gen_adjust_stack_and_probe) (rtx, rtx, rtx);
2540 static rtx (*ix86_gen_probe_stack_range) (rtx, rtx, rtx);
2541 static rtx (*ix86_gen_tls_global_dynamic_64) (rtx, rtx, rtx);
2542 static rtx (*ix86_gen_tls_local_dynamic_base_64) (rtx, rtx);
2543
2544 /* Preferred alignment for stack boundary in bits. */
2545 unsigned int ix86_preferred_stack_boundary;
2546
2547 /* Alignment for incoming stack boundary in bits specified at
2548 command line. */
2549 static unsigned int ix86_user_incoming_stack_boundary;
2550
2551 /* Default alignment for incoming stack boundary in bits. */
2552 static unsigned int ix86_default_incoming_stack_boundary;
2553
2554 /* Alignment for incoming stack boundary in bits. */
2555 unsigned int ix86_incoming_stack_boundary;
2556
2557 /* Calling abi specific va_list type nodes. */
2558 static GTY(()) tree sysv_va_list_type_node;
2559 static GTY(()) tree ms_va_list_type_node;
2560
2561 /* Prefix built by ASM_GENERATE_INTERNAL_LABEL. */
2562 char internal_label_prefix[16];
2563 int internal_label_prefix_len;
2564
2565 /* Fence to use after loop using movnt. */
2566 tree x86_mfence;
2567
2568 /* Register class used for passing given 64bit part of the argument.
2569 These represent classes as documented by the PS ABI, with the exception
2570 of SSESF, SSEDF classes, that are basically SSE class, just gcc will
2571 use SF or DFmode move instead of DImode to avoid reformatting penalties.
2572
2573 Similarly we play games with INTEGERSI_CLASS to use cheaper SImode moves
2574 whenever possible (upper half does contain padding). */
2575 enum x86_64_reg_class
2576 {
2577 X86_64_NO_CLASS,
2578 X86_64_INTEGER_CLASS,
2579 X86_64_INTEGERSI_CLASS,
2580 X86_64_SSE_CLASS,
2581 X86_64_SSESF_CLASS,
2582 X86_64_SSEDF_CLASS,
2583 X86_64_SSEUP_CLASS,
2584 X86_64_X87_CLASS,
2585 X86_64_X87UP_CLASS,
2586 X86_64_COMPLEX_X87_CLASS,
2587 X86_64_MEMORY_CLASS
2588 };
2589
2590 #define MAX_CLASSES 4
2591
2592 /* Table of constants used by fldpi, fldln2, etc.... */
2593 static REAL_VALUE_TYPE ext_80387_constants_table [5];
2594 static bool ext_80387_constants_init = 0;
2595
2596 \f
2597 static struct machine_function * ix86_init_machine_status (void);
2598 static rtx ix86_function_value (const_tree, const_tree, bool);
2599 static bool ix86_function_value_regno_p (const unsigned int);
2600 static unsigned int ix86_function_arg_boundary (enum machine_mode,
2601 const_tree);
2602 static rtx ix86_static_chain (const_tree, bool);
2603 static int ix86_function_regparm (const_tree, const_tree);
2604 static void ix86_compute_frame_layout (struct ix86_frame *);
2605 static bool ix86_expand_vector_init_one_nonzero (bool, enum machine_mode,
2606 rtx, rtx, int);
2607 static void ix86_add_new_builtins (HOST_WIDE_INT);
2608 static tree ix86_canonical_va_list_type (tree);
2609 static void predict_jump (int);
2610 static unsigned int split_stack_prologue_scratch_regno (void);
2611 static bool i386_asm_output_addr_const_extra (FILE *, rtx);
2612
2613 enum ix86_function_specific_strings
2614 {
2615 IX86_FUNCTION_SPECIFIC_ARCH,
2616 IX86_FUNCTION_SPECIFIC_TUNE,
2617 IX86_FUNCTION_SPECIFIC_MAX
2618 };
2619
2620 static char *ix86_target_string (HOST_WIDE_INT, int, const char *,
2621 const char *, enum fpmath_unit, bool);
2622 static void ix86_debug_options (void) ATTRIBUTE_UNUSED;
2623 static void ix86_function_specific_save (struct cl_target_option *);
2624 static void ix86_function_specific_restore (struct cl_target_option *);
2625 static void ix86_function_specific_print (FILE *, int,
2626 struct cl_target_option *);
2627 static bool ix86_valid_target_attribute_p (tree, tree, tree, int);
2628 static bool ix86_valid_target_attribute_inner_p (tree, char *[],
2629 struct gcc_options *);
2630 static bool ix86_can_inline_p (tree, tree);
2631 static void ix86_set_current_function (tree);
2632 static unsigned int ix86_minimum_incoming_stack_boundary (bool);
2633
2634 static enum calling_abi ix86_function_abi (const_tree);
2635
2636 \f
2637 #ifndef SUBTARGET32_DEFAULT_CPU
2638 #define SUBTARGET32_DEFAULT_CPU "i386"
2639 #endif
2640
2641 /* The svr4 ABI for the i386 says that records and unions are returned
2642 in memory. */
2643 #ifndef DEFAULT_PCC_STRUCT_RETURN
2644 #define DEFAULT_PCC_STRUCT_RETURN 1
2645 #endif
2646
2647 /* Whether -mtune= or -march= were specified */
2648 static int ix86_tune_defaulted;
2649 static int ix86_arch_specified;
2650
2651 /* Vectorization library interface and handlers. */
2652 static tree (*ix86_veclib_handler) (enum built_in_function, tree, tree);
2653
2654 static tree ix86_veclibabi_svml (enum built_in_function, tree, tree);
2655 static tree ix86_veclibabi_acml (enum built_in_function, tree, tree);
2656
2657 /* Processor target table, indexed by processor number */
2658 struct ptt
2659 {
2660 const struct processor_costs *cost; /* Processor costs */
2661 const int align_loop; /* Default alignments. */
2662 const int align_loop_max_skip;
2663 const int align_jump;
2664 const int align_jump_max_skip;
2665 const int align_func;
2666 };
2667
2668 static const struct ptt processor_target_table[PROCESSOR_max] =
2669 {
2670 {&i386_cost, 4, 3, 4, 3, 4},
2671 {&i486_cost, 16, 15, 16, 15, 16},
2672 {&pentium_cost, 16, 7, 16, 7, 16},
2673 {&pentiumpro_cost, 16, 15, 16, 10, 16},
2674 {&geode_cost, 0, 0, 0, 0, 0},
2675 {&k6_cost, 32, 7, 32, 7, 32},
2676 {&athlon_cost, 16, 7, 16, 7, 16},
2677 {&pentium4_cost, 0, 0, 0, 0, 0},
2678 {&k8_cost, 16, 7, 16, 7, 16},
2679 {&nocona_cost, 0, 0, 0, 0, 0},
2680 /* Core 2 32-bit. */
2681 {&generic32_cost, 16, 10, 16, 10, 16},
2682 /* Core 2 64-bit. */
2683 {&generic64_cost, 16, 10, 16, 10, 16},
2684 /* Core i7 32-bit. */
2685 {&generic32_cost, 16, 10, 16, 10, 16},
2686 /* Core i7 64-bit. */
2687 {&generic64_cost, 16, 10, 16, 10, 16},
2688 {&generic32_cost, 16, 7, 16, 7, 16},
2689 {&generic64_cost, 16, 10, 16, 10, 16},
2690 {&amdfam10_cost, 32, 24, 32, 7, 32},
2691 {&bdver1_cost, 32, 24, 32, 7, 32},
2692 {&bdver2_cost, 32, 24, 32, 7, 32},
2693 {&btver1_cost, 32, 24, 32, 7, 32},
2694 {&btver2_cost, 32, 24, 32, 7, 32},
2695 {&atom_cost, 16, 15, 16, 7, 16}
2696 };
2697
2698 static const char *const cpu_names[TARGET_CPU_DEFAULT_max] =
2699 {
2700 "generic",
2701 "i386",
2702 "i486",
2703 "pentium",
2704 "pentium-mmx",
2705 "pentiumpro",
2706 "pentium2",
2707 "pentium3",
2708 "pentium4",
2709 "pentium-m",
2710 "prescott",
2711 "nocona",
2712 "core2",
2713 "corei7",
2714 "atom",
2715 "geode",
2716 "k6",
2717 "k6-2",
2718 "k6-3",
2719 "athlon",
2720 "athlon-4",
2721 "k8",
2722 "amdfam10",
2723 "bdver1",
2724 "bdver2",
2725 "btver1",
2726 "btver2"
2727 };
2728 \f
2729 /* Return true if a red-zone is in use. */
2730
2731 static inline bool
2732 ix86_using_red_zone (void)
2733 {
2734 return TARGET_RED_ZONE && !TARGET_64BIT_MS_ABI;
2735 }
2736 \f
2737 /* Return a string that documents the current -m options. The caller is
2738 responsible for freeing the string. */
2739
2740 static char *
2741 ix86_target_string (HOST_WIDE_INT isa, int flags, const char *arch,
2742 const char *tune, enum fpmath_unit fpmath,
2743 bool add_nl_p)
2744 {
2745 struct ix86_target_opts
2746 {
2747 const char *option; /* option string */
2748 HOST_WIDE_INT mask; /* isa mask options */
2749 };
2750
2751 /* This table is ordered so that options like -msse4.2 that imply
2752 preceding options while match those first. */
2753 static struct ix86_target_opts isa_opts[] =
2754 {
2755 { "-mfma4", OPTION_MASK_ISA_FMA4 },
2756 { "-mfma", OPTION_MASK_ISA_FMA },
2757 { "-mxop", OPTION_MASK_ISA_XOP },
2758 { "-mlwp", OPTION_MASK_ISA_LWP },
2759 { "-msse4a", OPTION_MASK_ISA_SSE4A },
2760 { "-msse4.2", OPTION_MASK_ISA_SSE4_2 },
2761 { "-msse4.1", OPTION_MASK_ISA_SSE4_1 },
2762 { "-mssse3", OPTION_MASK_ISA_SSSE3 },
2763 { "-msse3", OPTION_MASK_ISA_SSE3 },
2764 { "-msse2", OPTION_MASK_ISA_SSE2 },
2765 { "-msse", OPTION_MASK_ISA_SSE },
2766 { "-m3dnow", OPTION_MASK_ISA_3DNOW },
2767 { "-m3dnowa", OPTION_MASK_ISA_3DNOW_A },
2768 { "-mmmx", OPTION_MASK_ISA_MMX },
2769 { "-mabm", OPTION_MASK_ISA_ABM },
2770 { "-mbmi", OPTION_MASK_ISA_BMI },
2771 { "-mbmi2", OPTION_MASK_ISA_BMI2 },
2772 { "-mlzcnt", OPTION_MASK_ISA_LZCNT },
2773 { "-mhle", OPTION_MASK_ISA_HLE },
2774 { "-mfxsr", OPTION_MASK_ISA_FXSR },
2775 { "-mrdseed", OPTION_MASK_ISA_RDSEED },
2776 { "-mprfchw", OPTION_MASK_ISA_PRFCHW },
2777 { "-madx", OPTION_MASK_ISA_ADX },
2778 { "-mtbm", OPTION_MASK_ISA_TBM },
2779 { "-mpopcnt", OPTION_MASK_ISA_POPCNT },
2780 { "-mmovbe", OPTION_MASK_ISA_MOVBE },
2781 { "-mcrc32", OPTION_MASK_ISA_CRC32 },
2782 { "-maes", OPTION_MASK_ISA_AES },
2783 { "-mpclmul", OPTION_MASK_ISA_PCLMUL },
2784 { "-mfsgsbase", OPTION_MASK_ISA_FSGSBASE },
2785 { "-mrdrnd", OPTION_MASK_ISA_RDRND },
2786 { "-mf16c", OPTION_MASK_ISA_F16C },
2787 { "-mrtm", OPTION_MASK_ISA_RTM },
2788 { "-mxsave", OPTION_MASK_ISA_XSAVE },
2789 { "-mxsaveopt", OPTION_MASK_ISA_XSAVEOPT },
2790 };
2791
2792 /* Flag options. */
2793 static struct ix86_target_opts flag_opts[] =
2794 {
2795 { "-m128bit-long-double", MASK_128BIT_LONG_DOUBLE },
2796 { "-mlong-double-64", MASK_LONG_DOUBLE_64 },
2797 { "-m80387", MASK_80387 },
2798 { "-maccumulate-outgoing-args", MASK_ACCUMULATE_OUTGOING_ARGS },
2799 { "-malign-double", MASK_ALIGN_DOUBLE },
2800 { "-mcld", MASK_CLD },
2801 { "-mfp-ret-in-387", MASK_FLOAT_RETURNS },
2802 { "-mieee-fp", MASK_IEEE_FP },
2803 { "-minline-all-stringops", MASK_INLINE_ALL_STRINGOPS },
2804 { "-minline-stringops-dynamically", MASK_INLINE_STRINGOPS_DYNAMICALLY },
2805 { "-mms-bitfields", MASK_MS_BITFIELD_LAYOUT },
2806 { "-mno-align-stringops", MASK_NO_ALIGN_STRINGOPS },
2807 { "-mno-fancy-math-387", MASK_NO_FANCY_MATH_387 },
2808 { "-mno-push-args", MASK_NO_PUSH_ARGS },
2809 { "-mno-red-zone", MASK_NO_RED_ZONE },
2810 { "-momit-leaf-frame-pointer", MASK_OMIT_LEAF_FRAME_POINTER },
2811 { "-mrecip", MASK_RECIP },
2812 { "-mrtd", MASK_RTD },
2813 { "-msseregparm", MASK_SSEREGPARM },
2814 { "-mstack-arg-probe", MASK_STACK_PROBE },
2815 { "-mtls-direct-seg-refs", MASK_TLS_DIRECT_SEG_REFS },
2816 { "-mvect8-ret-in-mem", MASK_VECT8_RETURNS },
2817 { "-m8bit-idiv", MASK_USE_8BIT_IDIV },
2818 { "-mvzeroupper", MASK_VZEROUPPER },
2819 { "-mavx256-split-unaligned-load", MASK_AVX256_SPLIT_UNALIGNED_LOAD},
2820 { "-mavx256-split-unaligned-store", MASK_AVX256_SPLIT_UNALIGNED_STORE},
2821 { "-mprefer-avx128", MASK_PREFER_AVX128},
2822 };
2823
2824 const char *opts[ARRAY_SIZE (isa_opts) + ARRAY_SIZE (flag_opts) + 6][2];
2825
2826 char isa_other[40];
2827 char target_other[40];
2828 unsigned num = 0;
2829 unsigned i, j;
2830 char *ret;
2831 char *ptr;
2832 size_t len;
2833 size_t line_len;
2834 size_t sep_len;
2835 const char *abi;
2836
2837 memset (opts, '\0', sizeof (opts));
2838
2839 /* Add -march= option. */
2840 if (arch)
2841 {
2842 opts[num][0] = "-march=";
2843 opts[num++][1] = arch;
2844 }
2845
2846 /* Add -mtune= option. */
2847 if (tune)
2848 {
2849 opts[num][0] = "-mtune=";
2850 opts[num++][1] = tune;
2851 }
2852
2853 /* Add -m32/-m64/-mx32. */
2854 if ((isa & OPTION_MASK_ISA_64BIT) != 0)
2855 {
2856 if ((isa & OPTION_MASK_ABI_64) != 0)
2857 abi = "-m64";
2858 else
2859 abi = "-mx32";
2860 isa &= ~ (OPTION_MASK_ISA_64BIT
2861 | OPTION_MASK_ABI_64
2862 | OPTION_MASK_ABI_X32);
2863 }
2864 else
2865 abi = "-m32";
2866 opts[num++][0] = abi;
2867
2868 /* Pick out the options in isa options. */
2869 for (i = 0; i < ARRAY_SIZE (isa_opts); i++)
2870 {
2871 if ((isa & isa_opts[i].mask) != 0)
2872 {
2873 opts[num++][0] = isa_opts[i].option;
2874 isa &= ~ isa_opts[i].mask;
2875 }
2876 }
2877
2878 if (isa && add_nl_p)
2879 {
2880 opts[num++][0] = isa_other;
2881 sprintf (isa_other, "(other isa: %#" HOST_WIDE_INT_PRINT "x)",
2882 isa);
2883 }
2884
2885 /* Add flag options. */
2886 for (i = 0; i < ARRAY_SIZE (flag_opts); i++)
2887 {
2888 if ((flags & flag_opts[i].mask) != 0)
2889 {
2890 opts[num++][0] = flag_opts[i].option;
2891 flags &= ~ flag_opts[i].mask;
2892 }
2893 }
2894
2895 if (flags && add_nl_p)
2896 {
2897 opts[num++][0] = target_other;
2898 sprintf (target_other, "(other flags: %#x)", flags);
2899 }
2900
2901 /* Add -fpmath= option. */
2902 if (fpmath)
2903 {
2904 opts[num][0] = "-mfpmath=";
2905 switch ((int) fpmath)
2906 {
2907 case FPMATH_387:
2908 opts[num++][1] = "387";
2909 break;
2910
2911 case FPMATH_SSE:
2912 opts[num++][1] = "sse";
2913 break;
2914
2915 case FPMATH_387 | FPMATH_SSE:
2916 opts[num++][1] = "sse+387";
2917 break;
2918
2919 default:
2920 gcc_unreachable ();
2921 }
2922 }
2923
2924 /* Any options? */
2925 if (num == 0)
2926 return NULL;
2927
2928 gcc_assert (num < ARRAY_SIZE (opts));
2929
2930 /* Size the string. */
2931 len = 0;
2932 sep_len = (add_nl_p) ? 3 : 1;
2933 for (i = 0; i < num; i++)
2934 {
2935 len += sep_len;
2936 for (j = 0; j < 2; j++)
2937 if (opts[i][j])
2938 len += strlen (opts[i][j]);
2939 }
2940
2941 /* Build the string. */
2942 ret = ptr = (char *) xmalloc (len);
2943 line_len = 0;
2944
2945 for (i = 0; i < num; i++)
2946 {
2947 size_t len2[2];
2948
2949 for (j = 0; j < 2; j++)
2950 len2[j] = (opts[i][j]) ? strlen (opts[i][j]) : 0;
2951
2952 if (i != 0)
2953 {
2954 *ptr++ = ' ';
2955 line_len++;
2956
2957 if (add_nl_p && line_len + len2[0] + len2[1] > 70)
2958 {
2959 *ptr++ = '\\';
2960 *ptr++ = '\n';
2961 line_len = 0;
2962 }
2963 }
2964
2965 for (j = 0; j < 2; j++)
2966 if (opts[i][j])
2967 {
2968 memcpy (ptr, opts[i][j], len2[j]);
2969 ptr += len2[j];
2970 line_len += len2[j];
2971 }
2972 }
2973
2974 *ptr = '\0';
2975 gcc_assert (ret + len >= ptr);
2976
2977 return ret;
2978 }
2979
2980 /* Return true, if profiling code should be emitted before
2981 prologue. Otherwise it returns false.
2982 Note: For x86 with "hotfix" it is sorried. */
2983 static bool
2984 ix86_profile_before_prologue (void)
2985 {
2986 return flag_fentry != 0;
2987 }
2988
2989 /* Function that is callable from the debugger to print the current
2990 options. */
2991 void
2992 ix86_debug_options (void)
2993 {
2994 char *opts = ix86_target_string (ix86_isa_flags, target_flags,
2995 ix86_arch_string, ix86_tune_string,
2996 ix86_fpmath, true);
2997
2998 if (opts)
2999 {
3000 fprintf (stderr, "%s\n\n", opts);
3001 free (opts);
3002 }
3003 else
3004 fputs ("<no options>\n\n", stderr);
3005
3006 return;
3007 }
3008 \f
3009 /* Override various settings based on options. If MAIN_ARGS_P, the
3010 options are from the command line, otherwise they are from
3011 attributes. */
3012
3013 static void
3014 ix86_option_override_internal (bool main_args_p)
3015 {
3016 int i;
3017 unsigned int ix86_arch_mask, ix86_tune_mask;
3018 const bool ix86_tune_specified = (ix86_tune_string != NULL);
3019 const char *prefix;
3020 const char *suffix;
3021 const char *sw;
3022
3023 #define PTA_3DNOW (HOST_WIDE_INT_1 << 0)
3024 #define PTA_3DNOW_A (HOST_WIDE_INT_1 << 1)
3025 #define PTA_64BIT (HOST_WIDE_INT_1 << 2)
3026 #define PTA_ABM (HOST_WIDE_INT_1 << 3)
3027 #define PTA_AES (HOST_WIDE_INT_1 << 4)
3028 #define PTA_AVX (HOST_WIDE_INT_1 << 5)
3029 #define PTA_BMI (HOST_WIDE_INT_1 << 6)
3030 #define PTA_CX16 (HOST_WIDE_INT_1 << 7)
3031 #define PTA_F16C (HOST_WIDE_INT_1 << 8)
3032 #define PTA_FMA (HOST_WIDE_INT_1 << 9)
3033 #define PTA_FMA4 (HOST_WIDE_INT_1 << 10)
3034 #define PTA_FSGSBASE (HOST_WIDE_INT_1 << 11)
3035 #define PTA_LWP (HOST_WIDE_INT_1 << 12)
3036 #define PTA_LZCNT (HOST_WIDE_INT_1 << 13)
3037 #define PTA_MMX (HOST_WIDE_INT_1 << 14)
3038 #define PTA_MOVBE (HOST_WIDE_INT_1 << 15)
3039 #define PTA_NO_SAHF (HOST_WIDE_INT_1 << 16)
3040 #define PTA_PCLMUL (HOST_WIDE_INT_1 << 17)
3041 #define PTA_POPCNT (HOST_WIDE_INT_1 << 18)
3042 #define PTA_PREFETCH_SSE (HOST_WIDE_INT_1 << 19)
3043 #define PTA_RDRND (HOST_WIDE_INT_1 << 20)
3044 #define PTA_SSE (HOST_WIDE_INT_1 << 21)
3045 #define PTA_SSE2 (HOST_WIDE_INT_1 << 22)
3046 #define PTA_SSE3 (HOST_WIDE_INT_1 << 23)
3047 #define PTA_SSE4_1 (HOST_WIDE_INT_1 << 24)
3048 #define PTA_SSE4_2 (HOST_WIDE_INT_1 << 25)
3049 #define PTA_SSE4A (HOST_WIDE_INT_1 << 26)
3050 #define PTA_SSSE3 (HOST_WIDE_INT_1 << 27)
3051 #define PTA_TBM (HOST_WIDE_INT_1 << 28)
3052 #define PTA_XOP (HOST_WIDE_INT_1 << 29)
3053 #define PTA_AVX2 (HOST_WIDE_INT_1 << 30)
3054 #define PTA_BMI2 (HOST_WIDE_INT_1 << 31)
3055 #define PTA_RTM (HOST_WIDE_INT_1 << 32)
3056 #define PTA_HLE (HOST_WIDE_INT_1 << 33)
3057 #define PTA_PRFCHW (HOST_WIDE_INT_1 << 34)
3058 #define PTA_RDSEED (HOST_WIDE_INT_1 << 35)
3059 #define PTA_ADX (HOST_WIDE_INT_1 << 36)
3060 #define PTA_FXSR (HOST_WIDE_INT_1 << 37)
3061 #define PTA_XSAVE (HOST_WIDE_INT_1 << 38)
3062 #define PTA_XSAVEOPT (HOST_WIDE_INT_1 << 39)
3063
3064 /* if this reaches 64, need to widen struct pta flags below */
3065
3066 static struct pta
3067 {
3068 const char *const name; /* processor name or nickname. */
3069 const enum processor_type processor;
3070 const enum attr_cpu schedule;
3071 const unsigned HOST_WIDE_INT flags;
3072 }
3073 const processor_alias_table[] =
3074 {
3075 {"i386", PROCESSOR_I386, CPU_NONE, 0},
3076 {"i486", PROCESSOR_I486, CPU_NONE, 0},
3077 {"i586", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
3078 {"pentium", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
3079 {"pentium-mmx", PROCESSOR_PENTIUM, CPU_PENTIUM, PTA_MMX},
3080 {"winchip-c6", PROCESSOR_I486, CPU_NONE, PTA_MMX},
3081 {"winchip2", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
3082 {"c3", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
3083 {"c3-2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, PTA_MMX | PTA_SSE},
3084 {"i686", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
3085 {"pentiumpro", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
3086 {"pentium2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, PTA_MMX | PTA_FXSR},
3087 {"pentium3", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
3088 PTA_MMX | PTA_SSE | PTA_FXSR},
3089 {"pentium3m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
3090 PTA_MMX | PTA_SSE | PTA_FXSR},
3091 {"pentium-m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
3092 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_FXSR},
3093 {"pentium4", PROCESSOR_PENTIUM4, CPU_NONE,
3094 PTA_MMX |PTA_SSE | PTA_SSE2 | PTA_FXSR},
3095 {"pentium4m", PROCESSOR_PENTIUM4, CPU_NONE,
3096 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_FXSR},
3097 {"prescott", PROCESSOR_NOCONA, CPU_NONE,
3098 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_FXSR},
3099 {"nocona", PROCESSOR_NOCONA, CPU_NONE,
3100 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3101 | PTA_CX16 | PTA_NO_SAHF | PTA_FXSR},
3102 {"core2", PROCESSOR_CORE2_64, CPU_CORE2,
3103 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3104 | PTA_SSSE3 | PTA_CX16 | PTA_FXSR},
3105 {"corei7", PROCESSOR_COREI7_64, CPU_COREI7,
3106 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3107 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_CX16 | PTA_FXSR},
3108 {"corei7-avx", PROCESSOR_COREI7_64, CPU_COREI7,
3109 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3110 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX
3111 | PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL
3112 | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT},
3113 {"core-avx-i", PROCESSOR_COREI7_64, CPU_COREI7,
3114 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3115 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX
3116 | PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL | PTA_FSGSBASE
3117 | PTA_RDRND | PTA_F16C | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT},
3118 {"core-avx2", PROCESSOR_COREI7_64, CPU_COREI7,
3119 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3120 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX | PTA_AVX2
3121 | PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL | PTA_FSGSBASE
3122 | PTA_RDRND | PTA_F16C | PTA_BMI | PTA_BMI2 | PTA_LZCNT
3123 | PTA_FMA | PTA_MOVBE | PTA_RTM | PTA_HLE | PTA_FXSR | PTA_XSAVE
3124 | PTA_XSAVEOPT},
3125 {"atom", PROCESSOR_ATOM, CPU_ATOM,
3126 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3127 | PTA_SSSE3 | PTA_CX16 | PTA_MOVBE | PTA_FXSR},
3128 {"geode", PROCESSOR_GEODE, CPU_GEODE,
3129 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
3130 {"k6", PROCESSOR_K6, CPU_K6, PTA_MMX},
3131 {"k6-2", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW},
3132 {"k6-3", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW},
3133 {"athlon", PROCESSOR_ATHLON, CPU_ATHLON,
3134 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
3135 {"athlon-tbird", PROCESSOR_ATHLON, CPU_ATHLON,
3136 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
3137 {"athlon-4", PROCESSOR_ATHLON, CPU_ATHLON,
3138 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE},
3139 {"athlon-xp", PROCESSOR_ATHLON, CPU_ATHLON,
3140 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE},
3141 {"athlon-mp", PROCESSOR_ATHLON, CPU_ATHLON,
3142 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE},
3143 {"x86-64", PROCESSOR_K8, CPU_K8,
3144 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_NO_SAHF},
3145 {"k8", PROCESSOR_K8, CPU_K8,
3146 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3147 | PTA_SSE2 | PTA_NO_SAHF},
3148 {"k8-sse3", PROCESSOR_K8, CPU_K8,
3149 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3150 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF},
3151 {"opteron", PROCESSOR_K8, CPU_K8,
3152 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3153 | PTA_SSE2 | PTA_NO_SAHF},
3154 {"opteron-sse3", PROCESSOR_K8, CPU_K8,
3155 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3156 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF},
3157 {"athlon64", PROCESSOR_K8, CPU_K8,
3158 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3159 | PTA_SSE2 | PTA_NO_SAHF},
3160 {"athlon64-sse3", PROCESSOR_K8, CPU_K8,
3161 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3162 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF},
3163 {"athlon-fx", PROCESSOR_K8, CPU_K8,
3164 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3165 | PTA_SSE2 | PTA_NO_SAHF},
3166 {"amdfam10", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
3167 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3168 | PTA_SSE2 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM},
3169 {"barcelona", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
3170 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3171 | PTA_SSE2 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM},
3172 {"bdver1", PROCESSOR_BDVER1, CPU_BDVER1,
3173 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3174 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
3175 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
3176 | PTA_XOP | PTA_LWP | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE
3177 | PTA_XSAVEOPT},
3178 {"bdver2", PROCESSOR_BDVER2, CPU_BDVER2,
3179 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3180 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
3181 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
3182 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_TBM | PTA_F16C
3183 | PTA_FMA | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT},
3184 {"btver1", PROCESSOR_BTVER1, CPU_GENERIC64,
3185 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3186 | PTA_SSSE3 | PTA_SSE4A |PTA_ABM | PTA_CX16 | PTA_PRFCHW
3187 | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT},
3188 {"btver2", PROCESSOR_BTVER2, CPU_GENERIC64,
3189 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3190 | PTA_SSSE3 | PTA_SSE4A |PTA_ABM | PTA_CX16 | PTA_SSE4_1
3191 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX
3192 | PTA_BMI | PTA_F16C | PTA_MOVBE | PTA_PRFCHW
3193 | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT},
3194
3195 {"generic32", PROCESSOR_GENERIC32, CPU_PENTIUMPRO,
3196 PTA_HLE /* flags are only used for -march switch. */ },
3197 {"generic64", PROCESSOR_GENERIC64, CPU_GENERIC64,
3198 PTA_64BIT
3199 | PTA_HLE /* flags are only used for -march switch. */ },
3200 };
3201
3202 /* -mrecip options. */
3203 static struct
3204 {
3205 const char *string; /* option name */
3206 unsigned int mask; /* mask bits to set */
3207 }
3208 const recip_options[] =
3209 {
3210 { "all", RECIP_MASK_ALL },
3211 { "none", RECIP_MASK_NONE },
3212 { "div", RECIP_MASK_DIV },
3213 { "sqrt", RECIP_MASK_SQRT },
3214 { "vec-div", RECIP_MASK_VEC_DIV },
3215 { "vec-sqrt", RECIP_MASK_VEC_SQRT },
3216 };
3217
3218 int const pta_size = ARRAY_SIZE (processor_alias_table);
3219
3220 /* Set up prefix/suffix so the error messages refer to either the command
3221 line argument, or the attribute(target). */
3222 if (main_args_p)
3223 {
3224 prefix = "-m";
3225 suffix = "";
3226 sw = "switch";
3227 }
3228 else
3229 {
3230 prefix = "option(\"";
3231 suffix = "\")";
3232 sw = "attribute";
3233 }
3234
3235 /* Turn off both OPTION_MASK_ABI_64 and OPTION_MASK_ABI_X32 if
3236 TARGET_64BIT_DEFAULT is true and TARGET_64BIT is false. */
3237 if (TARGET_64BIT_DEFAULT && !TARGET_64BIT)
3238 ix86_isa_flags &= ~(OPTION_MASK_ABI_64 | OPTION_MASK_ABI_X32);
3239 #ifdef TARGET_BI_ARCH
3240 else
3241 {
3242 #if TARGET_BI_ARCH == 1
3243 /* When TARGET_BI_ARCH == 1, by default, OPTION_MASK_ABI_64
3244 is on and OPTION_MASK_ABI_X32 is off. We turn off
3245 OPTION_MASK_ABI_64 if OPTION_MASK_ABI_X32 is turned on by
3246 -mx32. */
3247 if (TARGET_X32)
3248 ix86_isa_flags &= ~OPTION_MASK_ABI_64;
3249 #else
3250 /* When TARGET_BI_ARCH == 2, by default, OPTION_MASK_ABI_X32 is
3251 on and OPTION_MASK_ABI_64 is off. We turn off
3252 OPTION_MASK_ABI_X32 if OPTION_MASK_ABI_64 is turned on by
3253 -m64. */
3254 if (TARGET_LP64)
3255 ix86_isa_flags &= ~OPTION_MASK_ABI_X32;
3256 #endif
3257 }
3258 #endif
3259
3260 if (TARGET_X32)
3261 {
3262 /* Always turn on OPTION_MASK_ISA_64BIT and turn off
3263 OPTION_MASK_ABI_64 for TARGET_X32. */
3264 ix86_isa_flags |= OPTION_MASK_ISA_64BIT;
3265 ix86_isa_flags &= ~OPTION_MASK_ABI_64;
3266 }
3267 else if (TARGET_LP64)
3268 {
3269 /* Always turn on OPTION_MASK_ISA_64BIT and turn off
3270 OPTION_MASK_ABI_X32 for TARGET_LP64. */
3271 ix86_isa_flags |= OPTION_MASK_ISA_64BIT;
3272 ix86_isa_flags &= ~OPTION_MASK_ABI_X32;
3273 }
3274
3275 #ifdef SUBTARGET_OVERRIDE_OPTIONS
3276 SUBTARGET_OVERRIDE_OPTIONS;
3277 #endif
3278
3279 #ifdef SUBSUBTARGET_OVERRIDE_OPTIONS
3280 SUBSUBTARGET_OVERRIDE_OPTIONS;
3281 #endif
3282
3283 /* -fPIC is the default for x86_64. */
3284 if (TARGET_MACHO && TARGET_64BIT)
3285 flag_pic = 2;
3286
3287 /* Need to check -mtune=generic first. */
3288 if (ix86_tune_string)
3289 {
3290 if (!strcmp (ix86_tune_string, "generic")
3291 || !strcmp (ix86_tune_string, "i686")
3292 /* As special support for cross compilers we read -mtune=native
3293 as -mtune=generic. With native compilers we won't see the
3294 -mtune=native, as it was changed by the driver. */
3295 || !strcmp (ix86_tune_string, "native"))
3296 {
3297 if (TARGET_64BIT)
3298 ix86_tune_string = "generic64";
3299 else
3300 ix86_tune_string = "generic32";
3301 }
3302 /* If this call is for setting the option attribute, allow the
3303 generic32/generic64 that was previously set. */
3304 else if (!main_args_p
3305 && (!strcmp (ix86_tune_string, "generic32")
3306 || !strcmp (ix86_tune_string, "generic64")))
3307 ;
3308 else if (!strncmp (ix86_tune_string, "generic", 7))
3309 error ("bad value (%s) for %stune=%s %s",
3310 ix86_tune_string, prefix, suffix, sw);
3311 else if (!strcmp (ix86_tune_string, "x86-64"))
3312 warning (OPT_Wdeprecated, "%stune=x86-64%s is deprecated; use "
3313 "%stune=k8%s or %stune=generic%s instead as appropriate",
3314 prefix, suffix, prefix, suffix, prefix, suffix);
3315 }
3316 else
3317 {
3318 if (ix86_arch_string)
3319 ix86_tune_string = ix86_arch_string;
3320 if (!ix86_tune_string)
3321 {
3322 ix86_tune_string = cpu_names[TARGET_CPU_DEFAULT];
3323 ix86_tune_defaulted = 1;
3324 }
3325
3326 /* ix86_tune_string is set to ix86_arch_string or defaulted. We
3327 need to use a sensible tune option. */
3328 if (!strcmp (ix86_tune_string, "generic")
3329 || !strcmp (ix86_tune_string, "x86-64")
3330 || !strcmp (ix86_tune_string, "i686"))
3331 {
3332 if (TARGET_64BIT)
3333 ix86_tune_string = "generic64";
3334 else
3335 ix86_tune_string = "generic32";
3336 }
3337 }
3338
3339 if (ix86_stringop_alg == rep_prefix_8_byte && !TARGET_64BIT)
3340 {
3341 /* rep; movq isn't available in 32-bit code. */
3342 error ("-mstringop-strategy=rep_8byte not supported for 32-bit code");
3343 ix86_stringop_alg = no_stringop;
3344 }
3345
3346 if (!ix86_arch_string)
3347 ix86_arch_string = TARGET_64BIT ? "x86-64" : SUBTARGET32_DEFAULT_CPU;
3348 else
3349 ix86_arch_specified = 1;
3350
3351 if (global_options_set.x_ix86_pmode)
3352 {
3353 if ((TARGET_LP64 && ix86_pmode == PMODE_SI)
3354 || (!TARGET_64BIT && ix86_pmode == PMODE_DI))
3355 error ("address mode %qs not supported in the %s bit mode",
3356 TARGET_64BIT ? "short" : "long",
3357 TARGET_64BIT ? "64" : "32");
3358 }
3359 else
3360 ix86_pmode = TARGET_LP64 ? PMODE_DI : PMODE_SI;
3361
3362 if (!global_options_set.x_ix86_abi)
3363 ix86_abi = DEFAULT_ABI;
3364
3365 if (global_options_set.x_ix86_cmodel)
3366 {
3367 switch (ix86_cmodel)
3368 {
3369 case CM_SMALL:
3370 case CM_SMALL_PIC:
3371 if (flag_pic)
3372 ix86_cmodel = CM_SMALL_PIC;
3373 if (!TARGET_64BIT)
3374 error ("code model %qs not supported in the %s bit mode",
3375 "small", "32");
3376 break;
3377
3378 case CM_MEDIUM:
3379 case CM_MEDIUM_PIC:
3380 if (flag_pic)
3381 ix86_cmodel = CM_MEDIUM_PIC;
3382 if (!TARGET_64BIT)
3383 error ("code model %qs not supported in the %s bit mode",
3384 "medium", "32");
3385 else if (TARGET_X32)
3386 error ("code model %qs not supported in x32 mode",
3387 "medium");
3388 break;
3389
3390 case CM_LARGE:
3391 case CM_LARGE_PIC:
3392 if (flag_pic)
3393 ix86_cmodel = CM_LARGE_PIC;
3394 if (!TARGET_64BIT)
3395 error ("code model %qs not supported in the %s bit mode",
3396 "large", "32");
3397 else if (TARGET_X32)
3398 error ("code model %qs not supported in x32 mode",
3399 "large");
3400 break;
3401
3402 case CM_32:
3403 if (flag_pic)
3404 error ("code model %s does not support PIC mode", "32");
3405 if (TARGET_64BIT)
3406 error ("code model %qs not supported in the %s bit mode",
3407 "32", "64");
3408 break;
3409
3410 case CM_KERNEL:
3411 if (flag_pic)
3412 {
3413 error ("code model %s does not support PIC mode", "kernel");
3414 ix86_cmodel = CM_32;
3415 }
3416 if (!TARGET_64BIT)
3417 error ("code model %qs not supported in the %s bit mode",
3418 "kernel", "32");
3419 break;
3420
3421 default:
3422 gcc_unreachable ();
3423 }
3424 }
3425 else
3426 {
3427 /* For TARGET_64BIT and MS_ABI, force pic on, in order to enable the
3428 use of rip-relative addressing. This eliminates fixups that
3429 would otherwise be needed if this object is to be placed in a
3430 DLL, and is essentially just as efficient as direct addressing. */
3431 if (TARGET_64BIT && DEFAULT_ABI == MS_ABI)
3432 ix86_cmodel = CM_SMALL_PIC, flag_pic = 1;
3433 else if (TARGET_64BIT)
3434 ix86_cmodel = flag_pic ? CM_SMALL_PIC : CM_SMALL;
3435 else
3436 ix86_cmodel = CM_32;
3437 }
3438 if (TARGET_MACHO && ix86_asm_dialect == ASM_INTEL)
3439 {
3440 error ("-masm=intel not supported in this configuration");
3441 ix86_asm_dialect = ASM_ATT;
3442 }
3443 if ((TARGET_64BIT != 0) != ((ix86_isa_flags & OPTION_MASK_ISA_64BIT) != 0))
3444 sorry ("%i-bit mode not compiled in",
3445 (ix86_isa_flags & OPTION_MASK_ISA_64BIT) ? 64 : 32);
3446
3447 for (i = 0; i < pta_size; i++)
3448 if (! strcmp (ix86_arch_string, processor_alias_table[i].name))
3449 {
3450 ix86_schedule = processor_alias_table[i].schedule;
3451 ix86_arch = processor_alias_table[i].processor;
3452 /* Default cpu tuning to the architecture. */
3453 ix86_tune = ix86_arch;
3454
3455 if (TARGET_64BIT && !(processor_alias_table[i].flags & PTA_64BIT))
3456 error ("CPU you selected does not support x86-64 "
3457 "instruction set");
3458
3459 if (processor_alias_table[i].flags & PTA_MMX
3460 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_MMX))
3461 ix86_isa_flags |= OPTION_MASK_ISA_MMX;
3462 if (processor_alias_table[i].flags & PTA_3DNOW
3463 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW))
3464 ix86_isa_flags |= OPTION_MASK_ISA_3DNOW;
3465 if (processor_alias_table[i].flags & PTA_3DNOW_A
3466 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW_A))
3467 ix86_isa_flags |= OPTION_MASK_ISA_3DNOW_A;
3468 if (processor_alias_table[i].flags & PTA_SSE
3469 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE))
3470 ix86_isa_flags |= OPTION_MASK_ISA_SSE;
3471 if (processor_alias_table[i].flags & PTA_SSE2
3472 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE2))
3473 ix86_isa_flags |= OPTION_MASK_ISA_SSE2;
3474 if (processor_alias_table[i].flags & PTA_SSE3
3475 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE3))
3476 ix86_isa_flags |= OPTION_MASK_ISA_SSE3;
3477 if (processor_alias_table[i].flags & PTA_SSSE3
3478 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSSE3))
3479 ix86_isa_flags |= OPTION_MASK_ISA_SSSE3;
3480 if (processor_alias_table[i].flags & PTA_SSE4_1
3481 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_1))
3482 ix86_isa_flags |= OPTION_MASK_ISA_SSE4_1;
3483 if (processor_alias_table[i].flags & PTA_SSE4_2
3484 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_2))
3485 ix86_isa_flags |= OPTION_MASK_ISA_SSE4_2;
3486 if (processor_alias_table[i].flags & PTA_AVX
3487 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX))
3488 ix86_isa_flags |= OPTION_MASK_ISA_AVX;
3489 if (processor_alias_table[i].flags & PTA_AVX2
3490 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX2))
3491 ix86_isa_flags |= OPTION_MASK_ISA_AVX2;
3492 if (processor_alias_table[i].flags & PTA_FMA
3493 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA))
3494 ix86_isa_flags |= OPTION_MASK_ISA_FMA;
3495 if (processor_alias_table[i].flags & PTA_SSE4A
3496 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4A))
3497 ix86_isa_flags |= OPTION_MASK_ISA_SSE4A;
3498 if (processor_alias_table[i].flags & PTA_FMA4
3499 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA4))
3500 ix86_isa_flags |= OPTION_MASK_ISA_FMA4;
3501 if (processor_alias_table[i].flags & PTA_XOP
3502 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_XOP))
3503 ix86_isa_flags |= OPTION_MASK_ISA_XOP;
3504 if (processor_alias_table[i].flags & PTA_LWP
3505 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_LWP))
3506 ix86_isa_flags |= OPTION_MASK_ISA_LWP;
3507 if (processor_alias_table[i].flags & PTA_ABM
3508 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_ABM))
3509 ix86_isa_flags |= OPTION_MASK_ISA_ABM;
3510 if (processor_alias_table[i].flags & PTA_BMI
3511 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI))
3512 ix86_isa_flags |= OPTION_MASK_ISA_BMI;
3513 if (processor_alias_table[i].flags & (PTA_LZCNT | PTA_ABM)
3514 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_LZCNT))
3515 ix86_isa_flags |= OPTION_MASK_ISA_LZCNT;
3516 if (processor_alias_table[i].flags & PTA_TBM
3517 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_TBM))
3518 ix86_isa_flags |= OPTION_MASK_ISA_TBM;
3519 if (processor_alias_table[i].flags & PTA_BMI2
3520 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI2))
3521 ix86_isa_flags |= OPTION_MASK_ISA_BMI2;
3522 if (processor_alias_table[i].flags & PTA_CX16
3523 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_CX16))
3524 ix86_isa_flags |= OPTION_MASK_ISA_CX16;
3525 if (processor_alias_table[i].flags & (PTA_POPCNT | PTA_ABM)
3526 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_POPCNT))
3527 ix86_isa_flags |= OPTION_MASK_ISA_POPCNT;
3528 if (!(TARGET_64BIT && (processor_alias_table[i].flags & PTA_NO_SAHF))
3529 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SAHF))
3530 ix86_isa_flags |= OPTION_MASK_ISA_SAHF;
3531 if (processor_alias_table[i].flags & PTA_MOVBE
3532 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_MOVBE))
3533 ix86_isa_flags |= OPTION_MASK_ISA_MOVBE;
3534 if (processor_alias_table[i].flags & PTA_AES
3535 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AES))
3536 ix86_isa_flags |= OPTION_MASK_ISA_AES;
3537 if (processor_alias_table[i].flags & PTA_PCLMUL
3538 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_PCLMUL))
3539 ix86_isa_flags |= OPTION_MASK_ISA_PCLMUL;
3540 if (processor_alias_table[i].flags & PTA_FSGSBASE
3541 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_FSGSBASE))
3542 ix86_isa_flags |= OPTION_MASK_ISA_FSGSBASE;
3543 if (processor_alias_table[i].flags & PTA_RDRND
3544 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_RDRND))
3545 ix86_isa_flags |= OPTION_MASK_ISA_RDRND;
3546 if (processor_alias_table[i].flags & PTA_F16C
3547 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_F16C))
3548 ix86_isa_flags |= OPTION_MASK_ISA_F16C;
3549 if (processor_alias_table[i].flags & PTA_RTM
3550 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_RTM))
3551 ix86_isa_flags |= OPTION_MASK_ISA_RTM;
3552 if (processor_alias_table[i].flags & PTA_HLE
3553 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_HLE))
3554 ix86_isa_flags |= OPTION_MASK_ISA_HLE;
3555 if (processor_alias_table[i].flags & PTA_PRFCHW
3556 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_PRFCHW))
3557 ix86_isa_flags |= OPTION_MASK_ISA_PRFCHW;
3558 if (processor_alias_table[i].flags & PTA_RDSEED
3559 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_RDSEED))
3560 ix86_isa_flags |= OPTION_MASK_ISA_RDSEED;
3561 if (processor_alias_table[i].flags & PTA_ADX
3562 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_ADX))
3563 ix86_isa_flags |= OPTION_MASK_ISA_ADX;
3564 if (processor_alias_table[i].flags & PTA_FXSR
3565 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_FXSR))
3566 ix86_isa_flags |= OPTION_MASK_ISA_FXSR;
3567 if (processor_alias_table[i].flags & PTA_XSAVE
3568 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVE))
3569 ix86_isa_flags |= OPTION_MASK_ISA_XSAVE;
3570 if (processor_alias_table[i].flags & PTA_XSAVEOPT
3571 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVEOPT))
3572 ix86_isa_flags |= OPTION_MASK_ISA_XSAVEOPT;
3573 if (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE))
3574 x86_prefetch_sse = true;
3575
3576 break;
3577 }
3578
3579 if (!strcmp (ix86_arch_string, "generic"))
3580 error ("generic CPU can be used only for %stune=%s %s",
3581 prefix, suffix, sw);
3582 else if (!strncmp (ix86_arch_string, "generic", 7) || i == pta_size)
3583 error ("bad value (%s) for %sarch=%s %s",
3584 ix86_arch_string, prefix, suffix, sw);
3585
3586 ix86_arch_mask = 1u << ix86_arch;
3587 for (i = 0; i < X86_ARCH_LAST; ++i)
3588 ix86_arch_features[i] = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
3589
3590 for (i = 0; i < pta_size; i++)
3591 if (! strcmp (ix86_tune_string, processor_alias_table[i].name))
3592 {
3593 ix86_schedule = processor_alias_table[i].schedule;
3594 ix86_tune = processor_alias_table[i].processor;
3595 if (TARGET_64BIT)
3596 {
3597 if (!(processor_alias_table[i].flags & PTA_64BIT))
3598 {
3599 if (ix86_tune_defaulted)
3600 {
3601 ix86_tune_string = "x86-64";
3602 for (i = 0; i < pta_size; i++)
3603 if (! strcmp (ix86_tune_string,
3604 processor_alias_table[i].name))
3605 break;
3606 ix86_schedule = processor_alias_table[i].schedule;
3607 ix86_tune = processor_alias_table[i].processor;
3608 }
3609 else
3610 error ("CPU you selected does not support x86-64 "
3611 "instruction set");
3612 }
3613 }
3614 else
3615 {
3616 /* Adjust tuning when compiling for 32-bit ABI. */
3617 switch (ix86_tune)
3618 {
3619 case PROCESSOR_GENERIC64:
3620 ix86_tune = PROCESSOR_GENERIC32;
3621 ix86_schedule = CPU_PENTIUMPRO;
3622 break;
3623
3624 case PROCESSOR_CORE2_64:
3625 ix86_tune = PROCESSOR_CORE2_32;
3626 break;
3627
3628 case PROCESSOR_COREI7_64:
3629 ix86_tune = PROCESSOR_COREI7_32;
3630 break;
3631
3632 default:
3633 break;
3634 }
3635 }
3636 /* Intel CPUs have always interpreted SSE prefetch instructions as
3637 NOPs; so, we can enable SSE prefetch instructions even when
3638 -mtune (rather than -march) points us to a processor that has them.
3639 However, the VIA C3 gives a SIGILL, so we only do that for i686 and
3640 higher processors. */
3641 if (TARGET_CMOV
3642 && (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE)))
3643 x86_prefetch_sse = true;
3644 break;
3645 }
3646
3647 if (ix86_tune_specified && i == pta_size)
3648 error ("bad value (%s) for %stune=%s %s",
3649 ix86_tune_string, prefix, suffix, sw);
3650
3651 ix86_tune_mask = 1u << ix86_tune;
3652 for (i = 0; i < X86_TUNE_LAST; ++i)
3653 ix86_tune_features[i] = !!(initial_ix86_tune_features[i] & ix86_tune_mask);
3654
3655 #ifndef USE_IX86_FRAME_POINTER
3656 #define USE_IX86_FRAME_POINTER 0
3657 #endif
3658
3659 #ifndef USE_X86_64_FRAME_POINTER
3660 #define USE_X86_64_FRAME_POINTER 0
3661 #endif
3662
3663 /* Set the default values for switches whose default depends on TARGET_64BIT
3664 in case they weren't overwritten by command line options. */
3665 if (TARGET_64BIT)
3666 {
3667 if (optimize >= 1 && !global_options_set.x_flag_omit_frame_pointer)
3668 flag_omit_frame_pointer = !USE_X86_64_FRAME_POINTER;
3669 if (flag_asynchronous_unwind_tables == 2)
3670 flag_unwind_tables = flag_asynchronous_unwind_tables = 1;
3671 if (flag_pcc_struct_return == 2)
3672 flag_pcc_struct_return = 0;
3673 }
3674 else
3675 {
3676 if (optimize >= 1 && !global_options_set.x_flag_omit_frame_pointer)
3677 flag_omit_frame_pointer = !(USE_IX86_FRAME_POINTER || optimize_size);
3678 if (flag_asynchronous_unwind_tables == 2)
3679 flag_asynchronous_unwind_tables = !USE_IX86_FRAME_POINTER;
3680 if (flag_pcc_struct_return == 2)
3681 flag_pcc_struct_return = DEFAULT_PCC_STRUCT_RETURN;
3682 }
3683
3684 ix86_tune_cost = processor_target_table[ix86_tune].cost;
3685 if (optimize_size)
3686 ix86_cost = &ix86_size_cost;
3687 else
3688 ix86_cost = ix86_tune_cost;
3689
3690 /* Arrange to set up i386_stack_locals for all functions. */
3691 init_machine_status = ix86_init_machine_status;
3692
3693 /* Validate -mregparm= value. */
3694 if (global_options_set.x_ix86_regparm)
3695 {
3696 if (TARGET_64BIT)
3697 warning (0, "-mregparm is ignored in 64-bit mode");
3698 if (ix86_regparm > REGPARM_MAX)
3699 {
3700 error ("-mregparm=%d is not between 0 and %d",
3701 ix86_regparm, REGPARM_MAX);
3702 ix86_regparm = 0;
3703 }
3704 }
3705 if (TARGET_64BIT)
3706 ix86_regparm = REGPARM_MAX;
3707
3708 /* Default align_* from the processor table. */
3709 if (align_loops == 0)
3710 {
3711 align_loops = processor_target_table[ix86_tune].align_loop;
3712 align_loops_max_skip = processor_target_table[ix86_tune].align_loop_max_skip;
3713 }
3714 if (align_jumps == 0)
3715 {
3716 align_jumps = processor_target_table[ix86_tune].align_jump;
3717 align_jumps_max_skip = processor_target_table[ix86_tune].align_jump_max_skip;
3718 }
3719 if (align_functions == 0)
3720 {
3721 align_functions = processor_target_table[ix86_tune].align_func;
3722 }
3723
3724 /* Provide default for -mbranch-cost= value. */
3725 if (!global_options_set.x_ix86_branch_cost)
3726 ix86_branch_cost = ix86_cost->branch_cost;
3727
3728 if (TARGET_64BIT)
3729 {
3730 target_flags |= TARGET_SUBTARGET64_DEFAULT & ~target_flags_explicit;
3731
3732 /* Enable by default the SSE and MMX builtins. Do allow the user to
3733 explicitly disable any of these. In particular, disabling SSE and
3734 MMX for kernel code is extremely useful. */
3735 if (!ix86_arch_specified)
3736 ix86_isa_flags
3737 |= ((OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_MMX
3738 | TARGET_SUBTARGET64_ISA_DEFAULT) & ~ix86_isa_flags_explicit);
3739
3740 if (TARGET_RTD)
3741 warning (0, "%srtd%s is ignored in 64bit mode", prefix, suffix);
3742 }
3743 else
3744 {
3745 target_flags |= TARGET_SUBTARGET32_DEFAULT & ~target_flags_explicit;
3746
3747 if (!ix86_arch_specified)
3748 ix86_isa_flags
3749 |= TARGET_SUBTARGET32_ISA_DEFAULT & ~ix86_isa_flags_explicit;
3750
3751 /* i386 ABI does not specify red zone. It still makes sense to use it
3752 when programmer takes care to stack from being destroyed. */
3753 if (!(target_flags_explicit & MASK_NO_RED_ZONE))
3754 target_flags |= MASK_NO_RED_ZONE;
3755 }
3756
3757 /* Keep nonleaf frame pointers. */
3758 if (flag_omit_frame_pointer)
3759 target_flags &= ~MASK_OMIT_LEAF_FRAME_POINTER;
3760 else if (TARGET_OMIT_LEAF_FRAME_POINTER)
3761 flag_omit_frame_pointer = 1;
3762
3763 /* If we're doing fast math, we don't care about comparison order
3764 wrt NaNs. This lets us use a shorter comparison sequence. */
3765 if (flag_finite_math_only)
3766 target_flags &= ~MASK_IEEE_FP;
3767
3768 /* If the architecture always has an FPU, turn off NO_FANCY_MATH_387,
3769 since the insns won't need emulation. */
3770 if (x86_arch_always_fancy_math_387 & ix86_arch_mask)
3771 target_flags &= ~MASK_NO_FANCY_MATH_387;
3772
3773 /* Likewise, if the target doesn't have a 387, or we've specified
3774 software floating point, don't use 387 inline intrinsics. */
3775 if (!TARGET_80387)
3776 target_flags |= MASK_NO_FANCY_MATH_387;
3777
3778 /* Turn on MMX builtins for -msse. */
3779 if (TARGET_SSE)
3780 ix86_isa_flags |= OPTION_MASK_ISA_MMX & ~ix86_isa_flags_explicit;
3781
3782 /* Enable SSE prefetch. */
3783 if (TARGET_SSE || TARGET_PRFCHW)
3784 x86_prefetch_sse = true;
3785
3786 /* Turn on popcnt instruction for -msse4.2 or -mabm. */
3787 if (TARGET_SSE4_2 || TARGET_ABM)
3788 ix86_isa_flags |= OPTION_MASK_ISA_POPCNT & ~ix86_isa_flags_explicit;
3789
3790 /* Turn on lzcnt instruction for -mabm. */
3791 if (TARGET_ABM)
3792 ix86_isa_flags |= OPTION_MASK_ISA_LZCNT & ~ix86_isa_flags_explicit;
3793
3794 /* Validate -mpreferred-stack-boundary= value or default it to
3795 PREFERRED_STACK_BOUNDARY_DEFAULT. */
3796 ix86_preferred_stack_boundary = PREFERRED_STACK_BOUNDARY_DEFAULT;
3797 if (global_options_set.x_ix86_preferred_stack_boundary_arg)
3798 {
3799 int min = (TARGET_64BIT ? (TARGET_SSE ? 4 : 3) : 2);
3800 int max = (TARGET_SEH ? 4 : 12);
3801
3802 if (ix86_preferred_stack_boundary_arg < min
3803 || ix86_preferred_stack_boundary_arg > max)
3804 {
3805 if (min == max)
3806 error ("-mpreferred-stack-boundary is not supported "
3807 "for this target");
3808 else
3809 error ("-mpreferred-stack-boundary=%d is not between %d and %d",
3810 ix86_preferred_stack_boundary_arg, min, max);
3811 }
3812 else
3813 ix86_preferred_stack_boundary
3814 = (1 << ix86_preferred_stack_boundary_arg) * BITS_PER_UNIT;
3815 }
3816
3817 /* Set the default value for -mstackrealign. */
3818 if (ix86_force_align_arg_pointer == -1)
3819 ix86_force_align_arg_pointer = STACK_REALIGN_DEFAULT;
3820
3821 ix86_default_incoming_stack_boundary = PREFERRED_STACK_BOUNDARY;
3822
3823 /* Validate -mincoming-stack-boundary= value or default it to
3824 MIN_STACK_BOUNDARY/PREFERRED_STACK_BOUNDARY. */
3825 ix86_incoming_stack_boundary = ix86_default_incoming_stack_boundary;
3826 if (global_options_set.x_ix86_incoming_stack_boundary_arg)
3827 {
3828 if (ix86_incoming_stack_boundary_arg < (TARGET_64BIT ? 4 : 2)
3829 || ix86_incoming_stack_boundary_arg > 12)
3830 error ("-mincoming-stack-boundary=%d is not between %d and 12",
3831 ix86_incoming_stack_boundary_arg, TARGET_64BIT ? 4 : 2);
3832 else
3833 {
3834 ix86_user_incoming_stack_boundary
3835 = (1 << ix86_incoming_stack_boundary_arg) * BITS_PER_UNIT;
3836 ix86_incoming_stack_boundary
3837 = ix86_user_incoming_stack_boundary;
3838 }
3839 }
3840
3841 /* Accept -msseregparm only if at least SSE support is enabled. */
3842 if (TARGET_SSEREGPARM
3843 && ! TARGET_SSE)
3844 error ("%ssseregparm%s used without SSE enabled", prefix, suffix);
3845
3846 if (global_options_set.x_ix86_fpmath)
3847 {
3848 if (ix86_fpmath & FPMATH_SSE)
3849 {
3850 if (!TARGET_SSE)
3851 {
3852 warning (0, "SSE instruction set disabled, using 387 arithmetics");
3853 ix86_fpmath = FPMATH_387;
3854 }
3855 else if ((ix86_fpmath & FPMATH_387) && !TARGET_80387)
3856 {
3857 warning (0, "387 instruction set disabled, using SSE arithmetics");
3858 ix86_fpmath = FPMATH_SSE;
3859 }
3860 }
3861 }
3862 else
3863 ix86_fpmath = TARGET_FPMATH_DEFAULT;
3864
3865 /* If the i387 is disabled, then do not return values in it. */
3866 if (!TARGET_80387)
3867 target_flags &= ~MASK_FLOAT_RETURNS;
3868
3869 /* Use external vectorized library in vectorizing intrinsics. */
3870 if (global_options_set.x_ix86_veclibabi_type)
3871 switch (ix86_veclibabi_type)
3872 {
3873 case ix86_veclibabi_type_svml:
3874 ix86_veclib_handler = ix86_veclibabi_svml;
3875 break;
3876
3877 case ix86_veclibabi_type_acml:
3878 ix86_veclib_handler = ix86_veclibabi_acml;
3879 break;
3880
3881 default:
3882 gcc_unreachable ();
3883 }
3884
3885 if ((!USE_IX86_FRAME_POINTER
3886 || (x86_accumulate_outgoing_args & ix86_tune_mask))
3887 && !(target_flags_explicit & MASK_ACCUMULATE_OUTGOING_ARGS)
3888 && !optimize_size)
3889 target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
3890
3891 /* ??? Unwind info is not correct around the CFG unless either a frame
3892 pointer is present or M_A_O_A is set. Fixing this requires rewriting
3893 unwind info generation to be aware of the CFG and propagating states
3894 around edges. */
3895 if ((flag_unwind_tables || flag_asynchronous_unwind_tables
3896 || flag_exceptions || flag_non_call_exceptions)
3897 && flag_omit_frame_pointer
3898 && !(target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
3899 {
3900 if (target_flags_explicit & MASK_ACCUMULATE_OUTGOING_ARGS)
3901 warning (0, "unwind tables currently require either a frame pointer "
3902 "or %saccumulate-outgoing-args%s for correctness",
3903 prefix, suffix);
3904 target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
3905 }
3906
3907 /* If stack probes are required, the space used for large function
3908 arguments on the stack must also be probed, so enable
3909 -maccumulate-outgoing-args so this happens in the prologue. */
3910 if (TARGET_STACK_PROBE
3911 && !(target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
3912 {
3913 if (target_flags_explicit & MASK_ACCUMULATE_OUTGOING_ARGS)
3914 warning (0, "stack probing requires %saccumulate-outgoing-args%s "
3915 "for correctness", prefix, suffix);
3916 target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
3917 }
3918
3919 /* Figure out what ASM_GENERATE_INTERNAL_LABEL builds as a prefix. */
3920 {
3921 char *p;
3922 ASM_GENERATE_INTERNAL_LABEL (internal_label_prefix, "LX", 0);
3923 p = strchr (internal_label_prefix, 'X');
3924 internal_label_prefix_len = p - internal_label_prefix;
3925 *p = '\0';
3926 }
3927
3928 /* When scheduling description is not available, disable scheduler pass
3929 so it won't slow down the compilation and make x87 code slower. */
3930 if (!TARGET_SCHEDULE)
3931 flag_schedule_insns_after_reload = flag_schedule_insns = 0;
3932
3933 maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
3934 ix86_tune_cost->simultaneous_prefetches,
3935 global_options.x_param_values,
3936 global_options_set.x_param_values);
3937 maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
3938 ix86_tune_cost->prefetch_block,
3939 global_options.x_param_values,
3940 global_options_set.x_param_values);
3941 maybe_set_param_value (PARAM_L1_CACHE_SIZE,
3942 ix86_tune_cost->l1_cache_size,
3943 global_options.x_param_values,
3944 global_options_set.x_param_values);
3945 maybe_set_param_value (PARAM_L2_CACHE_SIZE,
3946 ix86_tune_cost->l2_cache_size,
3947 global_options.x_param_values,
3948 global_options_set.x_param_values);
3949
3950 /* Enable sw prefetching at -O3 for CPUS that prefetching is helpful. */
3951 if (flag_prefetch_loop_arrays < 0
3952 && HAVE_prefetch
3953 && optimize >= 3
3954 && TARGET_SOFTWARE_PREFETCHING_BENEFICIAL)
3955 flag_prefetch_loop_arrays = 1;
3956
3957 /* If using typedef char *va_list, signal that __builtin_va_start (&ap, 0)
3958 can be optimized to ap = __builtin_next_arg (0). */
3959 if (!TARGET_64BIT && !flag_split_stack)
3960 targetm.expand_builtin_va_start = NULL;
3961
3962 if (TARGET_64BIT)
3963 {
3964 ix86_gen_leave = gen_leave_rex64;
3965 if (Pmode == DImode)
3966 {
3967 ix86_gen_monitor = gen_sse3_monitor64_di;
3968 ix86_gen_tls_global_dynamic_64 = gen_tls_global_dynamic_64_di;
3969 ix86_gen_tls_local_dynamic_base_64
3970 = gen_tls_local_dynamic_base_64_di;
3971 }
3972 else
3973 {
3974 ix86_gen_monitor = gen_sse3_monitor64_si;
3975 ix86_gen_tls_global_dynamic_64 = gen_tls_global_dynamic_64_si;
3976 ix86_gen_tls_local_dynamic_base_64
3977 = gen_tls_local_dynamic_base_64_si;
3978 }
3979 }
3980 else
3981 {
3982 ix86_gen_leave = gen_leave;
3983 ix86_gen_monitor = gen_sse3_monitor;
3984 }
3985
3986 if (Pmode == DImode)
3987 {
3988 ix86_gen_add3 = gen_adddi3;
3989 ix86_gen_sub3 = gen_subdi3;
3990 ix86_gen_sub3_carry = gen_subdi3_carry;
3991 ix86_gen_one_cmpl2 = gen_one_cmpldi2;
3992 ix86_gen_andsp = gen_anddi3;
3993 ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_di;
3994 ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probedi;
3995 ix86_gen_probe_stack_range = gen_probe_stack_rangedi;
3996 }
3997 else
3998 {
3999 ix86_gen_add3 = gen_addsi3;
4000 ix86_gen_sub3 = gen_subsi3;
4001 ix86_gen_sub3_carry = gen_subsi3_carry;
4002 ix86_gen_one_cmpl2 = gen_one_cmplsi2;
4003 ix86_gen_andsp = gen_andsi3;
4004 ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_si;
4005 ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probesi;
4006 ix86_gen_probe_stack_range = gen_probe_stack_rangesi;
4007 }
4008
4009 #ifdef USE_IX86_CLD
4010 /* Use -mcld by default for 32-bit code if configured with --enable-cld. */
4011 if (!TARGET_64BIT)
4012 target_flags |= MASK_CLD & ~target_flags_explicit;
4013 #endif
4014
4015 if (!TARGET_64BIT && flag_pic)
4016 {
4017 if (flag_fentry > 0)
4018 sorry ("-mfentry isn%'t supported for 32-bit in combination "
4019 "with -fpic");
4020 flag_fentry = 0;
4021 }
4022 else if (TARGET_SEH)
4023 {
4024 if (flag_fentry == 0)
4025 sorry ("-mno-fentry isn%'t compatible with SEH");
4026 flag_fentry = 1;
4027 }
4028 else if (flag_fentry < 0)
4029 {
4030 #if defined(PROFILE_BEFORE_PROLOGUE)
4031 flag_fentry = 1;
4032 #else
4033 flag_fentry = 0;
4034 #endif
4035 }
4036
4037 if (TARGET_AVX)
4038 {
4039 /* When not optimize for size, enable vzeroupper optimization for
4040 TARGET_AVX with -fexpensive-optimizations and split 32-byte
4041 AVX unaligned load/store. */
4042 if (!optimize_size)
4043 {
4044 if (flag_expensive_optimizations
4045 && !(target_flags_explicit & MASK_VZEROUPPER))
4046 target_flags |= MASK_VZEROUPPER;
4047 if ((x86_avx256_split_unaligned_load & ix86_tune_mask)
4048 && !(target_flags_explicit & MASK_AVX256_SPLIT_UNALIGNED_LOAD))
4049 target_flags |= MASK_AVX256_SPLIT_UNALIGNED_LOAD;
4050 if ((x86_avx256_split_unaligned_store & ix86_tune_mask)
4051 && !(target_flags_explicit & MASK_AVX256_SPLIT_UNALIGNED_STORE))
4052 target_flags |= MASK_AVX256_SPLIT_UNALIGNED_STORE;
4053 /* Enable 128-bit AVX instruction generation for the auto-vectorizer. */
4054 if (TARGET_AVX128_OPTIMAL && !(target_flags_explicit & MASK_PREFER_AVX128))
4055 target_flags |= MASK_PREFER_AVX128;
4056 }
4057 }
4058 else
4059 {
4060 /* Disable vzeroupper pass if TARGET_AVX is disabled. */
4061 target_flags &= ~MASK_VZEROUPPER;
4062 }
4063
4064 if (ix86_recip_name)
4065 {
4066 char *p = ASTRDUP (ix86_recip_name);
4067 char *q;
4068 unsigned int mask, i;
4069 bool invert;
4070
4071 while ((q = strtok (p, ",")) != NULL)
4072 {
4073 p = NULL;
4074 if (*q == '!')
4075 {
4076 invert = true;
4077 q++;
4078 }
4079 else
4080 invert = false;
4081
4082 if (!strcmp (q, "default"))
4083 mask = RECIP_MASK_ALL;
4084 else
4085 {
4086 for (i = 0; i < ARRAY_SIZE (recip_options); i++)
4087 if (!strcmp (q, recip_options[i].string))
4088 {
4089 mask = recip_options[i].mask;
4090 break;
4091 }
4092
4093 if (i == ARRAY_SIZE (recip_options))
4094 {
4095 error ("unknown option for -mrecip=%s", q);
4096 invert = false;
4097 mask = RECIP_MASK_NONE;
4098 }
4099 }
4100
4101 recip_mask_explicit |= mask;
4102 if (invert)
4103 recip_mask &= ~mask;
4104 else
4105 recip_mask |= mask;
4106 }
4107 }
4108
4109 if (TARGET_RECIP)
4110 recip_mask |= RECIP_MASK_ALL & ~recip_mask_explicit;
4111 else if (target_flags_explicit & MASK_RECIP)
4112 recip_mask &= ~(RECIP_MASK_ALL & ~recip_mask_explicit);
4113
4114 /* Default long double to 64-bit for Bionic. */
4115 if (TARGET_HAS_BIONIC
4116 && !(target_flags_explicit & MASK_LONG_DOUBLE_64))
4117 target_flags |= MASK_LONG_DOUBLE_64;
4118
4119 /* Save the initial options in case the user does function specific
4120 options. */
4121 if (main_args_p)
4122 target_option_default_node = target_option_current_node
4123 = build_target_option_node ();
4124 }
4125
4126 /* Return TRUE if VAL is passed in register with 256bit AVX modes. */
4127
4128 static bool
4129 function_pass_avx256_p (const_rtx val)
4130 {
4131 if (!val)
4132 return false;
4133
4134 if (REG_P (val) && VALID_AVX256_REG_MODE (GET_MODE (val)))
4135 return true;
4136
4137 if (GET_CODE (val) == PARALLEL)
4138 {
4139 int i;
4140 rtx r;
4141
4142 for (i = XVECLEN (val, 0) - 1; i >= 0; i--)
4143 {
4144 r = XVECEXP (val, 0, i);
4145 if (GET_CODE (r) == EXPR_LIST
4146 && XEXP (r, 0)
4147 && REG_P (XEXP (r, 0))
4148 && (GET_MODE (XEXP (r, 0)) == OImode
4149 || VALID_AVX256_REG_MODE (GET_MODE (XEXP (r, 0)))))
4150 return true;
4151 }
4152 }
4153
4154 return false;
4155 }
4156
4157 /* Implement the TARGET_OPTION_OVERRIDE hook. */
4158
4159 static void
4160 ix86_option_override (void)
4161 {
4162 ix86_option_override_internal (true);
4163 }
4164
4165 /* Update register usage after having seen the compiler flags. */
4166
4167 static void
4168 ix86_conditional_register_usage (void)
4169 {
4170 int i, c_mask;
4171 unsigned int j;
4172
4173 /* The PIC register, if it exists, is fixed. */
4174 j = PIC_OFFSET_TABLE_REGNUM;
4175 if (j != INVALID_REGNUM)
4176 fixed_regs[j] = call_used_regs[j] = 1;
4177
4178 /* For 32-bit targets, squash the REX registers. */
4179 if (! TARGET_64BIT)
4180 {
4181 for (i = FIRST_REX_INT_REG; i <= LAST_REX_INT_REG; i++)
4182 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4183 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
4184 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4185 }
4186
4187 /* See the definition of CALL_USED_REGISTERS in i386.h. */
4188 c_mask = (TARGET_64BIT_MS_ABI ? (1 << 3)
4189 : TARGET_64BIT ? (1 << 2)
4190 : (1 << 1));
4191
4192 CLEAR_HARD_REG_SET (reg_class_contents[(int)CLOBBERED_REGS]);
4193
4194 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4195 {
4196 /* Set/reset conditionally defined registers from
4197 CALL_USED_REGISTERS initializer. */
4198 if (call_used_regs[i] > 1)
4199 call_used_regs[i] = !!(call_used_regs[i] & c_mask);
4200
4201 /* Calculate registers of CLOBBERED_REGS register set
4202 as call used registers from GENERAL_REGS register set. */
4203 if (TEST_HARD_REG_BIT (reg_class_contents[(int)GENERAL_REGS], i)
4204 && call_used_regs[i])
4205 SET_HARD_REG_BIT (reg_class_contents[(int)CLOBBERED_REGS], i);
4206 }
4207
4208 /* If MMX is disabled, squash the registers. */
4209 if (! TARGET_MMX)
4210 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4211 if (TEST_HARD_REG_BIT (reg_class_contents[(int)MMX_REGS], i))
4212 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4213
4214 /* If SSE is disabled, squash the registers. */
4215 if (! TARGET_SSE)
4216 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4217 if (TEST_HARD_REG_BIT (reg_class_contents[(int)SSE_REGS], i))
4218 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4219
4220 /* If the FPU is disabled, squash the registers. */
4221 if (! (TARGET_80387 || TARGET_FLOAT_RETURNS_IN_80387))
4222 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4223 if (TEST_HARD_REG_BIT (reg_class_contents[(int)FLOAT_REGS], i))
4224 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4225 }
4226
4227 \f
4228 /* Save the current options */
4229
4230 static void
4231 ix86_function_specific_save (struct cl_target_option *ptr)
4232 {
4233 ptr->arch = ix86_arch;
4234 ptr->schedule = ix86_schedule;
4235 ptr->tune = ix86_tune;
4236 ptr->branch_cost = ix86_branch_cost;
4237 ptr->tune_defaulted = ix86_tune_defaulted;
4238 ptr->arch_specified = ix86_arch_specified;
4239 ptr->x_ix86_isa_flags_explicit = ix86_isa_flags_explicit;
4240 ptr->ix86_target_flags_explicit = target_flags_explicit;
4241 ptr->x_recip_mask_explicit = recip_mask_explicit;
4242
4243 /* The fields are char but the variables are not; make sure the
4244 values fit in the fields. */
4245 gcc_assert (ptr->arch == ix86_arch);
4246 gcc_assert (ptr->schedule == ix86_schedule);
4247 gcc_assert (ptr->tune == ix86_tune);
4248 gcc_assert (ptr->branch_cost == ix86_branch_cost);
4249 }
4250
4251 /* Restore the current options */
4252
4253 static void
4254 ix86_function_specific_restore (struct cl_target_option *ptr)
4255 {
4256 enum processor_type old_tune = ix86_tune;
4257 enum processor_type old_arch = ix86_arch;
4258 unsigned int ix86_arch_mask, ix86_tune_mask;
4259 int i;
4260
4261 ix86_arch = (enum processor_type) ptr->arch;
4262 ix86_schedule = (enum attr_cpu) ptr->schedule;
4263 ix86_tune = (enum processor_type) ptr->tune;
4264 ix86_branch_cost = ptr->branch_cost;
4265 ix86_tune_defaulted = ptr->tune_defaulted;
4266 ix86_arch_specified = ptr->arch_specified;
4267 ix86_isa_flags_explicit = ptr->x_ix86_isa_flags_explicit;
4268 target_flags_explicit = ptr->ix86_target_flags_explicit;
4269 recip_mask_explicit = ptr->x_recip_mask_explicit;
4270
4271 /* Recreate the arch feature tests if the arch changed */
4272 if (old_arch != ix86_arch)
4273 {
4274 ix86_arch_mask = 1u << ix86_arch;
4275 for (i = 0; i < X86_ARCH_LAST; ++i)
4276 ix86_arch_features[i]
4277 = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
4278 }
4279
4280 /* Recreate the tune optimization tests */
4281 if (old_tune != ix86_tune)
4282 {
4283 ix86_tune_mask = 1u << ix86_tune;
4284 for (i = 0; i < X86_TUNE_LAST; ++i)
4285 ix86_tune_features[i]
4286 = !!(initial_ix86_tune_features[i] & ix86_tune_mask);
4287 }
4288 }
4289
4290 /* Print the current options */
4291
4292 static void
4293 ix86_function_specific_print (FILE *file, int indent,
4294 struct cl_target_option *ptr)
4295 {
4296 char *target_string
4297 = ix86_target_string (ptr->x_ix86_isa_flags, ptr->x_target_flags,
4298 NULL, NULL, ptr->x_ix86_fpmath, false);
4299
4300 fprintf (file, "%*sarch = %d (%s)\n",
4301 indent, "",
4302 ptr->arch,
4303 ((ptr->arch < TARGET_CPU_DEFAULT_max)
4304 ? cpu_names[ptr->arch]
4305 : "<unknown>"));
4306
4307 fprintf (file, "%*stune = %d (%s)\n",
4308 indent, "",
4309 ptr->tune,
4310 ((ptr->tune < TARGET_CPU_DEFAULT_max)
4311 ? cpu_names[ptr->tune]
4312 : "<unknown>"));
4313
4314 fprintf (file, "%*sbranch_cost = %d\n", indent, "", ptr->branch_cost);
4315
4316 if (target_string)
4317 {
4318 fprintf (file, "%*s%s\n", indent, "", target_string);
4319 free (target_string);
4320 }
4321 }
4322
4323 \f
4324 /* Inner function to process the attribute((target(...))), take an argument and
4325 set the current options from the argument. If we have a list, recursively go
4326 over the list. */
4327
4328 static bool
4329 ix86_valid_target_attribute_inner_p (tree args, char *p_strings[],
4330 struct gcc_options *enum_opts_set)
4331 {
4332 char *next_optstr;
4333 bool ret = true;
4334
4335 #define IX86_ATTR_ISA(S,O) { S, sizeof (S)-1, ix86_opt_isa, O, 0 }
4336 #define IX86_ATTR_STR(S,O) { S, sizeof (S)-1, ix86_opt_str, O, 0 }
4337 #define IX86_ATTR_ENUM(S,O) { S, sizeof (S)-1, ix86_opt_enum, O, 0 }
4338 #define IX86_ATTR_YES(S,O,M) { S, sizeof (S)-1, ix86_opt_yes, O, M }
4339 #define IX86_ATTR_NO(S,O,M) { S, sizeof (S)-1, ix86_opt_no, O, M }
4340
4341 enum ix86_opt_type
4342 {
4343 ix86_opt_unknown,
4344 ix86_opt_yes,
4345 ix86_opt_no,
4346 ix86_opt_str,
4347 ix86_opt_enum,
4348 ix86_opt_isa
4349 };
4350
4351 static const struct
4352 {
4353 const char *string;
4354 size_t len;
4355 enum ix86_opt_type type;
4356 int opt;
4357 int mask;
4358 } attrs[] = {
4359 /* isa options */
4360 IX86_ATTR_ISA ("3dnow", OPT_m3dnow),
4361 IX86_ATTR_ISA ("abm", OPT_mabm),
4362 IX86_ATTR_ISA ("bmi", OPT_mbmi),
4363 IX86_ATTR_ISA ("bmi2", OPT_mbmi2),
4364 IX86_ATTR_ISA ("lzcnt", OPT_mlzcnt),
4365 IX86_ATTR_ISA ("tbm", OPT_mtbm),
4366 IX86_ATTR_ISA ("aes", OPT_maes),
4367 IX86_ATTR_ISA ("avx", OPT_mavx),
4368 IX86_ATTR_ISA ("avx2", OPT_mavx2),
4369 IX86_ATTR_ISA ("mmx", OPT_mmmx),
4370 IX86_ATTR_ISA ("pclmul", OPT_mpclmul),
4371 IX86_ATTR_ISA ("popcnt", OPT_mpopcnt),
4372 IX86_ATTR_ISA ("sse", OPT_msse),
4373 IX86_ATTR_ISA ("sse2", OPT_msse2),
4374 IX86_ATTR_ISA ("sse3", OPT_msse3),
4375 IX86_ATTR_ISA ("sse4", OPT_msse4),
4376 IX86_ATTR_ISA ("sse4.1", OPT_msse4_1),
4377 IX86_ATTR_ISA ("sse4.2", OPT_msse4_2),
4378 IX86_ATTR_ISA ("sse4a", OPT_msse4a),
4379 IX86_ATTR_ISA ("ssse3", OPT_mssse3),
4380 IX86_ATTR_ISA ("fma4", OPT_mfma4),
4381 IX86_ATTR_ISA ("fma", OPT_mfma),
4382 IX86_ATTR_ISA ("xop", OPT_mxop),
4383 IX86_ATTR_ISA ("lwp", OPT_mlwp),
4384 IX86_ATTR_ISA ("fsgsbase", OPT_mfsgsbase),
4385 IX86_ATTR_ISA ("rdrnd", OPT_mrdrnd),
4386 IX86_ATTR_ISA ("f16c", OPT_mf16c),
4387 IX86_ATTR_ISA ("rtm", OPT_mrtm),
4388 IX86_ATTR_ISA ("hle", OPT_mhle),
4389 IX86_ATTR_ISA ("prfchw", OPT_mprfchw),
4390 IX86_ATTR_ISA ("rdseed", OPT_mrdseed),
4391 IX86_ATTR_ISA ("adx", OPT_madx),
4392 IX86_ATTR_ISA ("fxsr", OPT_mfxsr),
4393 IX86_ATTR_ISA ("xsave", OPT_mxsave),
4394 IX86_ATTR_ISA ("xsaveopt", OPT_mxsaveopt),
4395
4396 /* enum options */
4397 IX86_ATTR_ENUM ("fpmath=", OPT_mfpmath_),
4398
4399 /* string options */
4400 IX86_ATTR_STR ("arch=", IX86_FUNCTION_SPECIFIC_ARCH),
4401 IX86_ATTR_STR ("tune=", IX86_FUNCTION_SPECIFIC_TUNE),
4402
4403 /* flag options */
4404 IX86_ATTR_YES ("cld",
4405 OPT_mcld,
4406 MASK_CLD),
4407
4408 IX86_ATTR_NO ("fancy-math-387",
4409 OPT_mfancy_math_387,
4410 MASK_NO_FANCY_MATH_387),
4411
4412 IX86_ATTR_YES ("ieee-fp",
4413 OPT_mieee_fp,
4414 MASK_IEEE_FP),
4415
4416 IX86_ATTR_YES ("inline-all-stringops",
4417 OPT_minline_all_stringops,
4418 MASK_INLINE_ALL_STRINGOPS),
4419
4420 IX86_ATTR_YES ("inline-stringops-dynamically",
4421 OPT_minline_stringops_dynamically,
4422 MASK_INLINE_STRINGOPS_DYNAMICALLY),
4423
4424 IX86_ATTR_NO ("align-stringops",
4425 OPT_mno_align_stringops,
4426 MASK_NO_ALIGN_STRINGOPS),
4427
4428 IX86_ATTR_YES ("recip",
4429 OPT_mrecip,
4430 MASK_RECIP),
4431
4432 };
4433
4434 /* If this is a list, recurse to get the options. */
4435 if (TREE_CODE (args) == TREE_LIST)
4436 {
4437 bool ret = true;
4438
4439 for (; args; args = TREE_CHAIN (args))
4440 if (TREE_VALUE (args)
4441 && !ix86_valid_target_attribute_inner_p (TREE_VALUE (args),
4442 p_strings, enum_opts_set))
4443 ret = false;
4444
4445 return ret;
4446 }
4447
4448 else if (TREE_CODE (args) != STRING_CST)
4449 gcc_unreachable ();
4450
4451 /* Handle multiple arguments separated by commas. */
4452 next_optstr = ASTRDUP (TREE_STRING_POINTER (args));
4453
4454 while (next_optstr && *next_optstr != '\0')
4455 {
4456 char *p = next_optstr;
4457 char *orig_p = p;
4458 char *comma = strchr (next_optstr, ',');
4459 const char *opt_string;
4460 size_t len, opt_len;
4461 int opt;
4462 bool opt_set_p;
4463 char ch;
4464 unsigned i;
4465 enum ix86_opt_type type = ix86_opt_unknown;
4466 int mask = 0;
4467
4468 if (comma)
4469 {
4470 *comma = '\0';
4471 len = comma - next_optstr;
4472 next_optstr = comma + 1;
4473 }
4474 else
4475 {
4476 len = strlen (p);
4477 next_optstr = NULL;
4478 }
4479
4480 /* Recognize no-xxx. */
4481 if (len > 3 && p[0] == 'n' && p[1] == 'o' && p[2] == '-')
4482 {
4483 opt_set_p = false;
4484 p += 3;
4485 len -= 3;
4486 }
4487 else
4488 opt_set_p = true;
4489
4490 /* Find the option. */
4491 ch = *p;
4492 opt = N_OPTS;
4493 for (i = 0; i < ARRAY_SIZE (attrs); i++)
4494 {
4495 type = attrs[i].type;
4496 opt_len = attrs[i].len;
4497 if (ch == attrs[i].string[0]
4498 && ((type != ix86_opt_str && type != ix86_opt_enum)
4499 ? len == opt_len
4500 : len > opt_len)
4501 && memcmp (p, attrs[i].string, opt_len) == 0)
4502 {
4503 opt = attrs[i].opt;
4504 mask = attrs[i].mask;
4505 opt_string = attrs[i].string;
4506 break;
4507 }
4508 }
4509
4510 /* Process the option. */
4511 if (opt == N_OPTS)
4512 {
4513 error ("attribute(target(\"%s\")) is unknown", orig_p);
4514 ret = false;
4515 }
4516
4517 else if (type == ix86_opt_isa)
4518 {
4519 struct cl_decoded_option decoded;
4520
4521 generate_option (opt, NULL, opt_set_p, CL_TARGET, &decoded);
4522 ix86_handle_option (&global_options, &global_options_set,
4523 &decoded, input_location);
4524 }
4525
4526 else if (type == ix86_opt_yes || type == ix86_opt_no)
4527 {
4528 if (type == ix86_opt_no)
4529 opt_set_p = !opt_set_p;
4530
4531 if (opt_set_p)
4532 target_flags |= mask;
4533 else
4534 target_flags &= ~mask;
4535 }
4536
4537 else if (type == ix86_opt_str)
4538 {
4539 if (p_strings[opt])
4540 {
4541 error ("option(\"%s\") was already specified", opt_string);
4542 ret = false;
4543 }
4544 else
4545 p_strings[opt] = xstrdup (p + opt_len);
4546 }
4547
4548 else if (type == ix86_opt_enum)
4549 {
4550 bool arg_ok;
4551 int value;
4552
4553 arg_ok = opt_enum_arg_to_value (opt, p + opt_len, &value, CL_TARGET);
4554 if (arg_ok)
4555 set_option (&global_options, enum_opts_set, opt, value,
4556 p + opt_len, DK_UNSPECIFIED, input_location,
4557 global_dc);
4558 else
4559 {
4560 error ("attribute(target(\"%s\")) is unknown", orig_p);
4561 ret = false;
4562 }
4563 }
4564
4565 else
4566 gcc_unreachable ();
4567 }
4568
4569 return ret;
4570 }
4571
4572 /* Return a TARGET_OPTION_NODE tree of the target options listed or NULL. */
4573
4574 tree
4575 ix86_valid_target_attribute_tree (tree args)
4576 {
4577 const char *orig_arch_string = ix86_arch_string;
4578 const char *orig_tune_string = ix86_tune_string;
4579 enum fpmath_unit orig_fpmath_set = global_options_set.x_ix86_fpmath;
4580 int orig_tune_defaulted = ix86_tune_defaulted;
4581 int orig_arch_specified = ix86_arch_specified;
4582 char *option_strings[IX86_FUNCTION_SPECIFIC_MAX] = { NULL, NULL };
4583 tree t = NULL_TREE;
4584 int i;
4585 struct cl_target_option *def
4586 = TREE_TARGET_OPTION (target_option_default_node);
4587 struct gcc_options enum_opts_set;
4588
4589 memset (&enum_opts_set, 0, sizeof (enum_opts_set));
4590
4591 /* Process each of the options on the chain. */
4592 if (! ix86_valid_target_attribute_inner_p (args, option_strings,
4593 &enum_opts_set))
4594 return NULL_TREE;
4595
4596 /* If the changed options are different from the default, rerun
4597 ix86_option_override_internal, and then save the options away.
4598 The string options are are attribute options, and will be undone
4599 when we copy the save structure. */
4600 if (ix86_isa_flags != def->x_ix86_isa_flags
4601 || target_flags != def->x_target_flags
4602 || option_strings[IX86_FUNCTION_SPECIFIC_ARCH]
4603 || option_strings[IX86_FUNCTION_SPECIFIC_TUNE]
4604 || enum_opts_set.x_ix86_fpmath)
4605 {
4606 /* If we are using the default tune= or arch=, undo the string assigned,
4607 and use the default. */
4608 if (option_strings[IX86_FUNCTION_SPECIFIC_ARCH])
4609 ix86_arch_string = option_strings[IX86_FUNCTION_SPECIFIC_ARCH];
4610 else if (!orig_arch_specified)
4611 ix86_arch_string = NULL;
4612
4613 if (option_strings[IX86_FUNCTION_SPECIFIC_TUNE])
4614 ix86_tune_string = option_strings[IX86_FUNCTION_SPECIFIC_TUNE];
4615 else if (orig_tune_defaulted)
4616 ix86_tune_string = NULL;
4617
4618 /* If fpmath= is not set, and we now have sse2 on 32-bit, use it. */
4619 if (enum_opts_set.x_ix86_fpmath)
4620 global_options_set.x_ix86_fpmath = (enum fpmath_unit) 1;
4621 else if (!TARGET_64BIT && TARGET_SSE)
4622 {
4623 ix86_fpmath = (enum fpmath_unit) (FPMATH_SSE | FPMATH_387);
4624 global_options_set.x_ix86_fpmath = (enum fpmath_unit) 1;
4625 }
4626
4627 /* Do any overrides, such as arch=xxx, or tune=xxx support. */
4628 ix86_option_override_internal (false);
4629
4630 /* Add any builtin functions with the new isa if any. */
4631 ix86_add_new_builtins (ix86_isa_flags);
4632
4633 /* Save the current options unless we are validating options for
4634 #pragma. */
4635 t = build_target_option_node ();
4636
4637 ix86_arch_string = orig_arch_string;
4638 ix86_tune_string = orig_tune_string;
4639 global_options_set.x_ix86_fpmath = orig_fpmath_set;
4640
4641 /* Free up memory allocated to hold the strings */
4642 for (i = 0; i < IX86_FUNCTION_SPECIFIC_MAX; i++)
4643 free (option_strings[i]);
4644 }
4645
4646 return t;
4647 }
4648
4649 /* Hook to validate attribute((target("string"))). */
4650
4651 static bool
4652 ix86_valid_target_attribute_p (tree fndecl,
4653 tree ARG_UNUSED (name),
4654 tree args,
4655 int ARG_UNUSED (flags))
4656 {
4657 struct cl_target_option cur_target;
4658 bool ret = true;
4659 tree old_optimize = build_optimization_node ();
4660 tree new_target, new_optimize;
4661 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
4662
4663 /* If the function changed the optimization levels as well as setting target
4664 options, start with the optimizations specified. */
4665 if (func_optimize && func_optimize != old_optimize)
4666 cl_optimization_restore (&global_options,
4667 TREE_OPTIMIZATION (func_optimize));
4668
4669 /* The target attributes may also change some optimization flags, so update
4670 the optimization options if necessary. */
4671 cl_target_option_save (&cur_target, &global_options);
4672 new_target = ix86_valid_target_attribute_tree (args);
4673 new_optimize = build_optimization_node ();
4674
4675 if (!new_target)
4676 ret = false;
4677
4678 else if (fndecl)
4679 {
4680 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
4681
4682 if (old_optimize != new_optimize)
4683 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
4684 }
4685
4686 cl_target_option_restore (&global_options, &cur_target);
4687
4688 if (old_optimize != new_optimize)
4689 cl_optimization_restore (&global_options,
4690 TREE_OPTIMIZATION (old_optimize));
4691
4692 return ret;
4693 }
4694
4695 \f
4696 /* Hook to determine if one function can safely inline another. */
4697
4698 static bool
4699 ix86_can_inline_p (tree caller, tree callee)
4700 {
4701 bool ret = false;
4702 tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
4703 tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
4704
4705 /* If callee has no option attributes, then it is ok to inline. */
4706 if (!callee_tree)
4707 ret = true;
4708
4709 /* If caller has no option attributes, but callee does then it is not ok to
4710 inline. */
4711 else if (!caller_tree)
4712 ret = false;
4713
4714 else
4715 {
4716 struct cl_target_option *caller_opts = TREE_TARGET_OPTION (caller_tree);
4717 struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree);
4718
4719 /* Callee's isa options should a subset of the caller's, i.e. a SSE4 function
4720 can inline a SSE2 function but a SSE2 function can't inline a SSE4
4721 function. */
4722 if ((caller_opts->x_ix86_isa_flags & callee_opts->x_ix86_isa_flags)
4723 != callee_opts->x_ix86_isa_flags)
4724 ret = false;
4725
4726 /* See if we have the same non-isa options. */
4727 else if (caller_opts->x_target_flags != callee_opts->x_target_flags)
4728 ret = false;
4729
4730 /* See if arch, tune, etc. are the same. */
4731 else if (caller_opts->arch != callee_opts->arch)
4732 ret = false;
4733
4734 else if (caller_opts->tune != callee_opts->tune)
4735 ret = false;
4736
4737 else if (caller_opts->x_ix86_fpmath != callee_opts->x_ix86_fpmath)
4738 ret = false;
4739
4740 else if (caller_opts->branch_cost != callee_opts->branch_cost)
4741 ret = false;
4742
4743 else
4744 ret = true;
4745 }
4746
4747 return ret;
4748 }
4749
4750 \f
4751 /* Remember the last target of ix86_set_current_function. */
4752 static GTY(()) tree ix86_previous_fndecl;
4753
4754 /* Establish appropriate back-end context for processing the function
4755 FNDECL. The argument might be NULL to indicate processing at top
4756 level, outside of any function scope. */
4757 static void
4758 ix86_set_current_function (tree fndecl)
4759 {
4760 /* Only change the context if the function changes. This hook is called
4761 several times in the course of compiling a function, and we don't want to
4762 slow things down too much or call target_reinit when it isn't safe. */
4763 if (fndecl && fndecl != ix86_previous_fndecl)
4764 {
4765 tree old_tree = (ix86_previous_fndecl
4766 ? DECL_FUNCTION_SPECIFIC_TARGET (ix86_previous_fndecl)
4767 : NULL_TREE);
4768
4769 tree new_tree = (fndecl
4770 ? DECL_FUNCTION_SPECIFIC_TARGET (fndecl)
4771 : NULL_TREE);
4772
4773 ix86_previous_fndecl = fndecl;
4774 if (old_tree == new_tree)
4775 ;
4776
4777 else if (new_tree)
4778 {
4779 cl_target_option_restore (&global_options,
4780 TREE_TARGET_OPTION (new_tree));
4781 target_reinit ();
4782 }
4783
4784 else if (old_tree)
4785 {
4786 struct cl_target_option *def
4787 = TREE_TARGET_OPTION (target_option_current_node);
4788
4789 cl_target_option_restore (&global_options, def);
4790 target_reinit ();
4791 }
4792 }
4793 }
4794
4795 \f
4796 /* Return true if this goes in large data/bss. */
4797
4798 static bool
4799 ix86_in_large_data_p (tree exp)
4800 {
4801 if (ix86_cmodel != CM_MEDIUM && ix86_cmodel != CM_MEDIUM_PIC)
4802 return false;
4803
4804 /* Functions are never large data. */
4805 if (TREE_CODE (exp) == FUNCTION_DECL)
4806 return false;
4807
4808 if (TREE_CODE (exp) == VAR_DECL && DECL_SECTION_NAME (exp))
4809 {
4810 const char *section = TREE_STRING_POINTER (DECL_SECTION_NAME (exp));
4811 if (strcmp (section, ".ldata") == 0
4812 || strcmp (section, ".lbss") == 0)
4813 return true;
4814 return false;
4815 }
4816 else
4817 {
4818 HOST_WIDE_INT size = int_size_in_bytes (TREE_TYPE (exp));
4819
4820 /* If this is an incomplete type with size 0, then we can't put it
4821 in data because it might be too big when completed. */
4822 if (!size || size > ix86_section_threshold)
4823 return true;
4824 }
4825
4826 return false;
4827 }
4828
4829 /* Switch to the appropriate section for output of DECL.
4830 DECL is either a `VAR_DECL' node or a constant of some sort.
4831 RELOC indicates whether forming the initial value of DECL requires
4832 link-time relocations. */
4833
4834 static section * x86_64_elf_select_section (tree, int, unsigned HOST_WIDE_INT)
4835 ATTRIBUTE_UNUSED;
4836
4837 static section *
4838 x86_64_elf_select_section (tree decl, int reloc,
4839 unsigned HOST_WIDE_INT align)
4840 {
4841 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4842 && ix86_in_large_data_p (decl))
4843 {
4844 const char *sname = NULL;
4845 unsigned int flags = SECTION_WRITE;
4846 switch (categorize_decl_for_section (decl, reloc))
4847 {
4848 case SECCAT_DATA:
4849 sname = ".ldata";
4850 break;
4851 case SECCAT_DATA_REL:
4852 sname = ".ldata.rel";
4853 break;
4854 case SECCAT_DATA_REL_LOCAL:
4855 sname = ".ldata.rel.local";
4856 break;
4857 case SECCAT_DATA_REL_RO:
4858 sname = ".ldata.rel.ro";
4859 break;
4860 case SECCAT_DATA_REL_RO_LOCAL:
4861 sname = ".ldata.rel.ro.local";
4862 break;
4863 case SECCAT_BSS:
4864 sname = ".lbss";
4865 flags |= SECTION_BSS;
4866 break;
4867 case SECCAT_RODATA:
4868 case SECCAT_RODATA_MERGE_STR:
4869 case SECCAT_RODATA_MERGE_STR_INIT:
4870 case SECCAT_RODATA_MERGE_CONST:
4871 sname = ".lrodata";
4872 flags = 0;
4873 break;
4874 case SECCAT_SRODATA:
4875 case SECCAT_SDATA:
4876 case SECCAT_SBSS:
4877 gcc_unreachable ();
4878 case SECCAT_TEXT:
4879 case SECCAT_TDATA:
4880 case SECCAT_TBSS:
4881 /* We don't split these for medium model. Place them into
4882 default sections and hope for best. */
4883 break;
4884 }
4885 if (sname)
4886 {
4887 /* We might get called with string constants, but get_named_section
4888 doesn't like them as they are not DECLs. Also, we need to set
4889 flags in that case. */
4890 if (!DECL_P (decl))
4891 return get_section (sname, flags, NULL);
4892 return get_named_section (decl, sname, reloc);
4893 }
4894 }
4895 return default_elf_select_section (decl, reloc, align);
4896 }
4897
4898 /* Build up a unique section name, expressed as a
4899 STRING_CST node, and assign it to DECL_SECTION_NAME (decl).
4900 RELOC indicates whether the initial value of EXP requires
4901 link-time relocations. */
4902
4903 static void ATTRIBUTE_UNUSED
4904 x86_64_elf_unique_section (tree decl, int reloc)
4905 {
4906 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4907 && ix86_in_large_data_p (decl))
4908 {
4909 const char *prefix = NULL;
4910 /* We only need to use .gnu.linkonce if we don't have COMDAT groups. */
4911 bool one_only = DECL_ONE_ONLY (decl) && !HAVE_COMDAT_GROUP;
4912
4913 switch (categorize_decl_for_section (decl, reloc))
4914 {
4915 case SECCAT_DATA:
4916 case SECCAT_DATA_REL:
4917 case SECCAT_DATA_REL_LOCAL:
4918 case SECCAT_DATA_REL_RO:
4919 case SECCAT_DATA_REL_RO_LOCAL:
4920 prefix = one_only ? ".ld" : ".ldata";
4921 break;
4922 case SECCAT_BSS:
4923 prefix = one_only ? ".lb" : ".lbss";
4924 break;
4925 case SECCAT_RODATA:
4926 case SECCAT_RODATA_MERGE_STR:
4927 case SECCAT_RODATA_MERGE_STR_INIT:
4928 case SECCAT_RODATA_MERGE_CONST:
4929 prefix = one_only ? ".lr" : ".lrodata";
4930 break;
4931 case SECCAT_SRODATA:
4932 case SECCAT_SDATA:
4933 case SECCAT_SBSS:
4934 gcc_unreachable ();
4935 case SECCAT_TEXT:
4936 case SECCAT_TDATA:
4937 case SECCAT_TBSS:
4938 /* We don't split these for medium model. Place them into
4939 default sections and hope for best. */
4940 break;
4941 }
4942 if (prefix)
4943 {
4944 const char *name, *linkonce;
4945 char *string;
4946
4947 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
4948 name = targetm.strip_name_encoding (name);
4949
4950 /* If we're using one_only, then there needs to be a .gnu.linkonce
4951 prefix to the section name. */
4952 linkonce = one_only ? ".gnu.linkonce" : "";
4953
4954 string = ACONCAT ((linkonce, prefix, ".", name, NULL));
4955
4956 DECL_SECTION_NAME (decl) = build_string (strlen (string), string);
4957 return;
4958 }
4959 }
4960 default_unique_section (decl, reloc);
4961 }
4962
4963 #ifdef COMMON_ASM_OP
4964 /* This says how to output assembler code to declare an
4965 uninitialized external linkage data object.
4966
4967 For medium model x86-64 we need to use .largecomm opcode for
4968 large objects. */
4969 void
4970 x86_elf_aligned_common (FILE *file,
4971 const char *name, unsigned HOST_WIDE_INT size,
4972 int align)
4973 {
4974 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4975 && size > (unsigned int)ix86_section_threshold)
4976 fputs (".largecomm\t", file);
4977 else
4978 fputs (COMMON_ASM_OP, file);
4979 assemble_name (file, name);
4980 fprintf (file, "," HOST_WIDE_INT_PRINT_UNSIGNED ",%u\n",
4981 size, align / BITS_PER_UNIT);
4982 }
4983 #endif
4984
4985 /* Utility function for targets to use in implementing
4986 ASM_OUTPUT_ALIGNED_BSS. */
4987
4988 void
4989 x86_output_aligned_bss (FILE *file, tree decl ATTRIBUTE_UNUSED,
4990 const char *name, unsigned HOST_WIDE_INT size,
4991 int align)
4992 {
4993 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4994 && size > (unsigned int)ix86_section_threshold)
4995 switch_to_section (get_named_section (decl, ".lbss", 0));
4996 else
4997 switch_to_section (bss_section);
4998 ASM_OUTPUT_ALIGN (file, floor_log2 (align / BITS_PER_UNIT));
4999 #ifdef ASM_DECLARE_OBJECT_NAME
5000 last_assemble_variable_decl = decl;
5001 ASM_DECLARE_OBJECT_NAME (file, name, decl);
5002 #else
5003 /* Standard thing is just output label for the object. */
5004 ASM_OUTPUT_LABEL (file, name);
5005 #endif /* ASM_DECLARE_OBJECT_NAME */
5006 ASM_OUTPUT_SKIP (file, size ? size : 1);
5007 }
5008 \f
5009 /* Decide whether we must probe the stack before any space allocation
5010 on this target. It's essentially TARGET_STACK_PROBE except when
5011 -fstack-check causes the stack to be already probed differently. */
5012
5013 bool
5014 ix86_target_stack_probe (void)
5015 {
5016 /* Do not probe the stack twice if static stack checking is enabled. */
5017 if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
5018 return false;
5019
5020 return TARGET_STACK_PROBE;
5021 }
5022 \f
5023 /* Decide whether we can make a sibling call to a function. DECL is the
5024 declaration of the function being targeted by the call and EXP is the
5025 CALL_EXPR representing the call. */
5026
5027 static bool
5028 ix86_function_ok_for_sibcall (tree decl, tree exp)
5029 {
5030 tree type, decl_or_type;
5031 rtx a, b;
5032
5033 /* If we are generating position-independent code, we cannot sibcall
5034 optimize any indirect call, or a direct call to a global function,
5035 as the PLT requires %ebx be live. (Darwin does not have a PLT.) */
5036 if (!TARGET_MACHO
5037 && !TARGET_64BIT
5038 && flag_pic
5039 && (!decl || !targetm.binds_local_p (decl)))
5040 return false;
5041
5042 /* If we need to align the outgoing stack, then sibcalling would
5043 unalign the stack, which may break the called function. */
5044 if (ix86_minimum_incoming_stack_boundary (true)
5045 < PREFERRED_STACK_BOUNDARY)
5046 return false;
5047
5048 if (decl)
5049 {
5050 decl_or_type = decl;
5051 type = TREE_TYPE (decl);
5052 }
5053 else
5054 {
5055 /* We're looking at the CALL_EXPR, we need the type of the function. */
5056 type = CALL_EXPR_FN (exp); /* pointer expression */
5057 type = TREE_TYPE (type); /* pointer type */
5058 type = TREE_TYPE (type); /* function type */
5059 decl_or_type = type;
5060 }
5061
5062 /* Check that the return value locations are the same. Like
5063 if we are returning floats on the 80387 register stack, we cannot
5064 make a sibcall from a function that doesn't return a float to a
5065 function that does or, conversely, from a function that does return
5066 a float to a function that doesn't; the necessary stack adjustment
5067 would not be executed. This is also the place we notice
5068 differences in the return value ABI. Note that it is ok for one
5069 of the functions to have void return type as long as the return
5070 value of the other is passed in a register. */
5071 a = ix86_function_value (TREE_TYPE (exp), decl_or_type, false);
5072 b = ix86_function_value (TREE_TYPE (DECL_RESULT (cfun->decl)),
5073 cfun->decl, false);
5074 if (STACK_REG_P (a) || STACK_REG_P (b))
5075 {
5076 if (!rtx_equal_p (a, b))
5077 return false;
5078 }
5079 else if (VOID_TYPE_P (TREE_TYPE (DECL_RESULT (cfun->decl))))
5080 {
5081 /* Disable sibcall if we need to generate vzeroupper after
5082 callee returns. */
5083 if (TARGET_VZEROUPPER
5084 && cfun->machine->callee_return_avx256_p
5085 && !cfun->machine->caller_return_avx256_p)
5086 return false;
5087 }
5088 else if (!rtx_equal_p (a, b))
5089 return false;
5090
5091 if (TARGET_64BIT)
5092 {
5093 /* The SYSV ABI has more call-clobbered registers;
5094 disallow sibcalls from MS to SYSV. */
5095 if (cfun->machine->call_abi == MS_ABI
5096 && ix86_function_type_abi (type) == SYSV_ABI)
5097 return false;
5098 }
5099 else
5100 {
5101 /* If this call is indirect, we'll need to be able to use a
5102 call-clobbered register for the address of the target function.
5103 Make sure that all such registers are not used for passing
5104 parameters. Note that DLLIMPORT functions are indirect. */
5105 if (!decl
5106 || (TARGET_DLLIMPORT_DECL_ATTRIBUTES && DECL_DLLIMPORT_P (decl)))
5107 {
5108 if (ix86_function_regparm (type, NULL) >= 3)
5109 {
5110 /* ??? Need to count the actual number of registers to be used,
5111 not the possible number of registers. Fix later. */
5112 return false;
5113 }
5114 }
5115 }
5116
5117 /* Otherwise okay. That also includes certain types of indirect calls. */
5118 return true;
5119 }
5120
5121 /* Handle "cdecl", "stdcall", "fastcall", "regparm", "thiscall",
5122 and "sseregparm" calling convention attributes;
5123 arguments as in struct attribute_spec.handler. */
5124
5125 static tree
5126 ix86_handle_cconv_attribute (tree *node, tree name,
5127 tree args,
5128 int flags ATTRIBUTE_UNUSED,
5129 bool *no_add_attrs)
5130 {
5131 if (TREE_CODE (*node) != FUNCTION_TYPE
5132 && TREE_CODE (*node) != METHOD_TYPE
5133 && TREE_CODE (*node) != FIELD_DECL
5134 && TREE_CODE (*node) != TYPE_DECL)
5135 {
5136 warning (OPT_Wattributes, "%qE attribute only applies to functions",
5137 name);
5138 *no_add_attrs = true;
5139 return NULL_TREE;
5140 }
5141
5142 /* Can combine regparm with all attributes but fastcall, and thiscall. */
5143 if (is_attribute_p ("regparm", name))
5144 {
5145 tree cst;
5146
5147 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
5148 {
5149 error ("fastcall and regparm attributes are not compatible");
5150 }
5151
5152 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
5153 {
5154 error ("regparam and thiscall attributes are not compatible");
5155 }
5156
5157 cst = TREE_VALUE (args);
5158 if (TREE_CODE (cst) != INTEGER_CST)
5159 {
5160 warning (OPT_Wattributes,
5161 "%qE attribute requires an integer constant argument",
5162 name);
5163 *no_add_attrs = true;
5164 }
5165 else if (compare_tree_int (cst, REGPARM_MAX) > 0)
5166 {
5167 warning (OPT_Wattributes, "argument to %qE attribute larger than %d",
5168 name, REGPARM_MAX);
5169 *no_add_attrs = true;
5170 }
5171
5172 return NULL_TREE;
5173 }
5174
5175 if (TARGET_64BIT)
5176 {
5177 /* Do not warn when emulating the MS ABI. */
5178 if ((TREE_CODE (*node) != FUNCTION_TYPE
5179 && TREE_CODE (*node) != METHOD_TYPE)
5180 || ix86_function_type_abi (*node) != MS_ABI)
5181 warning (OPT_Wattributes, "%qE attribute ignored",
5182 name);
5183 *no_add_attrs = true;
5184 return NULL_TREE;
5185 }
5186
5187 /* Can combine fastcall with stdcall (redundant) and sseregparm. */
5188 if (is_attribute_p ("fastcall", name))
5189 {
5190 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
5191 {
5192 error ("fastcall and cdecl attributes are not compatible");
5193 }
5194 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
5195 {
5196 error ("fastcall and stdcall attributes are not compatible");
5197 }
5198 if (lookup_attribute ("regparm", TYPE_ATTRIBUTES (*node)))
5199 {
5200 error ("fastcall and regparm attributes are not compatible");
5201 }
5202 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
5203 {
5204 error ("fastcall and thiscall attributes are not compatible");
5205 }
5206 }
5207
5208 /* Can combine stdcall with fastcall (redundant), regparm and
5209 sseregparm. */
5210 else if (is_attribute_p ("stdcall", name))
5211 {
5212 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
5213 {
5214 error ("stdcall and cdecl attributes are not compatible");
5215 }
5216 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
5217 {
5218 error ("stdcall and fastcall attributes are not compatible");
5219 }
5220 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
5221 {
5222 error ("stdcall and thiscall attributes are not compatible");
5223 }
5224 }
5225
5226 /* Can combine cdecl with regparm and sseregparm. */
5227 else if (is_attribute_p ("cdecl", name))
5228 {
5229 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
5230 {
5231 error ("stdcall and cdecl attributes are not compatible");
5232 }
5233 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
5234 {
5235 error ("fastcall and cdecl attributes are not compatible");
5236 }
5237 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
5238 {
5239 error ("cdecl and thiscall attributes are not compatible");
5240 }
5241 }
5242 else if (is_attribute_p ("thiscall", name))
5243 {
5244 if (TREE_CODE (*node) != METHOD_TYPE && pedantic)
5245 warning (OPT_Wattributes, "%qE attribute is used for none class-method",
5246 name);
5247 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
5248 {
5249 error ("stdcall and thiscall attributes are not compatible");
5250 }
5251 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
5252 {
5253 error ("fastcall and thiscall attributes are not compatible");
5254 }
5255 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
5256 {
5257 error ("cdecl and thiscall attributes are not compatible");
5258 }
5259 }
5260
5261 /* Can combine sseregparm with all attributes. */
5262
5263 return NULL_TREE;
5264 }
5265
5266 /* The transactional memory builtins are implicitly regparm or fastcall
5267 depending on the ABI. Override the generic do-nothing attribute that
5268 these builtins were declared with, and replace it with one of the two
5269 attributes that we expect elsewhere. */
5270
5271 static tree
5272 ix86_handle_tm_regparm_attribute (tree *node, tree name ATTRIBUTE_UNUSED,
5273 tree args ATTRIBUTE_UNUSED,
5274 int flags ATTRIBUTE_UNUSED,
5275 bool *no_add_attrs)
5276 {
5277 tree alt;
5278
5279 /* In no case do we want to add the placeholder attribute. */
5280 *no_add_attrs = true;
5281
5282 /* The 64-bit ABI is unchanged for transactional memory. */
5283 if (TARGET_64BIT)
5284 return NULL_TREE;
5285
5286 /* ??? Is there a better way to validate 32-bit windows? We have
5287 cfun->machine->call_abi, but that seems to be set only for 64-bit. */
5288 if (CHECK_STACK_LIMIT > 0)
5289 alt = tree_cons (get_identifier ("fastcall"), NULL, NULL);
5290 else
5291 {
5292 alt = tree_cons (NULL, build_int_cst (NULL, 2), NULL);
5293 alt = tree_cons (get_identifier ("regparm"), alt, NULL);
5294 }
5295 decl_attributes (node, alt, flags);
5296
5297 return NULL_TREE;
5298 }
5299
5300 /* This function determines from TYPE the calling-convention. */
5301
5302 unsigned int
5303 ix86_get_callcvt (const_tree type)
5304 {
5305 unsigned int ret = 0;
5306 bool is_stdarg;
5307 tree attrs;
5308
5309 if (TARGET_64BIT)
5310 return IX86_CALLCVT_CDECL;
5311
5312 attrs = TYPE_ATTRIBUTES (type);
5313 if (attrs != NULL_TREE)
5314 {
5315 if (lookup_attribute ("cdecl", attrs))
5316 ret |= IX86_CALLCVT_CDECL;
5317 else if (lookup_attribute ("stdcall", attrs))
5318 ret |= IX86_CALLCVT_STDCALL;
5319 else if (lookup_attribute ("fastcall", attrs))
5320 ret |= IX86_CALLCVT_FASTCALL;
5321 else if (lookup_attribute ("thiscall", attrs))
5322 ret |= IX86_CALLCVT_THISCALL;
5323
5324 /* Regparam isn't allowed for thiscall and fastcall. */
5325 if ((ret & (IX86_CALLCVT_THISCALL | IX86_CALLCVT_FASTCALL)) == 0)
5326 {
5327 if (lookup_attribute ("regparm", attrs))
5328 ret |= IX86_CALLCVT_REGPARM;
5329 if (lookup_attribute ("sseregparm", attrs))
5330 ret |= IX86_CALLCVT_SSEREGPARM;
5331 }
5332
5333 if (IX86_BASE_CALLCVT(ret) != 0)
5334 return ret;
5335 }
5336
5337 is_stdarg = stdarg_p (type);
5338 if (TARGET_RTD && !is_stdarg)
5339 return IX86_CALLCVT_STDCALL | ret;
5340
5341 if (ret != 0
5342 || is_stdarg
5343 || TREE_CODE (type) != METHOD_TYPE
5344 || ix86_function_type_abi (type) != MS_ABI)
5345 return IX86_CALLCVT_CDECL | ret;
5346
5347 return IX86_CALLCVT_THISCALL;
5348 }
5349
5350 /* Return 0 if the attributes for two types are incompatible, 1 if they
5351 are compatible, and 2 if they are nearly compatible (which causes a
5352 warning to be generated). */
5353
5354 static int
5355 ix86_comp_type_attributes (const_tree type1, const_tree type2)
5356 {
5357 unsigned int ccvt1, ccvt2;
5358
5359 if (TREE_CODE (type1) != FUNCTION_TYPE
5360 && TREE_CODE (type1) != METHOD_TYPE)
5361 return 1;
5362
5363 ccvt1 = ix86_get_callcvt (type1);
5364 ccvt2 = ix86_get_callcvt (type2);
5365 if (ccvt1 != ccvt2)
5366 return 0;
5367 if (ix86_function_regparm (type1, NULL)
5368 != ix86_function_regparm (type2, NULL))
5369 return 0;
5370
5371 return 1;
5372 }
5373 \f
5374 /* Return the regparm value for a function with the indicated TYPE and DECL.
5375 DECL may be NULL when calling function indirectly
5376 or considering a libcall. */
5377
5378 static int
5379 ix86_function_regparm (const_tree type, const_tree decl)
5380 {
5381 tree attr;
5382 int regparm;
5383 unsigned int ccvt;
5384
5385 if (TARGET_64BIT)
5386 return (ix86_function_type_abi (type) == SYSV_ABI
5387 ? X86_64_REGPARM_MAX : X86_64_MS_REGPARM_MAX);
5388 ccvt = ix86_get_callcvt (type);
5389 regparm = ix86_regparm;
5390
5391 if ((ccvt & IX86_CALLCVT_REGPARM) != 0)
5392 {
5393 attr = lookup_attribute ("regparm", TYPE_ATTRIBUTES (type));
5394 if (attr)
5395 {
5396 regparm = TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr)));
5397 return regparm;
5398 }
5399 }
5400 else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
5401 return 2;
5402 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
5403 return 1;
5404
5405 /* Use register calling convention for local functions when possible. */
5406 if (decl
5407 && TREE_CODE (decl) == FUNCTION_DECL
5408 && optimize
5409 && !(profile_flag && !flag_fentry))
5410 {
5411 /* FIXME: remove this CONST_CAST when cgraph.[ch] is constified. */
5412 struct cgraph_local_info *i = cgraph_local_info (CONST_CAST_TREE (decl));
5413 if (i && i->local && i->can_change_signature)
5414 {
5415 int local_regparm, globals = 0, regno;
5416
5417 /* Make sure no regparm register is taken by a
5418 fixed register variable. */
5419 for (local_regparm = 0; local_regparm < REGPARM_MAX; local_regparm++)
5420 if (fixed_regs[local_regparm])
5421 break;
5422
5423 /* We don't want to use regparm(3) for nested functions as
5424 these use a static chain pointer in the third argument. */
5425 if (local_regparm == 3 && DECL_STATIC_CHAIN (decl))
5426 local_regparm = 2;
5427
5428 /* In 32-bit mode save a register for the split stack. */
5429 if (!TARGET_64BIT && local_regparm == 3 && flag_split_stack)
5430 local_regparm = 2;
5431
5432 /* Each fixed register usage increases register pressure,
5433 so less registers should be used for argument passing.
5434 This functionality can be overriden by an explicit
5435 regparm value. */
5436 for (regno = AX_REG; regno <= DI_REG; regno++)
5437 if (fixed_regs[regno])
5438 globals++;
5439
5440 local_regparm
5441 = globals < local_regparm ? local_regparm - globals : 0;
5442
5443 if (local_regparm > regparm)
5444 regparm = local_regparm;
5445 }
5446 }
5447
5448 return regparm;
5449 }
5450
5451 /* Return 1 or 2, if we can pass up to SSE_REGPARM_MAX SFmode (1) and
5452 DFmode (2) arguments in SSE registers for a function with the
5453 indicated TYPE and DECL. DECL may be NULL when calling function
5454 indirectly or considering a libcall. Otherwise return 0. */
5455
5456 static int
5457 ix86_function_sseregparm (const_tree type, const_tree decl, bool warn)
5458 {
5459 gcc_assert (!TARGET_64BIT);
5460
5461 /* Use SSE registers to pass SFmode and DFmode arguments if requested
5462 by the sseregparm attribute. */
5463 if (TARGET_SSEREGPARM
5464 || (type && lookup_attribute ("sseregparm", TYPE_ATTRIBUTES (type))))
5465 {
5466 if (!TARGET_SSE)
5467 {
5468 if (warn)
5469 {
5470 if (decl)
5471 error ("calling %qD with attribute sseregparm without "
5472 "SSE/SSE2 enabled", decl);
5473 else
5474 error ("calling %qT with attribute sseregparm without "
5475 "SSE/SSE2 enabled", type);
5476 }
5477 return 0;
5478 }
5479
5480 return 2;
5481 }
5482
5483 /* For local functions, pass up to SSE_REGPARM_MAX SFmode
5484 (and DFmode for SSE2) arguments in SSE registers. */
5485 if (decl && TARGET_SSE_MATH && optimize
5486 && !(profile_flag && !flag_fentry))
5487 {
5488 /* FIXME: remove this CONST_CAST when cgraph.[ch] is constified. */
5489 struct cgraph_local_info *i = cgraph_local_info (CONST_CAST_TREE(decl));
5490 if (i && i->local && i->can_change_signature)
5491 return TARGET_SSE2 ? 2 : 1;
5492 }
5493
5494 return 0;
5495 }
5496
5497 /* Return true if EAX is live at the start of the function. Used by
5498 ix86_expand_prologue to determine if we need special help before
5499 calling allocate_stack_worker. */
5500
5501 static bool
5502 ix86_eax_live_at_start_p (void)
5503 {
5504 /* Cheat. Don't bother working forward from ix86_function_regparm
5505 to the function type to whether an actual argument is located in
5506 eax. Instead just look at cfg info, which is still close enough
5507 to correct at this point. This gives false positives for broken
5508 functions that might use uninitialized data that happens to be
5509 allocated in eax, but who cares? */
5510 return REGNO_REG_SET_P (df_get_live_out (ENTRY_BLOCK_PTR), 0);
5511 }
5512
5513 static bool
5514 ix86_keep_aggregate_return_pointer (tree fntype)
5515 {
5516 tree attr;
5517
5518 if (!TARGET_64BIT)
5519 {
5520 attr = lookup_attribute ("callee_pop_aggregate_return",
5521 TYPE_ATTRIBUTES (fntype));
5522 if (attr)
5523 return (TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr))) == 0);
5524
5525 /* For 32-bit MS-ABI the default is to keep aggregate
5526 return pointer. */
5527 if (ix86_function_type_abi (fntype) == MS_ABI)
5528 return true;
5529 }
5530 return KEEP_AGGREGATE_RETURN_POINTER != 0;
5531 }
5532
5533 /* Value is the number of bytes of arguments automatically
5534 popped when returning from a subroutine call.
5535 FUNDECL is the declaration node of the function (as a tree),
5536 FUNTYPE is the data type of the function (as a tree),
5537 or for a library call it is an identifier node for the subroutine name.
5538 SIZE is the number of bytes of arguments passed on the stack.
5539
5540 On the 80386, the RTD insn may be used to pop them if the number
5541 of args is fixed, but if the number is variable then the caller
5542 must pop them all. RTD can't be used for library calls now
5543 because the library is compiled with the Unix compiler.
5544 Use of RTD is a selectable option, since it is incompatible with
5545 standard Unix calling sequences. If the option is not selected,
5546 the caller must always pop the args.
5547
5548 The attribute stdcall is equivalent to RTD on a per module basis. */
5549
5550 static int
5551 ix86_return_pops_args (tree fundecl, tree funtype, int size)
5552 {
5553 unsigned int ccvt;
5554
5555 /* None of the 64-bit ABIs pop arguments. */
5556 if (TARGET_64BIT)
5557 return 0;
5558
5559 ccvt = ix86_get_callcvt (funtype);
5560
5561 if ((ccvt & (IX86_CALLCVT_STDCALL | IX86_CALLCVT_FASTCALL
5562 | IX86_CALLCVT_THISCALL)) != 0
5563 && ! stdarg_p (funtype))
5564 return size;
5565
5566 /* Lose any fake structure return argument if it is passed on the stack. */
5567 if (aggregate_value_p (TREE_TYPE (funtype), fundecl)
5568 && !ix86_keep_aggregate_return_pointer (funtype))
5569 {
5570 int nregs = ix86_function_regparm (funtype, fundecl);
5571 if (nregs == 0)
5572 return GET_MODE_SIZE (Pmode);
5573 }
5574
5575 return 0;
5576 }
5577
5578 /* Implement the TARGET_LEGITIMATE_COMBINED_INSN hook. */
5579
5580 static bool
5581 ix86_legitimate_combined_insn (rtx insn)
5582 {
5583 /* Check operand constraints in case hard registers were propagated
5584 into insn pattern. This check prevents combine pass from
5585 generating insn patterns with invalid hard register operands.
5586 These invalid insns can eventually confuse reload to error out
5587 with a spill failure. See also PRs 46829 and 46843. */
5588 if ((INSN_CODE (insn) = recog (PATTERN (insn), insn, 0)) >= 0)
5589 {
5590 int i;
5591
5592 extract_insn (insn);
5593 preprocess_constraints ();
5594
5595 for (i = 0; i < recog_data.n_operands; i++)
5596 {
5597 rtx op = recog_data.operand[i];
5598 enum machine_mode mode = GET_MODE (op);
5599 struct operand_alternative *op_alt;
5600 int offset = 0;
5601 bool win;
5602 int j;
5603
5604 /* A unary operator may be accepted by the predicate, but it
5605 is irrelevant for matching constraints. */
5606 if (UNARY_P (op))
5607 op = XEXP (op, 0);
5608
5609 if (GET_CODE (op) == SUBREG)
5610 {
5611 if (REG_P (SUBREG_REG (op))
5612 && REGNO (SUBREG_REG (op)) < FIRST_PSEUDO_REGISTER)
5613 offset = subreg_regno_offset (REGNO (SUBREG_REG (op)),
5614 GET_MODE (SUBREG_REG (op)),
5615 SUBREG_BYTE (op),
5616 GET_MODE (op));
5617 op = SUBREG_REG (op);
5618 }
5619
5620 if (!(REG_P (op) && HARD_REGISTER_P (op)))
5621 continue;
5622
5623 op_alt = recog_op_alt[i];
5624
5625 /* Operand has no constraints, anything is OK. */
5626 win = !recog_data.n_alternatives;
5627
5628 for (j = 0; j < recog_data.n_alternatives; j++)
5629 {
5630 if (op_alt[j].anything_ok
5631 || (op_alt[j].matches != -1
5632 && operands_match_p
5633 (recog_data.operand[i],
5634 recog_data.operand[op_alt[j].matches]))
5635 || reg_fits_class_p (op, op_alt[j].cl, offset, mode))
5636 {
5637 win = true;
5638 break;
5639 }
5640 }
5641
5642 if (!win)
5643 return false;
5644 }
5645 }
5646
5647 return true;
5648 }
5649 \f
5650 /* Argument support functions. */
5651
5652 /* Return true when register may be used to pass function parameters. */
5653 bool
5654 ix86_function_arg_regno_p (int regno)
5655 {
5656 int i;
5657 const int *parm_regs;
5658
5659 if (!TARGET_64BIT)
5660 {
5661 if (TARGET_MACHO)
5662 return (regno < REGPARM_MAX
5663 || (TARGET_SSE && SSE_REGNO_P (regno) && !fixed_regs[regno]));
5664 else
5665 return (regno < REGPARM_MAX
5666 || (TARGET_MMX && MMX_REGNO_P (regno)
5667 && (regno < FIRST_MMX_REG + MMX_REGPARM_MAX))
5668 || (TARGET_SSE && SSE_REGNO_P (regno)
5669 && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX)));
5670 }
5671
5672 if (TARGET_MACHO)
5673 {
5674 if (SSE_REGNO_P (regno) && TARGET_SSE)
5675 return true;
5676 }
5677 else
5678 {
5679 if (TARGET_SSE && SSE_REGNO_P (regno)
5680 && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX))
5681 return true;
5682 }
5683
5684 /* TODO: The function should depend on current function ABI but
5685 builtins.c would need updating then. Therefore we use the
5686 default ABI. */
5687
5688 /* RAX is used as hidden argument to va_arg functions. */
5689 if (ix86_abi == SYSV_ABI && regno == AX_REG)
5690 return true;
5691
5692 if (ix86_abi == MS_ABI)
5693 parm_regs = x86_64_ms_abi_int_parameter_registers;
5694 else
5695 parm_regs = x86_64_int_parameter_registers;
5696 for (i = 0; i < (ix86_abi == MS_ABI
5697 ? X86_64_MS_REGPARM_MAX : X86_64_REGPARM_MAX); i++)
5698 if (regno == parm_regs[i])
5699 return true;
5700 return false;
5701 }
5702
5703 /* Return if we do not know how to pass TYPE solely in registers. */
5704
5705 static bool
5706 ix86_must_pass_in_stack (enum machine_mode mode, const_tree type)
5707 {
5708 if (must_pass_in_stack_var_size_or_pad (mode, type))
5709 return true;
5710
5711 /* For 32-bit, we want TImode aggregates to go on the stack. But watch out!
5712 The layout_type routine is crafty and tries to trick us into passing
5713 currently unsupported vector types on the stack by using TImode. */
5714 return (!TARGET_64BIT && mode == TImode
5715 && type && TREE_CODE (type) != VECTOR_TYPE);
5716 }
5717
5718 /* It returns the size, in bytes, of the area reserved for arguments passed
5719 in registers for the function represented by fndecl dependent to the used
5720 abi format. */
5721 int
5722 ix86_reg_parm_stack_space (const_tree fndecl)
5723 {
5724 enum calling_abi call_abi = SYSV_ABI;
5725 if (fndecl != NULL_TREE && TREE_CODE (fndecl) == FUNCTION_DECL)
5726 call_abi = ix86_function_abi (fndecl);
5727 else
5728 call_abi = ix86_function_type_abi (fndecl);
5729 if (TARGET_64BIT && call_abi == MS_ABI)
5730 return 32;
5731 return 0;
5732 }
5733
5734 /* Returns value SYSV_ABI, MS_ABI dependent on fntype, specifying the
5735 call abi used. */
5736 enum calling_abi
5737 ix86_function_type_abi (const_tree fntype)
5738 {
5739 if (fntype != NULL_TREE && TYPE_ATTRIBUTES (fntype) != NULL_TREE)
5740 {
5741 enum calling_abi abi = ix86_abi;
5742 if (abi == SYSV_ABI)
5743 {
5744 if (lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (fntype)))
5745 abi = MS_ABI;
5746 }
5747 else if (lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (fntype)))
5748 abi = SYSV_ABI;
5749 return abi;
5750 }
5751 return ix86_abi;
5752 }
5753
5754 static bool
5755 ix86_function_ms_hook_prologue (const_tree fn)
5756 {
5757 if (fn && lookup_attribute ("ms_hook_prologue", DECL_ATTRIBUTES (fn)))
5758 {
5759 if (decl_function_context (fn) != NULL_TREE)
5760 error_at (DECL_SOURCE_LOCATION (fn),
5761 "ms_hook_prologue is not compatible with nested function");
5762 else
5763 return true;
5764 }
5765 return false;
5766 }
5767
5768 static enum calling_abi
5769 ix86_function_abi (const_tree fndecl)
5770 {
5771 if (! fndecl)
5772 return ix86_abi;
5773 return ix86_function_type_abi (TREE_TYPE (fndecl));
5774 }
5775
5776 /* Returns value SYSV_ABI, MS_ABI dependent on cfun, specifying the
5777 call abi used. */
5778 enum calling_abi
5779 ix86_cfun_abi (void)
5780 {
5781 if (! cfun)
5782 return ix86_abi;
5783 return cfun->machine->call_abi;
5784 }
5785
5786 /* Write the extra assembler code needed to declare a function properly. */
5787
5788 void
5789 ix86_asm_output_function_label (FILE *asm_out_file, const char *fname,
5790 tree decl)
5791 {
5792 bool is_ms_hook = ix86_function_ms_hook_prologue (decl);
5793
5794 if (is_ms_hook)
5795 {
5796 int i, filler_count = (TARGET_64BIT ? 32 : 16);
5797 unsigned int filler_cc = 0xcccccccc;
5798
5799 for (i = 0; i < filler_count; i += 4)
5800 fprintf (asm_out_file, ASM_LONG " %#x\n", filler_cc);
5801 }
5802
5803 #ifdef SUBTARGET_ASM_UNWIND_INIT
5804 SUBTARGET_ASM_UNWIND_INIT (asm_out_file);
5805 #endif
5806
5807 ASM_OUTPUT_LABEL (asm_out_file, fname);
5808
5809 /* Output magic byte marker, if hot-patch attribute is set. */
5810 if (is_ms_hook)
5811 {
5812 if (TARGET_64BIT)
5813 {
5814 /* leaq [%rsp + 0], %rsp */
5815 asm_fprintf (asm_out_file, ASM_BYTE
5816 "0x48, 0x8d, 0xa4, 0x24, 0x00, 0x00, 0x00, 0x00\n");
5817 }
5818 else
5819 {
5820 /* movl.s %edi, %edi
5821 push %ebp
5822 movl.s %esp, %ebp */
5823 asm_fprintf (asm_out_file, ASM_BYTE
5824 "0x8b, 0xff, 0x55, 0x8b, 0xec\n");
5825 }
5826 }
5827 }
5828
5829 /* regclass.c */
5830 extern void init_regs (void);
5831
5832 /* Implementation of call abi switching target hook. Specific to FNDECL
5833 the specific call register sets are set. See also
5834 ix86_conditional_register_usage for more details. */
5835 void
5836 ix86_call_abi_override (const_tree fndecl)
5837 {
5838 if (fndecl == NULL_TREE)
5839 cfun->machine->call_abi = ix86_abi;
5840 else
5841 cfun->machine->call_abi = ix86_function_type_abi (TREE_TYPE (fndecl));
5842 }
5843
5844 /* 64-bit MS and SYSV ABI have different set of call used registers. Avoid
5845 expensive re-initialization of init_regs each time we switch function context
5846 since this is needed only during RTL expansion. */
5847 static void
5848 ix86_maybe_switch_abi (void)
5849 {
5850 if (TARGET_64BIT &&
5851 call_used_regs[SI_REG] == (cfun->machine->call_abi == MS_ABI))
5852 reinit_regs ();
5853 }
5854
5855 /* Initialize a variable CUM of type CUMULATIVE_ARGS
5856 for a call to a function whose data type is FNTYPE.
5857 For a library call, FNTYPE is 0. */
5858
5859 void
5860 init_cumulative_args (CUMULATIVE_ARGS *cum, /* Argument info to initialize */
5861 tree fntype, /* tree ptr for function decl */
5862 rtx libname, /* SYMBOL_REF of library name or 0 */
5863 tree fndecl,
5864 int caller)
5865 {
5866 struct cgraph_local_info *i;
5867 tree fnret_type;
5868
5869 memset (cum, 0, sizeof (*cum));
5870
5871 /* Initialize for the current callee. */
5872 if (caller)
5873 {
5874 cfun->machine->callee_pass_avx256_p = false;
5875 cfun->machine->callee_return_avx256_p = false;
5876 }
5877
5878 if (fndecl)
5879 {
5880 i = cgraph_local_info (fndecl);
5881 cum->call_abi = ix86_function_abi (fndecl);
5882 fnret_type = TREE_TYPE (TREE_TYPE (fndecl));
5883 }
5884 else
5885 {
5886 i = NULL;
5887 cum->call_abi = ix86_function_type_abi (fntype);
5888 if (fntype)
5889 fnret_type = TREE_TYPE (fntype);
5890 else
5891 fnret_type = NULL;
5892 }
5893
5894 if (TARGET_VZEROUPPER && fnret_type)
5895 {
5896 rtx fnret_value = ix86_function_value (fnret_type, fntype,
5897 false);
5898 if (function_pass_avx256_p (fnret_value))
5899 {
5900 /* The return value of this function uses 256bit AVX modes. */
5901 if (caller)
5902 cfun->machine->callee_return_avx256_p = true;
5903 else
5904 cfun->machine->caller_return_avx256_p = true;
5905 }
5906 }
5907
5908 cum->caller = caller;
5909
5910 /* Set up the number of registers to use for passing arguments. */
5911
5912 if (TARGET_64BIT && cum->call_abi == MS_ABI && !ACCUMULATE_OUTGOING_ARGS)
5913 sorry ("ms_abi attribute requires -maccumulate-outgoing-args "
5914 "or subtarget optimization implying it");
5915 cum->nregs = ix86_regparm;
5916 if (TARGET_64BIT)
5917 {
5918 cum->nregs = (cum->call_abi == SYSV_ABI
5919 ? X86_64_REGPARM_MAX
5920 : X86_64_MS_REGPARM_MAX);
5921 }
5922 if (TARGET_SSE)
5923 {
5924 cum->sse_nregs = SSE_REGPARM_MAX;
5925 if (TARGET_64BIT)
5926 {
5927 cum->sse_nregs = (cum->call_abi == SYSV_ABI
5928 ? X86_64_SSE_REGPARM_MAX
5929 : X86_64_MS_SSE_REGPARM_MAX);
5930 }
5931 }
5932 if (TARGET_MMX)
5933 cum->mmx_nregs = MMX_REGPARM_MAX;
5934 cum->warn_avx = true;
5935 cum->warn_sse = true;
5936 cum->warn_mmx = true;
5937
5938 /* Because type might mismatch in between caller and callee, we need to
5939 use actual type of function for local calls.
5940 FIXME: cgraph_analyze can be told to actually record if function uses
5941 va_start so for local functions maybe_vaarg can be made aggressive
5942 helping K&R code.
5943 FIXME: once typesytem is fixed, we won't need this code anymore. */
5944 if (i && i->local && i->can_change_signature)
5945 fntype = TREE_TYPE (fndecl);
5946 cum->maybe_vaarg = (fntype
5947 ? (!prototype_p (fntype) || stdarg_p (fntype))
5948 : !libname);
5949
5950 if (!TARGET_64BIT)
5951 {
5952 /* If there are variable arguments, then we won't pass anything
5953 in registers in 32-bit mode. */
5954 if (stdarg_p (fntype))
5955 {
5956 cum->nregs = 0;
5957 cum->sse_nregs = 0;
5958 cum->mmx_nregs = 0;
5959 cum->warn_avx = 0;
5960 cum->warn_sse = 0;
5961 cum->warn_mmx = 0;
5962 return;
5963 }
5964
5965 /* Use ecx and edx registers if function has fastcall attribute,
5966 else look for regparm information. */
5967 if (fntype)
5968 {
5969 unsigned int ccvt = ix86_get_callcvt (fntype);
5970 if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
5971 {
5972 cum->nregs = 1;
5973 cum->fastcall = 1; /* Same first register as in fastcall. */
5974 }
5975 else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
5976 {
5977 cum->nregs = 2;
5978 cum->fastcall = 1;
5979 }
5980 else
5981 cum->nregs = ix86_function_regparm (fntype, fndecl);
5982 }
5983
5984 /* Set up the number of SSE registers used for passing SFmode
5985 and DFmode arguments. Warn for mismatching ABI. */
5986 cum->float_in_sse = ix86_function_sseregparm (fntype, fndecl, true);
5987 }
5988 }
5989
5990 /* Return the "natural" mode for TYPE. In most cases, this is just TYPE_MODE.
5991 But in the case of vector types, it is some vector mode.
5992
5993 When we have only some of our vector isa extensions enabled, then there
5994 are some modes for which vector_mode_supported_p is false. For these
5995 modes, the generic vector support in gcc will choose some non-vector mode
5996 in order to implement the type. By computing the natural mode, we'll
5997 select the proper ABI location for the operand and not depend on whatever
5998 the middle-end decides to do with these vector types.
5999
6000 The midde-end can't deal with the vector types > 16 bytes. In this
6001 case, we return the original mode and warn ABI change if CUM isn't
6002 NULL. */
6003
6004 static enum machine_mode
6005 type_natural_mode (const_tree type, const CUMULATIVE_ARGS *cum)
6006 {
6007 enum machine_mode mode = TYPE_MODE (type);
6008
6009 if (TREE_CODE (type) == VECTOR_TYPE && !VECTOR_MODE_P (mode))
6010 {
6011 HOST_WIDE_INT size = int_size_in_bytes (type);
6012 if ((size == 8 || size == 16 || size == 32)
6013 /* ??? Generic code allows us to create width 1 vectors. Ignore. */
6014 && TYPE_VECTOR_SUBPARTS (type) > 1)
6015 {
6016 enum machine_mode innermode = TYPE_MODE (TREE_TYPE (type));
6017
6018 if (TREE_CODE (TREE_TYPE (type)) == REAL_TYPE)
6019 mode = MIN_MODE_VECTOR_FLOAT;
6020 else
6021 mode = MIN_MODE_VECTOR_INT;
6022
6023 /* Get the mode which has this inner mode and number of units. */
6024 for (; mode != VOIDmode; mode = GET_MODE_WIDER_MODE (mode))
6025 if (GET_MODE_NUNITS (mode) == TYPE_VECTOR_SUBPARTS (type)
6026 && GET_MODE_INNER (mode) == innermode)
6027 {
6028 if (size == 32 && !TARGET_AVX)
6029 {
6030 static bool warnedavx;
6031
6032 if (cum
6033 && !warnedavx
6034 && cum->warn_avx)
6035 {
6036 warnedavx = true;
6037 warning (0, "AVX vector argument without AVX "
6038 "enabled changes the ABI");
6039 }
6040 return TYPE_MODE (type);
6041 }
6042 else if ((size == 8 || size == 16) && !TARGET_SSE)
6043 {
6044 static bool warnedsse;
6045
6046 if (cum
6047 && !warnedsse
6048 && cum->warn_sse)
6049 {
6050 warnedsse = true;
6051 warning (0, "SSE vector argument without SSE "
6052 "enabled changes the ABI");
6053 }
6054 return mode;
6055 }
6056 else
6057 return mode;
6058 }
6059
6060 gcc_unreachable ();
6061 }
6062 }
6063
6064 return mode;
6065 }
6066
6067 /* We want to pass a value in REGNO whose "natural" mode is MODE. However,
6068 this may not agree with the mode that the type system has chosen for the
6069 register, which is ORIG_MODE. If ORIG_MODE is not BLKmode, then we can
6070 go ahead and use it. Otherwise we have to build a PARALLEL instead. */
6071
6072 static rtx
6073 gen_reg_or_parallel (enum machine_mode mode, enum machine_mode orig_mode,
6074 unsigned int regno)
6075 {
6076 rtx tmp;
6077
6078 if (orig_mode != BLKmode)
6079 tmp = gen_rtx_REG (orig_mode, regno);
6080 else
6081 {
6082 tmp = gen_rtx_REG (mode, regno);
6083 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, const0_rtx);
6084 tmp = gen_rtx_PARALLEL (orig_mode, gen_rtvec (1, tmp));
6085 }
6086
6087 return tmp;
6088 }
6089
6090 /* x86-64 register passing implementation. See x86-64 ABI for details. Goal
6091 of this code is to classify each 8bytes of incoming argument by the register
6092 class and assign registers accordingly. */
6093
6094 /* Return the union class of CLASS1 and CLASS2.
6095 See the x86-64 PS ABI for details. */
6096
6097 static enum x86_64_reg_class
6098 merge_classes (enum x86_64_reg_class class1, enum x86_64_reg_class class2)
6099 {
6100 /* Rule #1: If both classes are equal, this is the resulting class. */
6101 if (class1 == class2)
6102 return class1;
6103
6104 /* Rule #2: If one of the classes is NO_CLASS, the resulting class is
6105 the other class. */
6106 if (class1 == X86_64_NO_CLASS)
6107 return class2;
6108 if (class2 == X86_64_NO_CLASS)
6109 return class1;
6110
6111 /* Rule #3: If one of the classes is MEMORY, the result is MEMORY. */
6112 if (class1 == X86_64_MEMORY_CLASS || class2 == X86_64_MEMORY_CLASS)
6113 return X86_64_MEMORY_CLASS;
6114
6115 /* Rule #4: If one of the classes is INTEGER, the result is INTEGER. */
6116 if ((class1 == X86_64_INTEGERSI_CLASS && class2 == X86_64_SSESF_CLASS)
6117 || (class2 == X86_64_INTEGERSI_CLASS && class1 == X86_64_SSESF_CLASS))
6118 return X86_64_INTEGERSI_CLASS;
6119 if (class1 == X86_64_INTEGER_CLASS || class1 == X86_64_INTEGERSI_CLASS
6120 || class2 == X86_64_INTEGER_CLASS || class2 == X86_64_INTEGERSI_CLASS)
6121 return X86_64_INTEGER_CLASS;
6122
6123 /* Rule #5: If one of the classes is X87, X87UP, or COMPLEX_X87 class,
6124 MEMORY is used. */
6125 if (class1 == X86_64_X87_CLASS
6126 || class1 == X86_64_X87UP_CLASS
6127 || class1 == X86_64_COMPLEX_X87_CLASS
6128 || class2 == X86_64_X87_CLASS
6129 || class2 == X86_64_X87UP_CLASS
6130 || class2 == X86_64_COMPLEX_X87_CLASS)
6131 return X86_64_MEMORY_CLASS;
6132
6133 /* Rule #6: Otherwise class SSE is used. */
6134 return X86_64_SSE_CLASS;
6135 }
6136
6137 /* Classify the argument of type TYPE and mode MODE.
6138 CLASSES will be filled by the register class used to pass each word
6139 of the operand. The number of words is returned. In case the parameter
6140 should be passed in memory, 0 is returned. As a special case for zero
6141 sized containers, classes[0] will be NO_CLASS and 1 is returned.
6142
6143 BIT_OFFSET is used internally for handling records and specifies offset
6144 of the offset in bits modulo 256 to avoid overflow cases.
6145
6146 See the x86-64 PS ABI for details.
6147 */
6148
6149 static int
6150 classify_argument (enum machine_mode mode, const_tree type,
6151 enum x86_64_reg_class classes[MAX_CLASSES], int bit_offset)
6152 {
6153 HOST_WIDE_INT bytes =
6154 (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
6155 int words
6156 = (bytes + (bit_offset % 64) / 8 + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
6157
6158 /* Variable sized entities are always passed/returned in memory. */
6159 if (bytes < 0)
6160 return 0;
6161
6162 if (mode != VOIDmode
6163 && targetm.calls.must_pass_in_stack (mode, type))
6164 return 0;
6165
6166 if (type && AGGREGATE_TYPE_P (type))
6167 {
6168 int i;
6169 tree field;
6170 enum x86_64_reg_class subclasses[MAX_CLASSES];
6171
6172 /* On x86-64 we pass structures larger than 32 bytes on the stack. */
6173 if (bytes > 32)
6174 return 0;
6175
6176 for (i = 0; i < words; i++)
6177 classes[i] = X86_64_NO_CLASS;
6178
6179 /* Zero sized arrays or structures are NO_CLASS. We return 0 to
6180 signalize memory class, so handle it as special case. */
6181 if (!words)
6182 {
6183 classes[0] = X86_64_NO_CLASS;
6184 return 1;
6185 }
6186
6187 /* Classify each field of record and merge classes. */
6188 switch (TREE_CODE (type))
6189 {
6190 case RECORD_TYPE:
6191 /* And now merge the fields of structure. */
6192 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
6193 {
6194 if (TREE_CODE (field) == FIELD_DECL)
6195 {
6196 int num;
6197
6198 if (TREE_TYPE (field) == error_mark_node)
6199 continue;
6200
6201 /* Bitfields are always classified as integer. Handle them
6202 early, since later code would consider them to be
6203 misaligned integers. */
6204 if (DECL_BIT_FIELD (field))
6205 {
6206 for (i = (int_bit_position (field)
6207 + (bit_offset % 64)) / 8 / 8;
6208 i < ((int_bit_position (field) + (bit_offset % 64))
6209 + tree_low_cst (DECL_SIZE (field), 0)
6210 + 63) / 8 / 8; i++)
6211 classes[i] =
6212 merge_classes (X86_64_INTEGER_CLASS,
6213 classes[i]);
6214 }
6215 else
6216 {
6217 int pos;
6218
6219 type = TREE_TYPE (field);
6220
6221 /* Flexible array member is ignored. */
6222 if (TYPE_MODE (type) == BLKmode
6223 && TREE_CODE (type) == ARRAY_TYPE
6224 && TYPE_SIZE (type) == NULL_TREE
6225 && TYPE_DOMAIN (type) != NULL_TREE
6226 && (TYPE_MAX_VALUE (TYPE_DOMAIN (type))
6227 == NULL_TREE))
6228 {
6229 static bool warned;
6230
6231 if (!warned && warn_psabi)
6232 {
6233 warned = true;
6234 inform (input_location,
6235 "the ABI of passing struct with"
6236 " a flexible array member has"
6237 " changed in GCC 4.4");
6238 }
6239 continue;
6240 }
6241 num = classify_argument (TYPE_MODE (type), type,
6242 subclasses,
6243 (int_bit_position (field)
6244 + bit_offset) % 256);
6245 if (!num)
6246 return 0;
6247 pos = (int_bit_position (field)
6248 + (bit_offset % 64)) / 8 / 8;
6249 for (i = 0; i < num && (i + pos) < words; i++)
6250 classes[i + pos] =
6251 merge_classes (subclasses[i], classes[i + pos]);
6252 }
6253 }
6254 }
6255 break;
6256
6257 case ARRAY_TYPE:
6258 /* Arrays are handled as small records. */
6259 {
6260 int num;
6261 num = classify_argument (TYPE_MODE (TREE_TYPE (type)),
6262 TREE_TYPE (type), subclasses, bit_offset);
6263 if (!num)
6264 return 0;
6265
6266 /* The partial classes are now full classes. */
6267 if (subclasses[0] == X86_64_SSESF_CLASS && bytes != 4)
6268 subclasses[0] = X86_64_SSE_CLASS;
6269 if (subclasses[0] == X86_64_INTEGERSI_CLASS
6270 && !((bit_offset % 64) == 0 && bytes == 4))
6271 subclasses[0] = X86_64_INTEGER_CLASS;
6272
6273 for (i = 0; i < words; i++)
6274 classes[i] = subclasses[i % num];
6275
6276 break;
6277 }
6278 case UNION_TYPE:
6279 case QUAL_UNION_TYPE:
6280 /* Unions are similar to RECORD_TYPE but offset is always 0.
6281 */
6282 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
6283 {
6284 if (TREE_CODE (field) == FIELD_DECL)
6285 {
6286 int num;
6287
6288 if (TREE_TYPE (field) == error_mark_node)
6289 continue;
6290
6291 num = classify_argument (TYPE_MODE (TREE_TYPE (field)),
6292 TREE_TYPE (field), subclasses,
6293 bit_offset);
6294 if (!num)
6295 return 0;
6296 for (i = 0; i < num; i++)
6297 classes[i] = merge_classes (subclasses[i], classes[i]);
6298 }
6299 }
6300 break;
6301
6302 default:
6303 gcc_unreachable ();
6304 }
6305
6306 if (words > 2)
6307 {
6308 /* When size > 16 bytes, if the first one isn't
6309 X86_64_SSE_CLASS or any other ones aren't
6310 X86_64_SSEUP_CLASS, everything should be passed in
6311 memory. */
6312 if (classes[0] != X86_64_SSE_CLASS)
6313 return 0;
6314
6315 for (i = 1; i < words; i++)
6316 if (classes[i] != X86_64_SSEUP_CLASS)
6317 return 0;
6318 }
6319
6320 /* Final merger cleanup. */
6321 for (i = 0; i < words; i++)
6322 {
6323 /* If one class is MEMORY, everything should be passed in
6324 memory. */
6325 if (classes[i] == X86_64_MEMORY_CLASS)
6326 return 0;
6327
6328 /* The X86_64_SSEUP_CLASS should be always preceded by
6329 X86_64_SSE_CLASS or X86_64_SSEUP_CLASS. */
6330 if (classes[i] == X86_64_SSEUP_CLASS
6331 && classes[i - 1] != X86_64_SSE_CLASS
6332 && classes[i - 1] != X86_64_SSEUP_CLASS)
6333 {
6334 /* The first one should never be X86_64_SSEUP_CLASS. */
6335 gcc_assert (i != 0);
6336 classes[i] = X86_64_SSE_CLASS;
6337 }
6338
6339 /* If X86_64_X87UP_CLASS isn't preceded by X86_64_X87_CLASS,
6340 everything should be passed in memory. */
6341 if (classes[i] == X86_64_X87UP_CLASS
6342 && (classes[i - 1] != X86_64_X87_CLASS))
6343 {
6344 static bool warned;
6345
6346 /* The first one should never be X86_64_X87UP_CLASS. */
6347 gcc_assert (i != 0);
6348 if (!warned && warn_psabi)
6349 {
6350 warned = true;
6351 inform (input_location,
6352 "the ABI of passing union with long double"
6353 " has changed in GCC 4.4");
6354 }
6355 return 0;
6356 }
6357 }
6358 return words;
6359 }
6360
6361 /* Compute alignment needed. We align all types to natural boundaries with
6362 exception of XFmode that is aligned to 64bits. */
6363 if (mode != VOIDmode && mode != BLKmode)
6364 {
6365 int mode_alignment = GET_MODE_BITSIZE (mode);
6366
6367 if (mode == XFmode)
6368 mode_alignment = 128;
6369 else if (mode == XCmode)
6370 mode_alignment = 256;
6371 if (COMPLEX_MODE_P (mode))
6372 mode_alignment /= 2;
6373 /* Misaligned fields are always returned in memory. */
6374 if (bit_offset % mode_alignment)
6375 return 0;
6376 }
6377
6378 /* for V1xx modes, just use the base mode */
6379 if (VECTOR_MODE_P (mode) && mode != V1DImode && mode != V1TImode
6380 && GET_MODE_SIZE (GET_MODE_INNER (mode)) == bytes)
6381 mode = GET_MODE_INNER (mode);
6382
6383 /* Classification of atomic types. */
6384 switch (mode)
6385 {
6386 case SDmode:
6387 case DDmode:
6388 classes[0] = X86_64_SSE_CLASS;
6389 return 1;
6390 case TDmode:
6391 classes[0] = X86_64_SSE_CLASS;
6392 classes[1] = X86_64_SSEUP_CLASS;
6393 return 2;
6394 case DImode:
6395 case SImode:
6396 case HImode:
6397 case QImode:
6398 case CSImode:
6399 case CHImode:
6400 case CQImode:
6401 {
6402 int size = (bit_offset % 64)+ (int) GET_MODE_BITSIZE (mode);
6403
6404 if (size <= 32)
6405 {
6406 classes[0] = X86_64_INTEGERSI_CLASS;
6407 return 1;
6408 }
6409 else if (size <= 64)
6410 {
6411 classes[0] = X86_64_INTEGER_CLASS;
6412 return 1;
6413 }
6414 else if (size <= 64+32)
6415 {
6416 classes[0] = X86_64_INTEGER_CLASS;
6417 classes[1] = X86_64_INTEGERSI_CLASS;
6418 return 2;
6419 }
6420 else if (size <= 64+64)
6421 {
6422 classes[0] = classes[1] = X86_64_INTEGER_CLASS;
6423 return 2;
6424 }
6425 else
6426 gcc_unreachable ();
6427 }
6428 case CDImode:
6429 case TImode:
6430 classes[0] = classes[1] = X86_64_INTEGER_CLASS;
6431 return 2;
6432 case COImode:
6433 case OImode:
6434 /* OImode shouldn't be used directly. */
6435 gcc_unreachable ();
6436 case CTImode:
6437 return 0;
6438 case SFmode:
6439 if (!(bit_offset % 64))
6440 classes[0] = X86_64_SSESF_CLASS;
6441 else
6442 classes[0] = X86_64_SSE_CLASS;
6443 return 1;
6444 case DFmode:
6445 classes[0] = X86_64_SSEDF_CLASS;
6446 return 1;
6447 case XFmode:
6448 classes[0] = X86_64_X87_CLASS;
6449 classes[1] = X86_64_X87UP_CLASS;
6450 return 2;
6451 case TFmode:
6452 classes[0] = X86_64_SSE_CLASS;
6453 classes[1] = X86_64_SSEUP_CLASS;
6454 return 2;
6455 case SCmode:
6456 classes[0] = X86_64_SSE_CLASS;
6457 if (!(bit_offset % 64))
6458 return 1;
6459 else
6460 {
6461 static bool warned;
6462
6463 if (!warned && warn_psabi)
6464 {
6465 warned = true;
6466 inform (input_location,
6467 "the ABI of passing structure with complex float"
6468 " member has changed in GCC 4.4");
6469 }
6470 classes[1] = X86_64_SSESF_CLASS;
6471 return 2;
6472 }
6473 case DCmode:
6474 classes[0] = X86_64_SSEDF_CLASS;
6475 classes[1] = X86_64_SSEDF_CLASS;
6476 return 2;
6477 case XCmode:
6478 classes[0] = X86_64_COMPLEX_X87_CLASS;
6479 return 1;
6480 case TCmode:
6481 /* This modes is larger than 16 bytes. */
6482 return 0;
6483 case V8SFmode:
6484 case V8SImode:
6485 case V32QImode:
6486 case V16HImode:
6487 case V4DFmode:
6488 case V4DImode:
6489 classes[0] = X86_64_SSE_CLASS;
6490 classes[1] = X86_64_SSEUP_CLASS;
6491 classes[2] = X86_64_SSEUP_CLASS;
6492 classes[3] = X86_64_SSEUP_CLASS;
6493 return 4;
6494 case V4SFmode:
6495 case V4SImode:
6496 case V16QImode:
6497 case V8HImode:
6498 case V2DFmode:
6499 case V2DImode:
6500 classes[0] = X86_64_SSE_CLASS;
6501 classes[1] = X86_64_SSEUP_CLASS;
6502 return 2;
6503 case V1TImode:
6504 case V1DImode:
6505 case V2SFmode:
6506 case V2SImode:
6507 case V4HImode:
6508 case V8QImode:
6509 classes[0] = X86_64_SSE_CLASS;
6510 return 1;
6511 case BLKmode:
6512 case VOIDmode:
6513 return 0;
6514 default:
6515 gcc_assert (VECTOR_MODE_P (mode));
6516
6517 if (bytes > 16)
6518 return 0;
6519
6520 gcc_assert (GET_MODE_CLASS (GET_MODE_INNER (mode)) == MODE_INT);
6521
6522 if (bit_offset + GET_MODE_BITSIZE (mode) <= 32)
6523 classes[0] = X86_64_INTEGERSI_CLASS;
6524 else
6525 classes[0] = X86_64_INTEGER_CLASS;
6526 classes[1] = X86_64_INTEGER_CLASS;
6527 return 1 + (bytes > 8);
6528 }
6529 }
6530
6531 /* Examine the argument and return set number of register required in each
6532 class. Return 0 iff parameter should be passed in memory. */
6533 static int
6534 examine_argument (enum machine_mode mode, const_tree type, int in_return,
6535 int *int_nregs, int *sse_nregs)
6536 {
6537 enum x86_64_reg_class regclass[MAX_CLASSES];
6538 int n = classify_argument (mode, type, regclass, 0);
6539
6540 *int_nregs = 0;
6541 *sse_nregs = 0;
6542 if (!n)
6543 return 0;
6544 for (n--; n >= 0; n--)
6545 switch (regclass[n])
6546 {
6547 case X86_64_INTEGER_CLASS:
6548 case X86_64_INTEGERSI_CLASS:
6549 (*int_nregs)++;
6550 break;
6551 case X86_64_SSE_CLASS:
6552 case X86_64_SSESF_CLASS:
6553 case X86_64_SSEDF_CLASS:
6554 (*sse_nregs)++;
6555 break;
6556 case X86_64_NO_CLASS:
6557 case X86_64_SSEUP_CLASS:
6558 break;
6559 case X86_64_X87_CLASS:
6560 case X86_64_X87UP_CLASS:
6561 if (!in_return)
6562 return 0;
6563 break;
6564 case X86_64_COMPLEX_X87_CLASS:
6565 return in_return ? 2 : 0;
6566 case X86_64_MEMORY_CLASS:
6567 gcc_unreachable ();
6568 }
6569 return 1;
6570 }
6571
6572 /* Construct container for the argument used by GCC interface. See
6573 FUNCTION_ARG for the detailed description. */
6574
6575 static rtx
6576 construct_container (enum machine_mode mode, enum machine_mode orig_mode,
6577 const_tree type, int in_return, int nintregs, int nsseregs,
6578 const int *intreg, int sse_regno)
6579 {
6580 /* The following variables hold the static issued_error state. */
6581 static bool issued_sse_arg_error;
6582 static bool issued_sse_ret_error;
6583 static bool issued_x87_ret_error;
6584
6585 enum machine_mode tmpmode;
6586 int bytes =
6587 (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
6588 enum x86_64_reg_class regclass[MAX_CLASSES];
6589 int n;
6590 int i;
6591 int nexps = 0;
6592 int needed_sseregs, needed_intregs;
6593 rtx exp[MAX_CLASSES];
6594 rtx ret;
6595
6596 n = classify_argument (mode, type, regclass, 0);
6597 if (!n)
6598 return NULL;
6599 if (!examine_argument (mode, type, in_return, &needed_intregs,
6600 &needed_sseregs))
6601 return NULL;
6602 if (needed_intregs > nintregs || needed_sseregs > nsseregs)
6603 return NULL;
6604
6605 /* We allowed the user to turn off SSE for kernel mode. Don't crash if
6606 some less clueful developer tries to use floating-point anyway. */
6607 if (needed_sseregs && !TARGET_SSE)
6608 {
6609 if (in_return)
6610 {
6611 if (!issued_sse_ret_error)
6612 {
6613 error ("SSE register return with SSE disabled");
6614 issued_sse_ret_error = true;
6615 }
6616 }
6617 else if (!issued_sse_arg_error)
6618 {
6619 error ("SSE register argument with SSE disabled");
6620 issued_sse_arg_error = true;
6621 }
6622 return NULL;
6623 }
6624
6625 /* Likewise, error if the ABI requires us to return values in the
6626 x87 registers and the user specified -mno-80387. */
6627 if (!TARGET_80387 && in_return)
6628 for (i = 0; i < n; i++)
6629 if (regclass[i] == X86_64_X87_CLASS
6630 || regclass[i] == X86_64_X87UP_CLASS
6631 || regclass[i] == X86_64_COMPLEX_X87_CLASS)
6632 {
6633 if (!issued_x87_ret_error)
6634 {
6635 error ("x87 register return with x87 disabled");
6636 issued_x87_ret_error = true;
6637 }
6638 return NULL;
6639 }
6640
6641 /* First construct simple cases. Avoid SCmode, since we want to use
6642 single register to pass this type. */
6643 if (n == 1 && mode != SCmode)
6644 switch (regclass[0])
6645 {
6646 case X86_64_INTEGER_CLASS:
6647 case X86_64_INTEGERSI_CLASS:
6648 return gen_rtx_REG (mode, intreg[0]);
6649 case X86_64_SSE_CLASS:
6650 case X86_64_SSESF_CLASS:
6651 case X86_64_SSEDF_CLASS:
6652 if (mode != BLKmode)
6653 return gen_reg_or_parallel (mode, orig_mode,
6654 SSE_REGNO (sse_regno));
6655 break;
6656 case X86_64_X87_CLASS:
6657 case X86_64_COMPLEX_X87_CLASS:
6658 return gen_rtx_REG (mode, FIRST_STACK_REG);
6659 case X86_64_NO_CLASS:
6660 /* Zero sized array, struct or class. */
6661 return NULL;
6662 default:
6663 gcc_unreachable ();
6664 }
6665 if (n == 2
6666 && regclass[0] == X86_64_SSE_CLASS
6667 && regclass[1] == X86_64_SSEUP_CLASS
6668 && mode != BLKmode)
6669 return gen_reg_or_parallel (mode, orig_mode,
6670 SSE_REGNO (sse_regno));
6671 if (n == 4
6672 && regclass[0] == X86_64_SSE_CLASS
6673 && regclass[1] == X86_64_SSEUP_CLASS
6674 && regclass[2] == X86_64_SSEUP_CLASS
6675 && regclass[3] == X86_64_SSEUP_CLASS
6676 && mode != BLKmode)
6677 return gen_reg_or_parallel (mode, orig_mode,
6678 SSE_REGNO (sse_regno));
6679 if (n == 2
6680 && regclass[0] == X86_64_X87_CLASS
6681 && regclass[1] == X86_64_X87UP_CLASS)
6682 return gen_rtx_REG (XFmode, FIRST_STACK_REG);
6683
6684 if (n == 2
6685 && regclass[0] == X86_64_INTEGER_CLASS
6686 && regclass[1] == X86_64_INTEGER_CLASS
6687 && (mode == CDImode || mode == TImode || mode == TFmode)
6688 && intreg[0] + 1 == intreg[1])
6689 return gen_rtx_REG (mode, intreg[0]);
6690
6691 /* Otherwise figure out the entries of the PARALLEL. */
6692 for (i = 0; i < n; i++)
6693 {
6694 int pos;
6695
6696 switch (regclass[i])
6697 {
6698 case X86_64_NO_CLASS:
6699 break;
6700 case X86_64_INTEGER_CLASS:
6701 case X86_64_INTEGERSI_CLASS:
6702 /* Merge TImodes on aligned occasions here too. */
6703 if (i * 8 + 8 > bytes)
6704 tmpmode
6705 = mode_for_size ((bytes - i * 8) * BITS_PER_UNIT, MODE_INT, 0);
6706 else if (regclass[i] == X86_64_INTEGERSI_CLASS)
6707 tmpmode = SImode;
6708 else
6709 tmpmode = DImode;
6710 /* We've requested 24 bytes we
6711 don't have mode for. Use DImode. */
6712 if (tmpmode == BLKmode)
6713 tmpmode = DImode;
6714 exp [nexps++]
6715 = gen_rtx_EXPR_LIST (VOIDmode,
6716 gen_rtx_REG (tmpmode, *intreg),
6717 GEN_INT (i*8));
6718 intreg++;
6719 break;
6720 case X86_64_SSESF_CLASS:
6721 exp [nexps++]
6722 = gen_rtx_EXPR_LIST (VOIDmode,
6723 gen_rtx_REG (SFmode,
6724 SSE_REGNO (sse_regno)),
6725 GEN_INT (i*8));
6726 sse_regno++;
6727 break;
6728 case X86_64_SSEDF_CLASS:
6729 exp [nexps++]
6730 = gen_rtx_EXPR_LIST (VOIDmode,
6731 gen_rtx_REG (DFmode,
6732 SSE_REGNO (sse_regno)),
6733 GEN_INT (i*8));
6734 sse_regno++;
6735 break;
6736 case X86_64_SSE_CLASS:
6737 pos = i;
6738 switch (n)
6739 {
6740 case 1:
6741 tmpmode = DImode;
6742 break;
6743 case 2:
6744 if (i == 0 && regclass[1] == X86_64_SSEUP_CLASS)
6745 {
6746 tmpmode = TImode;
6747 i++;
6748 }
6749 else
6750 tmpmode = DImode;
6751 break;
6752 case 4:
6753 gcc_assert (i == 0
6754 && regclass[1] == X86_64_SSEUP_CLASS
6755 && regclass[2] == X86_64_SSEUP_CLASS
6756 && regclass[3] == X86_64_SSEUP_CLASS);
6757 tmpmode = OImode;
6758 i += 3;
6759 break;
6760 default:
6761 gcc_unreachable ();
6762 }
6763 exp [nexps++]
6764 = gen_rtx_EXPR_LIST (VOIDmode,
6765 gen_rtx_REG (tmpmode,
6766 SSE_REGNO (sse_regno)),
6767 GEN_INT (pos*8));
6768 sse_regno++;
6769 break;
6770 default:
6771 gcc_unreachable ();
6772 }
6773 }
6774
6775 /* Empty aligned struct, union or class. */
6776 if (nexps == 0)
6777 return NULL;
6778
6779 ret = gen_rtx_PARALLEL (mode, rtvec_alloc (nexps));
6780 for (i = 0; i < nexps; i++)
6781 XVECEXP (ret, 0, i) = exp [i];
6782 return ret;
6783 }
6784
6785 /* Update the data in CUM to advance over an argument of mode MODE
6786 and data type TYPE. (TYPE is null for libcalls where that information
6787 may not be available.) */
6788
6789 static void
6790 function_arg_advance_32 (CUMULATIVE_ARGS *cum, enum machine_mode mode,
6791 const_tree type, HOST_WIDE_INT bytes,
6792 HOST_WIDE_INT words)
6793 {
6794 switch (mode)
6795 {
6796 default:
6797 break;
6798
6799 case BLKmode:
6800 if (bytes < 0)
6801 break;
6802 /* FALLTHRU */
6803
6804 case DImode:
6805 case SImode:
6806 case HImode:
6807 case QImode:
6808 cum->words += words;
6809 cum->nregs -= words;
6810 cum->regno += words;
6811
6812 if (cum->nregs <= 0)
6813 {
6814 cum->nregs = 0;
6815 cum->regno = 0;
6816 }
6817 break;
6818
6819 case OImode:
6820 /* OImode shouldn't be used directly. */
6821 gcc_unreachable ();
6822
6823 case DFmode:
6824 if (cum->float_in_sse < 2)
6825 break;
6826 case SFmode:
6827 if (cum->float_in_sse < 1)
6828 break;
6829 /* FALLTHRU */
6830
6831 case V8SFmode:
6832 case V8SImode:
6833 case V32QImode:
6834 case V16HImode:
6835 case V4DFmode:
6836 case V4DImode:
6837 case TImode:
6838 case V16QImode:
6839 case V8HImode:
6840 case V4SImode:
6841 case V2DImode:
6842 case V4SFmode:
6843 case V2DFmode:
6844 if (!type || !AGGREGATE_TYPE_P (type))
6845 {
6846 cum->sse_words += words;
6847 cum->sse_nregs -= 1;
6848 cum->sse_regno += 1;
6849 if (cum->sse_nregs <= 0)
6850 {
6851 cum->sse_nregs = 0;
6852 cum->sse_regno = 0;
6853 }
6854 }
6855 break;
6856
6857 case V8QImode:
6858 case V4HImode:
6859 case V2SImode:
6860 case V2SFmode:
6861 case V1TImode:
6862 case V1DImode:
6863 if (!type || !AGGREGATE_TYPE_P (type))
6864 {
6865 cum->mmx_words += words;
6866 cum->mmx_nregs -= 1;
6867 cum->mmx_regno += 1;
6868 if (cum->mmx_nregs <= 0)
6869 {
6870 cum->mmx_nregs = 0;
6871 cum->mmx_regno = 0;
6872 }
6873 }
6874 break;
6875 }
6876 }
6877
6878 static void
6879 function_arg_advance_64 (CUMULATIVE_ARGS *cum, enum machine_mode mode,
6880 const_tree type, HOST_WIDE_INT words, bool named)
6881 {
6882 int int_nregs, sse_nregs;
6883
6884 /* Unnamed 256bit vector mode parameters are passed on stack. */
6885 if (!named && VALID_AVX256_REG_MODE (mode))
6886 return;
6887
6888 if (examine_argument (mode, type, 0, &int_nregs, &sse_nregs)
6889 && sse_nregs <= cum->sse_nregs && int_nregs <= cum->nregs)
6890 {
6891 cum->nregs -= int_nregs;
6892 cum->sse_nregs -= sse_nregs;
6893 cum->regno += int_nregs;
6894 cum->sse_regno += sse_nregs;
6895 }
6896 else
6897 {
6898 int align = ix86_function_arg_boundary (mode, type) / BITS_PER_WORD;
6899 cum->words = (cum->words + align - 1) & ~(align - 1);
6900 cum->words += words;
6901 }
6902 }
6903
6904 static void
6905 function_arg_advance_ms_64 (CUMULATIVE_ARGS *cum, HOST_WIDE_INT bytes,
6906 HOST_WIDE_INT words)
6907 {
6908 /* Otherwise, this should be passed indirect. */
6909 gcc_assert (bytes == 1 || bytes == 2 || bytes == 4 || bytes == 8);
6910
6911 cum->words += words;
6912 if (cum->nregs > 0)
6913 {
6914 cum->nregs -= 1;
6915 cum->regno += 1;
6916 }
6917 }
6918
6919 /* Update the data in CUM to advance over an argument of mode MODE and
6920 data type TYPE. (TYPE is null for libcalls where that information
6921 may not be available.) */
6922
6923 static void
6924 ix86_function_arg_advance (cumulative_args_t cum_v, enum machine_mode mode,
6925 const_tree type, bool named)
6926 {
6927 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
6928 HOST_WIDE_INT bytes, words;
6929
6930 if (mode == BLKmode)
6931 bytes = int_size_in_bytes (type);
6932 else
6933 bytes = GET_MODE_SIZE (mode);
6934 words = (bytes + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
6935
6936 if (type)
6937 mode = type_natural_mode (type, NULL);
6938
6939 if (TARGET_64BIT && (cum ? cum->call_abi : ix86_abi) == MS_ABI)
6940 function_arg_advance_ms_64 (cum, bytes, words);
6941 else if (TARGET_64BIT)
6942 function_arg_advance_64 (cum, mode, type, words, named);
6943 else
6944 function_arg_advance_32 (cum, mode, type, bytes, words);
6945 }
6946
6947 /* Define where to put the arguments to a function.
6948 Value is zero to push the argument on the stack,
6949 or a hard register in which to store the argument.
6950
6951 MODE is the argument's machine mode.
6952 TYPE is the data type of the argument (as a tree).
6953 This is null for libcalls where that information may
6954 not be available.
6955 CUM is a variable of type CUMULATIVE_ARGS which gives info about
6956 the preceding args and about the function being called.
6957 NAMED is nonzero if this argument is a named parameter
6958 (otherwise it is an extra parameter matching an ellipsis). */
6959
6960 static rtx
6961 function_arg_32 (const CUMULATIVE_ARGS *cum, enum machine_mode mode,
6962 enum machine_mode orig_mode, const_tree type,
6963 HOST_WIDE_INT bytes, HOST_WIDE_INT words)
6964 {
6965 static bool warnedsse, warnedmmx;
6966
6967 /* Avoid the AL settings for the Unix64 ABI. */
6968 if (mode == VOIDmode)
6969 return constm1_rtx;
6970
6971 switch (mode)
6972 {
6973 default:
6974 break;
6975
6976 case BLKmode:
6977 if (bytes < 0)
6978 break;
6979 /* FALLTHRU */
6980 case DImode:
6981 case SImode:
6982 case HImode:
6983 case QImode:
6984 if (words <= cum->nregs)
6985 {
6986 int regno = cum->regno;
6987
6988 /* Fastcall allocates the first two DWORD (SImode) or
6989 smaller arguments to ECX and EDX if it isn't an
6990 aggregate type . */
6991 if (cum->fastcall)
6992 {
6993 if (mode == BLKmode
6994 || mode == DImode
6995 || (type && AGGREGATE_TYPE_P (type)))
6996 break;
6997
6998 /* ECX not EAX is the first allocated register. */
6999 if (regno == AX_REG)
7000 regno = CX_REG;
7001 }
7002 return gen_rtx_REG (mode, regno);
7003 }
7004 break;
7005
7006 case DFmode:
7007 if (cum->float_in_sse < 2)
7008 break;
7009 case SFmode:
7010 if (cum->float_in_sse < 1)
7011 break;
7012 /* FALLTHRU */
7013 case TImode:
7014 /* In 32bit, we pass TImode in xmm registers. */
7015 case V16QImode:
7016 case V8HImode:
7017 case V4SImode:
7018 case V2DImode:
7019 case V4SFmode:
7020 case V2DFmode:
7021 if (!type || !AGGREGATE_TYPE_P (type))
7022 {
7023 if (!TARGET_SSE && !warnedsse && cum->warn_sse)
7024 {
7025 warnedsse = true;
7026 warning (0, "SSE vector argument without SSE enabled "
7027 "changes the ABI");
7028 }
7029 if (cum->sse_nregs)
7030 return gen_reg_or_parallel (mode, orig_mode,
7031 cum->sse_regno + FIRST_SSE_REG);
7032 }
7033 break;
7034
7035 case OImode:
7036 /* OImode shouldn't be used directly. */
7037 gcc_unreachable ();
7038
7039 case V8SFmode:
7040 case V8SImode:
7041 case V32QImode:
7042 case V16HImode:
7043 case V4DFmode:
7044 case V4DImode:
7045 if (!type || !AGGREGATE_TYPE_P (type))
7046 {
7047 if (cum->sse_nregs)
7048 return gen_reg_or_parallel (mode, orig_mode,
7049 cum->sse_regno + FIRST_SSE_REG);
7050 }
7051 break;
7052
7053 case V8QImode:
7054 case V4HImode:
7055 case V2SImode:
7056 case V2SFmode:
7057 case V1TImode:
7058 case V1DImode:
7059 if (!type || !AGGREGATE_TYPE_P (type))
7060 {
7061 if (!TARGET_MMX && !warnedmmx && cum->warn_mmx)
7062 {
7063 warnedmmx = true;
7064 warning (0, "MMX vector argument without MMX enabled "
7065 "changes the ABI");
7066 }
7067 if (cum->mmx_nregs)
7068 return gen_reg_or_parallel (mode, orig_mode,
7069 cum->mmx_regno + FIRST_MMX_REG);
7070 }
7071 break;
7072 }
7073
7074 return NULL_RTX;
7075 }
7076
7077 static rtx
7078 function_arg_64 (const CUMULATIVE_ARGS *cum, enum machine_mode mode,
7079 enum machine_mode orig_mode, const_tree type, bool named)
7080 {
7081 /* Handle a hidden AL argument containing number of registers
7082 for varargs x86-64 functions. */
7083 if (mode == VOIDmode)
7084 return GEN_INT (cum->maybe_vaarg
7085 ? (cum->sse_nregs < 0
7086 ? X86_64_SSE_REGPARM_MAX
7087 : cum->sse_regno)
7088 : -1);
7089
7090 switch (mode)
7091 {
7092 default:
7093 break;
7094
7095 case V8SFmode:
7096 case V8SImode:
7097 case V32QImode:
7098 case V16HImode:
7099 case V4DFmode:
7100 case V4DImode:
7101 /* Unnamed 256bit vector mode parameters are passed on stack. */
7102 if (!named)
7103 return NULL;
7104 break;
7105 }
7106
7107 return construct_container (mode, orig_mode, type, 0, cum->nregs,
7108 cum->sse_nregs,
7109 &x86_64_int_parameter_registers [cum->regno],
7110 cum->sse_regno);
7111 }
7112
7113 static rtx
7114 function_arg_ms_64 (const CUMULATIVE_ARGS *cum, enum machine_mode mode,
7115 enum machine_mode orig_mode, bool named,
7116 HOST_WIDE_INT bytes)
7117 {
7118 unsigned int regno;
7119
7120 /* We need to add clobber for MS_ABI->SYSV ABI calls in expand_call.
7121 We use value of -2 to specify that current function call is MSABI. */
7122 if (mode == VOIDmode)
7123 return GEN_INT (-2);
7124
7125 /* If we've run out of registers, it goes on the stack. */
7126 if (cum->nregs == 0)
7127 return NULL_RTX;
7128
7129 regno = x86_64_ms_abi_int_parameter_registers[cum->regno];
7130
7131 /* Only floating point modes are passed in anything but integer regs. */
7132 if (TARGET_SSE && (mode == SFmode || mode == DFmode))
7133 {
7134 if (named)
7135 regno = cum->regno + FIRST_SSE_REG;
7136 else
7137 {
7138 rtx t1, t2;
7139
7140 /* Unnamed floating parameters are passed in both the
7141 SSE and integer registers. */
7142 t1 = gen_rtx_REG (mode, cum->regno + FIRST_SSE_REG);
7143 t2 = gen_rtx_REG (mode, regno);
7144 t1 = gen_rtx_EXPR_LIST (VOIDmode, t1, const0_rtx);
7145 t2 = gen_rtx_EXPR_LIST (VOIDmode, t2, const0_rtx);
7146 return gen_rtx_PARALLEL (mode, gen_rtvec (2, t1, t2));
7147 }
7148 }
7149 /* Handle aggregated types passed in register. */
7150 if (orig_mode == BLKmode)
7151 {
7152 if (bytes > 0 && bytes <= 8)
7153 mode = (bytes > 4 ? DImode : SImode);
7154 if (mode == BLKmode)
7155 mode = DImode;
7156 }
7157
7158 return gen_reg_or_parallel (mode, orig_mode, regno);
7159 }
7160
7161 /* Return where to put the arguments to a function.
7162 Return zero to push the argument on the stack, or a hard register in which to store the argument.
7163
7164 MODE is the argument's machine mode. TYPE is the data type of the
7165 argument. It is null for libcalls where that information may not be
7166 available. CUM gives information about the preceding args and about
7167 the function being called. NAMED is nonzero if this argument is a
7168 named parameter (otherwise it is an extra parameter matching an
7169 ellipsis). */
7170
7171 static rtx
7172 ix86_function_arg (cumulative_args_t cum_v, enum machine_mode omode,
7173 const_tree type, bool named)
7174 {
7175 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
7176 enum machine_mode mode = omode;
7177 HOST_WIDE_INT bytes, words;
7178 rtx arg;
7179
7180 if (mode == BLKmode)
7181 bytes = int_size_in_bytes (type);
7182 else
7183 bytes = GET_MODE_SIZE (mode);
7184 words = (bytes + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
7185
7186 /* To simplify the code below, represent vector types with a vector mode
7187 even if MMX/SSE are not active. */
7188 if (type && TREE_CODE (type) == VECTOR_TYPE)
7189 mode = type_natural_mode (type, cum);
7190
7191 if (TARGET_64BIT && (cum ? cum->call_abi : ix86_abi) == MS_ABI)
7192 arg = function_arg_ms_64 (cum, mode, omode, named, bytes);
7193 else if (TARGET_64BIT)
7194 arg = function_arg_64 (cum, mode, omode, type, named);
7195 else
7196 arg = function_arg_32 (cum, mode, omode, type, bytes, words);
7197
7198 if (TARGET_VZEROUPPER && function_pass_avx256_p (arg))
7199 {
7200 /* This argument uses 256bit AVX modes. */
7201 if (cum->caller)
7202 cfun->machine->callee_pass_avx256_p = true;
7203 else
7204 cfun->machine->caller_pass_avx256_p = true;
7205 }
7206
7207 return arg;
7208 }
7209
7210 /* A C expression that indicates when an argument must be passed by
7211 reference. If nonzero for an argument, a copy of that argument is
7212 made in memory and a pointer to the argument is passed instead of
7213 the argument itself. The pointer is passed in whatever way is
7214 appropriate for passing a pointer to that type. */
7215
7216 static bool
7217 ix86_pass_by_reference (cumulative_args_t cum_v ATTRIBUTE_UNUSED,
7218 enum machine_mode mode ATTRIBUTE_UNUSED,
7219 const_tree type, bool named ATTRIBUTE_UNUSED)
7220 {
7221 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
7222
7223 /* See Windows x64 Software Convention. */
7224 if (TARGET_64BIT && (cum ? cum->call_abi : ix86_abi) == MS_ABI)
7225 {
7226 int msize = (int) GET_MODE_SIZE (mode);
7227 if (type)
7228 {
7229 /* Arrays are passed by reference. */
7230 if (TREE_CODE (type) == ARRAY_TYPE)
7231 return true;
7232
7233 if (AGGREGATE_TYPE_P (type))
7234 {
7235 /* Structs/unions of sizes other than 8, 16, 32, or 64 bits
7236 are passed by reference. */
7237 msize = int_size_in_bytes (type);
7238 }
7239 }
7240
7241 /* __m128 is passed by reference. */
7242 switch (msize) {
7243 case 1: case 2: case 4: case 8:
7244 break;
7245 default:
7246 return true;
7247 }
7248 }
7249 else if (TARGET_64BIT && type && int_size_in_bytes (type) == -1)
7250 return 1;
7251
7252 return 0;
7253 }
7254
7255 /* Return true when TYPE should be 128bit aligned for 32bit argument
7256 passing ABI. XXX: This function is obsolete and is only used for
7257 checking psABI compatibility with previous versions of GCC. */
7258
7259 static bool
7260 ix86_compat_aligned_value_p (const_tree type)
7261 {
7262 enum machine_mode mode = TYPE_MODE (type);
7263 if (((TARGET_SSE && SSE_REG_MODE_P (mode))
7264 || mode == TDmode
7265 || mode == TFmode
7266 || mode == TCmode)
7267 && (!TYPE_USER_ALIGN (type) || TYPE_ALIGN (type) > 128))
7268 return true;
7269 if (TYPE_ALIGN (type) < 128)
7270 return false;
7271
7272 if (AGGREGATE_TYPE_P (type))
7273 {
7274 /* Walk the aggregates recursively. */
7275 switch (TREE_CODE (type))
7276 {
7277 case RECORD_TYPE:
7278 case UNION_TYPE:
7279 case QUAL_UNION_TYPE:
7280 {
7281 tree field;
7282
7283 /* Walk all the structure fields. */
7284 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
7285 {
7286 if (TREE_CODE (field) == FIELD_DECL
7287 && ix86_compat_aligned_value_p (TREE_TYPE (field)))
7288 return true;
7289 }
7290 break;
7291 }
7292
7293 case ARRAY_TYPE:
7294 /* Just for use if some languages passes arrays by value. */
7295 if (ix86_compat_aligned_value_p (TREE_TYPE (type)))
7296 return true;
7297 break;
7298
7299 default:
7300 gcc_unreachable ();
7301 }
7302 }
7303 return false;
7304 }
7305
7306 /* Return the alignment boundary for MODE and TYPE with alignment ALIGN.
7307 XXX: This function is obsolete and is only used for checking psABI
7308 compatibility with previous versions of GCC. */
7309
7310 static unsigned int
7311 ix86_compat_function_arg_boundary (enum machine_mode mode,
7312 const_tree type, unsigned int align)
7313 {
7314 /* In 32bit, only _Decimal128 and __float128 are aligned to their
7315 natural boundaries. */
7316 if (!TARGET_64BIT && mode != TDmode && mode != TFmode)
7317 {
7318 /* i386 ABI defines all arguments to be 4 byte aligned. We have to
7319 make an exception for SSE modes since these require 128bit
7320 alignment.
7321
7322 The handling here differs from field_alignment. ICC aligns MMX
7323 arguments to 4 byte boundaries, while structure fields are aligned
7324 to 8 byte boundaries. */
7325 if (!type)
7326 {
7327 if (!(TARGET_SSE && SSE_REG_MODE_P (mode)))
7328 align = PARM_BOUNDARY;
7329 }
7330 else
7331 {
7332 if (!ix86_compat_aligned_value_p (type))
7333 align = PARM_BOUNDARY;
7334 }
7335 }
7336 if (align > BIGGEST_ALIGNMENT)
7337 align = BIGGEST_ALIGNMENT;
7338 return align;
7339 }
7340
7341 /* Return true when TYPE should be 128bit aligned for 32bit argument
7342 passing ABI. */
7343
7344 static bool
7345 ix86_contains_aligned_value_p (const_tree type)
7346 {
7347 enum machine_mode mode = TYPE_MODE (type);
7348
7349 if (mode == XFmode || mode == XCmode)
7350 return false;
7351
7352 if (TYPE_ALIGN (type) < 128)
7353 return false;
7354
7355 if (AGGREGATE_TYPE_P (type))
7356 {
7357 /* Walk the aggregates recursively. */
7358 switch (TREE_CODE (type))
7359 {
7360 case RECORD_TYPE:
7361 case UNION_TYPE:
7362 case QUAL_UNION_TYPE:
7363 {
7364 tree field;
7365
7366 /* Walk all the structure fields. */
7367 for (field = TYPE_FIELDS (type);
7368 field;
7369 field = DECL_CHAIN (field))
7370 {
7371 if (TREE_CODE (field) == FIELD_DECL
7372 && ix86_contains_aligned_value_p (TREE_TYPE (field)))
7373 return true;
7374 }
7375 break;
7376 }
7377
7378 case ARRAY_TYPE:
7379 /* Just for use if some languages passes arrays by value. */
7380 if (ix86_contains_aligned_value_p (TREE_TYPE (type)))
7381 return true;
7382 break;
7383
7384 default:
7385 gcc_unreachable ();
7386 }
7387 }
7388 else
7389 return TYPE_ALIGN (type) >= 128;
7390
7391 return false;
7392 }
7393
7394 /* Gives the alignment boundary, in bits, of an argument with the
7395 specified mode and type. */
7396
7397 static unsigned int
7398 ix86_function_arg_boundary (enum machine_mode mode, const_tree type)
7399 {
7400 unsigned int align;
7401 if (type)
7402 {
7403 /* Since the main variant type is used for call, we convert it to
7404 the main variant type. */
7405 type = TYPE_MAIN_VARIANT (type);
7406 align = TYPE_ALIGN (type);
7407 }
7408 else
7409 align = GET_MODE_ALIGNMENT (mode);
7410 if (align < PARM_BOUNDARY)
7411 align = PARM_BOUNDARY;
7412 else
7413 {
7414 static bool warned;
7415 unsigned int saved_align = align;
7416
7417 if (!TARGET_64BIT)
7418 {
7419 /* i386 ABI defines XFmode arguments to be 4 byte aligned. */
7420 if (!type)
7421 {
7422 if (mode == XFmode || mode == XCmode)
7423 align = PARM_BOUNDARY;
7424 }
7425 else if (!ix86_contains_aligned_value_p (type))
7426 align = PARM_BOUNDARY;
7427
7428 if (align < 128)
7429 align = PARM_BOUNDARY;
7430 }
7431
7432 if (warn_psabi
7433 && !warned
7434 && align != ix86_compat_function_arg_boundary (mode, type,
7435 saved_align))
7436 {
7437 warned = true;
7438 inform (input_location,
7439 "The ABI for passing parameters with %d-byte"
7440 " alignment has changed in GCC 4.6",
7441 align / BITS_PER_UNIT);
7442 }
7443 }
7444
7445 return align;
7446 }
7447
7448 /* Return true if N is a possible register number of function value. */
7449
7450 static bool
7451 ix86_function_value_regno_p (const unsigned int regno)
7452 {
7453 switch (regno)
7454 {
7455 case AX_REG:
7456 return true;
7457
7458 case FIRST_FLOAT_REG:
7459 /* TODO: The function should depend on current function ABI but
7460 builtins.c would need updating then. Therefore we use the
7461 default ABI. */
7462 if (TARGET_64BIT && ix86_abi == MS_ABI)
7463 return false;
7464 return TARGET_FLOAT_RETURNS_IN_80387;
7465
7466 case FIRST_SSE_REG:
7467 return TARGET_SSE;
7468
7469 case FIRST_MMX_REG:
7470 if (TARGET_MACHO || TARGET_64BIT)
7471 return false;
7472 return TARGET_MMX;
7473 }
7474
7475 return false;
7476 }
7477
7478 /* Define how to find the value returned by a function.
7479 VALTYPE is the data type of the value (as a tree).
7480 If the precise function being called is known, FUNC is its FUNCTION_DECL;
7481 otherwise, FUNC is 0. */
7482
7483 static rtx
7484 function_value_32 (enum machine_mode orig_mode, enum machine_mode mode,
7485 const_tree fntype, const_tree fn)
7486 {
7487 unsigned int regno;
7488
7489 /* 8-byte vector modes in %mm0. See ix86_return_in_memory for where
7490 we normally prevent this case when mmx is not available. However
7491 some ABIs may require the result to be returned like DImode. */
7492 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 8)
7493 regno = FIRST_MMX_REG;
7494
7495 /* 16-byte vector modes in %xmm0. See ix86_return_in_memory for where
7496 we prevent this case when sse is not available. However some ABIs
7497 may require the result to be returned like integer TImode. */
7498 else if (mode == TImode
7499 || (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 16))
7500 regno = FIRST_SSE_REG;
7501
7502 /* 32-byte vector modes in %ymm0. */
7503 else if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 32)
7504 regno = FIRST_SSE_REG;
7505
7506 /* Floating point return values in %st(0) (unless -mno-fp-ret-in-387). */
7507 else if (X87_FLOAT_MODE_P (mode) && TARGET_FLOAT_RETURNS_IN_80387)
7508 regno = FIRST_FLOAT_REG;
7509 else
7510 /* Most things go in %eax. */
7511 regno = AX_REG;
7512
7513 /* Override FP return register with %xmm0 for local functions when
7514 SSE math is enabled or for functions with sseregparm attribute. */
7515 if ((fn || fntype) && (mode == SFmode || mode == DFmode))
7516 {
7517 int sse_level = ix86_function_sseregparm (fntype, fn, false);
7518 if ((sse_level >= 1 && mode == SFmode)
7519 || (sse_level == 2 && mode == DFmode))
7520 regno = FIRST_SSE_REG;
7521 }
7522
7523 /* OImode shouldn't be used directly. */
7524 gcc_assert (mode != OImode);
7525
7526 return gen_rtx_REG (orig_mode, regno);
7527 }
7528
7529 static rtx
7530 function_value_64 (enum machine_mode orig_mode, enum machine_mode mode,
7531 const_tree valtype)
7532 {
7533 rtx ret;
7534
7535 /* Handle libcalls, which don't provide a type node. */
7536 if (valtype == NULL)
7537 {
7538 unsigned int regno;
7539
7540 switch (mode)
7541 {
7542 case SFmode:
7543 case SCmode:
7544 case DFmode:
7545 case DCmode:
7546 case TFmode:
7547 case SDmode:
7548 case DDmode:
7549 case TDmode:
7550 regno = FIRST_SSE_REG;
7551 break;
7552 case XFmode:
7553 case XCmode:
7554 regno = FIRST_FLOAT_REG;
7555 break;
7556 case TCmode:
7557 return NULL;
7558 default:
7559 regno = AX_REG;
7560 }
7561
7562 return gen_rtx_REG (mode, regno);
7563 }
7564 else if (POINTER_TYPE_P (valtype))
7565 {
7566 /* Pointers are always returned in word_mode. */
7567 mode = word_mode;
7568 }
7569
7570 ret = construct_container (mode, orig_mode, valtype, 1,
7571 X86_64_REGPARM_MAX, X86_64_SSE_REGPARM_MAX,
7572 x86_64_int_return_registers, 0);
7573
7574 /* For zero sized structures, construct_container returns NULL, but we
7575 need to keep rest of compiler happy by returning meaningful value. */
7576 if (!ret)
7577 ret = gen_rtx_REG (orig_mode, AX_REG);
7578
7579 return ret;
7580 }
7581
7582 static rtx
7583 function_value_ms_64 (enum machine_mode orig_mode, enum machine_mode mode)
7584 {
7585 unsigned int regno = AX_REG;
7586
7587 if (TARGET_SSE)
7588 {
7589 switch (GET_MODE_SIZE (mode))
7590 {
7591 case 16:
7592 if((SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
7593 && !COMPLEX_MODE_P (mode))
7594 regno = FIRST_SSE_REG;
7595 break;
7596 case 8:
7597 case 4:
7598 if (mode == SFmode || mode == DFmode)
7599 regno = FIRST_SSE_REG;
7600 break;
7601 default:
7602 break;
7603 }
7604 }
7605 return gen_rtx_REG (orig_mode, regno);
7606 }
7607
7608 static rtx
7609 ix86_function_value_1 (const_tree valtype, const_tree fntype_or_decl,
7610 enum machine_mode orig_mode, enum machine_mode mode)
7611 {
7612 const_tree fn, fntype;
7613
7614 fn = NULL_TREE;
7615 if (fntype_or_decl && DECL_P (fntype_or_decl))
7616 fn = fntype_or_decl;
7617 fntype = fn ? TREE_TYPE (fn) : fntype_or_decl;
7618
7619 if (TARGET_64BIT && ix86_function_type_abi (fntype) == MS_ABI)
7620 return function_value_ms_64 (orig_mode, mode);
7621 else if (TARGET_64BIT)
7622 return function_value_64 (orig_mode, mode, valtype);
7623 else
7624 return function_value_32 (orig_mode, mode, fntype, fn);
7625 }
7626
7627 static rtx
7628 ix86_function_value (const_tree valtype, const_tree fntype_or_decl,
7629 bool outgoing ATTRIBUTE_UNUSED)
7630 {
7631 enum machine_mode mode, orig_mode;
7632
7633 orig_mode = TYPE_MODE (valtype);
7634 mode = type_natural_mode (valtype, NULL);
7635 return ix86_function_value_1 (valtype, fntype_or_decl, orig_mode, mode);
7636 }
7637
7638 /* Pointer function arguments and return values are promoted to
7639 word_mode. */
7640
7641 static enum machine_mode
7642 ix86_promote_function_mode (const_tree type, enum machine_mode mode,
7643 int *punsignedp, const_tree fntype,
7644 int for_return)
7645 {
7646 if (type != NULL_TREE && POINTER_TYPE_P (type))
7647 {
7648 *punsignedp = POINTERS_EXTEND_UNSIGNED;
7649 return word_mode;
7650 }
7651 return default_promote_function_mode (type, mode, punsignedp, fntype,
7652 for_return);
7653 }
7654
7655 /* Return true if a structure, union or array with MODE containing FIELD
7656 should be accessed using BLKmode. */
7657
7658 static bool
7659 ix86_member_type_forces_blk (const_tree field, enum machine_mode mode)
7660 {
7661 /* Union with XFmode must be in BLKmode. */
7662 return (mode == XFmode
7663 && (TREE_CODE (DECL_FIELD_CONTEXT (field)) == UNION_TYPE
7664 || TREE_CODE (DECL_FIELD_CONTEXT (field)) == QUAL_UNION_TYPE));
7665 }
7666
7667 rtx
7668 ix86_libcall_value (enum machine_mode mode)
7669 {
7670 return ix86_function_value_1 (NULL, NULL, mode, mode);
7671 }
7672
7673 /* Return true iff type is returned in memory. */
7674
7675 static bool ATTRIBUTE_UNUSED
7676 return_in_memory_32 (const_tree type, enum machine_mode mode)
7677 {
7678 HOST_WIDE_INT size;
7679
7680 if (mode == BLKmode)
7681 return true;
7682
7683 size = int_size_in_bytes (type);
7684
7685 if (MS_AGGREGATE_RETURN && AGGREGATE_TYPE_P (type) && size <= 8)
7686 return false;
7687
7688 if (VECTOR_MODE_P (mode) || mode == TImode)
7689 {
7690 /* User-created vectors small enough to fit in EAX. */
7691 if (size < 8)
7692 return false;
7693
7694 /* MMX/3dNow values are returned in MM0,
7695 except when it doesn't exits or the ABI prescribes otherwise. */
7696 if (size == 8)
7697 return !TARGET_MMX || TARGET_VECT8_RETURNS;
7698
7699 /* SSE values are returned in XMM0, except when it doesn't exist. */
7700 if (size == 16)
7701 return !TARGET_SSE;
7702
7703 /* AVX values are returned in YMM0, except when it doesn't exist. */
7704 if (size == 32)
7705 return !TARGET_AVX;
7706 }
7707
7708 if (mode == XFmode)
7709 return false;
7710
7711 if (size > 12)
7712 return true;
7713
7714 /* OImode shouldn't be used directly. */
7715 gcc_assert (mode != OImode);
7716
7717 return false;
7718 }
7719
7720 static bool ATTRIBUTE_UNUSED
7721 return_in_memory_64 (const_tree type, enum machine_mode mode)
7722 {
7723 int needed_intregs, needed_sseregs;
7724 return !examine_argument (mode, type, 1, &needed_intregs, &needed_sseregs);
7725 }
7726
7727 static bool ATTRIBUTE_UNUSED
7728 return_in_memory_ms_64 (const_tree type, enum machine_mode mode)
7729 {
7730 HOST_WIDE_INT size = int_size_in_bytes (type);
7731
7732 /* __m128 is returned in xmm0. */
7733 if ((SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
7734 && !COMPLEX_MODE_P (mode) && (GET_MODE_SIZE (mode) == 16 || size == 16))
7735 return false;
7736
7737 /* Otherwise, the size must be exactly in [1248]. */
7738 return size != 1 && size != 2 && size != 4 && size != 8;
7739 }
7740
7741 static bool
7742 ix86_return_in_memory (const_tree type, const_tree fntype ATTRIBUTE_UNUSED)
7743 {
7744 #ifdef SUBTARGET_RETURN_IN_MEMORY
7745 return SUBTARGET_RETURN_IN_MEMORY (type, fntype);
7746 #else
7747 const enum machine_mode mode = type_natural_mode (type, NULL);
7748
7749 if (TARGET_64BIT)
7750 {
7751 if (ix86_function_type_abi (fntype) == MS_ABI)
7752 return return_in_memory_ms_64 (type, mode);
7753 else
7754 return return_in_memory_64 (type, mode);
7755 }
7756 else
7757 return return_in_memory_32 (type, mode);
7758 #endif
7759 }
7760
7761 /* When returning SSE vector types, we have a choice of either
7762 (1) being abi incompatible with a -march switch, or
7763 (2) generating an error.
7764 Given no good solution, I think the safest thing is one warning.
7765 The user won't be able to use -Werror, but....
7766
7767 Choose the STRUCT_VALUE_RTX hook because that's (at present) only
7768 called in response to actually generating a caller or callee that
7769 uses such a type. As opposed to TARGET_RETURN_IN_MEMORY, which is called
7770 via aggregate_value_p for general type probing from tree-ssa. */
7771
7772 static rtx
7773 ix86_struct_value_rtx (tree type, int incoming ATTRIBUTE_UNUSED)
7774 {
7775 static bool warnedsse, warnedmmx;
7776
7777 if (!TARGET_64BIT && type)
7778 {
7779 /* Look at the return type of the function, not the function type. */
7780 enum machine_mode mode = TYPE_MODE (TREE_TYPE (type));
7781
7782 if (!TARGET_SSE && !warnedsse)
7783 {
7784 if (mode == TImode
7785 || (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 16))
7786 {
7787 warnedsse = true;
7788 warning (0, "SSE vector return without SSE enabled "
7789 "changes the ABI");
7790 }
7791 }
7792
7793 if (!TARGET_MMX && !warnedmmx)
7794 {
7795 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 8)
7796 {
7797 warnedmmx = true;
7798 warning (0, "MMX vector return without MMX enabled "
7799 "changes the ABI");
7800 }
7801 }
7802 }
7803
7804 return NULL;
7805 }
7806
7807 \f
7808 /* Create the va_list data type. */
7809
7810 /* Returns the calling convention specific va_list date type.
7811 The argument ABI can be DEFAULT_ABI, MS_ABI, or SYSV_ABI. */
7812
7813 static tree
7814 ix86_build_builtin_va_list_abi (enum calling_abi abi)
7815 {
7816 tree f_gpr, f_fpr, f_ovf, f_sav, record, type_decl;
7817
7818 /* For i386 we use plain pointer to argument area. */
7819 if (!TARGET_64BIT || abi == MS_ABI)
7820 return build_pointer_type (char_type_node);
7821
7822 record = lang_hooks.types.make_type (RECORD_TYPE);
7823 type_decl = build_decl (BUILTINS_LOCATION,
7824 TYPE_DECL, get_identifier ("__va_list_tag"), record);
7825
7826 f_gpr = build_decl (BUILTINS_LOCATION,
7827 FIELD_DECL, get_identifier ("gp_offset"),
7828 unsigned_type_node);
7829 f_fpr = build_decl (BUILTINS_LOCATION,
7830 FIELD_DECL, get_identifier ("fp_offset"),
7831 unsigned_type_node);
7832 f_ovf = build_decl (BUILTINS_LOCATION,
7833 FIELD_DECL, get_identifier ("overflow_arg_area"),
7834 ptr_type_node);
7835 f_sav = build_decl (BUILTINS_LOCATION,
7836 FIELD_DECL, get_identifier ("reg_save_area"),
7837 ptr_type_node);
7838
7839 va_list_gpr_counter_field = f_gpr;
7840 va_list_fpr_counter_field = f_fpr;
7841
7842 DECL_FIELD_CONTEXT (f_gpr) = record;
7843 DECL_FIELD_CONTEXT (f_fpr) = record;
7844 DECL_FIELD_CONTEXT (f_ovf) = record;
7845 DECL_FIELD_CONTEXT (f_sav) = record;
7846
7847 TYPE_STUB_DECL (record) = type_decl;
7848 TYPE_NAME (record) = type_decl;
7849 TYPE_FIELDS (record) = f_gpr;
7850 DECL_CHAIN (f_gpr) = f_fpr;
7851 DECL_CHAIN (f_fpr) = f_ovf;
7852 DECL_CHAIN (f_ovf) = f_sav;
7853
7854 layout_type (record);
7855
7856 /* The correct type is an array type of one element. */
7857 return build_array_type (record, build_index_type (size_zero_node));
7858 }
7859
7860 /* Setup the builtin va_list data type and for 64-bit the additional
7861 calling convention specific va_list data types. */
7862
7863 static tree
7864 ix86_build_builtin_va_list (void)
7865 {
7866 tree ret = ix86_build_builtin_va_list_abi (ix86_abi);
7867
7868 /* Initialize abi specific va_list builtin types. */
7869 if (TARGET_64BIT)
7870 {
7871 tree t;
7872 if (ix86_abi == MS_ABI)
7873 {
7874 t = ix86_build_builtin_va_list_abi (SYSV_ABI);
7875 if (TREE_CODE (t) != RECORD_TYPE)
7876 t = build_variant_type_copy (t);
7877 sysv_va_list_type_node = t;
7878 }
7879 else
7880 {
7881 t = ret;
7882 if (TREE_CODE (t) != RECORD_TYPE)
7883 t = build_variant_type_copy (t);
7884 sysv_va_list_type_node = t;
7885 }
7886 if (ix86_abi != MS_ABI)
7887 {
7888 t = ix86_build_builtin_va_list_abi (MS_ABI);
7889 if (TREE_CODE (t) != RECORD_TYPE)
7890 t = build_variant_type_copy (t);
7891 ms_va_list_type_node = t;
7892 }
7893 else
7894 {
7895 t = ret;
7896 if (TREE_CODE (t) != RECORD_TYPE)
7897 t = build_variant_type_copy (t);
7898 ms_va_list_type_node = t;
7899 }
7900 }
7901
7902 return ret;
7903 }
7904
7905 /* Worker function for TARGET_SETUP_INCOMING_VARARGS. */
7906
7907 static void
7908 setup_incoming_varargs_64 (CUMULATIVE_ARGS *cum)
7909 {
7910 rtx save_area, mem;
7911 alias_set_type set;
7912 int i, max;
7913
7914 /* GPR size of varargs save area. */
7915 if (cfun->va_list_gpr_size)
7916 ix86_varargs_gpr_size = X86_64_REGPARM_MAX * UNITS_PER_WORD;
7917 else
7918 ix86_varargs_gpr_size = 0;
7919
7920 /* FPR size of varargs save area. We don't need it if we don't pass
7921 anything in SSE registers. */
7922 if (TARGET_SSE && cfun->va_list_fpr_size)
7923 ix86_varargs_fpr_size = X86_64_SSE_REGPARM_MAX * 16;
7924 else
7925 ix86_varargs_fpr_size = 0;
7926
7927 if (! ix86_varargs_gpr_size && ! ix86_varargs_fpr_size)
7928 return;
7929
7930 save_area = frame_pointer_rtx;
7931 set = get_varargs_alias_set ();
7932
7933 max = cum->regno + cfun->va_list_gpr_size / UNITS_PER_WORD;
7934 if (max > X86_64_REGPARM_MAX)
7935 max = X86_64_REGPARM_MAX;
7936
7937 for (i = cum->regno; i < max; i++)
7938 {
7939 mem = gen_rtx_MEM (word_mode,
7940 plus_constant (Pmode, save_area, i * UNITS_PER_WORD));
7941 MEM_NOTRAP_P (mem) = 1;
7942 set_mem_alias_set (mem, set);
7943 emit_move_insn (mem,
7944 gen_rtx_REG (word_mode,
7945 x86_64_int_parameter_registers[i]));
7946 }
7947
7948 if (ix86_varargs_fpr_size)
7949 {
7950 enum machine_mode smode;
7951 rtx label, test;
7952
7953 /* Now emit code to save SSE registers. The AX parameter contains number
7954 of SSE parameter registers used to call this function, though all we
7955 actually check here is the zero/non-zero status. */
7956
7957 label = gen_label_rtx ();
7958 test = gen_rtx_EQ (VOIDmode, gen_rtx_REG (QImode, AX_REG), const0_rtx);
7959 emit_jump_insn (gen_cbranchqi4 (test, XEXP (test, 0), XEXP (test, 1),
7960 label));
7961
7962 /* ??? If !TARGET_SSE_TYPELESS_STORES, would we perform better if
7963 we used movdqa (i.e. TImode) instead? Perhaps even better would
7964 be if we could determine the real mode of the data, via a hook
7965 into pass_stdarg. Ignore all that for now. */
7966 smode = V4SFmode;
7967 if (crtl->stack_alignment_needed < GET_MODE_ALIGNMENT (smode))
7968 crtl->stack_alignment_needed = GET_MODE_ALIGNMENT (smode);
7969
7970 max = cum->sse_regno + cfun->va_list_fpr_size / 16;
7971 if (max > X86_64_SSE_REGPARM_MAX)
7972 max = X86_64_SSE_REGPARM_MAX;
7973
7974 for (i = cum->sse_regno; i < max; ++i)
7975 {
7976 mem = plus_constant (Pmode, save_area,
7977 i * 16 + ix86_varargs_gpr_size);
7978 mem = gen_rtx_MEM (smode, mem);
7979 MEM_NOTRAP_P (mem) = 1;
7980 set_mem_alias_set (mem, set);
7981 set_mem_align (mem, GET_MODE_ALIGNMENT (smode));
7982
7983 emit_move_insn (mem, gen_rtx_REG (smode, SSE_REGNO (i)));
7984 }
7985
7986 emit_label (label);
7987 }
7988 }
7989
7990 static void
7991 setup_incoming_varargs_ms_64 (CUMULATIVE_ARGS *cum)
7992 {
7993 alias_set_type set = get_varargs_alias_set ();
7994 int i;
7995
7996 /* Reset to zero, as there might be a sysv vaarg used
7997 before. */
7998 ix86_varargs_gpr_size = 0;
7999 ix86_varargs_fpr_size = 0;
8000
8001 for (i = cum->regno; i < X86_64_MS_REGPARM_MAX; i++)
8002 {
8003 rtx reg, mem;
8004
8005 mem = gen_rtx_MEM (Pmode,
8006 plus_constant (Pmode, virtual_incoming_args_rtx,
8007 i * UNITS_PER_WORD));
8008 MEM_NOTRAP_P (mem) = 1;
8009 set_mem_alias_set (mem, set);
8010
8011 reg = gen_rtx_REG (Pmode, x86_64_ms_abi_int_parameter_registers[i]);
8012 emit_move_insn (mem, reg);
8013 }
8014 }
8015
8016 static void
8017 ix86_setup_incoming_varargs (cumulative_args_t cum_v, enum machine_mode mode,
8018 tree type, int *pretend_size ATTRIBUTE_UNUSED,
8019 int no_rtl)
8020 {
8021 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
8022 CUMULATIVE_ARGS next_cum;
8023 tree fntype;
8024
8025 /* This argument doesn't appear to be used anymore. Which is good,
8026 because the old code here didn't suppress rtl generation. */
8027 gcc_assert (!no_rtl);
8028
8029 if (!TARGET_64BIT)
8030 return;
8031
8032 fntype = TREE_TYPE (current_function_decl);
8033
8034 /* For varargs, we do not want to skip the dummy va_dcl argument.
8035 For stdargs, we do want to skip the last named argument. */
8036 next_cum = *cum;
8037 if (stdarg_p (fntype))
8038 ix86_function_arg_advance (pack_cumulative_args (&next_cum), mode, type,
8039 true);
8040
8041 if (cum->call_abi == MS_ABI)
8042 setup_incoming_varargs_ms_64 (&next_cum);
8043 else
8044 setup_incoming_varargs_64 (&next_cum);
8045 }
8046
8047 /* Checks if TYPE is of kind va_list char *. */
8048
8049 static bool
8050 is_va_list_char_pointer (tree type)
8051 {
8052 tree canonic;
8053
8054 /* For 32-bit it is always true. */
8055 if (!TARGET_64BIT)
8056 return true;
8057 canonic = ix86_canonical_va_list_type (type);
8058 return (canonic == ms_va_list_type_node
8059 || (ix86_abi == MS_ABI && canonic == va_list_type_node));
8060 }
8061
8062 /* Implement va_start. */
8063
8064 static void
8065 ix86_va_start (tree valist, rtx nextarg)
8066 {
8067 HOST_WIDE_INT words, n_gpr, n_fpr;
8068 tree f_gpr, f_fpr, f_ovf, f_sav;
8069 tree gpr, fpr, ovf, sav, t;
8070 tree type;
8071 rtx ovf_rtx;
8072
8073 if (flag_split_stack
8074 && cfun->machine->split_stack_varargs_pointer == NULL_RTX)
8075 {
8076 unsigned int scratch_regno;
8077
8078 /* When we are splitting the stack, we can't refer to the stack
8079 arguments using internal_arg_pointer, because they may be on
8080 the old stack. The split stack prologue will arrange to
8081 leave a pointer to the old stack arguments in a scratch
8082 register, which we here copy to a pseudo-register. The split
8083 stack prologue can't set the pseudo-register directly because
8084 it (the prologue) runs before any registers have been saved. */
8085
8086 scratch_regno = split_stack_prologue_scratch_regno ();
8087 if (scratch_regno != INVALID_REGNUM)
8088 {
8089 rtx reg, seq;
8090
8091 reg = gen_reg_rtx (Pmode);
8092 cfun->machine->split_stack_varargs_pointer = reg;
8093
8094 start_sequence ();
8095 emit_move_insn (reg, gen_rtx_REG (Pmode, scratch_regno));
8096 seq = get_insns ();
8097 end_sequence ();
8098
8099 push_topmost_sequence ();
8100 emit_insn_after (seq, entry_of_function ());
8101 pop_topmost_sequence ();
8102 }
8103 }
8104
8105 /* Only 64bit target needs something special. */
8106 if (!TARGET_64BIT || is_va_list_char_pointer (TREE_TYPE (valist)))
8107 {
8108 if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
8109 std_expand_builtin_va_start (valist, nextarg);
8110 else
8111 {
8112 rtx va_r, next;
8113
8114 va_r = expand_expr (valist, NULL_RTX, VOIDmode, EXPAND_WRITE);
8115 next = expand_binop (ptr_mode, add_optab,
8116 cfun->machine->split_stack_varargs_pointer,
8117 crtl->args.arg_offset_rtx,
8118 NULL_RTX, 0, OPTAB_LIB_WIDEN);
8119 convert_move (va_r, next, 0);
8120 }
8121 return;
8122 }
8123
8124 f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
8125 f_fpr = DECL_CHAIN (f_gpr);
8126 f_ovf = DECL_CHAIN (f_fpr);
8127 f_sav = DECL_CHAIN (f_ovf);
8128
8129 valist = build_simple_mem_ref (valist);
8130 TREE_TYPE (valist) = TREE_TYPE (sysv_va_list_type_node);
8131 /* The following should be folded into the MEM_REF offset. */
8132 gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr), unshare_expr (valist),
8133 f_gpr, NULL_TREE);
8134 fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), unshare_expr (valist),
8135 f_fpr, NULL_TREE);
8136 ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), unshare_expr (valist),
8137 f_ovf, NULL_TREE);
8138 sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), unshare_expr (valist),
8139 f_sav, NULL_TREE);
8140
8141 /* Count number of gp and fp argument registers used. */
8142 words = crtl->args.info.words;
8143 n_gpr = crtl->args.info.regno;
8144 n_fpr = crtl->args.info.sse_regno;
8145
8146 if (cfun->va_list_gpr_size)
8147 {
8148 type = TREE_TYPE (gpr);
8149 t = build2 (MODIFY_EXPR, type,
8150 gpr, build_int_cst (type, n_gpr * 8));
8151 TREE_SIDE_EFFECTS (t) = 1;
8152 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
8153 }
8154
8155 if (TARGET_SSE && cfun->va_list_fpr_size)
8156 {
8157 type = TREE_TYPE (fpr);
8158 t = build2 (MODIFY_EXPR, type, fpr,
8159 build_int_cst (type, n_fpr * 16 + 8*X86_64_REGPARM_MAX));
8160 TREE_SIDE_EFFECTS (t) = 1;
8161 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
8162 }
8163
8164 /* Find the overflow area. */
8165 type = TREE_TYPE (ovf);
8166 if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
8167 ovf_rtx = crtl->args.internal_arg_pointer;
8168 else
8169 ovf_rtx = cfun->machine->split_stack_varargs_pointer;
8170 t = make_tree (type, ovf_rtx);
8171 if (words != 0)
8172 t = fold_build_pointer_plus_hwi (t, words * UNITS_PER_WORD);
8173 t = build2 (MODIFY_EXPR, type, ovf, t);
8174 TREE_SIDE_EFFECTS (t) = 1;
8175 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
8176
8177 if (ix86_varargs_gpr_size || ix86_varargs_fpr_size)
8178 {
8179 /* Find the register save area.
8180 Prologue of the function save it right above stack frame. */
8181 type = TREE_TYPE (sav);
8182 t = make_tree (type, frame_pointer_rtx);
8183 if (!ix86_varargs_gpr_size)
8184 t = fold_build_pointer_plus_hwi (t, -8 * X86_64_REGPARM_MAX);
8185 t = build2 (MODIFY_EXPR, type, sav, t);
8186 TREE_SIDE_EFFECTS (t) = 1;
8187 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
8188 }
8189 }
8190
8191 /* Implement va_arg. */
8192
8193 static tree
8194 ix86_gimplify_va_arg (tree valist, tree type, gimple_seq *pre_p,
8195 gimple_seq *post_p)
8196 {
8197 static const int intreg[6] = { 0, 1, 2, 3, 4, 5 };
8198 tree f_gpr, f_fpr, f_ovf, f_sav;
8199 tree gpr, fpr, ovf, sav, t;
8200 int size, rsize;
8201 tree lab_false, lab_over = NULL_TREE;
8202 tree addr, t2;
8203 rtx container;
8204 int indirect_p = 0;
8205 tree ptrtype;
8206 enum machine_mode nat_mode;
8207 unsigned int arg_boundary;
8208
8209 /* Only 64bit target needs something special. */
8210 if (!TARGET_64BIT || is_va_list_char_pointer (TREE_TYPE (valist)))
8211 return std_gimplify_va_arg_expr (valist, type, pre_p, post_p);
8212
8213 f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
8214 f_fpr = DECL_CHAIN (f_gpr);
8215 f_ovf = DECL_CHAIN (f_fpr);
8216 f_sav = DECL_CHAIN (f_ovf);
8217
8218 gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr),
8219 build_va_arg_indirect_ref (valist), f_gpr, NULL_TREE);
8220 valist = build_va_arg_indirect_ref (valist);
8221 fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), valist, f_fpr, NULL_TREE);
8222 ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), valist, f_ovf, NULL_TREE);
8223 sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), valist, f_sav, NULL_TREE);
8224
8225 indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
8226 if (indirect_p)
8227 type = build_pointer_type (type);
8228 size = int_size_in_bytes (type);
8229 rsize = (size + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
8230
8231 nat_mode = type_natural_mode (type, NULL);
8232 switch (nat_mode)
8233 {
8234 case V8SFmode:
8235 case V8SImode:
8236 case V32QImode:
8237 case V16HImode:
8238 case V4DFmode:
8239 case V4DImode:
8240 /* Unnamed 256bit vector mode parameters are passed on stack. */
8241 if (!TARGET_64BIT_MS_ABI)
8242 {
8243 container = NULL;
8244 break;
8245 }
8246
8247 default:
8248 container = construct_container (nat_mode, TYPE_MODE (type),
8249 type, 0, X86_64_REGPARM_MAX,
8250 X86_64_SSE_REGPARM_MAX, intreg,
8251 0);
8252 break;
8253 }
8254
8255 /* Pull the value out of the saved registers. */
8256
8257 addr = create_tmp_var (ptr_type_node, "addr");
8258
8259 if (container)
8260 {
8261 int needed_intregs, needed_sseregs;
8262 bool need_temp;
8263 tree int_addr, sse_addr;
8264
8265 lab_false = create_artificial_label (UNKNOWN_LOCATION);
8266 lab_over = create_artificial_label (UNKNOWN_LOCATION);
8267
8268 examine_argument (nat_mode, type, 0, &needed_intregs, &needed_sseregs);
8269
8270 need_temp = (!REG_P (container)
8271 && ((needed_intregs && TYPE_ALIGN (type) > 64)
8272 || TYPE_ALIGN (type) > 128));
8273
8274 /* In case we are passing structure, verify that it is consecutive block
8275 on the register save area. If not we need to do moves. */
8276 if (!need_temp && !REG_P (container))
8277 {
8278 /* Verify that all registers are strictly consecutive */
8279 if (SSE_REGNO_P (REGNO (XEXP (XVECEXP (container, 0, 0), 0))))
8280 {
8281 int i;
8282
8283 for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
8284 {
8285 rtx slot = XVECEXP (container, 0, i);
8286 if (REGNO (XEXP (slot, 0)) != FIRST_SSE_REG + (unsigned int) i
8287 || INTVAL (XEXP (slot, 1)) != i * 16)
8288 need_temp = 1;
8289 }
8290 }
8291 else
8292 {
8293 int i;
8294
8295 for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
8296 {
8297 rtx slot = XVECEXP (container, 0, i);
8298 if (REGNO (XEXP (slot, 0)) != (unsigned int) i
8299 || INTVAL (XEXP (slot, 1)) != i * 8)
8300 need_temp = 1;
8301 }
8302 }
8303 }
8304 if (!need_temp)
8305 {
8306 int_addr = addr;
8307 sse_addr = addr;
8308 }
8309 else
8310 {
8311 int_addr = create_tmp_var (ptr_type_node, "int_addr");
8312 sse_addr = create_tmp_var (ptr_type_node, "sse_addr");
8313 }
8314
8315 /* First ensure that we fit completely in registers. */
8316 if (needed_intregs)
8317 {
8318 t = build_int_cst (TREE_TYPE (gpr),
8319 (X86_64_REGPARM_MAX - needed_intregs + 1) * 8);
8320 t = build2 (GE_EXPR, boolean_type_node, gpr, t);
8321 t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
8322 t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
8323 gimplify_and_add (t, pre_p);
8324 }
8325 if (needed_sseregs)
8326 {
8327 t = build_int_cst (TREE_TYPE (fpr),
8328 (X86_64_SSE_REGPARM_MAX - needed_sseregs + 1) * 16
8329 + X86_64_REGPARM_MAX * 8);
8330 t = build2 (GE_EXPR, boolean_type_node, fpr, t);
8331 t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
8332 t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
8333 gimplify_and_add (t, pre_p);
8334 }
8335
8336 /* Compute index to start of area used for integer regs. */
8337 if (needed_intregs)
8338 {
8339 /* int_addr = gpr + sav; */
8340 t = fold_build_pointer_plus (sav, gpr);
8341 gimplify_assign (int_addr, t, pre_p);
8342 }
8343 if (needed_sseregs)
8344 {
8345 /* sse_addr = fpr + sav; */
8346 t = fold_build_pointer_plus (sav, fpr);
8347 gimplify_assign (sse_addr, t, pre_p);
8348 }
8349 if (need_temp)
8350 {
8351 int i, prev_size = 0;
8352 tree temp = create_tmp_var (type, "va_arg_tmp");
8353
8354 /* addr = &temp; */
8355 t = build1 (ADDR_EXPR, build_pointer_type (type), temp);
8356 gimplify_assign (addr, t, pre_p);
8357
8358 for (i = 0; i < XVECLEN (container, 0); i++)
8359 {
8360 rtx slot = XVECEXP (container, 0, i);
8361 rtx reg = XEXP (slot, 0);
8362 enum machine_mode mode = GET_MODE (reg);
8363 tree piece_type;
8364 tree addr_type;
8365 tree daddr_type;
8366 tree src_addr, src;
8367 int src_offset;
8368 tree dest_addr, dest;
8369 int cur_size = GET_MODE_SIZE (mode);
8370
8371 gcc_assert (prev_size <= INTVAL (XEXP (slot, 1)));
8372 prev_size = INTVAL (XEXP (slot, 1));
8373 if (prev_size + cur_size > size)
8374 {
8375 cur_size = size - prev_size;
8376 mode = mode_for_size (cur_size * BITS_PER_UNIT, MODE_INT, 1);
8377 if (mode == BLKmode)
8378 mode = QImode;
8379 }
8380 piece_type = lang_hooks.types.type_for_mode (mode, 1);
8381 if (mode == GET_MODE (reg))
8382 addr_type = build_pointer_type (piece_type);
8383 else
8384 addr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
8385 true);
8386 daddr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
8387 true);
8388
8389 if (SSE_REGNO_P (REGNO (reg)))
8390 {
8391 src_addr = sse_addr;
8392 src_offset = (REGNO (reg) - FIRST_SSE_REG) * 16;
8393 }
8394 else
8395 {
8396 src_addr = int_addr;
8397 src_offset = REGNO (reg) * 8;
8398 }
8399 src_addr = fold_convert (addr_type, src_addr);
8400 src_addr = fold_build_pointer_plus_hwi (src_addr, src_offset);
8401
8402 dest_addr = fold_convert (daddr_type, addr);
8403 dest_addr = fold_build_pointer_plus_hwi (dest_addr, prev_size);
8404 if (cur_size == GET_MODE_SIZE (mode))
8405 {
8406 src = build_va_arg_indirect_ref (src_addr);
8407 dest = build_va_arg_indirect_ref (dest_addr);
8408
8409 gimplify_assign (dest, src, pre_p);
8410 }
8411 else
8412 {
8413 tree copy
8414 = build_call_expr (builtin_decl_implicit (BUILT_IN_MEMCPY),
8415 3, dest_addr, src_addr,
8416 size_int (cur_size));
8417 gimplify_and_add (copy, pre_p);
8418 }
8419 prev_size += cur_size;
8420 }
8421 }
8422
8423 if (needed_intregs)
8424 {
8425 t = build2 (PLUS_EXPR, TREE_TYPE (gpr), gpr,
8426 build_int_cst (TREE_TYPE (gpr), needed_intregs * 8));
8427 gimplify_assign (gpr, t, pre_p);
8428 }
8429
8430 if (needed_sseregs)
8431 {
8432 t = build2 (PLUS_EXPR, TREE_TYPE (fpr), fpr,
8433 build_int_cst (TREE_TYPE (fpr), needed_sseregs * 16));
8434 gimplify_assign (fpr, t, pre_p);
8435 }
8436
8437 gimple_seq_add_stmt (pre_p, gimple_build_goto (lab_over));
8438
8439 gimple_seq_add_stmt (pre_p, gimple_build_label (lab_false));
8440 }
8441
8442 /* ... otherwise out of the overflow area. */
8443
8444 /* When we align parameter on stack for caller, if the parameter
8445 alignment is beyond MAX_SUPPORTED_STACK_ALIGNMENT, it will be
8446 aligned at MAX_SUPPORTED_STACK_ALIGNMENT. We will match callee
8447 here with caller. */
8448 arg_boundary = ix86_function_arg_boundary (VOIDmode, type);
8449 if ((unsigned int) arg_boundary > MAX_SUPPORTED_STACK_ALIGNMENT)
8450 arg_boundary = MAX_SUPPORTED_STACK_ALIGNMENT;
8451
8452 /* Care for on-stack alignment if needed. */
8453 if (arg_boundary <= 64 || size == 0)
8454 t = ovf;
8455 else
8456 {
8457 HOST_WIDE_INT align = arg_boundary / 8;
8458 t = fold_build_pointer_plus_hwi (ovf, align - 1);
8459 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
8460 build_int_cst (TREE_TYPE (t), -align));
8461 }
8462
8463 gimplify_expr (&t, pre_p, NULL, is_gimple_val, fb_rvalue);
8464 gimplify_assign (addr, t, pre_p);
8465
8466 t = fold_build_pointer_plus_hwi (t, rsize * UNITS_PER_WORD);
8467 gimplify_assign (unshare_expr (ovf), t, pre_p);
8468
8469 if (container)
8470 gimple_seq_add_stmt (pre_p, gimple_build_label (lab_over));
8471
8472 ptrtype = build_pointer_type_for_mode (type, ptr_mode, true);
8473 addr = fold_convert (ptrtype, addr);
8474
8475 if (indirect_p)
8476 addr = build_va_arg_indirect_ref (addr);
8477 return build_va_arg_indirect_ref (addr);
8478 }
8479 \f
8480 /* Return true if OPNUM's MEM should be matched
8481 in movabs* patterns. */
8482
8483 bool
8484 ix86_check_movabs (rtx insn, int opnum)
8485 {
8486 rtx set, mem;
8487
8488 set = PATTERN (insn);
8489 if (GET_CODE (set) == PARALLEL)
8490 set = XVECEXP (set, 0, 0);
8491 gcc_assert (GET_CODE (set) == SET);
8492 mem = XEXP (set, opnum);
8493 while (GET_CODE (mem) == SUBREG)
8494 mem = SUBREG_REG (mem);
8495 gcc_assert (MEM_P (mem));
8496 return volatile_ok || !MEM_VOLATILE_P (mem);
8497 }
8498 \f
8499 /* Initialize the table of extra 80387 mathematical constants. */
8500
8501 static void
8502 init_ext_80387_constants (void)
8503 {
8504 static const char * cst[5] =
8505 {
8506 "0.3010299956639811952256464283594894482", /* 0: fldlg2 */
8507 "0.6931471805599453094286904741849753009", /* 1: fldln2 */
8508 "1.4426950408889634073876517827983434472", /* 2: fldl2e */
8509 "3.3219280948873623478083405569094566090", /* 3: fldl2t */
8510 "3.1415926535897932385128089594061862044", /* 4: fldpi */
8511 };
8512 int i;
8513
8514 for (i = 0; i < 5; i++)
8515 {
8516 real_from_string (&ext_80387_constants_table[i], cst[i]);
8517 /* Ensure each constant is rounded to XFmode precision. */
8518 real_convert (&ext_80387_constants_table[i],
8519 XFmode, &ext_80387_constants_table[i]);
8520 }
8521
8522 ext_80387_constants_init = 1;
8523 }
8524
8525 /* Return non-zero if the constant is something that
8526 can be loaded with a special instruction. */
8527
8528 int
8529 standard_80387_constant_p (rtx x)
8530 {
8531 enum machine_mode mode = GET_MODE (x);
8532
8533 REAL_VALUE_TYPE r;
8534
8535 if (!(X87_FLOAT_MODE_P (mode) && (GET_CODE (x) == CONST_DOUBLE)))
8536 return -1;
8537
8538 if (x == CONST0_RTX (mode))
8539 return 1;
8540 if (x == CONST1_RTX (mode))
8541 return 2;
8542
8543 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
8544
8545 /* For XFmode constants, try to find a special 80387 instruction when
8546 optimizing for size or on those CPUs that benefit from them. */
8547 if (mode == XFmode
8548 && (optimize_function_for_size_p (cfun) || TARGET_EXT_80387_CONSTANTS))
8549 {
8550 int i;
8551
8552 if (! ext_80387_constants_init)
8553 init_ext_80387_constants ();
8554
8555 for (i = 0; i < 5; i++)
8556 if (real_identical (&r, &ext_80387_constants_table[i]))
8557 return i + 3;
8558 }
8559
8560 /* Load of the constant -0.0 or -1.0 will be split as
8561 fldz;fchs or fld1;fchs sequence. */
8562 if (real_isnegzero (&r))
8563 return 8;
8564 if (real_identical (&r, &dconstm1))
8565 return 9;
8566
8567 return 0;
8568 }
8569
8570 /* Return the opcode of the special instruction to be used to load
8571 the constant X. */
8572
8573 const char *
8574 standard_80387_constant_opcode (rtx x)
8575 {
8576 switch (standard_80387_constant_p (x))
8577 {
8578 case 1:
8579 return "fldz";
8580 case 2:
8581 return "fld1";
8582 case 3:
8583 return "fldlg2";
8584 case 4:
8585 return "fldln2";
8586 case 5:
8587 return "fldl2e";
8588 case 6:
8589 return "fldl2t";
8590 case 7:
8591 return "fldpi";
8592 case 8:
8593 case 9:
8594 return "#";
8595 default:
8596 gcc_unreachable ();
8597 }
8598 }
8599
8600 /* Return the CONST_DOUBLE representing the 80387 constant that is
8601 loaded by the specified special instruction. The argument IDX
8602 matches the return value from standard_80387_constant_p. */
8603
8604 rtx
8605 standard_80387_constant_rtx (int idx)
8606 {
8607 int i;
8608
8609 if (! ext_80387_constants_init)
8610 init_ext_80387_constants ();
8611
8612 switch (idx)
8613 {
8614 case 3:
8615 case 4:
8616 case 5:
8617 case 6:
8618 case 7:
8619 i = idx - 3;
8620 break;
8621
8622 default:
8623 gcc_unreachable ();
8624 }
8625
8626 return CONST_DOUBLE_FROM_REAL_VALUE (ext_80387_constants_table[i],
8627 XFmode);
8628 }
8629
8630 /* Return 1 if X is all 0s and 2 if x is all 1s
8631 in supported SSE/AVX vector mode. */
8632
8633 int
8634 standard_sse_constant_p (rtx x)
8635 {
8636 enum machine_mode mode = GET_MODE (x);
8637
8638 if (x == const0_rtx || x == CONST0_RTX (GET_MODE (x)))
8639 return 1;
8640 if (vector_all_ones_operand (x, mode))
8641 switch (mode)
8642 {
8643 case V16QImode:
8644 case V8HImode:
8645 case V4SImode:
8646 case V2DImode:
8647 if (TARGET_SSE2)
8648 return 2;
8649 case V32QImode:
8650 case V16HImode:
8651 case V8SImode:
8652 case V4DImode:
8653 if (TARGET_AVX2)
8654 return 2;
8655 default:
8656 break;
8657 }
8658
8659 return 0;
8660 }
8661
8662 /* Return the opcode of the special instruction to be used to load
8663 the constant X. */
8664
8665 const char *
8666 standard_sse_constant_opcode (rtx insn, rtx x)
8667 {
8668 switch (standard_sse_constant_p (x))
8669 {
8670 case 1:
8671 switch (get_attr_mode (insn))
8672 {
8673 case MODE_TI:
8674 return "%vpxor\t%0, %d0";
8675 case MODE_V2DF:
8676 return "%vxorpd\t%0, %d0";
8677 case MODE_V4SF:
8678 return "%vxorps\t%0, %d0";
8679
8680 case MODE_OI:
8681 return "vpxor\t%x0, %x0, %x0";
8682 case MODE_V4DF:
8683 return "vxorpd\t%x0, %x0, %x0";
8684 case MODE_V8SF:
8685 return "vxorps\t%x0, %x0, %x0";
8686
8687 default:
8688 break;
8689 }
8690
8691 case 2:
8692 if (TARGET_AVX)
8693 return "vpcmpeqd\t%0, %0, %0";
8694 else
8695 return "pcmpeqd\t%0, %0";
8696
8697 default:
8698 break;
8699 }
8700 gcc_unreachable ();
8701 }
8702
8703 /* Returns true if OP contains a symbol reference */
8704
8705 bool
8706 symbolic_reference_mentioned_p (rtx op)
8707 {
8708 const char *fmt;
8709 int i;
8710
8711 if (GET_CODE (op) == SYMBOL_REF || GET_CODE (op) == LABEL_REF)
8712 return true;
8713
8714 fmt = GET_RTX_FORMAT (GET_CODE (op));
8715 for (i = GET_RTX_LENGTH (GET_CODE (op)) - 1; i >= 0; i--)
8716 {
8717 if (fmt[i] == 'E')
8718 {
8719 int j;
8720
8721 for (j = XVECLEN (op, i) - 1; j >= 0; j--)
8722 if (symbolic_reference_mentioned_p (XVECEXP (op, i, j)))
8723 return true;
8724 }
8725
8726 else if (fmt[i] == 'e' && symbolic_reference_mentioned_p (XEXP (op, i)))
8727 return true;
8728 }
8729
8730 return false;
8731 }
8732
8733 /* Return true if it is appropriate to emit `ret' instructions in the
8734 body of a function. Do this only if the epilogue is simple, needing a
8735 couple of insns. Prior to reloading, we can't tell how many registers
8736 must be saved, so return false then. Return false if there is no frame
8737 marker to de-allocate. */
8738
8739 bool
8740 ix86_can_use_return_insn_p (void)
8741 {
8742 struct ix86_frame frame;
8743
8744 if (! reload_completed || frame_pointer_needed)
8745 return 0;
8746
8747 /* Don't allow more than 32k pop, since that's all we can do
8748 with one instruction. */
8749 if (crtl->args.pops_args && crtl->args.size >= 32768)
8750 return 0;
8751
8752 ix86_compute_frame_layout (&frame);
8753 return (frame.stack_pointer_offset == UNITS_PER_WORD
8754 && (frame.nregs + frame.nsseregs) == 0);
8755 }
8756 \f
8757 /* Value should be nonzero if functions must have frame pointers.
8758 Zero means the frame pointer need not be set up (and parms may
8759 be accessed via the stack pointer) in functions that seem suitable. */
8760
8761 static bool
8762 ix86_frame_pointer_required (void)
8763 {
8764 /* If we accessed previous frames, then the generated code expects
8765 to be able to access the saved ebp value in our frame. */
8766 if (cfun->machine->accesses_prev_frame)
8767 return true;
8768
8769 /* Several x86 os'es need a frame pointer for other reasons,
8770 usually pertaining to setjmp. */
8771 if (SUBTARGET_FRAME_POINTER_REQUIRED)
8772 return true;
8773
8774 /* For older 32-bit runtimes setjmp requires valid frame-pointer. */
8775 if (TARGET_32BIT_MS_ABI && cfun->calls_setjmp)
8776 return true;
8777
8778 /* Win64 SEH, very large frames need a frame-pointer as maximum stack
8779 allocation is 4GB. */
8780 if (TARGET_64BIT_MS_ABI && get_frame_size () > SEH_MAX_FRAME_SIZE)
8781 return true;
8782
8783 /* In ix86_option_override_internal, TARGET_OMIT_LEAF_FRAME_POINTER
8784 turns off the frame pointer by default. Turn it back on now if
8785 we've not got a leaf function. */
8786 if (TARGET_OMIT_LEAF_FRAME_POINTER
8787 && (!crtl->is_leaf
8788 || ix86_current_function_calls_tls_descriptor))
8789 return true;
8790
8791 if (crtl->profile && !flag_fentry)
8792 return true;
8793
8794 return false;
8795 }
8796
8797 /* Record that the current function accesses previous call frames. */
8798
8799 void
8800 ix86_setup_frame_addresses (void)
8801 {
8802 cfun->machine->accesses_prev_frame = 1;
8803 }
8804 \f
8805 #ifndef USE_HIDDEN_LINKONCE
8806 # if defined(HAVE_GAS_HIDDEN) && (SUPPORTS_ONE_ONLY - 0)
8807 # define USE_HIDDEN_LINKONCE 1
8808 # else
8809 # define USE_HIDDEN_LINKONCE 0
8810 # endif
8811 #endif
8812
8813 static int pic_labels_used;
8814
8815 /* Fills in the label name that should be used for a pc thunk for
8816 the given register. */
8817
8818 static void
8819 get_pc_thunk_name (char name[32], unsigned int regno)
8820 {
8821 gcc_assert (!TARGET_64BIT);
8822
8823 if (USE_HIDDEN_LINKONCE)
8824 sprintf (name, "__x86.get_pc_thunk.%s", reg_names[regno]);
8825 else
8826 ASM_GENERATE_INTERNAL_LABEL (name, "LPR", regno);
8827 }
8828
8829
8830 /* This function generates code for -fpic that loads %ebx with
8831 the return address of the caller and then returns. */
8832
8833 static void
8834 ix86_code_end (void)
8835 {
8836 rtx xops[2];
8837 int regno;
8838
8839 for (regno = AX_REG; regno <= SP_REG; regno++)
8840 {
8841 char name[32];
8842 tree decl;
8843
8844 if (!(pic_labels_used & (1 << regno)))
8845 continue;
8846
8847 get_pc_thunk_name (name, regno);
8848
8849 decl = build_decl (BUILTINS_LOCATION, FUNCTION_DECL,
8850 get_identifier (name),
8851 build_function_type_list (void_type_node, NULL_TREE));
8852 DECL_RESULT (decl) = build_decl (BUILTINS_LOCATION, RESULT_DECL,
8853 NULL_TREE, void_type_node);
8854 TREE_PUBLIC (decl) = 1;
8855 TREE_STATIC (decl) = 1;
8856 DECL_IGNORED_P (decl) = 1;
8857
8858 #if TARGET_MACHO
8859 if (TARGET_MACHO)
8860 {
8861 switch_to_section (darwin_sections[text_coal_section]);
8862 fputs ("\t.weak_definition\t", asm_out_file);
8863 assemble_name (asm_out_file, name);
8864 fputs ("\n\t.private_extern\t", asm_out_file);
8865 assemble_name (asm_out_file, name);
8866 putc ('\n', asm_out_file);
8867 ASM_OUTPUT_LABEL (asm_out_file, name);
8868 DECL_WEAK (decl) = 1;
8869 }
8870 else
8871 #endif
8872 if (USE_HIDDEN_LINKONCE)
8873 {
8874 DECL_COMDAT_GROUP (decl) = DECL_ASSEMBLER_NAME (decl);
8875
8876 targetm.asm_out.unique_section (decl, 0);
8877 switch_to_section (get_named_section (decl, NULL, 0));
8878
8879 targetm.asm_out.globalize_label (asm_out_file, name);
8880 fputs ("\t.hidden\t", asm_out_file);
8881 assemble_name (asm_out_file, name);
8882 putc ('\n', asm_out_file);
8883 ASM_DECLARE_FUNCTION_NAME (asm_out_file, name, decl);
8884 }
8885 else
8886 {
8887 switch_to_section (text_section);
8888 ASM_OUTPUT_LABEL (asm_out_file, name);
8889 }
8890
8891 DECL_INITIAL (decl) = make_node (BLOCK);
8892 current_function_decl = decl;
8893 init_function_start (decl);
8894 first_function_block_is_cold = false;
8895 /* Make sure unwind info is emitted for the thunk if needed. */
8896 final_start_function (emit_barrier (), asm_out_file, 1);
8897
8898 /* Pad stack IP move with 4 instructions (two NOPs count
8899 as one instruction). */
8900 if (TARGET_PAD_SHORT_FUNCTION)
8901 {
8902 int i = 8;
8903
8904 while (i--)
8905 fputs ("\tnop\n", asm_out_file);
8906 }
8907
8908 xops[0] = gen_rtx_REG (Pmode, regno);
8909 xops[1] = gen_rtx_MEM (Pmode, stack_pointer_rtx);
8910 output_asm_insn ("mov%z0\t{%1, %0|%0, %1}", xops);
8911 fputs ("\tret\n", asm_out_file);
8912 final_end_function ();
8913 init_insn_lengths ();
8914 free_after_compilation (cfun);
8915 set_cfun (NULL);
8916 current_function_decl = NULL;
8917 }
8918
8919 if (flag_split_stack)
8920 file_end_indicate_split_stack ();
8921 }
8922
8923 /* Emit code for the SET_GOT patterns. */
8924
8925 const char *
8926 output_set_got (rtx dest, rtx label ATTRIBUTE_UNUSED)
8927 {
8928 rtx xops[3];
8929
8930 xops[0] = dest;
8931
8932 if (TARGET_VXWORKS_RTP && flag_pic)
8933 {
8934 /* Load (*VXWORKS_GOTT_BASE) into the PIC register. */
8935 xops[2] = gen_rtx_MEM (Pmode,
8936 gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_BASE));
8937 output_asm_insn ("mov{l}\t{%2, %0|%0, %2}", xops);
8938
8939 /* Load (*VXWORKS_GOTT_BASE)[VXWORKS_GOTT_INDEX] into the PIC register.
8940 Use %P and a local symbol in order to print VXWORKS_GOTT_INDEX as
8941 an unadorned address. */
8942 xops[2] = gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_INDEX);
8943 SYMBOL_REF_FLAGS (xops[2]) |= SYMBOL_FLAG_LOCAL;
8944 output_asm_insn ("mov{l}\t{%P2(%0), %0|%0, DWORD PTR %P2[%0]}", xops);
8945 return "";
8946 }
8947
8948 xops[1] = gen_rtx_SYMBOL_REF (Pmode, GOT_SYMBOL_NAME);
8949
8950 if (!flag_pic)
8951 {
8952 xops[2] = gen_rtx_LABEL_REF (Pmode, label ? label : gen_label_rtx ());
8953
8954 output_asm_insn ("mov%z0\t{%2, %0|%0, %2}", xops);
8955
8956 #if TARGET_MACHO
8957 /* Output the Mach-O "canonical" label name ("Lxx$pb") here too. This
8958 is what will be referenced by the Mach-O PIC subsystem. */
8959 if (!label)
8960 ASM_OUTPUT_LABEL (asm_out_file, MACHOPIC_FUNCTION_BASE_NAME);
8961 #endif
8962
8963 targetm.asm_out.internal_label (asm_out_file, "L",
8964 CODE_LABEL_NUMBER (XEXP (xops[2], 0)));
8965 }
8966 else
8967 {
8968 char name[32];
8969 get_pc_thunk_name (name, REGNO (dest));
8970 pic_labels_used |= 1 << REGNO (dest);
8971
8972 xops[2] = gen_rtx_SYMBOL_REF (Pmode, ggc_strdup (name));
8973 xops[2] = gen_rtx_MEM (QImode, xops[2]);
8974 output_asm_insn ("call\t%X2", xops);
8975 /* Output the Mach-O "canonical" label name ("Lxx$pb") here too. This
8976 is what will be referenced by the Mach-O PIC subsystem. */
8977 #if TARGET_MACHO
8978 if (!label)
8979 ASM_OUTPUT_LABEL (asm_out_file, MACHOPIC_FUNCTION_BASE_NAME);
8980 else
8981 targetm.asm_out.internal_label (asm_out_file, "L",
8982 CODE_LABEL_NUMBER (label));
8983 #endif
8984 }
8985
8986 if (!TARGET_MACHO)
8987 output_asm_insn ("add%z0\t{%1, %0|%0, %1}", xops);
8988
8989 return "";
8990 }
8991
8992 /* Generate an "push" pattern for input ARG. */
8993
8994 static rtx
8995 gen_push (rtx arg)
8996 {
8997 struct machine_function *m = cfun->machine;
8998
8999 if (m->fs.cfa_reg == stack_pointer_rtx)
9000 m->fs.cfa_offset += UNITS_PER_WORD;
9001 m->fs.sp_offset += UNITS_PER_WORD;
9002
9003 if (REG_P (arg) && GET_MODE (arg) != word_mode)
9004 arg = gen_rtx_REG (word_mode, REGNO (arg));
9005
9006 return gen_rtx_SET (VOIDmode,
9007 gen_rtx_MEM (word_mode,
9008 gen_rtx_PRE_DEC (Pmode,
9009 stack_pointer_rtx)),
9010 arg);
9011 }
9012
9013 /* Generate an "pop" pattern for input ARG. */
9014
9015 static rtx
9016 gen_pop (rtx arg)
9017 {
9018 if (REG_P (arg) && GET_MODE (arg) != word_mode)
9019 arg = gen_rtx_REG (word_mode, REGNO (arg));
9020
9021 return gen_rtx_SET (VOIDmode,
9022 arg,
9023 gen_rtx_MEM (word_mode,
9024 gen_rtx_POST_INC (Pmode,
9025 stack_pointer_rtx)));
9026 }
9027
9028 /* Return >= 0 if there is an unused call-clobbered register available
9029 for the entire function. */
9030
9031 static unsigned int
9032 ix86_select_alt_pic_regnum (void)
9033 {
9034 if (crtl->is_leaf
9035 && !crtl->profile
9036 && !ix86_current_function_calls_tls_descriptor)
9037 {
9038 int i, drap;
9039 /* Can't use the same register for both PIC and DRAP. */
9040 if (crtl->drap_reg)
9041 drap = REGNO (crtl->drap_reg);
9042 else
9043 drap = -1;
9044 for (i = 2; i >= 0; --i)
9045 if (i != drap && !df_regs_ever_live_p (i))
9046 return i;
9047 }
9048
9049 return INVALID_REGNUM;
9050 }
9051
9052 /* Return TRUE if we need to save REGNO. */
9053
9054 static bool
9055 ix86_save_reg (unsigned int regno, bool maybe_eh_return)
9056 {
9057 if (pic_offset_table_rtx
9058 && regno == REAL_PIC_OFFSET_TABLE_REGNUM
9059 && (df_regs_ever_live_p (REAL_PIC_OFFSET_TABLE_REGNUM)
9060 || crtl->profile
9061 || crtl->calls_eh_return
9062 || crtl->uses_const_pool))
9063 return ix86_select_alt_pic_regnum () == INVALID_REGNUM;
9064
9065 if (crtl->calls_eh_return && maybe_eh_return)
9066 {
9067 unsigned i;
9068 for (i = 0; ; i++)
9069 {
9070 unsigned test = EH_RETURN_DATA_REGNO (i);
9071 if (test == INVALID_REGNUM)
9072 break;
9073 if (test == regno)
9074 return true;
9075 }
9076 }
9077
9078 if (crtl->drap_reg && regno == REGNO (crtl->drap_reg))
9079 return true;
9080
9081 return (df_regs_ever_live_p (regno)
9082 && !call_used_regs[regno]
9083 && !fixed_regs[regno]
9084 && (regno != HARD_FRAME_POINTER_REGNUM || !frame_pointer_needed));
9085 }
9086
9087 /* Return number of saved general prupose registers. */
9088
9089 static int
9090 ix86_nsaved_regs (void)
9091 {
9092 int nregs = 0;
9093 int regno;
9094
9095 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
9096 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9097 nregs ++;
9098 return nregs;
9099 }
9100
9101 /* Return number of saved SSE registrers. */
9102
9103 static int
9104 ix86_nsaved_sseregs (void)
9105 {
9106 int nregs = 0;
9107 int regno;
9108
9109 if (!TARGET_64BIT_MS_ABI)
9110 return 0;
9111 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
9112 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9113 nregs ++;
9114 return nregs;
9115 }
9116
9117 /* Given FROM and TO register numbers, say whether this elimination is
9118 allowed. If stack alignment is needed, we can only replace argument
9119 pointer with hard frame pointer, or replace frame pointer with stack
9120 pointer. Otherwise, frame pointer elimination is automatically
9121 handled and all other eliminations are valid. */
9122
9123 static bool
9124 ix86_can_eliminate (const int from, const int to)
9125 {
9126 if (stack_realign_fp)
9127 return ((from == ARG_POINTER_REGNUM
9128 && to == HARD_FRAME_POINTER_REGNUM)
9129 || (from == FRAME_POINTER_REGNUM
9130 && to == STACK_POINTER_REGNUM));
9131 else
9132 return to == STACK_POINTER_REGNUM ? !frame_pointer_needed : true;
9133 }
9134
9135 /* Return the offset between two registers, one to be eliminated, and the other
9136 its replacement, at the start of a routine. */
9137
9138 HOST_WIDE_INT
9139 ix86_initial_elimination_offset (int from, int to)
9140 {
9141 struct ix86_frame frame;
9142 ix86_compute_frame_layout (&frame);
9143
9144 if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
9145 return frame.hard_frame_pointer_offset;
9146 else if (from == FRAME_POINTER_REGNUM
9147 && to == HARD_FRAME_POINTER_REGNUM)
9148 return frame.hard_frame_pointer_offset - frame.frame_pointer_offset;
9149 else
9150 {
9151 gcc_assert (to == STACK_POINTER_REGNUM);
9152
9153 if (from == ARG_POINTER_REGNUM)
9154 return frame.stack_pointer_offset;
9155
9156 gcc_assert (from == FRAME_POINTER_REGNUM);
9157 return frame.stack_pointer_offset - frame.frame_pointer_offset;
9158 }
9159 }
9160
9161 /* In a dynamically-aligned function, we can't know the offset from
9162 stack pointer to frame pointer, so we must ensure that setjmp
9163 eliminates fp against the hard fp (%ebp) rather than trying to
9164 index from %esp up to the top of the frame across a gap that is
9165 of unknown (at compile-time) size. */
9166 static rtx
9167 ix86_builtin_setjmp_frame_value (void)
9168 {
9169 return stack_realign_fp ? hard_frame_pointer_rtx : virtual_stack_vars_rtx;
9170 }
9171
9172 /* When using -fsplit-stack, the allocation routines set a field in
9173 the TCB to the bottom of the stack plus this much space, measured
9174 in bytes. */
9175
9176 #define SPLIT_STACK_AVAILABLE 256
9177
9178 /* Fill structure ix86_frame about frame of currently computed function. */
9179
9180 static void
9181 ix86_compute_frame_layout (struct ix86_frame *frame)
9182 {
9183 unsigned HOST_WIDE_INT stack_alignment_needed;
9184 HOST_WIDE_INT offset;
9185 unsigned HOST_WIDE_INT preferred_alignment;
9186 HOST_WIDE_INT size = get_frame_size ();
9187 HOST_WIDE_INT to_allocate;
9188
9189 frame->nregs = ix86_nsaved_regs ();
9190 frame->nsseregs = ix86_nsaved_sseregs ();
9191
9192 stack_alignment_needed = crtl->stack_alignment_needed / BITS_PER_UNIT;
9193 preferred_alignment = crtl->preferred_stack_boundary / BITS_PER_UNIT;
9194
9195 /* 64-bit MS ABI seem to require stack alignment to be always 16 except for
9196 function prologues and leaf. */
9197 if ((TARGET_64BIT_MS_ABI && preferred_alignment < 16)
9198 && (!crtl->is_leaf || cfun->calls_alloca != 0
9199 || ix86_current_function_calls_tls_descriptor))
9200 {
9201 preferred_alignment = 16;
9202 stack_alignment_needed = 16;
9203 crtl->preferred_stack_boundary = 128;
9204 crtl->stack_alignment_needed = 128;
9205 }
9206
9207 gcc_assert (!size || stack_alignment_needed);
9208 gcc_assert (preferred_alignment >= STACK_BOUNDARY / BITS_PER_UNIT);
9209 gcc_assert (preferred_alignment <= stack_alignment_needed);
9210
9211 /* For SEH we have to limit the amount of code movement into the prologue.
9212 At present we do this via a BLOCKAGE, at which point there's very little
9213 scheduling that can be done, which means that there's very little point
9214 in doing anything except PUSHs. */
9215 if (TARGET_SEH)
9216 cfun->machine->use_fast_prologue_epilogue = false;
9217
9218 /* During reload iteration the amount of registers saved can change.
9219 Recompute the value as needed. Do not recompute when amount of registers
9220 didn't change as reload does multiple calls to the function and does not
9221 expect the decision to change within single iteration. */
9222 else if (!optimize_function_for_size_p (cfun)
9223 && cfun->machine->use_fast_prologue_epilogue_nregs != frame->nregs)
9224 {
9225 int count = frame->nregs;
9226 struct cgraph_node *node = cgraph_get_node (current_function_decl);
9227
9228 cfun->machine->use_fast_prologue_epilogue_nregs = count;
9229
9230 /* The fast prologue uses move instead of push to save registers. This
9231 is significantly longer, but also executes faster as modern hardware
9232 can execute the moves in parallel, but can't do that for push/pop.
9233
9234 Be careful about choosing what prologue to emit: When function takes
9235 many instructions to execute we may use slow version as well as in
9236 case function is known to be outside hot spot (this is known with
9237 feedback only). Weight the size of function by number of registers
9238 to save as it is cheap to use one or two push instructions but very
9239 slow to use many of them. */
9240 if (count)
9241 count = (count - 1) * FAST_PROLOGUE_INSN_COUNT;
9242 if (node->frequency < NODE_FREQUENCY_NORMAL
9243 || (flag_branch_probabilities
9244 && node->frequency < NODE_FREQUENCY_HOT))
9245 cfun->machine->use_fast_prologue_epilogue = false;
9246 else
9247 cfun->machine->use_fast_prologue_epilogue
9248 = !expensive_function_p (count);
9249 }
9250
9251 frame->save_regs_using_mov
9252 = (TARGET_PROLOGUE_USING_MOVE && cfun->machine->use_fast_prologue_epilogue
9253 /* If static stack checking is enabled and done with probes,
9254 the registers need to be saved before allocating the frame. */
9255 && flag_stack_check != STATIC_BUILTIN_STACK_CHECK);
9256
9257 /* Skip return address. */
9258 offset = UNITS_PER_WORD;
9259
9260 /* Skip pushed static chain. */
9261 if (ix86_static_chain_on_stack)
9262 offset += UNITS_PER_WORD;
9263
9264 /* Skip saved base pointer. */
9265 if (frame_pointer_needed)
9266 offset += UNITS_PER_WORD;
9267 frame->hfp_save_offset = offset;
9268
9269 /* The traditional frame pointer location is at the top of the frame. */
9270 frame->hard_frame_pointer_offset = offset;
9271
9272 /* Register save area */
9273 offset += frame->nregs * UNITS_PER_WORD;
9274 frame->reg_save_offset = offset;
9275
9276 /* On SEH target, registers are pushed just before the frame pointer
9277 location. */
9278 if (TARGET_SEH)
9279 frame->hard_frame_pointer_offset = offset;
9280
9281 /* Align and set SSE register save area. */
9282 if (frame->nsseregs)
9283 {
9284 /* The only ABI that has saved SSE registers (Win64) also has a
9285 16-byte aligned default stack, and thus we don't need to be
9286 within the re-aligned local stack frame to save them. */
9287 gcc_assert (INCOMING_STACK_BOUNDARY >= 128);
9288 offset = (offset + 16 - 1) & -16;
9289 offset += frame->nsseregs * 16;
9290 }
9291 frame->sse_reg_save_offset = offset;
9292
9293 /* The re-aligned stack starts here. Values before this point are not
9294 directly comparable with values below this point. In order to make
9295 sure that no value happens to be the same before and after, force
9296 the alignment computation below to add a non-zero value. */
9297 if (stack_realign_fp)
9298 offset = (offset + stack_alignment_needed) & -stack_alignment_needed;
9299
9300 /* Va-arg area */
9301 frame->va_arg_size = ix86_varargs_gpr_size + ix86_varargs_fpr_size;
9302 offset += frame->va_arg_size;
9303
9304 /* Align start of frame for local function. */
9305 if (stack_realign_fp
9306 || offset != frame->sse_reg_save_offset
9307 || size != 0
9308 || !crtl->is_leaf
9309 || cfun->calls_alloca
9310 || ix86_current_function_calls_tls_descriptor)
9311 offset = (offset + stack_alignment_needed - 1) & -stack_alignment_needed;
9312
9313 /* Frame pointer points here. */
9314 frame->frame_pointer_offset = offset;
9315
9316 offset += size;
9317
9318 /* Add outgoing arguments area. Can be skipped if we eliminated
9319 all the function calls as dead code.
9320 Skipping is however impossible when function calls alloca. Alloca
9321 expander assumes that last crtl->outgoing_args_size
9322 of stack frame are unused. */
9323 if (ACCUMULATE_OUTGOING_ARGS
9324 && (!crtl->is_leaf || cfun->calls_alloca
9325 || ix86_current_function_calls_tls_descriptor))
9326 {
9327 offset += crtl->outgoing_args_size;
9328 frame->outgoing_arguments_size = crtl->outgoing_args_size;
9329 }
9330 else
9331 frame->outgoing_arguments_size = 0;
9332
9333 /* Align stack boundary. Only needed if we're calling another function
9334 or using alloca. */
9335 if (!crtl->is_leaf || cfun->calls_alloca
9336 || ix86_current_function_calls_tls_descriptor)
9337 offset = (offset + preferred_alignment - 1) & -preferred_alignment;
9338
9339 /* We've reached end of stack frame. */
9340 frame->stack_pointer_offset = offset;
9341
9342 /* Size prologue needs to allocate. */
9343 to_allocate = offset - frame->sse_reg_save_offset;
9344
9345 if ((!to_allocate && frame->nregs <= 1)
9346 || (TARGET_64BIT && to_allocate >= (HOST_WIDE_INT) 0x80000000))
9347 frame->save_regs_using_mov = false;
9348
9349 if (ix86_using_red_zone ()
9350 && crtl->sp_is_unchanging
9351 && crtl->is_leaf
9352 && !ix86_current_function_calls_tls_descriptor)
9353 {
9354 frame->red_zone_size = to_allocate;
9355 if (frame->save_regs_using_mov)
9356 frame->red_zone_size += frame->nregs * UNITS_PER_WORD;
9357 if (frame->red_zone_size > RED_ZONE_SIZE - RED_ZONE_RESERVE)
9358 frame->red_zone_size = RED_ZONE_SIZE - RED_ZONE_RESERVE;
9359 }
9360 else
9361 frame->red_zone_size = 0;
9362 frame->stack_pointer_offset -= frame->red_zone_size;
9363
9364 /* The SEH frame pointer location is near the bottom of the frame.
9365 This is enforced by the fact that the difference between the
9366 stack pointer and the frame pointer is limited to 240 bytes in
9367 the unwind data structure. */
9368 if (TARGET_SEH)
9369 {
9370 HOST_WIDE_INT diff;
9371
9372 /* If we can leave the frame pointer where it is, do so. Also, returns
9373 the establisher frame for __builtin_frame_address (0). */
9374 diff = frame->stack_pointer_offset - frame->hard_frame_pointer_offset;
9375 if (diff <= SEH_MAX_FRAME_SIZE
9376 && (diff > 240 || (diff & 15) != 0)
9377 && !crtl->accesses_prior_frames)
9378 {
9379 /* Ideally we'd determine what portion of the local stack frame
9380 (within the constraint of the lowest 240) is most heavily used.
9381 But without that complication, simply bias the frame pointer
9382 by 128 bytes so as to maximize the amount of the local stack
9383 frame that is addressable with 8-bit offsets. */
9384 frame->hard_frame_pointer_offset = frame->stack_pointer_offset - 128;
9385 }
9386 }
9387 }
9388
9389 /* This is semi-inlined memory_address_length, but simplified
9390 since we know that we're always dealing with reg+offset, and
9391 to avoid having to create and discard all that rtl. */
9392
9393 static inline int
9394 choose_baseaddr_len (unsigned int regno, HOST_WIDE_INT offset)
9395 {
9396 int len = 4;
9397
9398 if (offset == 0)
9399 {
9400 /* EBP and R13 cannot be encoded without an offset. */
9401 len = (regno == BP_REG || regno == R13_REG);
9402 }
9403 else if (IN_RANGE (offset, -128, 127))
9404 len = 1;
9405
9406 /* ESP and R12 must be encoded with a SIB byte. */
9407 if (regno == SP_REG || regno == R12_REG)
9408 len++;
9409
9410 return len;
9411 }
9412
9413 /* Return an RTX that points to CFA_OFFSET within the stack frame.
9414 The valid base registers are taken from CFUN->MACHINE->FS. */
9415
9416 static rtx
9417 choose_baseaddr (HOST_WIDE_INT cfa_offset)
9418 {
9419 const struct machine_function *m = cfun->machine;
9420 rtx base_reg = NULL;
9421 HOST_WIDE_INT base_offset = 0;
9422
9423 if (m->use_fast_prologue_epilogue)
9424 {
9425 /* Choose the base register most likely to allow the most scheduling
9426 opportunities. Generally FP is valid throughout the function,
9427 while DRAP must be reloaded within the epilogue. But choose either
9428 over the SP due to increased encoding size. */
9429
9430 if (m->fs.fp_valid)
9431 {
9432 base_reg = hard_frame_pointer_rtx;
9433 base_offset = m->fs.fp_offset - cfa_offset;
9434 }
9435 else if (m->fs.drap_valid)
9436 {
9437 base_reg = crtl->drap_reg;
9438 base_offset = 0 - cfa_offset;
9439 }
9440 else if (m->fs.sp_valid)
9441 {
9442 base_reg = stack_pointer_rtx;
9443 base_offset = m->fs.sp_offset - cfa_offset;
9444 }
9445 }
9446 else
9447 {
9448 HOST_WIDE_INT toffset;
9449 int len = 16, tlen;
9450
9451 /* Choose the base register with the smallest address encoding.
9452 With a tie, choose FP > DRAP > SP. */
9453 if (m->fs.sp_valid)
9454 {
9455 base_reg = stack_pointer_rtx;
9456 base_offset = m->fs.sp_offset - cfa_offset;
9457 len = choose_baseaddr_len (STACK_POINTER_REGNUM, base_offset);
9458 }
9459 if (m->fs.drap_valid)
9460 {
9461 toffset = 0 - cfa_offset;
9462 tlen = choose_baseaddr_len (REGNO (crtl->drap_reg), toffset);
9463 if (tlen <= len)
9464 {
9465 base_reg = crtl->drap_reg;
9466 base_offset = toffset;
9467 len = tlen;
9468 }
9469 }
9470 if (m->fs.fp_valid)
9471 {
9472 toffset = m->fs.fp_offset - cfa_offset;
9473 tlen = choose_baseaddr_len (HARD_FRAME_POINTER_REGNUM, toffset);
9474 if (tlen <= len)
9475 {
9476 base_reg = hard_frame_pointer_rtx;
9477 base_offset = toffset;
9478 len = tlen;
9479 }
9480 }
9481 }
9482 gcc_assert (base_reg != NULL);
9483
9484 return plus_constant (Pmode, base_reg, base_offset);
9485 }
9486
9487 /* Emit code to save registers in the prologue. */
9488
9489 static void
9490 ix86_emit_save_regs (void)
9491 {
9492 unsigned int regno;
9493 rtx insn;
9494
9495 for (regno = FIRST_PSEUDO_REGISTER - 1; regno-- > 0; )
9496 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9497 {
9498 insn = emit_insn (gen_push (gen_rtx_REG (word_mode, regno)));
9499 RTX_FRAME_RELATED_P (insn) = 1;
9500 }
9501 }
9502
9503 /* Emit a single register save at CFA - CFA_OFFSET. */
9504
9505 static void
9506 ix86_emit_save_reg_using_mov (enum machine_mode mode, unsigned int regno,
9507 HOST_WIDE_INT cfa_offset)
9508 {
9509 struct machine_function *m = cfun->machine;
9510 rtx reg = gen_rtx_REG (mode, regno);
9511 rtx mem, addr, base, insn;
9512
9513 addr = choose_baseaddr (cfa_offset);
9514 mem = gen_frame_mem (mode, addr);
9515
9516 /* For SSE saves, we need to indicate the 128-bit alignment. */
9517 set_mem_align (mem, GET_MODE_ALIGNMENT (mode));
9518
9519 insn = emit_move_insn (mem, reg);
9520 RTX_FRAME_RELATED_P (insn) = 1;
9521
9522 base = addr;
9523 if (GET_CODE (base) == PLUS)
9524 base = XEXP (base, 0);
9525 gcc_checking_assert (REG_P (base));
9526
9527 /* When saving registers into a re-aligned local stack frame, avoid
9528 any tricky guessing by dwarf2out. */
9529 if (m->fs.realigned)
9530 {
9531 gcc_checking_assert (stack_realign_drap);
9532
9533 if (regno == REGNO (crtl->drap_reg))
9534 {
9535 /* A bit of a hack. We force the DRAP register to be saved in
9536 the re-aligned stack frame, which provides us with a copy
9537 of the CFA that will last past the prologue. Install it. */
9538 gcc_checking_assert (cfun->machine->fs.fp_valid);
9539 addr = plus_constant (Pmode, hard_frame_pointer_rtx,
9540 cfun->machine->fs.fp_offset - cfa_offset);
9541 mem = gen_rtx_MEM (mode, addr);
9542 add_reg_note (insn, REG_CFA_DEF_CFA, mem);
9543 }
9544 else
9545 {
9546 /* The frame pointer is a stable reference within the
9547 aligned frame. Use it. */
9548 gcc_checking_assert (cfun->machine->fs.fp_valid);
9549 addr = plus_constant (Pmode, hard_frame_pointer_rtx,
9550 cfun->machine->fs.fp_offset - cfa_offset);
9551 mem = gen_rtx_MEM (mode, addr);
9552 add_reg_note (insn, REG_CFA_EXPRESSION,
9553 gen_rtx_SET (VOIDmode, mem, reg));
9554 }
9555 }
9556
9557 /* The memory may not be relative to the current CFA register,
9558 which means that we may need to generate a new pattern for
9559 use by the unwind info. */
9560 else if (base != m->fs.cfa_reg)
9561 {
9562 addr = plus_constant (Pmode, m->fs.cfa_reg,
9563 m->fs.cfa_offset - cfa_offset);
9564 mem = gen_rtx_MEM (mode, addr);
9565 add_reg_note (insn, REG_CFA_OFFSET, gen_rtx_SET (VOIDmode, mem, reg));
9566 }
9567 }
9568
9569 /* Emit code to save registers using MOV insns.
9570 First register is stored at CFA - CFA_OFFSET. */
9571 static void
9572 ix86_emit_save_regs_using_mov (HOST_WIDE_INT cfa_offset)
9573 {
9574 unsigned int regno;
9575
9576 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
9577 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9578 {
9579 ix86_emit_save_reg_using_mov (word_mode, regno, cfa_offset);
9580 cfa_offset -= UNITS_PER_WORD;
9581 }
9582 }
9583
9584 /* Emit code to save SSE registers using MOV insns.
9585 First register is stored at CFA - CFA_OFFSET. */
9586 static void
9587 ix86_emit_save_sse_regs_using_mov (HOST_WIDE_INT cfa_offset)
9588 {
9589 unsigned int regno;
9590
9591 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
9592 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9593 {
9594 ix86_emit_save_reg_using_mov (V4SFmode, regno, cfa_offset);
9595 cfa_offset -= 16;
9596 }
9597 }
9598
9599 static GTY(()) rtx queued_cfa_restores;
9600
9601 /* Add a REG_CFA_RESTORE REG note to INSN or queue them until next stack
9602 manipulation insn. The value is on the stack at CFA - CFA_OFFSET.
9603 Don't add the note if the previously saved value will be left untouched
9604 within stack red-zone till return, as unwinders can find the same value
9605 in the register and on the stack. */
9606
9607 static void
9608 ix86_add_cfa_restore_note (rtx insn, rtx reg, HOST_WIDE_INT cfa_offset)
9609 {
9610 if (!crtl->shrink_wrapped
9611 && cfa_offset <= cfun->machine->fs.red_zone_offset)
9612 return;
9613
9614 if (insn)
9615 {
9616 add_reg_note (insn, REG_CFA_RESTORE, reg);
9617 RTX_FRAME_RELATED_P (insn) = 1;
9618 }
9619 else
9620 queued_cfa_restores
9621 = alloc_reg_note (REG_CFA_RESTORE, reg, queued_cfa_restores);
9622 }
9623
9624 /* Add queued REG_CFA_RESTORE notes if any to INSN. */
9625
9626 static void
9627 ix86_add_queued_cfa_restore_notes (rtx insn)
9628 {
9629 rtx last;
9630 if (!queued_cfa_restores)
9631 return;
9632 for (last = queued_cfa_restores; XEXP (last, 1); last = XEXP (last, 1))
9633 ;
9634 XEXP (last, 1) = REG_NOTES (insn);
9635 REG_NOTES (insn) = queued_cfa_restores;
9636 queued_cfa_restores = NULL_RTX;
9637 RTX_FRAME_RELATED_P (insn) = 1;
9638 }
9639
9640 /* Expand prologue or epilogue stack adjustment.
9641 The pattern exist to put a dependency on all ebp-based memory accesses.
9642 STYLE should be negative if instructions should be marked as frame related,
9643 zero if %r11 register is live and cannot be freely used and positive
9644 otherwise. */
9645
9646 static void
9647 pro_epilogue_adjust_stack (rtx dest, rtx src, rtx offset,
9648 int style, bool set_cfa)
9649 {
9650 struct machine_function *m = cfun->machine;
9651 rtx insn;
9652 bool add_frame_related_expr = false;
9653
9654 if (Pmode == SImode)
9655 insn = gen_pro_epilogue_adjust_stack_si_add (dest, src, offset);
9656 else if (x86_64_immediate_operand (offset, DImode))
9657 insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, offset);
9658 else
9659 {
9660 rtx tmp;
9661 /* r11 is used by indirect sibcall return as well, set before the
9662 epilogue and used after the epilogue. */
9663 if (style)
9664 tmp = gen_rtx_REG (DImode, R11_REG);
9665 else
9666 {
9667 gcc_assert (src != hard_frame_pointer_rtx
9668 && dest != hard_frame_pointer_rtx);
9669 tmp = hard_frame_pointer_rtx;
9670 }
9671 insn = emit_insn (gen_rtx_SET (DImode, tmp, offset));
9672 if (style < 0)
9673 add_frame_related_expr = true;
9674
9675 insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, tmp);
9676 }
9677
9678 insn = emit_insn (insn);
9679 if (style >= 0)
9680 ix86_add_queued_cfa_restore_notes (insn);
9681
9682 if (set_cfa)
9683 {
9684 rtx r;
9685
9686 gcc_assert (m->fs.cfa_reg == src);
9687 m->fs.cfa_offset += INTVAL (offset);
9688 m->fs.cfa_reg = dest;
9689
9690 r = gen_rtx_PLUS (Pmode, src, offset);
9691 r = gen_rtx_SET (VOIDmode, dest, r);
9692 add_reg_note (insn, REG_CFA_ADJUST_CFA, r);
9693 RTX_FRAME_RELATED_P (insn) = 1;
9694 }
9695 else if (style < 0)
9696 {
9697 RTX_FRAME_RELATED_P (insn) = 1;
9698 if (add_frame_related_expr)
9699 {
9700 rtx r = gen_rtx_PLUS (Pmode, src, offset);
9701 r = gen_rtx_SET (VOIDmode, dest, r);
9702 add_reg_note (insn, REG_FRAME_RELATED_EXPR, r);
9703 }
9704 }
9705
9706 if (dest == stack_pointer_rtx)
9707 {
9708 HOST_WIDE_INT ooffset = m->fs.sp_offset;
9709 bool valid = m->fs.sp_valid;
9710
9711 if (src == hard_frame_pointer_rtx)
9712 {
9713 valid = m->fs.fp_valid;
9714 ooffset = m->fs.fp_offset;
9715 }
9716 else if (src == crtl->drap_reg)
9717 {
9718 valid = m->fs.drap_valid;
9719 ooffset = 0;
9720 }
9721 else
9722 {
9723 /* Else there are two possibilities: SP itself, which we set
9724 up as the default above. Or EH_RETURN_STACKADJ_RTX, which is
9725 taken care of this by hand along the eh_return path. */
9726 gcc_checking_assert (src == stack_pointer_rtx
9727 || offset == const0_rtx);
9728 }
9729
9730 m->fs.sp_offset = ooffset - INTVAL (offset);
9731 m->fs.sp_valid = valid;
9732 }
9733 }
9734
9735 /* Find an available register to be used as dynamic realign argument
9736 pointer regsiter. Such a register will be written in prologue and
9737 used in begin of body, so it must not be
9738 1. parameter passing register.
9739 2. GOT pointer.
9740 We reuse static-chain register if it is available. Otherwise, we
9741 use DI for i386 and R13 for x86-64. We chose R13 since it has
9742 shorter encoding.
9743
9744 Return: the regno of chosen register. */
9745
9746 static unsigned int
9747 find_drap_reg (void)
9748 {
9749 tree decl = cfun->decl;
9750
9751 if (TARGET_64BIT)
9752 {
9753 /* Use R13 for nested function or function need static chain.
9754 Since function with tail call may use any caller-saved
9755 registers in epilogue, DRAP must not use caller-saved
9756 register in such case. */
9757 if (DECL_STATIC_CHAIN (decl) || crtl->tail_call_emit)
9758 return R13_REG;
9759
9760 return R10_REG;
9761 }
9762 else
9763 {
9764 /* Use DI for nested function or function need static chain.
9765 Since function with tail call may use any caller-saved
9766 registers in epilogue, DRAP must not use caller-saved
9767 register in such case. */
9768 if (DECL_STATIC_CHAIN (decl) || crtl->tail_call_emit)
9769 return DI_REG;
9770
9771 /* Reuse static chain register if it isn't used for parameter
9772 passing. */
9773 if (ix86_function_regparm (TREE_TYPE (decl), decl) <= 2)
9774 {
9775 unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (decl));
9776 if ((ccvt & (IX86_CALLCVT_FASTCALL | IX86_CALLCVT_THISCALL)) == 0)
9777 return CX_REG;
9778 }
9779 return DI_REG;
9780 }
9781 }
9782
9783 /* Return minimum incoming stack alignment. */
9784
9785 static unsigned int
9786 ix86_minimum_incoming_stack_boundary (bool sibcall)
9787 {
9788 unsigned int incoming_stack_boundary;
9789
9790 /* Prefer the one specified at command line. */
9791 if (ix86_user_incoming_stack_boundary)
9792 incoming_stack_boundary = ix86_user_incoming_stack_boundary;
9793 /* In 32bit, use MIN_STACK_BOUNDARY for incoming stack boundary
9794 if -mstackrealign is used, it isn't used for sibcall check and
9795 estimated stack alignment is 128bit. */
9796 else if (!sibcall
9797 && !TARGET_64BIT
9798 && ix86_force_align_arg_pointer
9799 && crtl->stack_alignment_estimated == 128)
9800 incoming_stack_boundary = MIN_STACK_BOUNDARY;
9801 else
9802 incoming_stack_boundary = ix86_default_incoming_stack_boundary;
9803
9804 /* Incoming stack alignment can be changed on individual functions
9805 via force_align_arg_pointer attribute. We use the smallest
9806 incoming stack boundary. */
9807 if (incoming_stack_boundary > MIN_STACK_BOUNDARY
9808 && lookup_attribute (ix86_force_align_arg_pointer_string,
9809 TYPE_ATTRIBUTES (TREE_TYPE (current_function_decl))))
9810 incoming_stack_boundary = MIN_STACK_BOUNDARY;
9811
9812 /* The incoming stack frame has to be aligned at least at
9813 parm_stack_boundary. */
9814 if (incoming_stack_boundary < crtl->parm_stack_boundary)
9815 incoming_stack_boundary = crtl->parm_stack_boundary;
9816
9817 /* Stack at entrance of main is aligned by runtime. We use the
9818 smallest incoming stack boundary. */
9819 if (incoming_stack_boundary > MAIN_STACK_BOUNDARY
9820 && DECL_NAME (current_function_decl)
9821 && MAIN_NAME_P (DECL_NAME (current_function_decl))
9822 && DECL_FILE_SCOPE_P (current_function_decl))
9823 incoming_stack_boundary = MAIN_STACK_BOUNDARY;
9824
9825 return incoming_stack_boundary;
9826 }
9827
9828 /* Update incoming stack boundary and estimated stack alignment. */
9829
9830 static void
9831 ix86_update_stack_boundary (void)
9832 {
9833 ix86_incoming_stack_boundary
9834 = ix86_minimum_incoming_stack_boundary (false);
9835
9836 /* x86_64 vararg needs 16byte stack alignment for register save
9837 area. */
9838 if (TARGET_64BIT
9839 && cfun->stdarg
9840 && crtl->stack_alignment_estimated < 128)
9841 crtl->stack_alignment_estimated = 128;
9842 }
9843
9844 /* Handle the TARGET_GET_DRAP_RTX hook. Return NULL if no DRAP is
9845 needed or an rtx for DRAP otherwise. */
9846
9847 static rtx
9848 ix86_get_drap_rtx (void)
9849 {
9850 if (ix86_force_drap || !ACCUMULATE_OUTGOING_ARGS)
9851 crtl->need_drap = true;
9852
9853 if (stack_realign_drap)
9854 {
9855 /* Assign DRAP to vDRAP and returns vDRAP */
9856 unsigned int regno = find_drap_reg ();
9857 rtx drap_vreg;
9858 rtx arg_ptr;
9859 rtx seq, insn;
9860
9861 arg_ptr = gen_rtx_REG (Pmode, regno);
9862 crtl->drap_reg = arg_ptr;
9863
9864 start_sequence ();
9865 drap_vreg = copy_to_reg (arg_ptr);
9866 seq = get_insns ();
9867 end_sequence ();
9868
9869 insn = emit_insn_before (seq, NEXT_INSN (entry_of_function ()));
9870 if (!optimize)
9871 {
9872 add_reg_note (insn, REG_CFA_SET_VDRAP, drap_vreg);
9873 RTX_FRAME_RELATED_P (insn) = 1;
9874 }
9875 return drap_vreg;
9876 }
9877 else
9878 return NULL;
9879 }
9880
9881 /* Handle the TARGET_INTERNAL_ARG_POINTER hook. */
9882
9883 static rtx
9884 ix86_internal_arg_pointer (void)
9885 {
9886 return virtual_incoming_args_rtx;
9887 }
9888
9889 struct scratch_reg {
9890 rtx reg;
9891 bool saved;
9892 };
9893
9894 /* Return a short-lived scratch register for use on function entry.
9895 In 32-bit mode, it is valid only after the registers are saved
9896 in the prologue. This register must be released by means of
9897 release_scratch_register_on_entry once it is dead. */
9898
9899 static void
9900 get_scratch_register_on_entry (struct scratch_reg *sr)
9901 {
9902 int regno;
9903
9904 sr->saved = false;
9905
9906 if (TARGET_64BIT)
9907 {
9908 /* We always use R11 in 64-bit mode. */
9909 regno = R11_REG;
9910 }
9911 else
9912 {
9913 tree decl = current_function_decl, fntype = TREE_TYPE (decl);
9914 bool fastcall_p
9915 = lookup_attribute ("fastcall", TYPE_ATTRIBUTES (fntype)) != NULL_TREE;
9916 bool static_chain_p = DECL_STATIC_CHAIN (decl);
9917 int regparm = ix86_function_regparm (fntype, decl);
9918 int drap_regno
9919 = crtl->drap_reg ? REGNO (crtl->drap_reg) : INVALID_REGNUM;
9920
9921 /* 'fastcall' sets regparm to 2, uses ecx/edx for arguments and eax
9922 for the static chain register. */
9923 if ((regparm < 1 || (fastcall_p && !static_chain_p))
9924 && drap_regno != AX_REG)
9925 regno = AX_REG;
9926 else if (regparm < 2 && drap_regno != DX_REG)
9927 regno = DX_REG;
9928 /* ecx is the static chain register. */
9929 else if (regparm < 3 && !fastcall_p && !static_chain_p
9930 && drap_regno != CX_REG)
9931 regno = CX_REG;
9932 else if (ix86_save_reg (BX_REG, true))
9933 regno = BX_REG;
9934 /* esi is the static chain register. */
9935 else if (!(regparm == 3 && static_chain_p)
9936 && ix86_save_reg (SI_REG, true))
9937 regno = SI_REG;
9938 else if (ix86_save_reg (DI_REG, true))
9939 regno = DI_REG;
9940 else
9941 {
9942 regno = (drap_regno == AX_REG ? DX_REG : AX_REG);
9943 sr->saved = true;
9944 }
9945 }
9946
9947 sr->reg = gen_rtx_REG (Pmode, regno);
9948 if (sr->saved)
9949 {
9950 rtx insn = emit_insn (gen_push (sr->reg));
9951 RTX_FRAME_RELATED_P (insn) = 1;
9952 }
9953 }
9954
9955 /* Release a scratch register obtained from the preceding function. */
9956
9957 static void
9958 release_scratch_register_on_entry (struct scratch_reg *sr)
9959 {
9960 if (sr->saved)
9961 {
9962 rtx x, insn = emit_insn (gen_pop (sr->reg));
9963
9964 /* The RTX_FRAME_RELATED_P mechanism doesn't know about pop. */
9965 RTX_FRAME_RELATED_P (insn) = 1;
9966 x = gen_rtx_PLUS (Pmode, stack_pointer_rtx, GEN_INT (UNITS_PER_WORD));
9967 x = gen_rtx_SET (VOIDmode, stack_pointer_rtx, x);
9968 add_reg_note (insn, REG_FRAME_RELATED_EXPR, x);
9969 }
9970 }
9971
9972 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
9973
9974 /* Emit code to adjust the stack pointer by SIZE bytes while probing it. */
9975
9976 static void
9977 ix86_adjust_stack_and_probe (const HOST_WIDE_INT size)
9978 {
9979 /* We skip the probe for the first interval + a small dope of 4 words and
9980 probe that many bytes past the specified size to maintain a protection
9981 area at the botton of the stack. */
9982 const int dope = 4 * UNITS_PER_WORD;
9983 rtx size_rtx = GEN_INT (size), last;
9984
9985 /* See if we have a constant small number of probes to generate. If so,
9986 that's the easy case. The run-time loop is made up of 11 insns in the
9987 generic case while the compile-time loop is made up of 3+2*(n-1) insns
9988 for n # of intervals. */
9989 if (size <= 5 * PROBE_INTERVAL)
9990 {
9991 HOST_WIDE_INT i, adjust;
9992 bool first_probe = true;
9993
9994 /* Adjust SP and probe at PROBE_INTERVAL + N * PROBE_INTERVAL for
9995 values of N from 1 until it exceeds SIZE. If only one probe is
9996 needed, this will not generate any code. Then adjust and probe
9997 to PROBE_INTERVAL + SIZE. */
9998 for (i = PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
9999 {
10000 if (first_probe)
10001 {
10002 adjust = 2 * PROBE_INTERVAL + dope;
10003 first_probe = false;
10004 }
10005 else
10006 adjust = PROBE_INTERVAL;
10007
10008 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
10009 plus_constant (Pmode, stack_pointer_rtx,
10010 -adjust)));
10011 emit_stack_probe (stack_pointer_rtx);
10012 }
10013
10014 if (first_probe)
10015 adjust = size + PROBE_INTERVAL + dope;
10016 else
10017 adjust = size + PROBE_INTERVAL - i;
10018
10019 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
10020 plus_constant (Pmode, stack_pointer_rtx,
10021 -adjust)));
10022 emit_stack_probe (stack_pointer_rtx);
10023
10024 /* Adjust back to account for the additional first interval. */
10025 last = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
10026 plus_constant (Pmode, stack_pointer_rtx,
10027 PROBE_INTERVAL + dope)));
10028 }
10029
10030 /* Otherwise, do the same as above, but in a loop. Note that we must be
10031 extra careful with variables wrapping around because we might be at
10032 the very top (or the very bottom) of the address space and we have
10033 to be able to handle this case properly; in particular, we use an
10034 equality test for the loop condition. */
10035 else
10036 {
10037 HOST_WIDE_INT rounded_size;
10038 struct scratch_reg sr;
10039
10040 get_scratch_register_on_entry (&sr);
10041
10042
10043 /* Step 1: round SIZE to the previous multiple of the interval. */
10044
10045 rounded_size = size & -PROBE_INTERVAL;
10046
10047
10048 /* Step 2: compute initial and final value of the loop counter. */
10049
10050 /* SP = SP_0 + PROBE_INTERVAL. */
10051 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
10052 plus_constant (Pmode, stack_pointer_rtx,
10053 - (PROBE_INTERVAL + dope))));
10054
10055 /* LAST_ADDR = SP_0 + PROBE_INTERVAL + ROUNDED_SIZE. */
10056 emit_move_insn (sr.reg, GEN_INT (-rounded_size));
10057 emit_insn (gen_rtx_SET (VOIDmode, sr.reg,
10058 gen_rtx_PLUS (Pmode, sr.reg,
10059 stack_pointer_rtx)));
10060
10061
10062 /* Step 3: the loop
10063
10064 while (SP != LAST_ADDR)
10065 {
10066 SP = SP + PROBE_INTERVAL
10067 probe at SP
10068 }
10069
10070 adjusts SP and probes to PROBE_INTERVAL + N * PROBE_INTERVAL for
10071 values of N from 1 until it is equal to ROUNDED_SIZE. */
10072
10073 emit_insn (ix86_gen_adjust_stack_and_probe (sr.reg, sr.reg, size_rtx));
10074
10075
10076 /* Step 4: adjust SP and probe at PROBE_INTERVAL + SIZE if we cannot
10077 assert at compile-time that SIZE is equal to ROUNDED_SIZE. */
10078
10079 if (size != rounded_size)
10080 {
10081 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
10082 plus_constant (Pmode, stack_pointer_rtx,
10083 rounded_size - size)));
10084 emit_stack_probe (stack_pointer_rtx);
10085 }
10086
10087 /* Adjust back to account for the additional first interval. */
10088 last = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
10089 plus_constant (Pmode, stack_pointer_rtx,
10090 PROBE_INTERVAL + dope)));
10091
10092 release_scratch_register_on_entry (&sr);
10093 }
10094
10095 gcc_assert (cfun->machine->fs.cfa_reg != stack_pointer_rtx);
10096
10097 /* Even if the stack pointer isn't the CFA register, we need to correctly
10098 describe the adjustments made to it, in particular differentiate the
10099 frame-related ones from the frame-unrelated ones. */
10100 if (size > 0)
10101 {
10102 rtx expr = gen_rtx_SEQUENCE (VOIDmode, rtvec_alloc (2));
10103 XVECEXP (expr, 0, 0)
10104 = gen_rtx_SET (VOIDmode, stack_pointer_rtx,
10105 plus_constant (Pmode, stack_pointer_rtx, -size));
10106 XVECEXP (expr, 0, 1)
10107 = gen_rtx_SET (VOIDmode, stack_pointer_rtx,
10108 plus_constant (Pmode, stack_pointer_rtx,
10109 PROBE_INTERVAL + dope + size));
10110 add_reg_note (last, REG_FRAME_RELATED_EXPR, expr);
10111 RTX_FRAME_RELATED_P (last) = 1;
10112
10113 cfun->machine->fs.sp_offset += size;
10114 }
10115
10116 /* Make sure nothing is scheduled before we are done. */
10117 emit_insn (gen_blockage ());
10118 }
10119
10120 /* Adjust the stack pointer up to REG while probing it. */
10121
10122 const char *
10123 output_adjust_stack_and_probe (rtx reg)
10124 {
10125 static int labelno = 0;
10126 char loop_lab[32], end_lab[32];
10127 rtx xops[2];
10128
10129 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno);
10130 ASM_GENERATE_INTERNAL_LABEL (end_lab, "LPSRE", labelno++);
10131
10132 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
10133
10134 /* Jump to END_LAB if SP == LAST_ADDR. */
10135 xops[0] = stack_pointer_rtx;
10136 xops[1] = reg;
10137 output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
10138 fputs ("\tje\t", asm_out_file);
10139 assemble_name_raw (asm_out_file, end_lab);
10140 fputc ('\n', asm_out_file);
10141
10142 /* SP = SP + PROBE_INTERVAL. */
10143 xops[1] = GEN_INT (PROBE_INTERVAL);
10144 output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
10145
10146 /* Probe at SP. */
10147 xops[1] = const0_rtx;
10148 output_asm_insn ("or%z0\t{%1, (%0)|DWORD PTR [%0], %1}", xops);
10149
10150 fprintf (asm_out_file, "\tjmp\t");
10151 assemble_name_raw (asm_out_file, loop_lab);
10152 fputc ('\n', asm_out_file);
10153
10154 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, end_lab);
10155
10156 return "";
10157 }
10158
10159 /* Emit code to probe a range of stack addresses from FIRST to FIRST+SIZE,
10160 inclusive. These are offsets from the current stack pointer. */
10161
10162 static void
10163 ix86_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size)
10164 {
10165 /* See if we have a constant small number of probes to generate. If so,
10166 that's the easy case. The run-time loop is made up of 7 insns in the
10167 generic case while the compile-time loop is made up of n insns for n #
10168 of intervals. */
10169 if (size <= 7 * PROBE_INTERVAL)
10170 {
10171 HOST_WIDE_INT i;
10172
10173 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 1 until
10174 it exceeds SIZE. If only one probe is needed, this will not
10175 generate any code. Then probe at FIRST + SIZE. */
10176 for (i = PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
10177 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
10178 -(first + i)));
10179
10180 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
10181 -(first + size)));
10182 }
10183
10184 /* Otherwise, do the same as above, but in a loop. Note that we must be
10185 extra careful with variables wrapping around because we might be at
10186 the very top (or the very bottom) of the address space and we have
10187 to be able to handle this case properly; in particular, we use an
10188 equality test for the loop condition. */
10189 else
10190 {
10191 HOST_WIDE_INT rounded_size, last;
10192 struct scratch_reg sr;
10193
10194 get_scratch_register_on_entry (&sr);
10195
10196
10197 /* Step 1: round SIZE to the previous multiple of the interval. */
10198
10199 rounded_size = size & -PROBE_INTERVAL;
10200
10201
10202 /* Step 2: compute initial and final value of the loop counter. */
10203
10204 /* TEST_OFFSET = FIRST. */
10205 emit_move_insn (sr.reg, GEN_INT (-first));
10206
10207 /* LAST_OFFSET = FIRST + ROUNDED_SIZE. */
10208 last = first + rounded_size;
10209
10210
10211 /* Step 3: the loop
10212
10213 while (TEST_ADDR != LAST_ADDR)
10214 {
10215 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
10216 probe at TEST_ADDR
10217 }
10218
10219 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
10220 until it is equal to ROUNDED_SIZE. */
10221
10222 emit_insn (ix86_gen_probe_stack_range (sr.reg, sr.reg, GEN_INT (-last)));
10223
10224
10225 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
10226 that SIZE is equal to ROUNDED_SIZE. */
10227
10228 if (size != rounded_size)
10229 emit_stack_probe (plus_constant (Pmode,
10230 gen_rtx_PLUS (Pmode,
10231 stack_pointer_rtx,
10232 sr.reg),
10233 rounded_size - size));
10234
10235 release_scratch_register_on_entry (&sr);
10236 }
10237
10238 /* Make sure nothing is scheduled before we are done. */
10239 emit_insn (gen_blockage ());
10240 }
10241
10242 /* Probe a range of stack addresses from REG to END, inclusive. These are
10243 offsets from the current stack pointer. */
10244
10245 const char *
10246 output_probe_stack_range (rtx reg, rtx end)
10247 {
10248 static int labelno = 0;
10249 char loop_lab[32], end_lab[32];
10250 rtx xops[3];
10251
10252 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno);
10253 ASM_GENERATE_INTERNAL_LABEL (end_lab, "LPSRE", labelno++);
10254
10255 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
10256
10257 /* Jump to END_LAB if TEST_ADDR == LAST_ADDR. */
10258 xops[0] = reg;
10259 xops[1] = end;
10260 output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
10261 fputs ("\tje\t", asm_out_file);
10262 assemble_name_raw (asm_out_file, end_lab);
10263 fputc ('\n', asm_out_file);
10264
10265 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
10266 xops[1] = GEN_INT (PROBE_INTERVAL);
10267 output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
10268
10269 /* Probe at TEST_ADDR. */
10270 xops[0] = stack_pointer_rtx;
10271 xops[1] = reg;
10272 xops[2] = const0_rtx;
10273 output_asm_insn ("or%z0\t{%2, (%0,%1)|DWORD PTR [%0+%1], %2}", xops);
10274
10275 fprintf (asm_out_file, "\tjmp\t");
10276 assemble_name_raw (asm_out_file, loop_lab);
10277 fputc ('\n', asm_out_file);
10278
10279 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, end_lab);
10280
10281 return "";
10282 }
10283
10284 /* Finalize stack_realign_needed flag, which will guide prologue/epilogue
10285 to be generated in correct form. */
10286 static void
10287 ix86_finalize_stack_realign_flags (void)
10288 {
10289 /* Check if stack realign is really needed after reload, and
10290 stores result in cfun */
10291 unsigned int incoming_stack_boundary
10292 = (crtl->parm_stack_boundary > ix86_incoming_stack_boundary
10293 ? crtl->parm_stack_boundary : ix86_incoming_stack_boundary);
10294 unsigned int stack_realign = (incoming_stack_boundary
10295 < (crtl->is_leaf
10296 ? crtl->max_used_stack_slot_alignment
10297 : crtl->stack_alignment_needed));
10298
10299 if (crtl->stack_realign_finalized)
10300 {
10301 /* After stack_realign_needed is finalized, we can't no longer
10302 change it. */
10303 gcc_assert (crtl->stack_realign_needed == stack_realign);
10304 return;
10305 }
10306
10307 /* If the only reason for frame_pointer_needed is that we conservatively
10308 assumed stack realignment might be needed, but in the end nothing that
10309 needed the stack alignment had been spilled, clear frame_pointer_needed
10310 and say we don't need stack realignment. */
10311 if (stack_realign
10312 && !crtl->need_drap
10313 && frame_pointer_needed
10314 && crtl->is_leaf
10315 && flag_omit_frame_pointer
10316 && crtl->sp_is_unchanging
10317 && !ix86_current_function_calls_tls_descriptor
10318 && !crtl->accesses_prior_frames
10319 && !cfun->calls_alloca
10320 && !crtl->calls_eh_return
10321 && !(flag_stack_check && STACK_CHECK_MOVING_SP)
10322 && !ix86_frame_pointer_required ()
10323 && get_frame_size () == 0
10324 && ix86_nsaved_sseregs () == 0
10325 && ix86_varargs_gpr_size + ix86_varargs_fpr_size == 0)
10326 {
10327 HARD_REG_SET set_up_by_prologue, prologue_used;
10328 basic_block bb;
10329
10330 CLEAR_HARD_REG_SET (prologue_used);
10331 CLEAR_HARD_REG_SET (set_up_by_prologue);
10332 add_to_hard_reg_set (&set_up_by_prologue, Pmode, STACK_POINTER_REGNUM);
10333 add_to_hard_reg_set (&set_up_by_prologue, Pmode, ARG_POINTER_REGNUM);
10334 add_to_hard_reg_set (&set_up_by_prologue, Pmode,
10335 HARD_FRAME_POINTER_REGNUM);
10336 FOR_EACH_BB (bb)
10337 {
10338 rtx insn;
10339 FOR_BB_INSNS (bb, insn)
10340 if (NONDEBUG_INSN_P (insn)
10341 && requires_stack_frame_p (insn, prologue_used,
10342 set_up_by_prologue))
10343 {
10344 crtl->stack_realign_needed = stack_realign;
10345 crtl->stack_realign_finalized = true;
10346 return;
10347 }
10348 }
10349
10350 frame_pointer_needed = false;
10351 stack_realign = false;
10352 crtl->max_used_stack_slot_alignment = incoming_stack_boundary;
10353 crtl->stack_alignment_needed = incoming_stack_boundary;
10354 crtl->stack_alignment_estimated = incoming_stack_boundary;
10355 if (crtl->preferred_stack_boundary > incoming_stack_boundary)
10356 crtl->preferred_stack_boundary = incoming_stack_boundary;
10357 df_finish_pass (true);
10358 df_scan_alloc (NULL);
10359 df_scan_blocks ();
10360 df_compute_regs_ever_live (true);
10361 df_analyze ();
10362 }
10363
10364 crtl->stack_realign_needed = stack_realign;
10365 crtl->stack_realign_finalized = true;
10366 }
10367
10368 /* Expand the prologue into a bunch of separate insns. */
10369
10370 void
10371 ix86_expand_prologue (void)
10372 {
10373 struct machine_function *m = cfun->machine;
10374 rtx insn, t;
10375 bool pic_reg_used;
10376 struct ix86_frame frame;
10377 HOST_WIDE_INT allocate;
10378 bool int_registers_saved;
10379 bool sse_registers_saved;
10380
10381 ix86_finalize_stack_realign_flags ();
10382
10383 /* DRAP should not coexist with stack_realign_fp */
10384 gcc_assert (!(crtl->drap_reg && stack_realign_fp));
10385
10386 memset (&m->fs, 0, sizeof (m->fs));
10387
10388 /* Initialize CFA state for before the prologue. */
10389 m->fs.cfa_reg = stack_pointer_rtx;
10390 m->fs.cfa_offset = INCOMING_FRAME_SP_OFFSET;
10391
10392 /* Track SP offset to the CFA. We continue tracking this after we've
10393 swapped the CFA register away from SP. In the case of re-alignment
10394 this is fudged; we're interested to offsets within the local frame. */
10395 m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
10396 m->fs.sp_valid = true;
10397
10398 ix86_compute_frame_layout (&frame);
10399
10400 if (!TARGET_64BIT && ix86_function_ms_hook_prologue (current_function_decl))
10401 {
10402 /* We should have already generated an error for any use of
10403 ms_hook on a nested function. */
10404 gcc_checking_assert (!ix86_static_chain_on_stack);
10405
10406 /* Check if profiling is active and we shall use profiling before
10407 prologue variant. If so sorry. */
10408 if (crtl->profile && flag_fentry != 0)
10409 sorry ("ms_hook_prologue attribute isn%'t compatible "
10410 "with -mfentry for 32-bit");
10411
10412 /* In ix86_asm_output_function_label we emitted:
10413 8b ff movl.s %edi,%edi
10414 55 push %ebp
10415 8b ec movl.s %esp,%ebp
10416
10417 This matches the hookable function prologue in Win32 API
10418 functions in Microsoft Windows XP Service Pack 2 and newer.
10419 Wine uses this to enable Windows apps to hook the Win32 API
10420 functions provided by Wine.
10421
10422 What that means is that we've already set up the frame pointer. */
10423
10424 if (frame_pointer_needed
10425 && !(crtl->drap_reg && crtl->stack_realign_needed))
10426 {
10427 rtx push, mov;
10428
10429 /* We've decided to use the frame pointer already set up.
10430 Describe this to the unwinder by pretending that both
10431 push and mov insns happen right here.
10432
10433 Putting the unwind info here at the end of the ms_hook
10434 is done so that we can make absolutely certain we get
10435 the required byte sequence at the start of the function,
10436 rather than relying on an assembler that can produce
10437 the exact encoding required.
10438
10439 However it does mean (in the unpatched case) that we have
10440 a 1 insn window where the asynchronous unwind info is
10441 incorrect. However, if we placed the unwind info at
10442 its correct location we would have incorrect unwind info
10443 in the patched case. Which is probably all moot since
10444 I don't expect Wine generates dwarf2 unwind info for the
10445 system libraries that use this feature. */
10446
10447 insn = emit_insn (gen_blockage ());
10448
10449 push = gen_push (hard_frame_pointer_rtx);
10450 mov = gen_rtx_SET (VOIDmode, hard_frame_pointer_rtx,
10451 stack_pointer_rtx);
10452 RTX_FRAME_RELATED_P (push) = 1;
10453 RTX_FRAME_RELATED_P (mov) = 1;
10454
10455 RTX_FRAME_RELATED_P (insn) = 1;
10456 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
10457 gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, push, mov)));
10458
10459 /* Note that gen_push incremented m->fs.cfa_offset, even
10460 though we didn't emit the push insn here. */
10461 m->fs.cfa_reg = hard_frame_pointer_rtx;
10462 m->fs.fp_offset = m->fs.cfa_offset;
10463 m->fs.fp_valid = true;
10464 }
10465 else
10466 {
10467 /* The frame pointer is not needed so pop %ebp again.
10468 This leaves us with a pristine state. */
10469 emit_insn (gen_pop (hard_frame_pointer_rtx));
10470 }
10471 }
10472
10473 /* The first insn of a function that accepts its static chain on the
10474 stack is to push the register that would be filled in by a direct
10475 call. This insn will be skipped by the trampoline. */
10476 else if (ix86_static_chain_on_stack)
10477 {
10478 insn = emit_insn (gen_push (ix86_static_chain (cfun->decl, false)));
10479 emit_insn (gen_blockage ());
10480
10481 /* We don't want to interpret this push insn as a register save,
10482 only as a stack adjustment. The real copy of the register as
10483 a save will be done later, if needed. */
10484 t = plus_constant (Pmode, stack_pointer_rtx, -UNITS_PER_WORD);
10485 t = gen_rtx_SET (VOIDmode, stack_pointer_rtx, t);
10486 add_reg_note (insn, REG_CFA_ADJUST_CFA, t);
10487 RTX_FRAME_RELATED_P (insn) = 1;
10488 }
10489
10490 /* Emit prologue code to adjust stack alignment and setup DRAP, in case
10491 of DRAP is needed and stack realignment is really needed after reload */
10492 if (stack_realign_drap)
10493 {
10494 int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
10495
10496 /* Only need to push parameter pointer reg if it is caller saved. */
10497 if (!call_used_regs[REGNO (crtl->drap_reg)])
10498 {
10499 /* Push arg pointer reg */
10500 insn = emit_insn (gen_push (crtl->drap_reg));
10501 RTX_FRAME_RELATED_P (insn) = 1;
10502 }
10503
10504 /* Grab the argument pointer. */
10505 t = plus_constant (Pmode, stack_pointer_rtx, m->fs.sp_offset);
10506 insn = emit_insn (gen_rtx_SET (VOIDmode, crtl->drap_reg, t));
10507 RTX_FRAME_RELATED_P (insn) = 1;
10508 m->fs.cfa_reg = crtl->drap_reg;
10509 m->fs.cfa_offset = 0;
10510
10511 /* Align the stack. */
10512 insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
10513 stack_pointer_rtx,
10514 GEN_INT (-align_bytes)));
10515 RTX_FRAME_RELATED_P (insn) = 1;
10516
10517 /* Replicate the return address on the stack so that return
10518 address can be reached via (argp - 1) slot. This is needed
10519 to implement macro RETURN_ADDR_RTX and intrinsic function
10520 expand_builtin_return_addr etc. */
10521 t = plus_constant (Pmode, crtl->drap_reg, -UNITS_PER_WORD);
10522 t = gen_frame_mem (word_mode, t);
10523 insn = emit_insn (gen_push (t));
10524 RTX_FRAME_RELATED_P (insn) = 1;
10525
10526 /* For the purposes of frame and register save area addressing,
10527 we've started over with a new frame. */
10528 m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
10529 m->fs.realigned = true;
10530 }
10531
10532 int_registers_saved = (frame.nregs == 0);
10533 sse_registers_saved = (frame.nsseregs == 0);
10534
10535 if (frame_pointer_needed && !m->fs.fp_valid)
10536 {
10537 /* Note: AT&T enter does NOT have reversed args. Enter is probably
10538 slower on all targets. Also sdb doesn't like it. */
10539 insn = emit_insn (gen_push (hard_frame_pointer_rtx));
10540 RTX_FRAME_RELATED_P (insn) = 1;
10541
10542 /* Push registers now, before setting the frame pointer
10543 on SEH target. */
10544 if (!int_registers_saved
10545 && TARGET_SEH
10546 && !frame.save_regs_using_mov)
10547 {
10548 ix86_emit_save_regs ();
10549 int_registers_saved = true;
10550 gcc_assert (m->fs.sp_offset == frame.reg_save_offset);
10551 }
10552
10553 if (m->fs.sp_offset == frame.hard_frame_pointer_offset)
10554 {
10555 insn = emit_move_insn (hard_frame_pointer_rtx, stack_pointer_rtx);
10556 RTX_FRAME_RELATED_P (insn) = 1;
10557
10558 if (m->fs.cfa_reg == stack_pointer_rtx)
10559 m->fs.cfa_reg = hard_frame_pointer_rtx;
10560 m->fs.fp_offset = m->fs.sp_offset;
10561 m->fs.fp_valid = true;
10562 }
10563 }
10564
10565 if (!int_registers_saved)
10566 {
10567 /* If saving registers via PUSH, do so now. */
10568 if (!frame.save_regs_using_mov)
10569 {
10570 ix86_emit_save_regs ();
10571 int_registers_saved = true;
10572 gcc_assert (m->fs.sp_offset == frame.reg_save_offset);
10573 }
10574
10575 /* When using red zone we may start register saving before allocating
10576 the stack frame saving one cycle of the prologue. However, avoid
10577 doing this if we have to probe the stack; at least on x86_64 the
10578 stack probe can turn into a call that clobbers a red zone location. */
10579 else if (ix86_using_red_zone ()
10580 && (! TARGET_STACK_PROBE
10581 || frame.stack_pointer_offset < CHECK_STACK_LIMIT))
10582 {
10583 ix86_emit_save_regs_using_mov (frame.reg_save_offset);
10584 int_registers_saved = true;
10585 }
10586 }
10587
10588 if (stack_realign_fp)
10589 {
10590 int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
10591 gcc_assert (align_bytes > MIN_STACK_BOUNDARY / BITS_PER_UNIT);
10592
10593 /* The computation of the size of the re-aligned stack frame means
10594 that we must allocate the size of the register save area before
10595 performing the actual alignment. Otherwise we cannot guarantee
10596 that there's enough storage above the realignment point. */
10597 if (m->fs.sp_offset != frame.sse_reg_save_offset)
10598 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10599 GEN_INT (m->fs.sp_offset
10600 - frame.sse_reg_save_offset),
10601 -1, false);
10602
10603 /* Align the stack. */
10604 insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
10605 stack_pointer_rtx,
10606 GEN_INT (-align_bytes)));
10607
10608 /* For the purposes of register save area addressing, the stack
10609 pointer is no longer valid. As for the value of sp_offset,
10610 see ix86_compute_frame_layout, which we need to match in order
10611 to pass verification of stack_pointer_offset at the end. */
10612 m->fs.sp_offset = (m->fs.sp_offset + align_bytes) & -align_bytes;
10613 m->fs.sp_valid = false;
10614 }
10615
10616 allocate = frame.stack_pointer_offset - m->fs.sp_offset;
10617
10618 if (flag_stack_usage_info)
10619 {
10620 /* We start to count from ARG_POINTER. */
10621 HOST_WIDE_INT stack_size = frame.stack_pointer_offset;
10622
10623 /* If it was realigned, take into account the fake frame. */
10624 if (stack_realign_drap)
10625 {
10626 if (ix86_static_chain_on_stack)
10627 stack_size += UNITS_PER_WORD;
10628
10629 if (!call_used_regs[REGNO (crtl->drap_reg)])
10630 stack_size += UNITS_PER_WORD;
10631
10632 /* This over-estimates by 1 minimal-stack-alignment-unit but
10633 mitigates that by counting in the new return address slot. */
10634 current_function_dynamic_stack_size
10635 += crtl->stack_alignment_needed / BITS_PER_UNIT;
10636 }
10637
10638 current_function_static_stack_size = stack_size;
10639 }
10640
10641 /* On SEH target with very large frame size, allocate an area to save
10642 SSE registers (as the very large allocation won't be described). */
10643 if (TARGET_SEH
10644 && frame.stack_pointer_offset > SEH_MAX_FRAME_SIZE
10645 && !sse_registers_saved)
10646 {
10647 HOST_WIDE_INT sse_size =
10648 frame.sse_reg_save_offset - frame.reg_save_offset;
10649
10650 gcc_assert (int_registers_saved);
10651
10652 /* No need to do stack checking as the area will be immediately
10653 written. */
10654 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10655 GEN_INT (-sse_size), -1,
10656 m->fs.cfa_reg == stack_pointer_rtx);
10657 allocate -= sse_size;
10658 ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
10659 sse_registers_saved = true;
10660 }
10661
10662 /* The stack has already been decremented by the instruction calling us
10663 so probe if the size is non-negative to preserve the protection area. */
10664 if (allocate >= 0 && flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
10665 {
10666 /* We expect the registers to be saved when probes are used. */
10667 gcc_assert (int_registers_saved);
10668
10669 if (STACK_CHECK_MOVING_SP)
10670 {
10671 ix86_adjust_stack_and_probe (allocate);
10672 allocate = 0;
10673 }
10674 else
10675 {
10676 HOST_WIDE_INT size = allocate;
10677
10678 if (TARGET_64BIT && size >= (HOST_WIDE_INT) 0x80000000)
10679 size = 0x80000000 - STACK_CHECK_PROTECT - 1;
10680
10681 if (TARGET_STACK_PROBE)
10682 ix86_emit_probe_stack_range (0, size + STACK_CHECK_PROTECT);
10683 else
10684 ix86_emit_probe_stack_range (STACK_CHECK_PROTECT, size);
10685 }
10686 }
10687
10688 if (allocate == 0)
10689 ;
10690 else if (!ix86_target_stack_probe ()
10691 || frame.stack_pointer_offset < CHECK_STACK_LIMIT)
10692 {
10693 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10694 GEN_INT (-allocate), -1,
10695 m->fs.cfa_reg == stack_pointer_rtx);
10696 }
10697 else
10698 {
10699 rtx eax = gen_rtx_REG (Pmode, AX_REG);
10700 rtx r10 = NULL;
10701 rtx (*adjust_stack_insn)(rtx, rtx, rtx);
10702
10703 bool eax_live = false;
10704 bool r10_live = false;
10705
10706 if (TARGET_64BIT)
10707 r10_live = (DECL_STATIC_CHAIN (current_function_decl) != 0);
10708 if (!TARGET_64BIT_MS_ABI)
10709 eax_live = ix86_eax_live_at_start_p ();
10710
10711 if (eax_live)
10712 {
10713 emit_insn (gen_push (eax));
10714 allocate -= UNITS_PER_WORD;
10715 }
10716 if (r10_live)
10717 {
10718 r10 = gen_rtx_REG (Pmode, R10_REG);
10719 emit_insn (gen_push (r10));
10720 allocate -= UNITS_PER_WORD;
10721 }
10722
10723 emit_move_insn (eax, GEN_INT (allocate));
10724 emit_insn (ix86_gen_allocate_stack_worker (eax, eax));
10725
10726 /* Use the fact that AX still contains ALLOCATE. */
10727 adjust_stack_insn = (Pmode == DImode
10728 ? gen_pro_epilogue_adjust_stack_di_sub
10729 : gen_pro_epilogue_adjust_stack_si_sub);
10730
10731 insn = emit_insn (adjust_stack_insn (stack_pointer_rtx,
10732 stack_pointer_rtx, eax));
10733
10734 /* Note that SEH directives need to continue tracking the stack
10735 pointer even after the frame pointer has been set up. */
10736 if (m->fs.cfa_reg == stack_pointer_rtx || TARGET_SEH)
10737 {
10738 if (m->fs.cfa_reg == stack_pointer_rtx)
10739 m->fs.cfa_offset += allocate;
10740
10741 RTX_FRAME_RELATED_P (insn) = 1;
10742 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
10743 gen_rtx_SET (VOIDmode, stack_pointer_rtx,
10744 plus_constant (Pmode, stack_pointer_rtx,
10745 -allocate)));
10746 }
10747 m->fs.sp_offset += allocate;
10748
10749 if (r10_live && eax_live)
10750 {
10751 t = choose_baseaddr (m->fs.sp_offset - allocate);
10752 emit_move_insn (gen_rtx_REG (word_mode, R10_REG),
10753 gen_frame_mem (word_mode, t));
10754 t = choose_baseaddr (m->fs.sp_offset - allocate - UNITS_PER_WORD);
10755 emit_move_insn (gen_rtx_REG (word_mode, AX_REG),
10756 gen_frame_mem (word_mode, t));
10757 }
10758 else if (eax_live || r10_live)
10759 {
10760 t = choose_baseaddr (m->fs.sp_offset - allocate);
10761 emit_move_insn (gen_rtx_REG (word_mode,
10762 (eax_live ? AX_REG : R10_REG)),
10763 gen_frame_mem (word_mode, t));
10764 }
10765 }
10766 gcc_assert (m->fs.sp_offset == frame.stack_pointer_offset);
10767
10768 /* If we havn't already set up the frame pointer, do so now. */
10769 if (frame_pointer_needed && !m->fs.fp_valid)
10770 {
10771 insn = ix86_gen_add3 (hard_frame_pointer_rtx, stack_pointer_rtx,
10772 GEN_INT (frame.stack_pointer_offset
10773 - frame.hard_frame_pointer_offset));
10774 insn = emit_insn (insn);
10775 RTX_FRAME_RELATED_P (insn) = 1;
10776 add_reg_note (insn, REG_CFA_ADJUST_CFA, NULL);
10777
10778 if (m->fs.cfa_reg == stack_pointer_rtx)
10779 m->fs.cfa_reg = hard_frame_pointer_rtx;
10780 m->fs.fp_offset = frame.hard_frame_pointer_offset;
10781 m->fs.fp_valid = true;
10782 }
10783
10784 if (!int_registers_saved)
10785 ix86_emit_save_regs_using_mov (frame.reg_save_offset);
10786 if (!sse_registers_saved)
10787 ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
10788
10789 pic_reg_used = false;
10790 if (pic_offset_table_rtx
10791 && (df_regs_ever_live_p (REAL_PIC_OFFSET_TABLE_REGNUM)
10792 || crtl->profile))
10793 {
10794 unsigned int alt_pic_reg_used = ix86_select_alt_pic_regnum ();
10795
10796 if (alt_pic_reg_used != INVALID_REGNUM)
10797 SET_REGNO (pic_offset_table_rtx, alt_pic_reg_used);
10798
10799 pic_reg_used = true;
10800 }
10801
10802 if (pic_reg_used)
10803 {
10804 if (TARGET_64BIT)
10805 {
10806 if (ix86_cmodel == CM_LARGE_PIC)
10807 {
10808 rtx label, tmp_reg;
10809
10810 gcc_assert (Pmode == DImode);
10811 label = gen_label_rtx ();
10812 emit_label (label);
10813 LABEL_PRESERVE_P (label) = 1;
10814 tmp_reg = gen_rtx_REG (Pmode, R11_REG);
10815 gcc_assert (REGNO (pic_offset_table_rtx) != REGNO (tmp_reg));
10816 insn = emit_insn (gen_set_rip_rex64 (pic_offset_table_rtx,
10817 label));
10818 insn = emit_insn (gen_set_got_offset_rex64 (tmp_reg, label));
10819 insn = emit_insn (ix86_gen_add3 (pic_offset_table_rtx,
10820 pic_offset_table_rtx, tmp_reg));
10821 }
10822 else
10823 insn = emit_insn (gen_set_got_rex64 (pic_offset_table_rtx));
10824 }
10825 else
10826 {
10827 insn = emit_insn (gen_set_got (pic_offset_table_rtx));
10828 RTX_FRAME_RELATED_P (insn) = 1;
10829 add_reg_note (insn, REG_CFA_FLUSH_QUEUE, NULL_RTX);
10830 }
10831 }
10832
10833 /* In the pic_reg_used case, make sure that the got load isn't deleted
10834 when mcount needs it. Blockage to avoid call movement across mcount
10835 call is emitted in generic code after the NOTE_INSN_PROLOGUE_END
10836 note. */
10837 if (crtl->profile && !flag_fentry && pic_reg_used)
10838 emit_insn (gen_prologue_use (pic_offset_table_rtx));
10839
10840 if (crtl->drap_reg && !crtl->stack_realign_needed)
10841 {
10842 /* vDRAP is setup but after reload it turns out stack realign
10843 isn't necessary, here we will emit prologue to setup DRAP
10844 without stack realign adjustment */
10845 t = choose_baseaddr (0);
10846 emit_insn (gen_rtx_SET (VOIDmode, crtl->drap_reg, t));
10847 }
10848
10849 /* Prevent instructions from being scheduled into register save push
10850 sequence when access to the redzone area is done through frame pointer.
10851 The offset between the frame pointer and the stack pointer is calculated
10852 relative to the value of the stack pointer at the end of the function
10853 prologue, and moving instructions that access redzone area via frame
10854 pointer inside push sequence violates this assumption. */
10855 if (frame_pointer_needed && frame.red_zone_size)
10856 emit_insn (gen_memory_blockage ());
10857
10858 /* Emit cld instruction if stringops are used in the function. */
10859 if (TARGET_CLD && ix86_current_function_needs_cld)
10860 emit_insn (gen_cld ());
10861
10862 /* SEH requires that the prologue end within 256 bytes of the start of
10863 the function. Prevent instruction schedules that would extend that.
10864 Further, prevent alloca modifications to the stack pointer from being
10865 combined with prologue modifications. */
10866 if (TARGET_SEH)
10867 emit_insn (gen_prologue_use (stack_pointer_rtx));
10868 }
10869
10870 /* Emit code to restore REG using a POP insn. */
10871
10872 static void
10873 ix86_emit_restore_reg_using_pop (rtx reg)
10874 {
10875 struct machine_function *m = cfun->machine;
10876 rtx insn = emit_insn (gen_pop (reg));
10877
10878 ix86_add_cfa_restore_note (insn, reg, m->fs.sp_offset);
10879 m->fs.sp_offset -= UNITS_PER_WORD;
10880
10881 if (m->fs.cfa_reg == crtl->drap_reg
10882 && REGNO (reg) == REGNO (crtl->drap_reg))
10883 {
10884 /* Previously we'd represented the CFA as an expression
10885 like *(%ebp - 8). We've just popped that value from
10886 the stack, which means we need to reset the CFA to
10887 the drap register. This will remain until we restore
10888 the stack pointer. */
10889 add_reg_note (insn, REG_CFA_DEF_CFA, reg);
10890 RTX_FRAME_RELATED_P (insn) = 1;
10891
10892 /* This means that the DRAP register is valid for addressing too. */
10893 m->fs.drap_valid = true;
10894 return;
10895 }
10896
10897 if (m->fs.cfa_reg == stack_pointer_rtx)
10898 {
10899 rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
10900 x = gen_rtx_SET (VOIDmode, stack_pointer_rtx, x);
10901 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
10902 RTX_FRAME_RELATED_P (insn) = 1;
10903
10904 m->fs.cfa_offset -= UNITS_PER_WORD;
10905 }
10906
10907 /* When the frame pointer is the CFA, and we pop it, we are
10908 swapping back to the stack pointer as the CFA. This happens
10909 for stack frames that don't allocate other data, so we assume
10910 the stack pointer is now pointing at the return address, i.e.
10911 the function entry state, which makes the offset be 1 word. */
10912 if (reg == hard_frame_pointer_rtx)
10913 {
10914 m->fs.fp_valid = false;
10915 if (m->fs.cfa_reg == hard_frame_pointer_rtx)
10916 {
10917 m->fs.cfa_reg = stack_pointer_rtx;
10918 m->fs.cfa_offset -= UNITS_PER_WORD;
10919
10920 add_reg_note (insn, REG_CFA_DEF_CFA,
10921 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
10922 GEN_INT (m->fs.cfa_offset)));
10923 RTX_FRAME_RELATED_P (insn) = 1;
10924 }
10925 }
10926 }
10927
10928 /* Emit code to restore saved registers using POP insns. */
10929
10930 static void
10931 ix86_emit_restore_regs_using_pop (void)
10932 {
10933 unsigned int regno;
10934
10935 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
10936 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, false))
10937 ix86_emit_restore_reg_using_pop (gen_rtx_REG (word_mode, regno));
10938 }
10939
10940 /* Emit code and notes for the LEAVE instruction. */
10941
10942 static void
10943 ix86_emit_leave (void)
10944 {
10945 struct machine_function *m = cfun->machine;
10946 rtx insn = emit_insn (ix86_gen_leave ());
10947
10948 ix86_add_queued_cfa_restore_notes (insn);
10949
10950 gcc_assert (m->fs.fp_valid);
10951 m->fs.sp_valid = true;
10952 m->fs.sp_offset = m->fs.fp_offset - UNITS_PER_WORD;
10953 m->fs.fp_valid = false;
10954
10955 if (m->fs.cfa_reg == hard_frame_pointer_rtx)
10956 {
10957 m->fs.cfa_reg = stack_pointer_rtx;
10958 m->fs.cfa_offset = m->fs.sp_offset;
10959
10960 add_reg_note (insn, REG_CFA_DEF_CFA,
10961 plus_constant (Pmode, stack_pointer_rtx,
10962 m->fs.sp_offset));
10963 RTX_FRAME_RELATED_P (insn) = 1;
10964 }
10965 ix86_add_cfa_restore_note (insn, hard_frame_pointer_rtx,
10966 m->fs.fp_offset);
10967 }
10968
10969 /* Emit code to restore saved registers using MOV insns.
10970 First register is restored from CFA - CFA_OFFSET. */
10971 static void
10972 ix86_emit_restore_regs_using_mov (HOST_WIDE_INT cfa_offset,
10973 bool maybe_eh_return)
10974 {
10975 struct machine_function *m = cfun->machine;
10976 unsigned int regno;
10977
10978 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
10979 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return))
10980 {
10981 rtx reg = gen_rtx_REG (word_mode, regno);
10982 rtx insn, mem;
10983
10984 mem = choose_baseaddr (cfa_offset);
10985 mem = gen_frame_mem (word_mode, mem);
10986 insn = emit_move_insn (reg, mem);
10987
10988 if (m->fs.cfa_reg == crtl->drap_reg && regno == REGNO (crtl->drap_reg))
10989 {
10990 /* Previously we'd represented the CFA as an expression
10991 like *(%ebp - 8). We've just popped that value from
10992 the stack, which means we need to reset the CFA to
10993 the drap register. This will remain until we restore
10994 the stack pointer. */
10995 add_reg_note (insn, REG_CFA_DEF_CFA, reg);
10996 RTX_FRAME_RELATED_P (insn) = 1;
10997
10998 /* This means that the DRAP register is valid for addressing. */
10999 m->fs.drap_valid = true;
11000 }
11001 else
11002 ix86_add_cfa_restore_note (NULL_RTX, reg, cfa_offset);
11003
11004 cfa_offset -= UNITS_PER_WORD;
11005 }
11006 }
11007
11008 /* Emit code to restore saved registers using MOV insns.
11009 First register is restored from CFA - CFA_OFFSET. */
11010 static void
11011 ix86_emit_restore_sse_regs_using_mov (HOST_WIDE_INT cfa_offset,
11012 bool maybe_eh_return)
11013 {
11014 unsigned int regno;
11015
11016 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
11017 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return))
11018 {
11019 rtx reg = gen_rtx_REG (V4SFmode, regno);
11020 rtx mem;
11021
11022 mem = choose_baseaddr (cfa_offset);
11023 mem = gen_rtx_MEM (V4SFmode, mem);
11024 set_mem_align (mem, 128);
11025 emit_move_insn (reg, mem);
11026
11027 ix86_add_cfa_restore_note (NULL_RTX, reg, cfa_offset);
11028
11029 cfa_offset -= 16;
11030 }
11031 }
11032
11033 /* Emit vzeroupper if needed. */
11034
11035 void
11036 ix86_maybe_emit_epilogue_vzeroupper (void)
11037 {
11038 if (TARGET_VZEROUPPER
11039 && !TREE_THIS_VOLATILE (cfun->decl)
11040 && !cfun->machine->caller_return_avx256_p)
11041 emit_insn (gen_avx_vzeroupper (GEN_INT (call_no_avx256)));
11042 }
11043
11044 /* Restore function stack, frame, and registers. */
11045
11046 void
11047 ix86_expand_epilogue (int style)
11048 {
11049 struct machine_function *m = cfun->machine;
11050 struct machine_frame_state frame_state_save = m->fs;
11051 struct ix86_frame frame;
11052 bool restore_regs_via_mov;
11053 bool using_drap;
11054
11055 ix86_finalize_stack_realign_flags ();
11056 ix86_compute_frame_layout (&frame);
11057
11058 m->fs.sp_valid = (!frame_pointer_needed
11059 || (crtl->sp_is_unchanging
11060 && !stack_realign_fp));
11061 gcc_assert (!m->fs.sp_valid
11062 || m->fs.sp_offset == frame.stack_pointer_offset);
11063
11064 /* The FP must be valid if the frame pointer is present. */
11065 gcc_assert (frame_pointer_needed == m->fs.fp_valid);
11066 gcc_assert (!m->fs.fp_valid
11067 || m->fs.fp_offset == frame.hard_frame_pointer_offset);
11068
11069 /* We must have *some* valid pointer to the stack frame. */
11070 gcc_assert (m->fs.sp_valid || m->fs.fp_valid);
11071
11072 /* The DRAP is never valid at this point. */
11073 gcc_assert (!m->fs.drap_valid);
11074
11075 /* See the comment about red zone and frame
11076 pointer usage in ix86_expand_prologue. */
11077 if (frame_pointer_needed && frame.red_zone_size)
11078 emit_insn (gen_memory_blockage ());
11079
11080 using_drap = crtl->drap_reg && crtl->stack_realign_needed;
11081 gcc_assert (!using_drap || m->fs.cfa_reg == crtl->drap_reg);
11082
11083 /* Determine the CFA offset of the end of the red-zone. */
11084 m->fs.red_zone_offset = 0;
11085 if (ix86_using_red_zone () && crtl->args.pops_args < 65536)
11086 {
11087 /* The red-zone begins below the return address. */
11088 m->fs.red_zone_offset = RED_ZONE_SIZE + UNITS_PER_WORD;
11089
11090 /* When the register save area is in the aligned portion of
11091 the stack, determine the maximum runtime displacement that
11092 matches up with the aligned frame. */
11093 if (stack_realign_drap)
11094 m->fs.red_zone_offset -= (crtl->stack_alignment_needed / BITS_PER_UNIT
11095 + UNITS_PER_WORD);
11096 }
11097
11098 /* Special care must be taken for the normal return case of a function
11099 using eh_return: the eax and edx registers are marked as saved, but
11100 not restored along this path. Adjust the save location to match. */
11101 if (crtl->calls_eh_return && style != 2)
11102 frame.reg_save_offset -= 2 * UNITS_PER_WORD;
11103
11104 /* EH_RETURN requires the use of moves to function properly. */
11105 if (crtl->calls_eh_return)
11106 restore_regs_via_mov = true;
11107 /* SEH requires the use of pops to identify the epilogue. */
11108 else if (TARGET_SEH)
11109 restore_regs_via_mov = false;
11110 /* If we're only restoring one register and sp is not valid then
11111 using a move instruction to restore the register since it's
11112 less work than reloading sp and popping the register. */
11113 else if (!m->fs.sp_valid && frame.nregs <= 1)
11114 restore_regs_via_mov = true;
11115 else if (TARGET_EPILOGUE_USING_MOVE
11116 && cfun->machine->use_fast_prologue_epilogue
11117 && (frame.nregs > 1
11118 || m->fs.sp_offset != frame.reg_save_offset))
11119 restore_regs_via_mov = true;
11120 else if (frame_pointer_needed
11121 && !frame.nregs
11122 && m->fs.sp_offset != frame.reg_save_offset)
11123 restore_regs_via_mov = true;
11124 else if (frame_pointer_needed
11125 && TARGET_USE_LEAVE
11126 && cfun->machine->use_fast_prologue_epilogue
11127 && frame.nregs == 1)
11128 restore_regs_via_mov = true;
11129 else
11130 restore_regs_via_mov = false;
11131
11132 if (restore_regs_via_mov || frame.nsseregs)
11133 {
11134 /* Ensure that the entire register save area is addressable via
11135 the stack pointer, if we will restore via sp. */
11136 if (TARGET_64BIT
11137 && m->fs.sp_offset > 0x7fffffff
11138 && !(m->fs.fp_valid || m->fs.drap_valid)
11139 && (frame.nsseregs + frame.nregs) != 0)
11140 {
11141 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
11142 GEN_INT (m->fs.sp_offset
11143 - frame.sse_reg_save_offset),
11144 style,
11145 m->fs.cfa_reg == stack_pointer_rtx);
11146 }
11147 }
11148
11149 /* If there are any SSE registers to restore, then we have to do it
11150 via moves, since there's obviously no pop for SSE regs. */
11151 if (frame.nsseregs)
11152 ix86_emit_restore_sse_regs_using_mov (frame.sse_reg_save_offset,
11153 style == 2);
11154
11155 if (restore_regs_via_mov)
11156 {
11157 rtx t;
11158
11159 if (frame.nregs)
11160 ix86_emit_restore_regs_using_mov (frame.reg_save_offset, style == 2);
11161
11162 /* eh_return epilogues need %ecx added to the stack pointer. */
11163 if (style == 2)
11164 {
11165 rtx insn, sa = EH_RETURN_STACKADJ_RTX;
11166
11167 /* Stack align doesn't work with eh_return. */
11168 gcc_assert (!stack_realign_drap);
11169 /* Neither does regparm nested functions. */
11170 gcc_assert (!ix86_static_chain_on_stack);
11171
11172 if (frame_pointer_needed)
11173 {
11174 t = gen_rtx_PLUS (Pmode, hard_frame_pointer_rtx, sa);
11175 t = plus_constant (Pmode, t, m->fs.fp_offset - UNITS_PER_WORD);
11176 emit_insn (gen_rtx_SET (VOIDmode, sa, t));
11177
11178 t = gen_frame_mem (Pmode, hard_frame_pointer_rtx);
11179 insn = emit_move_insn (hard_frame_pointer_rtx, t);
11180
11181 /* Note that we use SA as a temporary CFA, as the return
11182 address is at the proper place relative to it. We
11183 pretend this happens at the FP restore insn because
11184 prior to this insn the FP would be stored at the wrong
11185 offset relative to SA, and after this insn we have no
11186 other reasonable register to use for the CFA. We don't
11187 bother resetting the CFA to the SP for the duration of
11188 the return insn. */
11189 add_reg_note (insn, REG_CFA_DEF_CFA,
11190 plus_constant (Pmode, sa, UNITS_PER_WORD));
11191 ix86_add_queued_cfa_restore_notes (insn);
11192 add_reg_note (insn, REG_CFA_RESTORE, hard_frame_pointer_rtx);
11193 RTX_FRAME_RELATED_P (insn) = 1;
11194
11195 m->fs.cfa_reg = sa;
11196 m->fs.cfa_offset = UNITS_PER_WORD;
11197 m->fs.fp_valid = false;
11198
11199 pro_epilogue_adjust_stack (stack_pointer_rtx, sa,
11200 const0_rtx, style, false);
11201 }
11202 else
11203 {
11204 t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, sa);
11205 t = plus_constant (Pmode, t, m->fs.sp_offset - UNITS_PER_WORD);
11206 insn = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx, t));
11207 ix86_add_queued_cfa_restore_notes (insn);
11208
11209 gcc_assert (m->fs.cfa_reg == stack_pointer_rtx);
11210 if (m->fs.cfa_offset != UNITS_PER_WORD)
11211 {
11212 m->fs.cfa_offset = UNITS_PER_WORD;
11213 add_reg_note (insn, REG_CFA_DEF_CFA,
11214 plus_constant (Pmode, stack_pointer_rtx,
11215 UNITS_PER_WORD));
11216 RTX_FRAME_RELATED_P (insn) = 1;
11217 }
11218 }
11219 m->fs.sp_offset = UNITS_PER_WORD;
11220 m->fs.sp_valid = true;
11221 }
11222 }
11223 else
11224 {
11225 /* SEH requires that the function end with (1) a stack adjustment
11226 if necessary, (2) a sequence of pops, and (3) a return or
11227 jump instruction. Prevent insns from the function body from
11228 being scheduled into this sequence. */
11229 if (TARGET_SEH)
11230 {
11231 /* Prevent a catch region from being adjacent to the standard
11232 epilogue sequence. Unfortuantely crtl->uses_eh_lsda nor
11233 several other flags that would be interesting to test are
11234 not yet set up. */
11235 if (flag_non_call_exceptions)
11236 emit_insn (gen_nops (const1_rtx));
11237 else
11238 emit_insn (gen_blockage ());
11239 }
11240
11241 /* First step is to deallocate the stack frame so that we can
11242 pop the registers. Also do it on SEH target for very large
11243 frame as the emitted instructions aren't allowed by the ABI in
11244 epilogues. */
11245 if (!m->fs.sp_valid
11246 || (TARGET_SEH
11247 && (m->fs.sp_offset - frame.reg_save_offset
11248 >= SEH_MAX_FRAME_SIZE)))
11249 {
11250 pro_epilogue_adjust_stack (stack_pointer_rtx, hard_frame_pointer_rtx,
11251 GEN_INT (m->fs.fp_offset
11252 - frame.reg_save_offset),
11253 style, false);
11254 }
11255 else if (m->fs.sp_offset != frame.reg_save_offset)
11256 {
11257 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
11258 GEN_INT (m->fs.sp_offset
11259 - frame.reg_save_offset),
11260 style,
11261 m->fs.cfa_reg == stack_pointer_rtx);
11262 }
11263
11264 ix86_emit_restore_regs_using_pop ();
11265 }
11266
11267 /* If we used a stack pointer and haven't already got rid of it,
11268 then do so now. */
11269 if (m->fs.fp_valid)
11270 {
11271 /* If the stack pointer is valid and pointing at the frame
11272 pointer store address, then we only need a pop. */
11273 if (m->fs.sp_valid && m->fs.sp_offset == frame.hfp_save_offset)
11274 ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
11275 /* Leave results in shorter dependency chains on CPUs that are
11276 able to grok it fast. */
11277 else if (TARGET_USE_LEAVE
11278 || optimize_function_for_size_p (cfun)
11279 || !cfun->machine->use_fast_prologue_epilogue)
11280 ix86_emit_leave ();
11281 else
11282 {
11283 pro_epilogue_adjust_stack (stack_pointer_rtx,
11284 hard_frame_pointer_rtx,
11285 const0_rtx, style, !using_drap);
11286 ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
11287 }
11288 }
11289
11290 if (using_drap)
11291 {
11292 int param_ptr_offset = UNITS_PER_WORD;
11293 rtx insn;
11294
11295 gcc_assert (stack_realign_drap);
11296
11297 if (ix86_static_chain_on_stack)
11298 param_ptr_offset += UNITS_PER_WORD;
11299 if (!call_used_regs[REGNO (crtl->drap_reg)])
11300 param_ptr_offset += UNITS_PER_WORD;
11301
11302 insn = emit_insn (gen_rtx_SET
11303 (VOIDmode, stack_pointer_rtx,
11304 gen_rtx_PLUS (Pmode,
11305 crtl->drap_reg,
11306 GEN_INT (-param_ptr_offset))));
11307 m->fs.cfa_reg = stack_pointer_rtx;
11308 m->fs.cfa_offset = param_ptr_offset;
11309 m->fs.sp_offset = param_ptr_offset;
11310 m->fs.realigned = false;
11311
11312 add_reg_note (insn, REG_CFA_DEF_CFA,
11313 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
11314 GEN_INT (param_ptr_offset)));
11315 RTX_FRAME_RELATED_P (insn) = 1;
11316
11317 if (!call_used_regs[REGNO (crtl->drap_reg)])
11318 ix86_emit_restore_reg_using_pop (crtl->drap_reg);
11319 }
11320
11321 /* At this point the stack pointer must be valid, and we must have
11322 restored all of the registers. We may not have deallocated the
11323 entire stack frame. We've delayed this until now because it may
11324 be possible to merge the local stack deallocation with the
11325 deallocation forced by ix86_static_chain_on_stack. */
11326 gcc_assert (m->fs.sp_valid);
11327 gcc_assert (!m->fs.fp_valid);
11328 gcc_assert (!m->fs.realigned);
11329 if (m->fs.sp_offset != UNITS_PER_WORD)
11330 {
11331 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
11332 GEN_INT (m->fs.sp_offset - UNITS_PER_WORD),
11333 style, true);
11334 }
11335 else
11336 ix86_add_queued_cfa_restore_notes (get_last_insn ());
11337
11338 /* Sibcall epilogues don't want a return instruction. */
11339 if (style == 0)
11340 {
11341 m->fs = frame_state_save;
11342 return;
11343 }
11344
11345 /* Emit vzeroupper if needed. */
11346 ix86_maybe_emit_epilogue_vzeroupper ();
11347
11348 if (crtl->args.pops_args && crtl->args.size)
11349 {
11350 rtx popc = GEN_INT (crtl->args.pops_args);
11351
11352 /* i386 can only pop 64K bytes. If asked to pop more, pop return
11353 address, do explicit add, and jump indirectly to the caller. */
11354
11355 if (crtl->args.pops_args >= 65536)
11356 {
11357 rtx ecx = gen_rtx_REG (SImode, CX_REG);
11358 rtx insn;
11359
11360 /* There is no "pascal" calling convention in any 64bit ABI. */
11361 gcc_assert (!TARGET_64BIT);
11362
11363 insn = emit_insn (gen_pop (ecx));
11364 m->fs.cfa_offset -= UNITS_PER_WORD;
11365 m->fs.sp_offset -= UNITS_PER_WORD;
11366
11367 add_reg_note (insn, REG_CFA_ADJUST_CFA,
11368 copy_rtx (XVECEXP (PATTERN (insn), 0, 1)));
11369 add_reg_note (insn, REG_CFA_REGISTER,
11370 gen_rtx_SET (VOIDmode, ecx, pc_rtx));
11371 RTX_FRAME_RELATED_P (insn) = 1;
11372
11373 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
11374 popc, -1, true);
11375 emit_jump_insn (gen_simple_return_indirect_internal (ecx));
11376 }
11377 else
11378 emit_jump_insn (gen_simple_return_pop_internal (popc));
11379 }
11380 else
11381 emit_jump_insn (gen_simple_return_internal ());
11382
11383 /* Restore the state back to the state from the prologue,
11384 so that it's correct for the next epilogue. */
11385 m->fs = frame_state_save;
11386 }
11387
11388 /* Reset from the function's potential modifications. */
11389
11390 static void
11391 ix86_output_function_epilogue (FILE *file ATTRIBUTE_UNUSED,
11392 HOST_WIDE_INT size ATTRIBUTE_UNUSED)
11393 {
11394 if (pic_offset_table_rtx)
11395 SET_REGNO (pic_offset_table_rtx, REAL_PIC_OFFSET_TABLE_REGNUM);
11396 #if TARGET_MACHO
11397 /* Mach-O doesn't support labels at the end of objects, so if
11398 it looks like we might want one, insert a NOP. */
11399 {
11400 rtx insn = get_last_insn ();
11401 rtx deleted_debug_label = NULL_RTX;
11402 while (insn
11403 && NOTE_P (insn)
11404 && NOTE_KIND (insn) != NOTE_INSN_DELETED_LABEL)
11405 {
11406 /* Don't insert a nop for NOTE_INSN_DELETED_DEBUG_LABEL
11407 notes only, instead set their CODE_LABEL_NUMBER to -1,
11408 otherwise there would be code generation differences
11409 in between -g and -g0. */
11410 if (NOTE_P (insn) && NOTE_KIND (insn) == NOTE_INSN_DELETED_DEBUG_LABEL)
11411 deleted_debug_label = insn;
11412 insn = PREV_INSN (insn);
11413 }
11414 if (insn
11415 && (LABEL_P (insn)
11416 || (NOTE_P (insn)
11417 && NOTE_KIND (insn) == NOTE_INSN_DELETED_LABEL)))
11418 fputs ("\tnop\n", file);
11419 else if (deleted_debug_label)
11420 for (insn = deleted_debug_label; insn; insn = NEXT_INSN (insn))
11421 if (NOTE_KIND (insn) == NOTE_INSN_DELETED_DEBUG_LABEL)
11422 CODE_LABEL_NUMBER (insn) = -1;
11423 }
11424 #endif
11425
11426 }
11427
11428 /* Return a scratch register to use in the split stack prologue. The
11429 split stack prologue is used for -fsplit-stack. It is the first
11430 instructions in the function, even before the regular prologue.
11431 The scratch register can be any caller-saved register which is not
11432 used for parameters or for the static chain. */
11433
11434 static unsigned int
11435 split_stack_prologue_scratch_regno (void)
11436 {
11437 if (TARGET_64BIT)
11438 return R11_REG;
11439 else
11440 {
11441 bool is_fastcall;
11442 int regparm;
11443
11444 is_fastcall = (lookup_attribute ("fastcall",
11445 TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
11446 != NULL);
11447 regparm = ix86_function_regparm (TREE_TYPE (cfun->decl), cfun->decl);
11448
11449 if (is_fastcall)
11450 {
11451 if (DECL_STATIC_CHAIN (cfun->decl))
11452 {
11453 sorry ("-fsplit-stack does not support fastcall with "
11454 "nested function");
11455 return INVALID_REGNUM;
11456 }
11457 return AX_REG;
11458 }
11459 else if (regparm < 3)
11460 {
11461 if (!DECL_STATIC_CHAIN (cfun->decl))
11462 return CX_REG;
11463 else
11464 {
11465 if (regparm >= 2)
11466 {
11467 sorry ("-fsplit-stack does not support 2 register "
11468 " parameters for a nested function");
11469 return INVALID_REGNUM;
11470 }
11471 return DX_REG;
11472 }
11473 }
11474 else
11475 {
11476 /* FIXME: We could make this work by pushing a register
11477 around the addition and comparison. */
11478 sorry ("-fsplit-stack does not support 3 register parameters");
11479 return INVALID_REGNUM;
11480 }
11481 }
11482 }
11483
11484 /* A SYMBOL_REF for the function which allocates new stackspace for
11485 -fsplit-stack. */
11486
11487 static GTY(()) rtx split_stack_fn;
11488
11489 /* A SYMBOL_REF for the more stack function when using the large
11490 model. */
11491
11492 static GTY(()) rtx split_stack_fn_large;
11493
11494 /* Handle -fsplit-stack. These are the first instructions in the
11495 function, even before the regular prologue. */
11496
11497 void
11498 ix86_expand_split_stack_prologue (void)
11499 {
11500 struct ix86_frame frame;
11501 HOST_WIDE_INT allocate;
11502 unsigned HOST_WIDE_INT args_size;
11503 rtx label, limit, current, jump_insn, allocate_rtx, call_insn, call_fusage;
11504 rtx scratch_reg = NULL_RTX;
11505 rtx varargs_label = NULL_RTX;
11506 rtx fn;
11507
11508 gcc_assert (flag_split_stack && reload_completed);
11509
11510 ix86_finalize_stack_realign_flags ();
11511 ix86_compute_frame_layout (&frame);
11512 allocate = frame.stack_pointer_offset - INCOMING_FRAME_SP_OFFSET;
11513
11514 /* This is the label we will branch to if we have enough stack
11515 space. We expect the basic block reordering pass to reverse this
11516 branch if optimizing, so that we branch in the unlikely case. */
11517 label = gen_label_rtx ();
11518
11519 /* We need to compare the stack pointer minus the frame size with
11520 the stack boundary in the TCB. The stack boundary always gives
11521 us SPLIT_STACK_AVAILABLE bytes, so if we need less than that we
11522 can compare directly. Otherwise we need to do an addition. */
11523
11524 limit = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
11525 UNSPEC_STACK_CHECK);
11526 limit = gen_rtx_CONST (Pmode, limit);
11527 limit = gen_rtx_MEM (Pmode, limit);
11528 if (allocate < SPLIT_STACK_AVAILABLE)
11529 current = stack_pointer_rtx;
11530 else
11531 {
11532 unsigned int scratch_regno;
11533 rtx offset;
11534
11535 /* We need a scratch register to hold the stack pointer minus
11536 the required frame size. Since this is the very start of the
11537 function, the scratch register can be any caller-saved
11538 register which is not used for parameters. */
11539 offset = GEN_INT (- allocate);
11540 scratch_regno = split_stack_prologue_scratch_regno ();
11541 if (scratch_regno == INVALID_REGNUM)
11542 return;
11543 scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
11544 if (!TARGET_64BIT || x86_64_immediate_operand (offset, Pmode))
11545 {
11546 /* We don't use ix86_gen_add3 in this case because it will
11547 want to split to lea, but when not optimizing the insn
11548 will not be split after this point. */
11549 emit_insn (gen_rtx_SET (VOIDmode, scratch_reg,
11550 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
11551 offset)));
11552 }
11553 else
11554 {
11555 emit_move_insn (scratch_reg, offset);
11556 emit_insn (ix86_gen_add3 (scratch_reg, scratch_reg,
11557 stack_pointer_rtx));
11558 }
11559 current = scratch_reg;
11560 }
11561
11562 ix86_expand_branch (GEU, current, limit, label);
11563 jump_insn = get_last_insn ();
11564 JUMP_LABEL (jump_insn) = label;
11565
11566 /* Mark the jump as very likely to be taken. */
11567 add_reg_note (jump_insn, REG_BR_PROB,
11568 GEN_INT (REG_BR_PROB_BASE - REG_BR_PROB_BASE / 100));
11569
11570 if (split_stack_fn == NULL_RTX)
11571 split_stack_fn = gen_rtx_SYMBOL_REF (Pmode, "__morestack");
11572 fn = split_stack_fn;
11573
11574 /* Get more stack space. We pass in the desired stack space and the
11575 size of the arguments to copy to the new stack. In 32-bit mode
11576 we push the parameters; __morestack will return on a new stack
11577 anyhow. In 64-bit mode we pass the parameters in r10 and
11578 r11. */
11579 allocate_rtx = GEN_INT (allocate);
11580 args_size = crtl->args.size >= 0 ? crtl->args.size : 0;
11581 call_fusage = NULL_RTX;
11582 if (TARGET_64BIT)
11583 {
11584 rtx reg10, reg11;
11585
11586 reg10 = gen_rtx_REG (Pmode, R10_REG);
11587 reg11 = gen_rtx_REG (Pmode, R11_REG);
11588
11589 /* If this function uses a static chain, it will be in %r10.
11590 Preserve it across the call to __morestack. */
11591 if (DECL_STATIC_CHAIN (cfun->decl))
11592 {
11593 rtx rax;
11594
11595 rax = gen_rtx_REG (word_mode, AX_REG);
11596 emit_move_insn (rax, gen_rtx_REG (word_mode, R10_REG));
11597 use_reg (&call_fusage, rax);
11598 }
11599
11600 if (ix86_cmodel == CM_LARGE || ix86_cmodel == CM_LARGE_PIC)
11601 {
11602 HOST_WIDE_INT argval;
11603
11604 gcc_assert (Pmode == DImode);
11605 /* When using the large model we need to load the address
11606 into a register, and we've run out of registers. So we
11607 switch to a different calling convention, and we call a
11608 different function: __morestack_large. We pass the
11609 argument size in the upper 32 bits of r10 and pass the
11610 frame size in the lower 32 bits. */
11611 gcc_assert ((allocate & (HOST_WIDE_INT) 0xffffffff) == allocate);
11612 gcc_assert ((args_size & 0xffffffff) == args_size);
11613
11614 if (split_stack_fn_large == NULL_RTX)
11615 split_stack_fn_large =
11616 gen_rtx_SYMBOL_REF (Pmode, "__morestack_large_model");
11617
11618 if (ix86_cmodel == CM_LARGE_PIC)
11619 {
11620 rtx label, x;
11621
11622 label = gen_label_rtx ();
11623 emit_label (label);
11624 LABEL_PRESERVE_P (label) = 1;
11625 emit_insn (gen_set_rip_rex64 (reg10, label));
11626 emit_insn (gen_set_got_offset_rex64 (reg11, label));
11627 emit_insn (ix86_gen_add3 (reg10, reg10, reg11));
11628 x = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, split_stack_fn_large),
11629 UNSPEC_GOT);
11630 x = gen_rtx_CONST (Pmode, x);
11631 emit_move_insn (reg11, x);
11632 x = gen_rtx_PLUS (Pmode, reg10, reg11);
11633 x = gen_const_mem (Pmode, x);
11634 emit_move_insn (reg11, x);
11635 }
11636 else
11637 emit_move_insn (reg11, split_stack_fn_large);
11638
11639 fn = reg11;
11640
11641 argval = ((args_size << 16) << 16) + allocate;
11642 emit_move_insn (reg10, GEN_INT (argval));
11643 }
11644 else
11645 {
11646 emit_move_insn (reg10, allocate_rtx);
11647 emit_move_insn (reg11, GEN_INT (args_size));
11648 use_reg (&call_fusage, reg11);
11649 }
11650
11651 use_reg (&call_fusage, reg10);
11652 }
11653 else
11654 {
11655 emit_insn (gen_push (GEN_INT (args_size)));
11656 emit_insn (gen_push (allocate_rtx));
11657 }
11658 call_insn = ix86_expand_call (NULL_RTX, gen_rtx_MEM (QImode, fn),
11659 GEN_INT (UNITS_PER_WORD), constm1_rtx,
11660 NULL_RTX, false);
11661 add_function_usage_to (call_insn, call_fusage);
11662
11663 /* In order to make call/return prediction work right, we now need
11664 to execute a return instruction. See
11665 libgcc/config/i386/morestack.S for the details on how this works.
11666
11667 For flow purposes gcc must not see this as a return
11668 instruction--we need control flow to continue at the subsequent
11669 label. Therefore, we use an unspec. */
11670 gcc_assert (crtl->args.pops_args < 65536);
11671 emit_insn (gen_split_stack_return (GEN_INT (crtl->args.pops_args)));
11672
11673 /* If we are in 64-bit mode and this function uses a static chain,
11674 we saved %r10 in %rax before calling _morestack. */
11675 if (TARGET_64BIT && DECL_STATIC_CHAIN (cfun->decl))
11676 emit_move_insn (gen_rtx_REG (word_mode, R10_REG),
11677 gen_rtx_REG (word_mode, AX_REG));
11678
11679 /* If this function calls va_start, we need to store a pointer to
11680 the arguments on the old stack, because they may not have been
11681 all copied to the new stack. At this point the old stack can be
11682 found at the frame pointer value used by __morestack, because
11683 __morestack has set that up before calling back to us. Here we
11684 store that pointer in a scratch register, and in
11685 ix86_expand_prologue we store the scratch register in a stack
11686 slot. */
11687 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
11688 {
11689 unsigned int scratch_regno;
11690 rtx frame_reg;
11691 int words;
11692
11693 scratch_regno = split_stack_prologue_scratch_regno ();
11694 scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
11695 frame_reg = gen_rtx_REG (Pmode, BP_REG);
11696
11697 /* 64-bit:
11698 fp -> old fp value
11699 return address within this function
11700 return address of caller of this function
11701 stack arguments
11702 So we add three words to get to the stack arguments.
11703
11704 32-bit:
11705 fp -> old fp value
11706 return address within this function
11707 first argument to __morestack
11708 second argument to __morestack
11709 return address of caller of this function
11710 stack arguments
11711 So we add five words to get to the stack arguments.
11712 */
11713 words = TARGET_64BIT ? 3 : 5;
11714 emit_insn (gen_rtx_SET (VOIDmode, scratch_reg,
11715 gen_rtx_PLUS (Pmode, frame_reg,
11716 GEN_INT (words * UNITS_PER_WORD))));
11717
11718 varargs_label = gen_label_rtx ();
11719 emit_jump_insn (gen_jump (varargs_label));
11720 JUMP_LABEL (get_last_insn ()) = varargs_label;
11721
11722 emit_barrier ();
11723 }
11724
11725 emit_label (label);
11726 LABEL_NUSES (label) = 1;
11727
11728 /* If this function calls va_start, we now have to set the scratch
11729 register for the case where we do not call __morestack. In this
11730 case we need to set it based on the stack pointer. */
11731 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
11732 {
11733 emit_insn (gen_rtx_SET (VOIDmode, scratch_reg,
11734 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
11735 GEN_INT (UNITS_PER_WORD))));
11736
11737 emit_label (varargs_label);
11738 LABEL_NUSES (varargs_label) = 1;
11739 }
11740 }
11741
11742 /* We may have to tell the dataflow pass that the split stack prologue
11743 is initializing a scratch register. */
11744
11745 static void
11746 ix86_live_on_entry (bitmap regs)
11747 {
11748 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
11749 {
11750 gcc_assert (flag_split_stack);
11751 bitmap_set_bit (regs, split_stack_prologue_scratch_regno ());
11752 }
11753 }
11754 \f
11755 /* Determine if op is suitable SUBREG RTX for address. */
11756
11757 static bool
11758 ix86_address_subreg_operand (rtx op)
11759 {
11760 enum machine_mode mode;
11761
11762 if (!REG_P (op))
11763 return false;
11764
11765 mode = GET_MODE (op);
11766
11767 if (GET_MODE_CLASS (mode) != MODE_INT)
11768 return false;
11769
11770 /* Don't allow SUBREGs that span more than a word. It can lead to spill
11771 failures when the register is one word out of a two word structure. */
11772 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
11773 return false;
11774
11775 /* simplify_subreg does not handle stack pointer. */
11776 if (REGNO (op) == STACK_POINTER_REGNUM)
11777 return false;
11778
11779 /* Allow only SUBREGs of non-eliminable hard registers. */
11780 return register_no_elim_operand (op, mode);
11781 }
11782
11783 /* Extract the parts of an RTL expression that is a valid memory address
11784 for an instruction. Return 0 if the structure of the address is
11785 grossly off. Return -1 if the address contains ASHIFT, so it is not
11786 strictly valid, but still used for computing length of lea instruction. */
11787
11788 int
11789 ix86_decompose_address (rtx addr, struct ix86_address *out)
11790 {
11791 rtx base = NULL_RTX, index = NULL_RTX, disp = NULL_RTX;
11792 rtx base_reg, index_reg;
11793 HOST_WIDE_INT scale = 1;
11794 rtx scale_rtx = NULL_RTX;
11795 rtx tmp;
11796 int retval = 1;
11797 enum ix86_address_seg seg = SEG_DEFAULT;
11798
11799 /* Allow zero-extended SImode addresses,
11800 they will be emitted with addr32 prefix. */
11801 if (TARGET_64BIT && GET_MODE (addr) == DImode)
11802 {
11803 if (GET_CODE (addr) == ZERO_EXTEND
11804 && GET_MODE (XEXP (addr, 0)) == SImode)
11805 {
11806 addr = XEXP (addr, 0);
11807 if (CONST_INT_P (addr))
11808 return 0;
11809 }
11810 else if (GET_CODE (addr) == AND
11811 && const_32bit_mask (XEXP (addr, 1), DImode))
11812 {
11813 addr = XEXP (addr, 0);
11814
11815 /* Adjust SUBREGs. */
11816 if (GET_CODE (addr) == SUBREG
11817 && GET_MODE (SUBREG_REG (addr)) == SImode)
11818 {
11819 addr = SUBREG_REG (addr);
11820 if (CONST_INT_P (addr))
11821 return 0;
11822 }
11823 else if (GET_MODE (addr) == DImode)
11824 addr = gen_rtx_SUBREG (SImode, addr, 0);
11825 else if (GET_MODE (addr) != VOIDmode)
11826 return 0;
11827 }
11828 }
11829
11830 /* Allow SImode subregs of DImode addresses,
11831 they will be emitted with addr32 prefix. */
11832 if (TARGET_64BIT && GET_MODE (addr) == SImode)
11833 {
11834 if (GET_CODE (addr) == SUBREG
11835 && GET_MODE (SUBREG_REG (addr)) == DImode)
11836 {
11837 addr = SUBREG_REG (addr);
11838 if (CONST_INT_P (addr))
11839 return 0;
11840 }
11841 }
11842
11843 if (REG_P (addr))
11844 base = addr;
11845 else if (GET_CODE (addr) == SUBREG)
11846 {
11847 if (ix86_address_subreg_operand (SUBREG_REG (addr)))
11848 base = addr;
11849 else
11850 return 0;
11851 }
11852 else if (GET_CODE (addr) == PLUS)
11853 {
11854 rtx addends[4], op;
11855 int n = 0, i;
11856
11857 op = addr;
11858 do
11859 {
11860 if (n >= 4)
11861 return 0;
11862 addends[n++] = XEXP (op, 1);
11863 op = XEXP (op, 0);
11864 }
11865 while (GET_CODE (op) == PLUS);
11866 if (n >= 4)
11867 return 0;
11868 addends[n] = op;
11869
11870 for (i = n; i >= 0; --i)
11871 {
11872 op = addends[i];
11873 switch (GET_CODE (op))
11874 {
11875 case MULT:
11876 if (index)
11877 return 0;
11878 index = XEXP (op, 0);
11879 scale_rtx = XEXP (op, 1);
11880 break;
11881
11882 case ASHIFT:
11883 if (index)
11884 return 0;
11885 index = XEXP (op, 0);
11886 tmp = XEXP (op, 1);
11887 if (!CONST_INT_P (tmp))
11888 return 0;
11889 scale = INTVAL (tmp);
11890 if ((unsigned HOST_WIDE_INT) scale > 3)
11891 return 0;
11892 scale = 1 << scale;
11893 break;
11894
11895 case ZERO_EXTEND:
11896 op = XEXP (op, 0);
11897 if (GET_CODE (op) != UNSPEC)
11898 return 0;
11899 /* FALLTHRU */
11900
11901 case UNSPEC:
11902 if (XINT (op, 1) == UNSPEC_TP
11903 && TARGET_TLS_DIRECT_SEG_REFS
11904 && seg == SEG_DEFAULT)
11905 seg = TARGET_64BIT ? SEG_FS : SEG_GS;
11906 else
11907 return 0;
11908 break;
11909
11910 case SUBREG:
11911 if (!ix86_address_subreg_operand (SUBREG_REG (op)))
11912 return 0;
11913 /* FALLTHRU */
11914
11915 case REG:
11916 if (!base)
11917 base = op;
11918 else if (!index)
11919 index = op;
11920 else
11921 return 0;
11922 break;
11923
11924 case CONST:
11925 case CONST_INT:
11926 case SYMBOL_REF:
11927 case LABEL_REF:
11928 if (disp)
11929 return 0;
11930 disp = op;
11931 break;
11932
11933 default:
11934 return 0;
11935 }
11936 }
11937 }
11938 else if (GET_CODE (addr) == MULT)
11939 {
11940 index = XEXP (addr, 0); /* index*scale */
11941 scale_rtx = XEXP (addr, 1);
11942 }
11943 else if (GET_CODE (addr) == ASHIFT)
11944 {
11945 /* We're called for lea too, which implements ashift on occasion. */
11946 index = XEXP (addr, 0);
11947 tmp = XEXP (addr, 1);
11948 if (!CONST_INT_P (tmp))
11949 return 0;
11950 scale = INTVAL (tmp);
11951 if ((unsigned HOST_WIDE_INT) scale > 3)
11952 return 0;
11953 scale = 1 << scale;
11954 retval = -1;
11955 }
11956 else if (CONST_INT_P (addr))
11957 {
11958 if (!x86_64_immediate_operand (addr, VOIDmode))
11959 return 0;
11960
11961 /* Constant addresses are sign extended to 64bit, we have to
11962 prevent addresses from 0x80000000 to 0xffffffff in x32 mode. */
11963 if (TARGET_X32
11964 && val_signbit_known_set_p (SImode, INTVAL (addr)))
11965 return 0;
11966
11967 disp = addr;
11968 }
11969 else
11970 disp = addr; /* displacement */
11971
11972 if (index)
11973 {
11974 if (REG_P (index))
11975 ;
11976 else if (GET_CODE (index) == SUBREG
11977 && ix86_address_subreg_operand (SUBREG_REG (index)))
11978 ;
11979 else
11980 return 0;
11981 }
11982
11983 /* Address override works only on the (%reg) part of %fs:(%reg). */
11984 if (seg != SEG_DEFAULT
11985 && ((base && GET_MODE (base) != word_mode)
11986 || (index && GET_MODE (index) != word_mode)))
11987 return 0;
11988
11989 /* Extract the integral value of scale. */
11990 if (scale_rtx)
11991 {
11992 if (!CONST_INT_P (scale_rtx))
11993 return 0;
11994 scale = INTVAL (scale_rtx);
11995 }
11996
11997 base_reg = base && GET_CODE (base) == SUBREG ? SUBREG_REG (base) : base;
11998 index_reg = index && GET_CODE (index) == SUBREG ? SUBREG_REG (index) : index;
11999
12000 /* Avoid useless 0 displacement. */
12001 if (disp == const0_rtx && (base || index))
12002 disp = NULL_RTX;
12003
12004 /* Allow arg pointer and stack pointer as index if there is not scaling. */
12005 if (base_reg && index_reg && scale == 1
12006 && (index_reg == arg_pointer_rtx
12007 || index_reg == frame_pointer_rtx
12008 || (REG_P (index_reg) && REGNO (index_reg) == STACK_POINTER_REGNUM)))
12009 {
12010 rtx tmp;
12011 tmp = base, base = index, index = tmp;
12012 tmp = base_reg, base_reg = index_reg, index_reg = tmp;
12013 }
12014
12015 /* Special case: %ebp cannot be encoded as a base without a displacement.
12016 Similarly %r13. */
12017 if (!disp
12018 && base_reg
12019 && (base_reg == hard_frame_pointer_rtx
12020 || base_reg == frame_pointer_rtx
12021 || base_reg == arg_pointer_rtx
12022 || (REG_P (base_reg)
12023 && (REGNO (base_reg) == HARD_FRAME_POINTER_REGNUM
12024 || REGNO (base_reg) == R13_REG))))
12025 disp = const0_rtx;
12026
12027 /* Special case: on K6, [%esi] makes the instruction vector decoded.
12028 Avoid this by transforming to [%esi+0].
12029 Reload calls address legitimization without cfun defined, so we need
12030 to test cfun for being non-NULL. */
12031 if (TARGET_K6 && cfun && optimize_function_for_speed_p (cfun)
12032 && base_reg && !index_reg && !disp
12033 && REG_P (base_reg) && REGNO (base_reg) == SI_REG)
12034 disp = const0_rtx;
12035
12036 /* Special case: encode reg+reg instead of reg*2. */
12037 if (!base && index && scale == 2)
12038 base = index, base_reg = index_reg, scale = 1;
12039
12040 /* Special case: scaling cannot be encoded without base or displacement. */
12041 if (!base && !disp && index && scale != 1)
12042 disp = const0_rtx;
12043
12044 out->base = base;
12045 out->index = index;
12046 out->disp = disp;
12047 out->scale = scale;
12048 out->seg = seg;
12049
12050 return retval;
12051 }
12052 \f
12053 /* Return cost of the memory address x.
12054 For i386, it is better to use a complex address than let gcc copy
12055 the address into a reg and make a new pseudo. But not if the address
12056 requires to two regs - that would mean more pseudos with longer
12057 lifetimes. */
12058 static int
12059 ix86_address_cost (rtx x, enum machine_mode mode ATTRIBUTE_UNUSED,
12060 addr_space_t as ATTRIBUTE_UNUSED,
12061 bool speed ATTRIBUTE_UNUSED)
12062 {
12063 struct ix86_address parts;
12064 int cost = 1;
12065 int ok = ix86_decompose_address (x, &parts);
12066
12067 gcc_assert (ok);
12068
12069 if (parts.base && GET_CODE (parts.base) == SUBREG)
12070 parts.base = SUBREG_REG (parts.base);
12071 if (parts.index && GET_CODE (parts.index) == SUBREG)
12072 parts.index = SUBREG_REG (parts.index);
12073
12074 /* Attempt to minimize number of registers in the address. */
12075 if ((parts.base
12076 && (!REG_P (parts.base) || REGNO (parts.base) >= FIRST_PSEUDO_REGISTER))
12077 || (parts.index
12078 && (!REG_P (parts.index)
12079 || REGNO (parts.index) >= FIRST_PSEUDO_REGISTER)))
12080 cost++;
12081
12082 if (parts.base
12083 && (!REG_P (parts.base) || REGNO (parts.base) >= FIRST_PSEUDO_REGISTER)
12084 && parts.index
12085 && (!REG_P (parts.index) || REGNO (parts.index) >= FIRST_PSEUDO_REGISTER)
12086 && parts.base != parts.index)
12087 cost++;
12088
12089 /* AMD-K6 don't like addresses with ModR/M set to 00_xxx_100b,
12090 since it's predecode logic can't detect the length of instructions
12091 and it degenerates to vector decoded. Increase cost of such
12092 addresses here. The penalty is minimally 2 cycles. It may be worthwhile
12093 to split such addresses or even refuse such addresses at all.
12094
12095 Following addressing modes are affected:
12096 [base+scale*index]
12097 [scale*index+disp]
12098 [base+index]
12099
12100 The first and last case may be avoidable by explicitly coding the zero in
12101 memory address, but I don't have AMD-K6 machine handy to check this
12102 theory. */
12103
12104 if (TARGET_K6
12105 && ((!parts.disp && parts.base && parts.index && parts.scale != 1)
12106 || (parts.disp && !parts.base && parts.index && parts.scale != 1)
12107 || (!parts.disp && parts.base && parts.index && parts.scale == 1)))
12108 cost += 10;
12109
12110 return cost;
12111 }
12112 \f
12113 /* Allow {LABEL | SYMBOL}_REF - SYMBOL_REF-FOR-PICBASE for Mach-O as
12114 this is used for to form addresses to local data when -fPIC is in
12115 use. */
12116
12117 static bool
12118 darwin_local_data_pic (rtx disp)
12119 {
12120 return (GET_CODE (disp) == UNSPEC
12121 && XINT (disp, 1) == UNSPEC_MACHOPIC_OFFSET);
12122 }
12123
12124 /* Determine if a given RTX is a valid constant. We already know this
12125 satisfies CONSTANT_P. */
12126
12127 static bool
12128 ix86_legitimate_constant_p (enum machine_mode mode ATTRIBUTE_UNUSED, rtx x)
12129 {
12130 switch (GET_CODE (x))
12131 {
12132 case CONST:
12133 x = XEXP (x, 0);
12134
12135 if (GET_CODE (x) == PLUS)
12136 {
12137 if (!CONST_INT_P (XEXP (x, 1)))
12138 return false;
12139 x = XEXP (x, 0);
12140 }
12141
12142 if (TARGET_MACHO && darwin_local_data_pic (x))
12143 return true;
12144
12145 /* Only some unspecs are valid as "constants". */
12146 if (GET_CODE (x) == UNSPEC)
12147 switch (XINT (x, 1))
12148 {
12149 case UNSPEC_GOT:
12150 case UNSPEC_GOTOFF:
12151 case UNSPEC_PLTOFF:
12152 return TARGET_64BIT;
12153 case UNSPEC_TPOFF:
12154 case UNSPEC_NTPOFF:
12155 x = XVECEXP (x, 0, 0);
12156 return (GET_CODE (x) == SYMBOL_REF
12157 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
12158 case UNSPEC_DTPOFF:
12159 x = XVECEXP (x, 0, 0);
12160 return (GET_CODE (x) == SYMBOL_REF
12161 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_DYNAMIC);
12162 default:
12163 return false;
12164 }
12165
12166 /* We must have drilled down to a symbol. */
12167 if (GET_CODE (x) == LABEL_REF)
12168 return true;
12169 if (GET_CODE (x) != SYMBOL_REF)
12170 return false;
12171 /* FALLTHRU */
12172
12173 case SYMBOL_REF:
12174 /* TLS symbols are never valid. */
12175 if (SYMBOL_REF_TLS_MODEL (x))
12176 return false;
12177
12178 /* DLLIMPORT symbols are never valid. */
12179 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
12180 && SYMBOL_REF_DLLIMPORT_P (x))
12181 return false;
12182
12183 #if TARGET_MACHO
12184 /* mdynamic-no-pic */
12185 if (MACHO_DYNAMIC_NO_PIC_P)
12186 return machopic_symbol_defined_p (x);
12187 #endif
12188 break;
12189
12190 case CONST_DOUBLE:
12191 if (GET_MODE (x) == TImode
12192 && x != CONST0_RTX (TImode)
12193 && !TARGET_64BIT)
12194 return false;
12195 break;
12196
12197 case CONST_VECTOR:
12198 if (!standard_sse_constant_p (x))
12199 return false;
12200
12201 default:
12202 break;
12203 }
12204
12205 /* Otherwise we handle everything else in the move patterns. */
12206 return true;
12207 }
12208
12209 /* Determine if it's legal to put X into the constant pool. This
12210 is not possible for the address of thread-local symbols, which
12211 is checked above. */
12212
12213 static bool
12214 ix86_cannot_force_const_mem (enum machine_mode mode, rtx x)
12215 {
12216 /* We can always put integral constants and vectors in memory. */
12217 switch (GET_CODE (x))
12218 {
12219 case CONST_INT:
12220 case CONST_DOUBLE:
12221 case CONST_VECTOR:
12222 return false;
12223
12224 default:
12225 break;
12226 }
12227 return !ix86_legitimate_constant_p (mode, x);
12228 }
12229
12230
12231 /* Nonzero if the constant value X is a legitimate general operand
12232 when generating PIC code. It is given that flag_pic is on and
12233 that X satisfies CONSTANT_P or is a CONST_DOUBLE. */
12234
12235 bool
12236 legitimate_pic_operand_p (rtx x)
12237 {
12238 rtx inner;
12239
12240 switch (GET_CODE (x))
12241 {
12242 case CONST:
12243 inner = XEXP (x, 0);
12244 if (GET_CODE (inner) == PLUS
12245 && CONST_INT_P (XEXP (inner, 1)))
12246 inner = XEXP (inner, 0);
12247
12248 /* Only some unspecs are valid as "constants". */
12249 if (GET_CODE (inner) == UNSPEC)
12250 switch (XINT (inner, 1))
12251 {
12252 case UNSPEC_GOT:
12253 case UNSPEC_GOTOFF:
12254 case UNSPEC_PLTOFF:
12255 return TARGET_64BIT;
12256 case UNSPEC_TPOFF:
12257 x = XVECEXP (inner, 0, 0);
12258 return (GET_CODE (x) == SYMBOL_REF
12259 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
12260 case UNSPEC_MACHOPIC_OFFSET:
12261 return legitimate_pic_address_disp_p (x);
12262 default:
12263 return false;
12264 }
12265 /* FALLTHRU */
12266
12267 case SYMBOL_REF:
12268 case LABEL_REF:
12269 return legitimate_pic_address_disp_p (x);
12270
12271 default:
12272 return true;
12273 }
12274 }
12275
12276 /* Determine if a given CONST RTX is a valid memory displacement
12277 in PIC mode. */
12278
12279 bool
12280 legitimate_pic_address_disp_p (rtx disp)
12281 {
12282 bool saw_plus;
12283
12284 /* In 64bit mode we can allow direct addresses of symbols and labels
12285 when they are not dynamic symbols. */
12286 if (TARGET_64BIT)
12287 {
12288 rtx op0 = disp, op1;
12289
12290 switch (GET_CODE (disp))
12291 {
12292 case LABEL_REF:
12293 return true;
12294
12295 case CONST:
12296 if (GET_CODE (XEXP (disp, 0)) != PLUS)
12297 break;
12298 op0 = XEXP (XEXP (disp, 0), 0);
12299 op1 = XEXP (XEXP (disp, 0), 1);
12300 if (!CONST_INT_P (op1)
12301 || INTVAL (op1) >= 16*1024*1024
12302 || INTVAL (op1) < -16*1024*1024)
12303 break;
12304 if (GET_CODE (op0) == LABEL_REF)
12305 return true;
12306 if (GET_CODE (op0) == CONST
12307 && GET_CODE (XEXP (op0, 0)) == UNSPEC
12308 && XINT (XEXP (op0, 0), 1) == UNSPEC_PCREL)
12309 return true;
12310 if (GET_CODE (op0) == UNSPEC
12311 && XINT (op0, 1) == UNSPEC_PCREL)
12312 return true;
12313 if (GET_CODE (op0) != SYMBOL_REF)
12314 break;
12315 /* FALLTHRU */
12316
12317 case SYMBOL_REF:
12318 /* TLS references should always be enclosed in UNSPEC. */
12319 if (SYMBOL_REF_TLS_MODEL (op0))
12320 return false;
12321 if (!SYMBOL_REF_FAR_ADDR_P (op0) && SYMBOL_REF_LOCAL_P (op0)
12322 && ix86_cmodel != CM_LARGE_PIC)
12323 return true;
12324 break;
12325
12326 default:
12327 break;
12328 }
12329 }
12330 if (GET_CODE (disp) != CONST)
12331 return false;
12332 disp = XEXP (disp, 0);
12333
12334 if (TARGET_64BIT)
12335 {
12336 /* We are unsafe to allow PLUS expressions. This limit allowed distance
12337 of GOT tables. We should not need these anyway. */
12338 if (GET_CODE (disp) != UNSPEC
12339 || (XINT (disp, 1) != UNSPEC_GOTPCREL
12340 && XINT (disp, 1) != UNSPEC_GOTOFF
12341 && XINT (disp, 1) != UNSPEC_PCREL
12342 && XINT (disp, 1) != UNSPEC_PLTOFF))
12343 return false;
12344
12345 if (GET_CODE (XVECEXP (disp, 0, 0)) != SYMBOL_REF
12346 && GET_CODE (XVECEXP (disp, 0, 0)) != LABEL_REF)
12347 return false;
12348 return true;
12349 }
12350
12351 saw_plus = false;
12352 if (GET_CODE (disp) == PLUS)
12353 {
12354 if (!CONST_INT_P (XEXP (disp, 1)))
12355 return false;
12356 disp = XEXP (disp, 0);
12357 saw_plus = true;
12358 }
12359
12360 if (TARGET_MACHO && darwin_local_data_pic (disp))
12361 return true;
12362
12363 if (GET_CODE (disp) != UNSPEC)
12364 return false;
12365
12366 switch (XINT (disp, 1))
12367 {
12368 case UNSPEC_GOT:
12369 if (saw_plus)
12370 return false;
12371 /* We need to check for both symbols and labels because VxWorks loads
12372 text labels with @GOT rather than @GOTOFF. See gotoff_operand for
12373 details. */
12374 return (GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
12375 || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF);
12376 case UNSPEC_GOTOFF:
12377 /* Refuse GOTOFF in 64bit mode since it is always 64bit when used.
12378 While ABI specify also 32bit relocation but we don't produce it in
12379 small PIC model at all. */
12380 if ((GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
12381 || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF)
12382 && !TARGET_64BIT)
12383 return gotoff_operand (XVECEXP (disp, 0, 0), Pmode);
12384 return false;
12385 case UNSPEC_GOTTPOFF:
12386 case UNSPEC_GOTNTPOFF:
12387 case UNSPEC_INDNTPOFF:
12388 if (saw_plus)
12389 return false;
12390 disp = XVECEXP (disp, 0, 0);
12391 return (GET_CODE (disp) == SYMBOL_REF
12392 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_INITIAL_EXEC);
12393 case UNSPEC_NTPOFF:
12394 disp = XVECEXP (disp, 0, 0);
12395 return (GET_CODE (disp) == SYMBOL_REF
12396 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_EXEC);
12397 case UNSPEC_DTPOFF:
12398 disp = XVECEXP (disp, 0, 0);
12399 return (GET_CODE (disp) == SYMBOL_REF
12400 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_DYNAMIC);
12401 }
12402
12403 return false;
12404 }
12405
12406 /* Our implementation of LEGITIMIZE_RELOAD_ADDRESS. Returns a value to
12407 replace the input X, or the original X if no replacement is called for.
12408 The output parameter *WIN is 1 if the calling macro should goto WIN,
12409 0 if it should not. */
12410
12411 bool
12412 ix86_legitimize_reload_address (rtx x,
12413 enum machine_mode mode ATTRIBUTE_UNUSED,
12414 int opnum, int type,
12415 int ind_levels ATTRIBUTE_UNUSED)
12416 {
12417 /* Reload can generate:
12418
12419 (plus:DI (plus:DI (unspec:DI [(const_int 0 [0])] UNSPEC_TP)
12420 (reg:DI 97))
12421 (reg:DI 2 cx))
12422
12423 This RTX is rejected from ix86_legitimate_address_p due to
12424 non-strictness of base register 97. Following this rejection,
12425 reload pushes all three components into separate registers,
12426 creating invalid memory address RTX.
12427
12428 Following code reloads only the invalid part of the
12429 memory address RTX. */
12430
12431 if (GET_CODE (x) == PLUS
12432 && REG_P (XEXP (x, 1))
12433 && GET_CODE (XEXP (x, 0)) == PLUS
12434 && REG_P (XEXP (XEXP (x, 0), 1)))
12435 {
12436 rtx base, index;
12437 bool something_reloaded = false;
12438
12439 base = XEXP (XEXP (x, 0), 1);
12440 if (!REG_OK_FOR_BASE_STRICT_P (base))
12441 {
12442 push_reload (base, NULL_RTX, &XEXP (XEXP (x, 0), 1), NULL,
12443 BASE_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
12444 opnum, (enum reload_type) type);
12445 something_reloaded = true;
12446 }
12447
12448 index = XEXP (x, 1);
12449 if (!REG_OK_FOR_INDEX_STRICT_P (index))
12450 {
12451 push_reload (index, NULL_RTX, &XEXP (x, 1), NULL,
12452 INDEX_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
12453 opnum, (enum reload_type) type);
12454 something_reloaded = true;
12455 }
12456
12457 gcc_assert (something_reloaded);
12458 return true;
12459 }
12460
12461 return false;
12462 }
12463
12464 /* Recognizes RTL expressions that are valid memory addresses for an
12465 instruction. The MODE argument is the machine mode for the MEM
12466 expression that wants to use this address.
12467
12468 It only recognizes address in canonical form. LEGITIMIZE_ADDRESS should
12469 convert common non-canonical forms to canonical form so that they will
12470 be recognized. */
12471
12472 static bool
12473 ix86_legitimate_address_p (enum machine_mode mode ATTRIBUTE_UNUSED,
12474 rtx addr, bool strict)
12475 {
12476 struct ix86_address parts;
12477 rtx base, index, disp;
12478 HOST_WIDE_INT scale;
12479
12480 if (ix86_decompose_address (addr, &parts) <= 0)
12481 /* Decomposition failed. */
12482 return false;
12483
12484 base = parts.base;
12485 index = parts.index;
12486 disp = parts.disp;
12487 scale = parts.scale;
12488
12489 /* Validate base register. */
12490 if (base)
12491 {
12492 rtx reg;
12493
12494 if (REG_P (base))
12495 reg = base;
12496 else if (GET_CODE (base) == SUBREG && REG_P (SUBREG_REG (base)))
12497 reg = SUBREG_REG (base);
12498 else
12499 /* Base is not a register. */
12500 return false;
12501
12502 if (GET_MODE (base) != SImode && GET_MODE (base) != DImode)
12503 return false;
12504
12505 if ((strict && ! REG_OK_FOR_BASE_STRICT_P (reg))
12506 || (! strict && ! REG_OK_FOR_BASE_NONSTRICT_P (reg)))
12507 /* Base is not valid. */
12508 return false;
12509 }
12510
12511 /* Validate index register. */
12512 if (index)
12513 {
12514 rtx reg;
12515
12516 if (REG_P (index))
12517 reg = index;
12518 else if (GET_CODE (index) == SUBREG && REG_P (SUBREG_REG (index)))
12519 reg = SUBREG_REG (index);
12520 else
12521 /* Index is not a register. */
12522 return false;
12523
12524 if (GET_MODE (index) != SImode && GET_MODE (index) != DImode)
12525 return false;
12526
12527 if ((strict && ! REG_OK_FOR_INDEX_STRICT_P (reg))
12528 || (! strict && ! REG_OK_FOR_INDEX_NONSTRICT_P (reg)))
12529 /* Index is not valid. */
12530 return false;
12531 }
12532
12533 /* Index and base should have the same mode. */
12534 if (base && index
12535 && GET_MODE (base) != GET_MODE (index))
12536 return false;
12537
12538 /* Validate scale factor. */
12539 if (scale != 1)
12540 {
12541 if (!index)
12542 /* Scale without index. */
12543 return false;
12544
12545 if (scale != 2 && scale != 4 && scale != 8)
12546 /* Scale is not a valid multiplier. */
12547 return false;
12548 }
12549
12550 /* Validate displacement. */
12551 if (disp)
12552 {
12553 if (GET_CODE (disp) == CONST
12554 && GET_CODE (XEXP (disp, 0)) == UNSPEC
12555 && XINT (XEXP (disp, 0), 1) != UNSPEC_MACHOPIC_OFFSET)
12556 switch (XINT (XEXP (disp, 0), 1))
12557 {
12558 /* Refuse GOTOFF and GOT in 64bit mode since it is always 64bit when
12559 used. While ABI specify also 32bit relocations, we don't produce
12560 them at all and use IP relative instead. */
12561 case UNSPEC_GOT:
12562 case UNSPEC_GOTOFF:
12563 gcc_assert (flag_pic);
12564 if (!TARGET_64BIT)
12565 goto is_legitimate_pic;
12566
12567 /* 64bit address unspec. */
12568 return false;
12569
12570 case UNSPEC_GOTPCREL:
12571 case UNSPEC_PCREL:
12572 gcc_assert (flag_pic);
12573 goto is_legitimate_pic;
12574
12575 case UNSPEC_GOTTPOFF:
12576 case UNSPEC_GOTNTPOFF:
12577 case UNSPEC_INDNTPOFF:
12578 case UNSPEC_NTPOFF:
12579 case UNSPEC_DTPOFF:
12580 break;
12581
12582 case UNSPEC_STACK_CHECK:
12583 gcc_assert (flag_split_stack);
12584 break;
12585
12586 default:
12587 /* Invalid address unspec. */
12588 return false;
12589 }
12590
12591 else if (SYMBOLIC_CONST (disp)
12592 && (flag_pic
12593 || (TARGET_MACHO
12594 #if TARGET_MACHO
12595 && MACHOPIC_INDIRECT
12596 && !machopic_operand_p (disp)
12597 #endif
12598 )))
12599 {
12600
12601 is_legitimate_pic:
12602 if (TARGET_64BIT && (index || base))
12603 {
12604 /* foo@dtpoff(%rX) is ok. */
12605 if (GET_CODE (disp) != CONST
12606 || GET_CODE (XEXP (disp, 0)) != PLUS
12607 || GET_CODE (XEXP (XEXP (disp, 0), 0)) != UNSPEC
12608 || !CONST_INT_P (XEXP (XEXP (disp, 0), 1))
12609 || (XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_DTPOFF
12610 && XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_NTPOFF))
12611 /* Non-constant pic memory reference. */
12612 return false;
12613 }
12614 else if ((!TARGET_MACHO || flag_pic)
12615 && ! legitimate_pic_address_disp_p (disp))
12616 /* Displacement is an invalid pic construct. */
12617 return false;
12618 #if TARGET_MACHO
12619 else if (MACHO_DYNAMIC_NO_PIC_P
12620 && !ix86_legitimate_constant_p (Pmode, disp))
12621 /* displacment must be referenced via non_lazy_pointer */
12622 return false;
12623 #endif
12624
12625 /* This code used to verify that a symbolic pic displacement
12626 includes the pic_offset_table_rtx register.
12627
12628 While this is good idea, unfortunately these constructs may
12629 be created by "adds using lea" optimization for incorrect
12630 code like:
12631
12632 int a;
12633 int foo(int i)
12634 {
12635 return *(&a+i);
12636 }
12637
12638 This code is nonsensical, but results in addressing
12639 GOT table with pic_offset_table_rtx base. We can't
12640 just refuse it easily, since it gets matched by
12641 "addsi3" pattern, that later gets split to lea in the
12642 case output register differs from input. While this
12643 can be handled by separate addsi pattern for this case
12644 that never results in lea, this seems to be easier and
12645 correct fix for crash to disable this test. */
12646 }
12647 else if (GET_CODE (disp) != LABEL_REF
12648 && !CONST_INT_P (disp)
12649 && (GET_CODE (disp) != CONST
12650 || !ix86_legitimate_constant_p (Pmode, disp))
12651 && (GET_CODE (disp) != SYMBOL_REF
12652 || !ix86_legitimate_constant_p (Pmode, disp)))
12653 /* Displacement is not constant. */
12654 return false;
12655 else if (TARGET_64BIT
12656 && !x86_64_immediate_operand (disp, VOIDmode))
12657 /* Displacement is out of range. */
12658 return false;
12659 }
12660
12661 /* Everything looks valid. */
12662 return true;
12663 }
12664
12665 /* Determine if a given RTX is a valid constant address. */
12666
12667 bool
12668 constant_address_p (rtx x)
12669 {
12670 return CONSTANT_P (x) && ix86_legitimate_address_p (Pmode, x, 1);
12671 }
12672 \f
12673 /* Return a unique alias set for the GOT. */
12674
12675 static alias_set_type
12676 ix86_GOT_alias_set (void)
12677 {
12678 static alias_set_type set = -1;
12679 if (set == -1)
12680 set = new_alias_set ();
12681 return set;
12682 }
12683
12684 /* Return a legitimate reference for ORIG (an address) using the
12685 register REG. If REG is 0, a new pseudo is generated.
12686
12687 There are two types of references that must be handled:
12688
12689 1. Global data references must load the address from the GOT, via
12690 the PIC reg. An insn is emitted to do this load, and the reg is
12691 returned.
12692
12693 2. Static data references, constant pool addresses, and code labels
12694 compute the address as an offset from the GOT, whose base is in
12695 the PIC reg. Static data objects have SYMBOL_FLAG_LOCAL set to
12696 differentiate them from global data objects. The returned
12697 address is the PIC reg + an unspec constant.
12698
12699 TARGET_LEGITIMATE_ADDRESS_P rejects symbolic references unless the PIC
12700 reg also appears in the address. */
12701
12702 static rtx
12703 legitimize_pic_address (rtx orig, rtx reg)
12704 {
12705 rtx addr = orig;
12706 rtx new_rtx = orig;
12707 rtx base;
12708
12709 #if TARGET_MACHO
12710 if (TARGET_MACHO && !TARGET_64BIT)
12711 {
12712 if (reg == 0)
12713 reg = gen_reg_rtx (Pmode);
12714 /* Use the generic Mach-O PIC machinery. */
12715 return machopic_legitimize_pic_address (orig, GET_MODE (orig), reg);
12716 }
12717 #endif
12718
12719 if (TARGET_64BIT && legitimate_pic_address_disp_p (addr))
12720 new_rtx = addr;
12721 else if (TARGET_64BIT
12722 && ix86_cmodel != CM_SMALL_PIC
12723 && gotoff_operand (addr, Pmode))
12724 {
12725 rtx tmpreg;
12726 /* This symbol may be referenced via a displacement from the PIC
12727 base address (@GOTOFF). */
12728
12729 if (reload_in_progress)
12730 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12731 if (GET_CODE (addr) == CONST)
12732 addr = XEXP (addr, 0);
12733 if (GET_CODE (addr) == PLUS)
12734 {
12735 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, XEXP (addr, 0)),
12736 UNSPEC_GOTOFF);
12737 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, XEXP (addr, 1));
12738 }
12739 else
12740 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTOFF);
12741 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12742 if (!reg)
12743 tmpreg = gen_reg_rtx (Pmode);
12744 else
12745 tmpreg = reg;
12746 emit_move_insn (tmpreg, new_rtx);
12747
12748 if (reg != 0)
12749 {
12750 new_rtx = expand_simple_binop (Pmode, PLUS, reg, pic_offset_table_rtx,
12751 tmpreg, 1, OPTAB_DIRECT);
12752 new_rtx = reg;
12753 }
12754 else new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, tmpreg);
12755 }
12756 else if (!TARGET_64BIT && gotoff_operand (addr, Pmode))
12757 {
12758 /* This symbol may be referenced via a displacement from the PIC
12759 base address (@GOTOFF). */
12760
12761 if (reload_in_progress)
12762 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12763 if (GET_CODE (addr) == CONST)
12764 addr = XEXP (addr, 0);
12765 if (GET_CODE (addr) == PLUS)
12766 {
12767 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, XEXP (addr, 0)),
12768 UNSPEC_GOTOFF);
12769 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, XEXP (addr, 1));
12770 }
12771 else
12772 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTOFF);
12773 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12774 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
12775
12776 if (reg != 0)
12777 {
12778 emit_move_insn (reg, new_rtx);
12779 new_rtx = reg;
12780 }
12781 }
12782 else if ((GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (addr) == 0)
12783 /* We can't use @GOTOFF for text labels on VxWorks;
12784 see gotoff_operand. */
12785 || (TARGET_VXWORKS_RTP && GET_CODE (addr) == LABEL_REF))
12786 {
12787 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
12788 {
12789 if (GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_DLLIMPORT_P (addr))
12790 return legitimize_dllimport_symbol (addr, true);
12791 if (GET_CODE (addr) == CONST && GET_CODE (XEXP (addr, 0)) == PLUS
12792 && GET_CODE (XEXP (XEXP (addr, 0), 0)) == SYMBOL_REF
12793 && SYMBOL_REF_DLLIMPORT_P (XEXP (XEXP (addr, 0), 0)))
12794 {
12795 rtx t = legitimize_dllimport_symbol (XEXP (XEXP (addr, 0), 0), true);
12796 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (addr, 0), 1));
12797 }
12798 }
12799
12800 /* For x64 PE-COFF there is no GOT table. So we use address
12801 directly. */
12802 if (TARGET_64BIT && DEFAULT_ABI == MS_ABI)
12803 {
12804 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_PCREL);
12805 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12806
12807 if (reg == 0)
12808 reg = gen_reg_rtx (Pmode);
12809 emit_move_insn (reg, new_rtx);
12810 new_rtx = reg;
12811 }
12812 else if (TARGET_64BIT && ix86_cmodel != CM_LARGE_PIC)
12813 {
12814 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTPCREL);
12815 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12816 new_rtx = gen_const_mem (Pmode, new_rtx);
12817 set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
12818
12819 if (reg == 0)
12820 reg = gen_reg_rtx (Pmode);
12821 /* Use directly gen_movsi, otherwise the address is loaded
12822 into register for CSE. We don't want to CSE this addresses,
12823 instead we CSE addresses from the GOT table, so skip this. */
12824 emit_insn (gen_movsi (reg, new_rtx));
12825 new_rtx = reg;
12826 }
12827 else
12828 {
12829 /* This symbol must be referenced via a load from the
12830 Global Offset Table (@GOT). */
12831
12832 if (reload_in_progress)
12833 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12834 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOT);
12835 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12836 if (TARGET_64BIT)
12837 new_rtx = force_reg (Pmode, new_rtx);
12838 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
12839 new_rtx = gen_const_mem (Pmode, new_rtx);
12840 set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
12841
12842 if (reg == 0)
12843 reg = gen_reg_rtx (Pmode);
12844 emit_move_insn (reg, new_rtx);
12845 new_rtx = reg;
12846 }
12847 }
12848 else
12849 {
12850 if (CONST_INT_P (addr)
12851 && !x86_64_immediate_operand (addr, VOIDmode))
12852 {
12853 if (reg)
12854 {
12855 emit_move_insn (reg, addr);
12856 new_rtx = reg;
12857 }
12858 else
12859 new_rtx = force_reg (Pmode, addr);
12860 }
12861 else if (GET_CODE (addr) == CONST)
12862 {
12863 addr = XEXP (addr, 0);
12864
12865 /* We must match stuff we generate before. Assume the only
12866 unspecs that can get here are ours. Not that we could do
12867 anything with them anyway.... */
12868 if (GET_CODE (addr) == UNSPEC
12869 || (GET_CODE (addr) == PLUS
12870 && GET_CODE (XEXP (addr, 0)) == UNSPEC))
12871 return orig;
12872 gcc_assert (GET_CODE (addr) == PLUS);
12873 }
12874 if (GET_CODE (addr) == PLUS)
12875 {
12876 rtx op0 = XEXP (addr, 0), op1 = XEXP (addr, 1);
12877
12878 /* Check first to see if this is a constant offset from a @GOTOFF
12879 symbol reference. */
12880 if (gotoff_operand (op0, Pmode)
12881 && CONST_INT_P (op1))
12882 {
12883 if (!TARGET_64BIT)
12884 {
12885 if (reload_in_progress)
12886 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12887 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op0),
12888 UNSPEC_GOTOFF);
12889 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, op1);
12890 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12891 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
12892
12893 if (reg != 0)
12894 {
12895 emit_move_insn (reg, new_rtx);
12896 new_rtx = reg;
12897 }
12898 }
12899 else
12900 {
12901 if (INTVAL (op1) < -16*1024*1024
12902 || INTVAL (op1) >= 16*1024*1024)
12903 {
12904 if (!x86_64_immediate_operand (op1, Pmode))
12905 op1 = force_reg (Pmode, op1);
12906 new_rtx = gen_rtx_PLUS (Pmode, force_reg (Pmode, op0), op1);
12907 }
12908 }
12909 }
12910 else
12911 {
12912 base = legitimize_pic_address (XEXP (addr, 0), reg);
12913 new_rtx = legitimize_pic_address (XEXP (addr, 1),
12914 base == reg ? NULL_RTX : reg);
12915
12916 if (CONST_INT_P (new_rtx))
12917 new_rtx = plus_constant (Pmode, base, INTVAL (new_rtx));
12918 else
12919 {
12920 if (GET_CODE (new_rtx) == PLUS && CONSTANT_P (XEXP (new_rtx, 1)))
12921 {
12922 base = gen_rtx_PLUS (Pmode, base, XEXP (new_rtx, 0));
12923 new_rtx = XEXP (new_rtx, 1);
12924 }
12925 new_rtx = gen_rtx_PLUS (Pmode, base, new_rtx);
12926 }
12927 }
12928 }
12929 }
12930 return new_rtx;
12931 }
12932 \f
12933 /* Load the thread pointer. If TO_REG is true, force it into a register. */
12934
12935 static rtx
12936 get_thread_pointer (enum machine_mode tp_mode, bool to_reg)
12937 {
12938 rtx tp = gen_rtx_UNSPEC (ptr_mode, gen_rtvec (1, const0_rtx), UNSPEC_TP);
12939
12940 if (GET_MODE (tp) != tp_mode)
12941 {
12942 gcc_assert (GET_MODE (tp) == SImode);
12943 gcc_assert (tp_mode == DImode);
12944
12945 tp = gen_rtx_ZERO_EXTEND (tp_mode, tp);
12946 }
12947
12948 if (to_reg)
12949 tp = copy_to_mode_reg (tp_mode, tp);
12950
12951 return tp;
12952 }
12953
12954 /* Construct the SYMBOL_REF for the tls_get_addr function. */
12955
12956 static GTY(()) rtx ix86_tls_symbol;
12957
12958 static rtx
12959 ix86_tls_get_addr (void)
12960 {
12961 if (!ix86_tls_symbol)
12962 {
12963 const char *sym
12964 = ((TARGET_ANY_GNU_TLS && !TARGET_64BIT)
12965 ? "___tls_get_addr" : "__tls_get_addr");
12966
12967 ix86_tls_symbol = gen_rtx_SYMBOL_REF (Pmode, sym);
12968 }
12969
12970 return ix86_tls_symbol;
12971 }
12972
12973 /* Construct the SYMBOL_REF for the _TLS_MODULE_BASE_ symbol. */
12974
12975 static GTY(()) rtx ix86_tls_module_base_symbol;
12976
12977 rtx
12978 ix86_tls_module_base (void)
12979 {
12980 if (!ix86_tls_module_base_symbol)
12981 {
12982 ix86_tls_module_base_symbol
12983 = gen_rtx_SYMBOL_REF (Pmode, "_TLS_MODULE_BASE_");
12984
12985 SYMBOL_REF_FLAGS (ix86_tls_module_base_symbol)
12986 |= TLS_MODEL_GLOBAL_DYNAMIC << SYMBOL_FLAG_TLS_SHIFT;
12987 }
12988
12989 return ix86_tls_module_base_symbol;
12990 }
12991
12992 /* A subroutine of ix86_legitimize_address and ix86_expand_move. FOR_MOV is
12993 false if we expect this to be used for a memory address and true if
12994 we expect to load the address into a register. */
12995
12996 static rtx
12997 legitimize_tls_address (rtx x, enum tls_model model, bool for_mov)
12998 {
12999 rtx dest, base, off;
13000 rtx pic = NULL_RTX, tp = NULL_RTX;
13001 enum machine_mode tp_mode = Pmode;
13002 int type;
13003
13004 switch (model)
13005 {
13006 case TLS_MODEL_GLOBAL_DYNAMIC:
13007 dest = gen_reg_rtx (Pmode);
13008
13009 if (!TARGET_64BIT)
13010 {
13011 if (flag_pic)
13012 pic = pic_offset_table_rtx;
13013 else
13014 {
13015 pic = gen_reg_rtx (Pmode);
13016 emit_insn (gen_set_got (pic));
13017 }
13018 }
13019
13020 if (TARGET_GNU2_TLS)
13021 {
13022 if (TARGET_64BIT)
13023 emit_insn (gen_tls_dynamic_gnu2_64 (dest, x));
13024 else
13025 emit_insn (gen_tls_dynamic_gnu2_32 (dest, x, pic));
13026
13027 tp = get_thread_pointer (Pmode, true);
13028 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, tp, dest));
13029
13030 set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
13031 }
13032 else
13033 {
13034 rtx caddr = ix86_tls_get_addr ();
13035
13036 if (TARGET_64BIT)
13037 {
13038 rtx rax = gen_rtx_REG (Pmode, AX_REG), insns;
13039
13040 start_sequence ();
13041 emit_call_insn (ix86_gen_tls_global_dynamic_64 (rax, x,
13042 caddr));
13043 insns = get_insns ();
13044 end_sequence ();
13045
13046 RTL_CONST_CALL_P (insns) = 1;
13047 emit_libcall_block (insns, dest, rax, x);
13048 }
13049 else
13050 emit_insn (gen_tls_global_dynamic_32 (dest, x, pic, caddr));
13051 }
13052 break;
13053
13054 case TLS_MODEL_LOCAL_DYNAMIC:
13055 base = gen_reg_rtx (Pmode);
13056
13057 if (!TARGET_64BIT)
13058 {
13059 if (flag_pic)
13060 pic = pic_offset_table_rtx;
13061 else
13062 {
13063 pic = gen_reg_rtx (Pmode);
13064 emit_insn (gen_set_got (pic));
13065 }
13066 }
13067
13068 if (TARGET_GNU2_TLS)
13069 {
13070 rtx tmp = ix86_tls_module_base ();
13071
13072 if (TARGET_64BIT)
13073 emit_insn (gen_tls_dynamic_gnu2_64 (base, tmp));
13074 else
13075 emit_insn (gen_tls_dynamic_gnu2_32 (base, tmp, pic));
13076
13077 tp = get_thread_pointer (Pmode, true);
13078 set_unique_reg_note (get_last_insn (), REG_EQUAL,
13079 gen_rtx_MINUS (Pmode, tmp, tp));
13080 }
13081 else
13082 {
13083 rtx caddr = ix86_tls_get_addr ();
13084
13085 if (TARGET_64BIT)
13086 {
13087 rtx rax = gen_rtx_REG (Pmode, AX_REG), insns, eqv;
13088
13089 start_sequence ();
13090 emit_call_insn (ix86_gen_tls_local_dynamic_base_64 (rax,
13091 caddr));
13092 insns = get_insns ();
13093 end_sequence ();
13094
13095 /* Attach a unique REG_EQUAL, to allow the RTL optimizers to
13096 share the LD_BASE result with other LD model accesses. */
13097 eqv = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
13098 UNSPEC_TLS_LD_BASE);
13099
13100 RTL_CONST_CALL_P (insns) = 1;
13101 emit_libcall_block (insns, base, rax, eqv);
13102 }
13103 else
13104 emit_insn (gen_tls_local_dynamic_base_32 (base, pic, caddr));
13105 }
13106
13107 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x), UNSPEC_DTPOFF);
13108 off = gen_rtx_CONST (Pmode, off);
13109
13110 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, base, off));
13111
13112 if (TARGET_GNU2_TLS)
13113 {
13114 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, dest, tp));
13115
13116 set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
13117 }
13118 break;
13119
13120 case TLS_MODEL_INITIAL_EXEC:
13121 if (TARGET_64BIT)
13122 {
13123 if (TARGET_SUN_TLS && !TARGET_X32)
13124 {
13125 /* The Sun linker took the AMD64 TLS spec literally
13126 and can only handle %rax as destination of the
13127 initial executable code sequence. */
13128
13129 dest = gen_reg_rtx (DImode);
13130 emit_insn (gen_tls_initial_exec_64_sun (dest, x));
13131 return dest;
13132 }
13133
13134 /* Generate DImode references to avoid %fs:(%reg32)
13135 problems and linker IE->LE relaxation bug. */
13136 tp_mode = DImode;
13137 pic = NULL;
13138 type = UNSPEC_GOTNTPOFF;
13139 }
13140 else if (flag_pic)
13141 {
13142 if (reload_in_progress)
13143 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
13144 pic = pic_offset_table_rtx;
13145 type = TARGET_ANY_GNU_TLS ? UNSPEC_GOTNTPOFF : UNSPEC_GOTTPOFF;
13146 }
13147 else if (!TARGET_ANY_GNU_TLS)
13148 {
13149 pic = gen_reg_rtx (Pmode);
13150 emit_insn (gen_set_got (pic));
13151 type = UNSPEC_GOTTPOFF;
13152 }
13153 else
13154 {
13155 pic = NULL;
13156 type = UNSPEC_INDNTPOFF;
13157 }
13158
13159 off = gen_rtx_UNSPEC (tp_mode, gen_rtvec (1, x), type);
13160 off = gen_rtx_CONST (tp_mode, off);
13161 if (pic)
13162 off = gen_rtx_PLUS (tp_mode, pic, off);
13163 off = gen_const_mem (tp_mode, off);
13164 set_mem_alias_set (off, ix86_GOT_alias_set ());
13165
13166 if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
13167 {
13168 base = get_thread_pointer (tp_mode,
13169 for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
13170 off = force_reg (tp_mode, off);
13171 return gen_rtx_PLUS (tp_mode, base, off);
13172 }
13173 else
13174 {
13175 base = get_thread_pointer (Pmode, true);
13176 dest = gen_reg_rtx (Pmode);
13177 emit_insn (ix86_gen_sub3 (dest, base, off));
13178 }
13179 break;
13180
13181 case TLS_MODEL_LOCAL_EXEC:
13182 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x),
13183 (TARGET_64BIT || TARGET_ANY_GNU_TLS)
13184 ? UNSPEC_NTPOFF : UNSPEC_TPOFF);
13185 off = gen_rtx_CONST (Pmode, off);
13186
13187 if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
13188 {
13189 base = get_thread_pointer (Pmode,
13190 for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
13191 return gen_rtx_PLUS (Pmode, base, off);
13192 }
13193 else
13194 {
13195 base = get_thread_pointer (Pmode, true);
13196 dest = gen_reg_rtx (Pmode);
13197 emit_insn (ix86_gen_sub3 (dest, base, off));
13198 }
13199 break;
13200
13201 default:
13202 gcc_unreachable ();
13203 }
13204
13205 return dest;
13206 }
13207
13208 /* Create or return the unique __imp_DECL dllimport symbol corresponding
13209 to symbol DECL. */
13210
13211 static GTY((if_marked ("tree_map_marked_p"), param_is (struct tree_map)))
13212 htab_t dllimport_map;
13213
13214 static tree
13215 get_dllimport_decl (tree decl)
13216 {
13217 struct tree_map *h, in;
13218 void **loc;
13219 const char *name;
13220 const char *prefix;
13221 size_t namelen, prefixlen;
13222 char *imp_name;
13223 tree to;
13224 rtx rtl;
13225
13226 if (!dllimport_map)
13227 dllimport_map = htab_create_ggc (512, tree_map_hash, tree_map_eq, 0);
13228
13229 in.hash = htab_hash_pointer (decl);
13230 in.base.from = decl;
13231 loc = htab_find_slot_with_hash (dllimport_map, &in, in.hash, INSERT);
13232 h = (struct tree_map *) *loc;
13233 if (h)
13234 return h->to;
13235
13236 *loc = h = ggc_alloc_tree_map ();
13237 h->hash = in.hash;
13238 h->base.from = decl;
13239 h->to = to = build_decl (DECL_SOURCE_LOCATION (decl),
13240 VAR_DECL, NULL, ptr_type_node);
13241 DECL_ARTIFICIAL (to) = 1;
13242 DECL_IGNORED_P (to) = 1;
13243 DECL_EXTERNAL (to) = 1;
13244 TREE_READONLY (to) = 1;
13245
13246 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
13247 name = targetm.strip_name_encoding (name);
13248 prefix = name[0] == FASTCALL_PREFIX || user_label_prefix[0] == 0
13249 ? "*__imp_" : "*__imp__";
13250 namelen = strlen (name);
13251 prefixlen = strlen (prefix);
13252 imp_name = (char *) alloca (namelen + prefixlen + 1);
13253 memcpy (imp_name, prefix, prefixlen);
13254 memcpy (imp_name + prefixlen, name, namelen + 1);
13255
13256 name = ggc_alloc_string (imp_name, namelen + prefixlen);
13257 rtl = gen_rtx_SYMBOL_REF (Pmode, name);
13258 SET_SYMBOL_REF_DECL (rtl, to);
13259 SYMBOL_REF_FLAGS (rtl) = SYMBOL_FLAG_LOCAL;
13260
13261 rtl = gen_const_mem (Pmode, rtl);
13262 set_mem_alias_set (rtl, ix86_GOT_alias_set ());
13263
13264 SET_DECL_RTL (to, rtl);
13265 SET_DECL_ASSEMBLER_NAME (to, get_identifier (name));
13266
13267 return to;
13268 }
13269
13270 /* Expand SYMBOL into its corresponding dllimport symbol. WANT_REG is
13271 true if we require the result be a register. */
13272
13273 static rtx
13274 legitimize_dllimport_symbol (rtx symbol, bool want_reg)
13275 {
13276 tree imp_decl;
13277 rtx x;
13278
13279 gcc_assert (SYMBOL_REF_DECL (symbol));
13280 imp_decl = get_dllimport_decl (SYMBOL_REF_DECL (symbol));
13281
13282 x = DECL_RTL (imp_decl);
13283 if (want_reg)
13284 x = force_reg (Pmode, x);
13285 return x;
13286 }
13287
13288 /* Try machine-dependent ways of modifying an illegitimate address
13289 to be legitimate. If we find one, return the new, valid address.
13290 This macro is used in only one place: `memory_address' in explow.c.
13291
13292 OLDX is the address as it was before break_out_memory_refs was called.
13293 In some cases it is useful to look at this to decide what needs to be done.
13294
13295 It is always safe for this macro to do nothing. It exists to recognize
13296 opportunities to optimize the output.
13297
13298 For the 80386, we handle X+REG by loading X into a register R and
13299 using R+REG. R will go in a general reg and indexing will be used.
13300 However, if REG is a broken-out memory address or multiplication,
13301 nothing needs to be done because REG can certainly go in a general reg.
13302
13303 When -fpic is used, special handling is needed for symbolic references.
13304 See comments by legitimize_pic_address in i386.c for details. */
13305
13306 static rtx
13307 ix86_legitimize_address (rtx x, rtx oldx ATTRIBUTE_UNUSED,
13308 enum machine_mode mode)
13309 {
13310 int changed = 0;
13311 unsigned log;
13312
13313 log = GET_CODE (x) == SYMBOL_REF ? SYMBOL_REF_TLS_MODEL (x) : 0;
13314 if (log)
13315 return legitimize_tls_address (x, (enum tls_model) log, false);
13316 if (GET_CODE (x) == CONST
13317 && GET_CODE (XEXP (x, 0)) == PLUS
13318 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF
13319 && (log = SYMBOL_REF_TLS_MODEL (XEXP (XEXP (x, 0), 0))))
13320 {
13321 rtx t = legitimize_tls_address (XEXP (XEXP (x, 0), 0),
13322 (enum tls_model) log, false);
13323 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (x, 0), 1));
13324 }
13325
13326 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
13327 {
13328 if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_DLLIMPORT_P (x))
13329 return legitimize_dllimport_symbol (x, true);
13330 if (GET_CODE (x) == CONST
13331 && GET_CODE (XEXP (x, 0)) == PLUS
13332 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF
13333 && SYMBOL_REF_DLLIMPORT_P (XEXP (XEXP (x, 0), 0)))
13334 {
13335 rtx t = legitimize_dllimport_symbol (XEXP (XEXP (x, 0), 0), true);
13336 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (x, 0), 1));
13337 }
13338 }
13339
13340 if (flag_pic && SYMBOLIC_CONST (x))
13341 return legitimize_pic_address (x, 0);
13342
13343 #if TARGET_MACHO
13344 if (MACHO_DYNAMIC_NO_PIC_P && SYMBOLIC_CONST (x))
13345 return machopic_indirect_data_reference (x, 0);
13346 #endif
13347
13348 /* Canonicalize shifts by 0, 1, 2, 3 into multiply */
13349 if (GET_CODE (x) == ASHIFT
13350 && CONST_INT_P (XEXP (x, 1))
13351 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (x, 1)) < 4)
13352 {
13353 changed = 1;
13354 log = INTVAL (XEXP (x, 1));
13355 x = gen_rtx_MULT (Pmode, force_reg (Pmode, XEXP (x, 0)),
13356 GEN_INT (1 << log));
13357 }
13358
13359 if (GET_CODE (x) == PLUS)
13360 {
13361 /* Canonicalize shifts by 0, 1, 2, 3 into multiply. */
13362
13363 if (GET_CODE (XEXP (x, 0)) == ASHIFT
13364 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
13365 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 0), 1)) < 4)
13366 {
13367 changed = 1;
13368 log = INTVAL (XEXP (XEXP (x, 0), 1));
13369 XEXP (x, 0) = gen_rtx_MULT (Pmode,
13370 force_reg (Pmode, XEXP (XEXP (x, 0), 0)),
13371 GEN_INT (1 << log));
13372 }
13373
13374 if (GET_CODE (XEXP (x, 1)) == ASHIFT
13375 && CONST_INT_P (XEXP (XEXP (x, 1), 1))
13376 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 1), 1)) < 4)
13377 {
13378 changed = 1;
13379 log = INTVAL (XEXP (XEXP (x, 1), 1));
13380 XEXP (x, 1) = gen_rtx_MULT (Pmode,
13381 force_reg (Pmode, XEXP (XEXP (x, 1), 0)),
13382 GEN_INT (1 << log));
13383 }
13384
13385 /* Put multiply first if it isn't already. */
13386 if (GET_CODE (XEXP (x, 1)) == MULT)
13387 {
13388 rtx tmp = XEXP (x, 0);
13389 XEXP (x, 0) = XEXP (x, 1);
13390 XEXP (x, 1) = tmp;
13391 changed = 1;
13392 }
13393
13394 /* Canonicalize (plus (mult (reg) (const)) (plus (reg) (const)))
13395 into (plus (plus (mult (reg) (const)) (reg)) (const)). This can be
13396 created by virtual register instantiation, register elimination, and
13397 similar optimizations. */
13398 if (GET_CODE (XEXP (x, 0)) == MULT && GET_CODE (XEXP (x, 1)) == PLUS)
13399 {
13400 changed = 1;
13401 x = gen_rtx_PLUS (Pmode,
13402 gen_rtx_PLUS (Pmode, XEXP (x, 0),
13403 XEXP (XEXP (x, 1), 0)),
13404 XEXP (XEXP (x, 1), 1));
13405 }
13406
13407 /* Canonicalize
13408 (plus (plus (mult (reg) (const)) (plus (reg) (const))) const)
13409 into (plus (plus (mult (reg) (const)) (reg)) (const)). */
13410 else if (GET_CODE (x) == PLUS && GET_CODE (XEXP (x, 0)) == PLUS
13411 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
13412 && GET_CODE (XEXP (XEXP (x, 0), 1)) == PLUS
13413 && CONSTANT_P (XEXP (x, 1)))
13414 {
13415 rtx constant;
13416 rtx other = NULL_RTX;
13417
13418 if (CONST_INT_P (XEXP (x, 1)))
13419 {
13420 constant = XEXP (x, 1);
13421 other = XEXP (XEXP (XEXP (x, 0), 1), 1);
13422 }
13423 else if (CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 1), 1)))
13424 {
13425 constant = XEXP (XEXP (XEXP (x, 0), 1), 1);
13426 other = XEXP (x, 1);
13427 }
13428 else
13429 constant = 0;
13430
13431 if (constant)
13432 {
13433 changed = 1;
13434 x = gen_rtx_PLUS (Pmode,
13435 gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 0),
13436 XEXP (XEXP (XEXP (x, 0), 1), 0)),
13437 plus_constant (Pmode, other,
13438 INTVAL (constant)));
13439 }
13440 }
13441
13442 if (changed && ix86_legitimate_address_p (mode, x, false))
13443 return x;
13444
13445 if (GET_CODE (XEXP (x, 0)) == MULT)
13446 {
13447 changed = 1;
13448 XEXP (x, 0) = force_operand (XEXP (x, 0), 0);
13449 }
13450
13451 if (GET_CODE (XEXP (x, 1)) == MULT)
13452 {
13453 changed = 1;
13454 XEXP (x, 1) = force_operand (XEXP (x, 1), 0);
13455 }
13456
13457 if (changed
13458 && REG_P (XEXP (x, 1))
13459 && REG_P (XEXP (x, 0)))
13460 return x;
13461
13462 if (flag_pic && SYMBOLIC_CONST (XEXP (x, 1)))
13463 {
13464 changed = 1;
13465 x = legitimize_pic_address (x, 0);
13466 }
13467
13468 if (changed && ix86_legitimate_address_p (mode, x, false))
13469 return x;
13470
13471 if (REG_P (XEXP (x, 0)))
13472 {
13473 rtx temp = gen_reg_rtx (Pmode);
13474 rtx val = force_operand (XEXP (x, 1), temp);
13475 if (val != temp)
13476 {
13477 if (GET_MODE (val) != Pmode)
13478 val = convert_to_mode (Pmode, val, 1);
13479 emit_move_insn (temp, val);
13480 }
13481
13482 XEXP (x, 1) = temp;
13483 return x;
13484 }
13485
13486 else if (REG_P (XEXP (x, 1)))
13487 {
13488 rtx temp = gen_reg_rtx (Pmode);
13489 rtx val = force_operand (XEXP (x, 0), temp);
13490 if (val != temp)
13491 {
13492 if (GET_MODE (val) != Pmode)
13493 val = convert_to_mode (Pmode, val, 1);
13494 emit_move_insn (temp, val);
13495 }
13496
13497 XEXP (x, 0) = temp;
13498 return x;
13499 }
13500 }
13501
13502 return x;
13503 }
13504 \f
13505 /* Print an integer constant expression in assembler syntax. Addition
13506 and subtraction are the only arithmetic that may appear in these
13507 expressions. FILE is the stdio stream to write to, X is the rtx, and
13508 CODE is the operand print code from the output string. */
13509
13510 static void
13511 output_pic_addr_const (FILE *file, rtx x, int code)
13512 {
13513 char buf[256];
13514
13515 switch (GET_CODE (x))
13516 {
13517 case PC:
13518 gcc_assert (flag_pic);
13519 putc ('.', file);
13520 break;
13521
13522 case SYMBOL_REF:
13523 if (TARGET_64BIT || ! TARGET_MACHO_BRANCH_ISLANDS)
13524 output_addr_const (file, x);
13525 else
13526 {
13527 const char *name = XSTR (x, 0);
13528
13529 /* Mark the decl as referenced so that cgraph will
13530 output the function. */
13531 if (SYMBOL_REF_DECL (x))
13532 mark_decl_referenced (SYMBOL_REF_DECL (x));
13533
13534 #if TARGET_MACHO
13535 if (MACHOPIC_INDIRECT
13536 && machopic_classify_symbol (x) == MACHOPIC_UNDEFINED_FUNCTION)
13537 name = machopic_indirection_name (x, /*stub_p=*/true);
13538 #endif
13539 assemble_name (file, name);
13540 }
13541 if (!TARGET_MACHO && !(TARGET_64BIT && DEFAULT_ABI == MS_ABI)
13542 && code == 'P' && ! SYMBOL_REF_LOCAL_P (x))
13543 fputs ("@PLT", file);
13544 break;
13545
13546 case LABEL_REF:
13547 x = XEXP (x, 0);
13548 /* FALLTHRU */
13549 case CODE_LABEL:
13550 ASM_GENERATE_INTERNAL_LABEL (buf, "L", CODE_LABEL_NUMBER (x));
13551 assemble_name (asm_out_file, buf);
13552 break;
13553
13554 case CONST_INT:
13555 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
13556 break;
13557
13558 case CONST:
13559 /* This used to output parentheses around the expression,
13560 but that does not work on the 386 (either ATT or BSD assembler). */
13561 output_pic_addr_const (file, XEXP (x, 0), code);
13562 break;
13563
13564 case CONST_DOUBLE:
13565 if (GET_MODE (x) == VOIDmode)
13566 {
13567 /* We can use %d if the number is <32 bits and positive. */
13568 if (CONST_DOUBLE_HIGH (x) || CONST_DOUBLE_LOW (x) < 0)
13569 fprintf (file, "0x%lx%08lx",
13570 (unsigned long) CONST_DOUBLE_HIGH (x),
13571 (unsigned long) CONST_DOUBLE_LOW (x));
13572 else
13573 fprintf (file, HOST_WIDE_INT_PRINT_DEC, CONST_DOUBLE_LOW (x));
13574 }
13575 else
13576 /* We can't handle floating point constants;
13577 TARGET_PRINT_OPERAND must handle them. */
13578 output_operand_lossage ("floating constant misused");
13579 break;
13580
13581 case PLUS:
13582 /* Some assemblers need integer constants to appear first. */
13583 if (CONST_INT_P (XEXP (x, 0)))
13584 {
13585 output_pic_addr_const (file, XEXP (x, 0), code);
13586 putc ('+', file);
13587 output_pic_addr_const (file, XEXP (x, 1), code);
13588 }
13589 else
13590 {
13591 gcc_assert (CONST_INT_P (XEXP (x, 1)));
13592 output_pic_addr_const (file, XEXP (x, 1), code);
13593 putc ('+', file);
13594 output_pic_addr_const (file, XEXP (x, 0), code);
13595 }
13596 break;
13597
13598 case MINUS:
13599 if (!TARGET_MACHO)
13600 putc (ASSEMBLER_DIALECT == ASM_INTEL ? '(' : '[', file);
13601 output_pic_addr_const (file, XEXP (x, 0), code);
13602 putc ('-', file);
13603 output_pic_addr_const (file, XEXP (x, 1), code);
13604 if (!TARGET_MACHO)
13605 putc (ASSEMBLER_DIALECT == ASM_INTEL ? ')' : ']', file);
13606 break;
13607
13608 case UNSPEC:
13609 if (XINT (x, 1) == UNSPEC_STACK_CHECK)
13610 {
13611 bool f = i386_asm_output_addr_const_extra (file, x);
13612 gcc_assert (f);
13613 break;
13614 }
13615
13616 gcc_assert (XVECLEN (x, 0) == 1);
13617 output_pic_addr_const (file, XVECEXP (x, 0, 0), code);
13618 switch (XINT (x, 1))
13619 {
13620 case UNSPEC_GOT:
13621 fputs ("@GOT", file);
13622 break;
13623 case UNSPEC_GOTOFF:
13624 fputs ("@GOTOFF", file);
13625 break;
13626 case UNSPEC_PLTOFF:
13627 fputs ("@PLTOFF", file);
13628 break;
13629 case UNSPEC_PCREL:
13630 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
13631 "(%rip)" : "[rip]", file);
13632 break;
13633 case UNSPEC_GOTPCREL:
13634 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
13635 "@GOTPCREL(%rip)" : "@GOTPCREL[rip]", file);
13636 break;
13637 case UNSPEC_GOTTPOFF:
13638 /* FIXME: This might be @TPOFF in Sun ld too. */
13639 fputs ("@gottpoff", file);
13640 break;
13641 case UNSPEC_TPOFF:
13642 fputs ("@tpoff", file);
13643 break;
13644 case UNSPEC_NTPOFF:
13645 if (TARGET_64BIT)
13646 fputs ("@tpoff", file);
13647 else
13648 fputs ("@ntpoff", file);
13649 break;
13650 case UNSPEC_DTPOFF:
13651 fputs ("@dtpoff", file);
13652 break;
13653 case UNSPEC_GOTNTPOFF:
13654 if (TARGET_64BIT)
13655 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
13656 "@gottpoff(%rip)": "@gottpoff[rip]", file);
13657 else
13658 fputs ("@gotntpoff", file);
13659 break;
13660 case UNSPEC_INDNTPOFF:
13661 fputs ("@indntpoff", file);
13662 break;
13663 #if TARGET_MACHO
13664 case UNSPEC_MACHOPIC_OFFSET:
13665 putc ('-', file);
13666 machopic_output_function_base_name (file);
13667 break;
13668 #endif
13669 default:
13670 output_operand_lossage ("invalid UNSPEC as operand");
13671 break;
13672 }
13673 break;
13674
13675 default:
13676 output_operand_lossage ("invalid expression as operand");
13677 }
13678 }
13679
13680 /* This is called from dwarf2out.c via TARGET_ASM_OUTPUT_DWARF_DTPREL.
13681 We need to emit DTP-relative relocations. */
13682
13683 static void ATTRIBUTE_UNUSED
13684 i386_output_dwarf_dtprel (FILE *file, int size, rtx x)
13685 {
13686 fputs (ASM_LONG, file);
13687 output_addr_const (file, x);
13688 fputs ("@dtpoff", file);
13689 switch (size)
13690 {
13691 case 4:
13692 break;
13693 case 8:
13694 fputs (", 0", file);
13695 break;
13696 default:
13697 gcc_unreachable ();
13698 }
13699 }
13700
13701 /* Return true if X is a representation of the PIC register. This copes
13702 with calls from ix86_find_base_term, where the register might have
13703 been replaced by a cselib value. */
13704
13705 static bool
13706 ix86_pic_register_p (rtx x)
13707 {
13708 if (GET_CODE (x) == VALUE && CSELIB_VAL_PTR (x))
13709 return (pic_offset_table_rtx
13710 && rtx_equal_for_cselib_p (x, pic_offset_table_rtx));
13711 else
13712 return REG_P (x) && REGNO (x) == PIC_OFFSET_TABLE_REGNUM;
13713 }
13714
13715 /* Helper function for ix86_delegitimize_address.
13716 Attempt to delegitimize TLS local-exec accesses. */
13717
13718 static rtx
13719 ix86_delegitimize_tls_address (rtx orig_x)
13720 {
13721 rtx x = orig_x, unspec;
13722 struct ix86_address addr;
13723
13724 if (!TARGET_TLS_DIRECT_SEG_REFS)
13725 return orig_x;
13726 if (MEM_P (x))
13727 x = XEXP (x, 0);
13728 if (GET_CODE (x) != PLUS || GET_MODE (x) != Pmode)
13729 return orig_x;
13730 if (ix86_decompose_address (x, &addr) == 0
13731 || addr.seg != (TARGET_64BIT ? SEG_FS : SEG_GS)
13732 || addr.disp == NULL_RTX
13733 || GET_CODE (addr.disp) != CONST)
13734 return orig_x;
13735 unspec = XEXP (addr.disp, 0);
13736 if (GET_CODE (unspec) == PLUS && CONST_INT_P (XEXP (unspec, 1)))
13737 unspec = XEXP (unspec, 0);
13738 if (GET_CODE (unspec) != UNSPEC || XINT (unspec, 1) != UNSPEC_NTPOFF)
13739 return orig_x;
13740 x = XVECEXP (unspec, 0, 0);
13741 gcc_assert (GET_CODE (x) == SYMBOL_REF);
13742 if (unspec != XEXP (addr.disp, 0))
13743 x = gen_rtx_PLUS (Pmode, x, XEXP (XEXP (addr.disp, 0), 1));
13744 if (addr.index)
13745 {
13746 rtx idx = addr.index;
13747 if (addr.scale != 1)
13748 idx = gen_rtx_MULT (Pmode, idx, GEN_INT (addr.scale));
13749 x = gen_rtx_PLUS (Pmode, idx, x);
13750 }
13751 if (addr.base)
13752 x = gen_rtx_PLUS (Pmode, addr.base, x);
13753 if (MEM_P (orig_x))
13754 x = replace_equiv_address_nv (orig_x, x);
13755 return x;
13756 }
13757
13758 /* In the name of slightly smaller debug output, and to cater to
13759 general assembler lossage, recognize PIC+GOTOFF and turn it back
13760 into a direct symbol reference.
13761
13762 On Darwin, this is necessary to avoid a crash, because Darwin
13763 has a different PIC label for each routine but the DWARF debugging
13764 information is not associated with any particular routine, so it's
13765 necessary to remove references to the PIC label from RTL stored by
13766 the DWARF output code. */
13767
13768 static rtx
13769 ix86_delegitimize_address (rtx x)
13770 {
13771 rtx orig_x = delegitimize_mem_from_attrs (x);
13772 /* addend is NULL or some rtx if x is something+GOTOFF where
13773 something doesn't include the PIC register. */
13774 rtx addend = NULL_RTX;
13775 /* reg_addend is NULL or a multiple of some register. */
13776 rtx reg_addend = NULL_RTX;
13777 /* const_addend is NULL or a const_int. */
13778 rtx const_addend = NULL_RTX;
13779 /* This is the result, or NULL. */
13780 rtx result = NULL_RTX;
13781
13782 x = orig_x;
13783
13784 if (MEM_P (x))
13785 x = XEXP (x, 0);
13786
13787 if (TARGET_64BIT)
13788 {
13789 if (GET_CODE (x) == CONST
13790 && GET_CODE (XEXP (x, 0)) == PLUS
13791 && GET_MODE (XEXP (x, 0)) == Pmode
13792 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
13793 && GET_CODE (XEXP (XEXP (x, 0), 0)) == UNSPEC
13794 && XINT (XEXP (XEXP (x, 0), 0), 1) == UNSPEC_PCREL)
13795 {
13796 rtx x2 = XVECEXP (XEXP (XEXP (x, 0), 0), 0, 0);
13797 x = gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 1), x2);
13798 if (MEM_P (orig_x))
13799 x = replace_equiv_address_nv (orig_x, x);
13800 return x;
13801 }
13802 if (GET_CODE (x) != CONST
13803 || GET_CODE (XEXP (x, 0)) != UNSPEC
13804 || (XINT (XEXP (x, 0), 1) != UNSPEC_GOTPCREL
13805 && XINT (XEXP (x, 0), 1) != UNSPEC_PCREL)
13806 || (!MEM_P (orig_x) && XINT (XEXP (x, 0), 1) != UNSPEC_PCREL))
13807 return ix86_delegitimize_tls_address (orig_x);
13808 x = XVECEXP (XEXP (x, 0), 0, 0);
13809 if (GET_MODE (orig_x) != GET_MODE (x) && MEM_P (orig_x))
13810 {
13811 x = simplify_gen_subreg (GET_MODE (orig_x), x,
13812 GET_MODE (x), 0);
13813 if (x == NULL_RTX)
13814 return orig_x;
13815 }
13816 return x;
13817 }
13818
13819 if (GET_CODE (x) != PLUS
13820 || GET_CODE (XEXP (x, 1)) != CONST)
13821 return ix86_delegitimize_tls_address (orig_x);
13822
13823 if (ix86_pic_register_p (XEXP (x, 0)))
13824 /* %ebx + GOT/GOTOFF */
13825 ;
13826 else if (GET_CODE (XEXP (x, 0)) == PLUS)
13827 {
13828 /* %ebx + %reg * scale + GOT/GOTOFF */
13829 reg_addend = XEXP (x, 0);
13830 if (ix86_pic_register_p (XEXP (reg_addend, 0)))
13831 reg_addend = XEXP (reg_addend, 1);
13832 else if (ix86_pic_register_p (XEXP (reg_addend, 1)))
13833 reg_addend = XEXP (reg_addend, 0);
13834 else
13835 {
13836 reg_addend = NULL_RTX;
13837 addend = XEXP (x, 0);
13838 }
13839 }
13840 else
13841 addend = XEXP (x, 0);
13842
13843 x = XEXP (XEXP (x, 1), 0);
13844 if (GET_CODE (x) == PLUS
13845 && CONST_INT_P (XEXP (x, 1)))
13846 {
13847 const_addend = XEXP (x, 1);
13848 x = XEXP (x, 0);
13849 }
13850
13851 if (GET_CODE (x) == UNSPEC
13852 && ((XINT (x, 1) == UNSPEC_GOT && MEM_P (orig_x) && !addend)
13853 || (XINT (x, 1) == UNSPEC_GOTOFF && !MEM_P (orig_x))))
13854 result = XVECEXP (x, 0, 0);
13855
13856 if (TARGET_MACHO && darwin_local_data_pic (x)
13857 && !MEM_P (orig_x))
13858 result = XVECEXP (x, 0, 0);
13859
13860 if (! result)
13861 return ix86_delegitimize_tls_address (orig_x);
13862
13863 if (const_addend)
13864 result = gen_rtx_CONST (Pmode, gen_rtx_PLUS (Pmode, result, const_addend));
13865 if (reg_addend)
13866 result = gen_rtx_PLUS (Pmode, reg_addend, result);
13867 if (addend)
13868 {
13869 /* If the rest of original X doesn't involve the PIC register, add
13870 addend and subtract pic_offset_table_rtx. This can happen e.g.
13871 for code like:
13872 leal (%ebx, %ecx, 4), %ecx
13873 ...
13874 movl foo@GOTOFF(%ecx), %edx
13875 in which case we return (%ecx - %ebx) + foo. */
13876 if (pic_offset_table_rtx)
13877 result = gen_rtx_PLUS (Pmode, gen_rtx_MINUS (Pmode, copy_rtx (addend),
13878 pic_offset_table_rtx),
13879 result);
13880 else
13881 return orig_x;
13882 }
13883 if (GET_MODE (orig_x) != Pmode && MEM_P (orig_x))
13884 {
13885 result = simplify_gen_subreg (GET_MODE (orig_x), result, Pmode, 0);
13886 if (result == NULL_RTX)
13887 return orig_x;
13888 }
13889 return result;
13890 }
13891
13892 /* If X is a machine specific address (i.e. a symbol or label being
13893 referenced as a displacement from the GOT implemented using an
13894 UNSPEC), then return the base term. Otherwise return X. */
13895
13896 rtx
13897 ix86_find_base_term (rtx x)
13898 {
13899 rtx term;
13900
13901 if (TARGET_64BIT)
13902 {
13903 if (GET_CODE (x) != CONST)
13904 return x;
13905 term = XEXP (x, 0);
13906 if (GET_CODE (term) == PLUS
13907 && (CONST_INT_P (XEXP (term, 1))
13908 || GET_CODE (XEXP (term, 1)) == CONST_DOUBLE))
13909 term = XEXP (term, 0);
13910 if (GET_CODE (term) != UNSPEC
13911 || (XINT (term, 1) != UNSPEC_GOTPCREL
13912 && XINT (term, 1) != UNSPEC_PCREL))
13913 return x;
13914
13915 return XVECEXP (term, 0, 0);
13916 }
13917
13918 return ix86_delegitimize_address (x);
13919 }
13920 \f
13921 static void
13922 put_condition_code (enum rtx_code code, enum machine_mode mode, bool reverse,
13923 bool fp, FILE *file)
13924 {
13925 const char *suffix;
13926
13927 if (mode == CCFPmode || mode == CCFPUmode)
13928 {
13929 code = ix86_fp_compare_code_to_integer (code);
13930 mode = CCmode;
13931 }
13932 if (reverse)
13933 code = reverse_condition (code);
13934
13935 switch (code)
13936 {
13937 case EQ:
13938 switch (mode)
13939 {
13940 case CCAmode:
13941 suffix = "a";
13942 break;
13943
13944 case CCCmode:
13945 suffix = "c";
13946 break;
13947
13948 case CCOmode:
13949 suffix = "o";
13950 break;
13951
13952 case CCSmode:
13953 suffix = "s";
13954 break;
13955
13956 default:
13957 suffix = "e";
13958 }
13959 break;
13960 case NE:
13961 switch (mode)
13962 {
13963 case CCAmode:
13964 suffix = "na";
13965 break;
13966
13967 case CCCmode:
13968 suffix = "nc";
13969 break;
13970
13971 case CCOmode:
13972 suffix = "no";
13973 break;
13974
13975 case CCSmode:
13976 suffix = "ns";
13977 break;
13978
13979 default:
13980 suffix = "ne";
13981 }
13982 break;
13983 case GT:
13984 gcc_assert (mode == CCmode || mode == CCNOmode || mode == CCGCmode);
13985 suffix = "g";
13986 break;
13987 case GTU:
13988 /* ??? Use "nbe" instead of "a" for fcmov lossage on some assemblers.
13989 Those same assemblers have the same but opposite lossage on cmov. */
13990 if (mode == CCmode)
13991 suffix = fp ? "nbe" : "a";
13992 else if (mode == CCCmode)
13993 suffix = "b";
13994 else
13995 gcc_unreachable ();
13996 break;
13997 case LT:
13998 switch (mode)
13999 {
14000 case CCNOmode:
14001 case CCGOCmode:
14002 suffix = "s";
14003 break;
14004
14005 case CCmode:
14006 case CCGCmode:
14007 suffix = "l";
14008 break;
14009
14010 default:
14011 gcc_unreachable ();
14012 }
14013 break;
14014 case LTU:
14015 gcc_assert (mode == CCmode || mode == CCCmode);
14016 suffix = "b";
14017 break;
14018 case GE:
14019 switch (mode)
14020 {
14021 case CCNOmode:
14022 case CCGOCmode:
14023 suffix = "ns";
14024 break;
14025
14026 case CCmode:
14027 case CCGCmode:
14028 suffix = "ge";
14029 break;
14030
14031 default:
14032 gcc_unreachable ();
14033 }
14034 break;
14035 case GEU:
14036 /* ??? As above. */
14037 gcc_assert (mode == CCmode || mode == CCCmode);
14038 suffix = fp ? "nb" : "ae";
14039 break;
14040 case LE:
14041 gcc_assert (mode == CCmode || mode == CCGCmode || mode == CCNOmode);
14042 suffix = "le";
14043 break;
14044 case LEU:
14045 /* ??? As above. */
14046 if (mode == CCmode)
14047 suffix = "be";
14048 else if (mode == CCCmode)
14049 suffix = fp ? "nb" : "ae";
14050 else
14051 gcc_unreachable ();
14052 break;
14053 case UNORDERED:
14054 suffix = fp ? "u" : "p";
14055 break;
14056 case ORDERED:
14057 suffix = fp ? "nu" : "np";
14058 break;
14059 default:
14060 gcc_unreachable ();
14061 }
14062 fputs (suffix, file);
14063 }
14064
14065 /* Print the name of register X to FILE based on its machine mode and number.
14066 If CODE is 'w', pretend the mode is HImode.
14067 If CODE is 'b', pretend the mode is QImode.
14068 If CODE is 'k', pretend the mode is SImode.
14069 If CODE is 'q', pretend the mode is DImode.
14070 If CODE is 'x', pretend the mode is V4SFmode.
14071 If CODE is 't', pretend the mode is V8SFmode.
14072 If CODE is 'h', pretend the reg is the 'high' byte register.
14073 If CODE is 'y', print "st(0)" instead of "st", if the reg is stack op.
14074 If CODE is 'd', duplicate the operand for AVX instruction.
14075 */
14076
14077 void
14078 print_reg (rtx x, int code, FILE *file)
14079 {
14080 const char *reg;
14081 bool duplicated = code == 'd' && TARGET_AVX;
14082
14083 gcc_assert (x == pc_rtx
14084 || (REGNO (x) != ARG_POINTER_REGNUM
14085 && REGNO (x) != FRAME_POINTER_REGNUM
14086 && REGNO (x) != FLAGS_REG
14087 && REGNO (x) != FPSR_REG
14088 && REGNO (x) != FPCR_REG));
14089
14090 if (ASSEMBLER_DIALECT == ASM_ATT)
14091 putc ('%', file);
14092
14093 if (x == pc_rtx)
14094 {
14095 gcc_assert (TARGET_64BIT);
14096 fputs ("rip", file);
14097 return;
14098 }
14099
14100 if (code == 'w' || MMX_REG_P (x))
14101 code = 2;
14102 else if (code == 'b')
14103 code = 1;
14104 else if (code == 'k')
14105 code = 4;
14106 else if (code == 'q')
14107 code = 8;
14108 else if (code == 'y')
14109 code = 3;
14110 else if (code == 'h')
14111 code = 0;
14112 else if (code == 'x')
14113 code = 16;
14114 else if (code == 't')
14115 code = 32;
14116 else
14117 code = GET_MODE_SIZE (GET_MODE (x));
14118
14119 /* Irritatingly, AMD extended registers use different naming convention
14120 from the normal registers: "r%d[bwd]" */
14121 if (REX_INT_REG_P (x))
14122 {
14123 gcc_assert (TARGET_64BIT);
14124 putc ('r', file);
14125 fprint_ul (file, REGNO (x) - FIRST_REX_INT_REG + 8);
14126 switch (code)
14127 {
14128 case 0:
14129 error ("extended registers have no high halves");
14130 break;
14131 case 1:
14132 putc ('b', file);
14133 break;
14134 case 2:
14135 putc ('w', file);
14136 break;
14137 case 4:
14138 putc ('d', file);
14139 break;
14140 case 8:
14141 /* no suffix */
14142 break;
14143 default:
14144 error ("unsupported operand size for extended register");
14145 break;
14146 }
14147 return;
14148 }
14149
14150 reg = NULL;
14151 switch (code)
14152 {
14153 case 3:
14154 if (STACK_TOP_P (x))
14155 {
14156 reg = "st(0)";
14157 break;
14158 }
14159 /* FALLTHRU */
14160 case 8:
14161 case 4:
14162 case 12:
14163 if (! ANY_FP_REG_P (x))
14164 putc (code == 8 && TARGET_64BIT ? 'r' : 'e', file);
14165 /* FALLTHRU */
14166 case 16:
14167 case 2:
14168 normal:
14169 reg = hi_reg_name[REGNO (x)];
14170 break;
14171 case 1:
14172 if (REGNO (x) >= ARRAY_SIZE (qi_reg_name))
14173 goto normal;
14174 reg = qi_reg_name[REGNO (x)];
14175 break;
14176 case 0:
14177 if (REGNO (x) >= ARRAY_SIZE (qi_high_reg_name))
14178 goto normal;
14179 reg = qi_high_reg_name[REGNO (x)];
14180 break;
14181 case 32:
14182 if (SSE_REG_P (x))
14183 {
14184 gcc_assert (!duplicated);
14185 putc ('y', file);
14186 fputs (hi_reg_name[REGNO (x)] + 1, file);
14187 return;
14188 }
14189 break;
14190 default:
14191 gcc_unreachable ();
14192 }
14193
14194 fputs (reg, file);
14195 if (duplicated)
14196 {
14197 if (ASSEMBLER_DIALECT == ASM_ATT)
14198 fprintf (file, ", %%%s", reg);
14199 else
14200 fprintf (file, ", %s", reg);
14201 }
14202 }
14203
14204 /* Locate some local-dynamic symbol still in use by this function
14205 so that we can print its name in some tls_local_dynamic_base
14206 pattern. */
14207
14208 static int
14209 get_some_local_dynamic_name_1 (rtx *px, void *data ATTRIBUTE_UNUSED)
14210 {
14211 rtx x = *px;
14212
14213 if (GET_CODE (x) == SYMBOL_REF
14214 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_DYNAMIC)
14215 {
14216 cfun->machine->some_ld_name = XSTR (x, 0);
14217 return 1;
14218 }
14219
14220 return 0;
14221 }
14222
14223 static const char *
14224 get_some_local_dynamic_name (void)
14225 {
14226 rtx insn;
14227
14228 if (cfun->machine->some_ld_name)
14229 return cfun->machine->some_ld_name;
14230
14231 for (insn = get_insns (); insn ; insn = NEXT_INSN (insn))
14232 if (NONDEBUG_INSN_P (insn)
14233 && for_each_rtx (&PATTERN (insn), get_some_local_dynamic_name_1, 0))
14234 return cfun->machine->some_ld_name;
14235
14236 return NULL;
14237 }
14238
14239 /* Meaning of CODE:
14240 L,W,B,Q,S,T -- print the opcode suffix for specified size of operand.
14241 C -- print opcode suffix for set/cmov insn.
14242 c -- like C, but print reversed condition
14243 F,f -- likewise, but for floating-point.
14244 O -- if HAVE_AS_IX86_CMOV_SUN_SYNTAX, expand to "w.", "l." or "q.",
14245 otherwise nothing
14246 R -- print the prefix for register names.
14247 z -- print the opcode suffix for the size of the current operand.
14248 Z -- likewise, with special suffixes for x87 instructions.
14249 * -- print a star (in certain assembler syntax)
14250 A -- print an absolute memory reference.
14251 E -- print address with DImode register names if TARGET_64BIT.
14252 w -- print the operand as if it's a "word" (HImode) even if it isn't.
14253 s -- print a shift double count, followed by the assemblers argument
14254 delimiter.
14255 b -- print the QImode name of the register for the indicated operand.
14256 %b0 would print %al if operands[0] is reg 0.
14257 w -- likewise, print the HImode name of the register.
14258 k -- likewise, print the SImode name of the register.
14259 q -- likewise, print the DImode name of the register.
14260 x -- likewise, print the V4SFmode name of the register.
14261 t -- likewise, print the V8SFmode name of the register.
14262 h -- print the QImode name for a "high" register, either ah, bh, ch or dh.
14263 y -- print "st(0)" instead of "st" as a register.
14264 d -- print duplicated register operand for AVX instruction.
14265 D -- print condition for SSE cmp instruction.
14266 P -- if PIC, print an @PLT suffix.
14267 p -- print raw symbol name.
14268 X -- don't print any sort of PIC '@' suffix for a symbol.
14269 & -- print some in-use local-dynamic symbol name.
14270 H -- print a memory address offset by 8; used for sse high-parts
14271 Y -- print condition for XOP pcom* instruction.
14272 + -- print a branch hint as 'cs' or 'ds' prefix
14273 ; -- print a semicolon (after prefixes due to bug in older gas).
14274 ~ -- print "i" if TARGET_AVX2, "f" otherwise.
14275 @ -- print a segment register of thread base pointer load
14276 ^ -- print addr32 prefix if TARGET_64BIT and Pmode != word_mode
14277 */
14278
14279 void
14280 ix86_print_operand (FILE *file, rtx x, int code)
14281 {
14282 if (code)
14283 {
14284 switch (code)
14285 {
14286 case 'A':
14287 switch (ASSEMBLER_DIALECT)
14288 {
14289 case ASM_ATT:
14290 putc ('*', file);
14291 break;
14292
14293 case ASM_INTEL:
14294 /* Intel syntax. For absolute addresses, registers should not
14295 be surrounded by braces. */
14296 if (!REG_P (x))
14297 {
14298 putc ('[', file);
14299 ix86_print_operand (file, x, 0);
14300 putc (']', file);
14301 return;
14302 }
14303 break;
14304
14305 default:
14306 gcc_unreachable ();
14307 }
14308
14309 ix86_print_operand (file, x, 0);
14310 return;
14311
14312 case 'E':
14313 /* Wrap address in an UNSPEC to declare special handling. */
14314 if (TARGET_64BIT)
14315 x = gen_rtx_UNSPEC (DImode, gen_rtvec (1, x), UNSPEC_LEA_ADDR);
14316
14317 output_address (x);
14318 return;
14319
14320 case 'L':
14321 if (ASSEMBLER_DIALECT == ASM_ATT)
14322 putc ('l', file);
14323 return;
14324
14325 case 'W':
14326 if (ASSEMBLER_DIALECT == ASM_ATT)
14327 putc ('w', file);
14328 return;
14329
14330 case 'B':
14331 if (ASSEMBLER_DIALECT == ASM_ATT)
14332 putc ('b', file);
14333 return;
14334
14335 case 'Q':
14336 if (ASSEMBLER_DIALECT == ASM_ATT)
14337 putc ('l', file);
14338 return;
14339
14340 case 'S':
14341 if (ASSEMBLER_DIALECT == ASM_ATT)
14342 putc ('s', file);
14343 return;
14344
14345 case 'T':
14346 if (ASSEMBLER_DIALECT == ASM_ATT)
14347 putc ('t', file);
14348 return;
14349
14350 case 'O':
14351 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
14352 if (ASSEMBLER_DIALECT != ASM_ATT)
14353 return;
14354
14355 switch (GET_MODE_SIZE (GET_MODE (x)))
14356 {
14357 case 2:
14358 putc ('w', file);
14359 break;
14360
14361 case 4:
14362 putc ('l', file);
14363 break;
14364
14365 case 8:
14366 putc ('q', file);
14367 break;
14368
14369 default:
14370 output_operand_lossage
14371 ("invalid operand size for operand code 'O'");
14372 return;
14373 }
14374
14375 putc ('.', file);
14376 #endif
14377 return;
14378
14379 case 'z':
14380 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
14381 {
14382 /* Opcodes don't get size suffixes if using Intel opcodes. */
14383 if (ASSEMBLER_DIALECT == ASM_INTEL)
14384 return;
14385
14386 switch (GET_MODE_SIZE (GET_MODE (x)))
14387 {
14388 case 1:
14389 putc ('b', file);
14390 return;
14391
14392 case 2:
14393 putc ('w', file);
14394 return;
14395
14396 case 4:
14397 putc ('l', file);
14398 return;
14399
14400 case 8:
14401 putc ('q', file);
14402 return;
14403
14404 default:
14405 output_operand_lossage
14406 ("invalid operand size for operand code 'z'");
14407 return;
14408 }
14409 }
14410
14411 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
14412 warning
14413 (0, "non-integer operand used with operand code 'z'");
14414 /* FALLTHRU */
14415
14416 case 'Z':
14417 /* 387 opcodes don't get size suffixes if using Intel opcodes. */
14418 if (ASSEMBLER_DIALECT == ASM_INTEL)
14419 return;
14420
14421 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
14422 {
14423 switch (GET_MODE_SIZE (GET_MODE (x)))
14424 {
14425 case 2:
14426 #ifdef HAVE_AS_IX86_FILDS
14427 putc ('s', file);
14428 #endif
14429 return;
14430
14431 case 4:
14432 putc ('l', file);
14433 return;
14434
14435 case 8:
14436 #ifdef HAVE_AS_IX86_FILDQ
14437 putc ('q', file);
14438 #else
14439 fputs ("ll", file);
14440 #endif
14441 return;
14442
14443 default:
14444 break;
14445 }
14446 }
14447 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
14448 {
14449 /* 387 opcodes don't get size suffixes
14450 if the operands are registers. */
14451 if (STACK_REG_P (x))
14452 return;
14453
14454 switch (GET_MODE_SIZE (GET_MODE (x)))
14455 {
14456 case 4:
14457 putc ('s', file);
14458 return;
14459
14460 case 8:
14461 putc ('l', file);
14462 return;
14463
14464 case 12:
14465 case 16:
14466 putc ('t', file);
14467 return;
14468
14469 default:
14470 break;
14471 }
14472 }
14473 else
14474 {
14475 output_operand_lossage
14476 ("invalid operand type used with operand code 'Z'");
14477 return;
14478 }
14479
14480 output_operand_lossage
14481 ("invalid operand size for operand code 'Z'");
14482 return;
14483
14484 case 'd':
14485 case 'b':
14486 case 'w':
14487 case 'k':
14488 case 'q':
14489 case 'h':
14490 case 't':
14491 case 'y':
14492 case 'x':
14493 case 'X':
14494 case 'P':
14495 case 'p':
14496 break;
14497
14498 case 's':
14499 if (CONST_INT_P (x) || ! SHIFT_DOUBLE_OMITS_COUNT)
14500 {
14501 ix86_print_operand (file, x, 0);
14502 fputs (", ", file);
14503 }
14504 return;
14505
14506 case 'Y':
14507 switch (GET_CODE (x))
14508 {
14509 case NE:
14510 fputs ("neq", file);
14511 break;
14512 case EQ:
14513 fputs ("eq", file);
14514 break;
14515 case GE:
14516 case GEU:
14517 fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "ge" : "unlt", file);
14518 break;
14519 case GT:
14520 case GTU:
14521 fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "gt" : "unle", file);
14522 break;
14523 case LE:
14524 case LEU:
14525 fputs ("le", file);
14526 break;
14527 case LT:
14528 case LTU:
14529 fputs ("lt", file);
14530 break;
14531 case UNORDERED:
14532 fputs ("unord", file);
14533 break;
14534 case ORDERED:
14535 fputs ("ord", file);
14536 break;
14537 case UNEQ:
14538 fputs ("ueq", file);
14539 break;
14540 case UNGE:
14541 fputs ("nlt", file);
14542 break;
14543 case UNGT:
14544 fputs ("nle", file);
14545 break;
14546 case UNLE:
14547 fputs ("ule", file);
14548 break;
14549 case UNLT:
14550 fputs ("ult", file);
14551 break;
14552 case LTGT:
14553 fputs ("une", file);
14554 break;
14555 default:
14556 output_operand_lossage ("operand is not a condition code, "
14557 "invalid operand code 'Y'");
14558 return;
14559 }
14560 return;
14561
14562 case 'D':
14563 /* Little bit of braindamage here. The SSE compare instructions
14564 does use completely different names for the comparisons that the
14565 fp conditional moves. */
14566 switch (GET_CODE (x))
14567 {
14568 case UNEQ:
14569 if (TARGET_AVX)
14570 {
14571 fputs ("eq_us", file);
14572 break;
14573 }
14574 case EQ:
14575 fputs ("eq", file);
14576 break;
14577 case UNLT:
14578 if (TARGET_AVX)
14579 {
14580 fputs ("nge", file);
14581 break;
14582 }
14583 case LT:
14584 fputs ("lt", file);
14585 break;
14586 case UNLE:
14587 if (TARGET_AVX)
14588 {
14589 fputs ("ngt", file);
14590 break;
14591 }
14592 case LE:
14593 fputs ("le", file);
14594 break;
14595 case UNORDERED:
14596 fputs ("unord", file);
14597 break;
14598 case LTGT:
14599 if (TARGET_AVX)
14600 {
14601 fputs ("neq_oq", file);
14602 break;
14603 }
14604 case NE:
14605 fputs ("neq", file);
14606 break;
14607 case GE:
14608 if (TARGET_AVX)
14609 {
14610 fputs ("ge", file);
14611 break;
14612 }
14613 case UNGE:
14614 fputs ("nlt", file);
14615 break;
14616 case GT:
14617 if (TARGET_AVX)
14618 {
14619 fputs ("gt", file);
14620 break;
14621 }
14622 case UNGT:
14623 fputs ("nle", file);
14624 break;
14625 case ORDERED:
14626 fputs ("ord", file);
14627 break;
14628 default:
14629 output_operand_lossage ("operand is not a condition code, "
14630 "invalid operand code 'D'");
14631 return;
14632 }
14633 return;
14634
14635 case 'F':
14636 case 'f':
14637 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
14638 if (ASSEMBLER_DIALECT == ASM_ATT)
14639 putc ('.', file);
14640 #endif
14641
14642 case 'C':
14643 case 'c':
14644 if (!COMPARISON_P (x))
14645 {
14646 output_operand_lossage ("operand is not a condition code, "
14647 "invalid operand code '%c'", code);
14648 return;
14649 }
14650 put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)),
14651 code == 'c' || code == 'f',
14652 code == 'F' || code == 'f',
14653 file);
14654 return;
14655
14656 case 'H':
14657 if (!offsettable_memref_p (x))
14658 {
14659 output_operand_lossage ("operand is not an offsettable memory "
14660 "reference, invalid operand code 'H'");
14661 return;
14662 }
14663 /* It doesn't actually matter what mode we use here, as we're
14664 only going to use this for printing. */
14665 x = adjust_address_nv (x, DImode, 8);
14666 break;
14667
14668 case 'K':
14669 gcc_assert (CONST_INT_P (x));
14670
14671 if (INTVAL (x) & IX86_HLE_ACQUIRE)
14672 #ifdef HAVE_AS_IX86_HLE
14673 fputs ("xacquire ", file);
14674 #else
14675 fputs ("\n" ASM_BYTE "0xf2\n\t", file);
14676 #endif
14677 else if (INTVAL (x) & IX86_HLE_RELEASE)
14678 #ifdef HAVE_AS_IX86_HLE
14679 fputs ("xrelease ", file);
14680 #else
14681 fputs ("\n" ASM_BYTE "0xf3\n\t", file);
14682 #endif
14683 /* We do not want to print value of the operand. */
14684 return;
14685
14686 case '*':
14687 if (ASSEMBLER_DIALECT == ASM_ATT)
14688 putc ('*', file);
14689 return;
14690
14691 case '&':
14692 {
14693 const char *name = get_some_local_dynamic_name ();
14694 if (name == NULL)
14695 output_operand_lossage ("'%%&' used without any "
14696 "local dynamic TLS references");
14697 else
14698 assemble_name (file, name);
14699 return;
14700 }
14701
14702 case '+':
14703 {
14704 rtx x;
14705
14706 if (!optimize
14707 || optimize_function_for_size_p (cfun)
14708 || !TARGET_BRANCH_PREDICTION_HINTS)
14709 return;
14710
14711 x = find_reg_note (current_output_insn, REG_BR_PROB, 0);
14712 if (x)
14713 {
14714 int pred_val = INTVAL (XEXP (x, 0));
14715
14716 if (pred_val < REG_BR_PROB_BASE * 45 / 100
14717 || pred_val > REG_BR_PROB_BASE * 55 / 100)
14718 {
14719 bool taken = pred_val > REG_BR_PROB_BASE / 2;
14720 bool cputaken
14721 = final_forward_branch_p (current_output_insn) == 0;
14722
14723 /* Emit hints only in the case default branch prediction
14724 heuristics would fail. */
14725 if (taken != cputaken)
14726 {
14727 /* We use 3e (DS) prefix for taken branches and
14728 2e (CS) prefix for not taken branches. */
14729 if (taken)
14730 fputs ("ds ; ", file);
14731 else
14732 fputs ("cs ; ", file);
14733 }
14734 }
14735 }
14736 return;
14737 }
14738
14739 case ';':
14740 #ifndef HAVE_AS_IX86_REP_LOCK_PREFIX
14741 putc (';', file);
14742 #endif
14743 return;
14744
14745 case '@':
14746 if (ASSEMBLER_DIALECT == ASM_ATT)
14747 putc ('%', file);
14748
14749 /* The kernel uses a different segment register for performance
14750 reasons; a system call would not have to trash the userspace
14751 segment register, which would be expensive. */
14752 if (TARGET_64BIT && ix86_cmodel != CM_KERNEL)
14753 fputs ("fs", file);
14754 else
14755 fputs ("gs", file);
14756 return;
14757
14758 case '~':
14759 putc (TARGET_AVX2 ? 'i' : 'f', file);
14760 return;
14761
14762 case '^':
14763 if (TARGET_64BIT && Pmode != word_mode)
14764 fputs ("addr32 ", file);
14765 return;
14766
14767 default:
14768 output_operand_lossage ("invalid operand code '%c'", code);
14769 }
14770 }
14771
14772 if (REG_P (x))
14773 print_reg (x, code, file);
14774
14775 else if (MEM_P (x))
14776 {
14777 /* No `byte ptr' prefix for call instructions or BLKmode operands. */
14778 if (ASSEMBLER_DIALECT == ASM_INTEL && code != 'X' && code != 'P'
14779 && GET_MODE (x) != BLKmode)
14780 {
14781 const char * size;
14782 switch (GET_MODE_SIZE (GET_MODE (x)))
14783 {
14784 case 1: size = "BYTE"; break;
14785 case 2: size = "WORD"; break;
14786 case 4: size = "DWORD"; break;
14787 case 8: size = "QWORD"; break;
14788 case 12: size = "TBYTE"; break;
14789 case 16:
14790 if (GET_MODE (x) == XFmode)
14791 size = "TBYTE";
14792 else
14793 size = "XMMWORD";
14794 break;
14795 case 32: size = "YMMWORD"; break;
14796 default:
14797 gcc_unreachable ();
14798 }
14799
14800 /* Check for explicit size override (codes 'b', 'w', 'k',
14801 'q' and 'x') */
14802 if (code == 'b')
14803 size = "BYTE";
14804 else if (code == 'w')
14805 size = "WORD";
14806 else if (code == 'k')
14807 size = "DWORD";
14808 else if (code == 'q')
14809 size = "QWORD";
14810 else if (code == 'x')
14811 size = "XMMWORD";
14812
14813 fputs (size, file);
14814 fputs (" PTR ", file);
14815 }
14816
14817 x = XEXP (x, 0);
14818 /* Avoid (%rip) for call operands. */
14819 if (CONSTANT_ADDRESS_P (x) && code == 'P'
14820 && !CONST_INT_P (x))
14821 output_addr_const (file, x);
14822 else if (this_is_asm_operands && ! address_operand (x, VOIDmode))
14823 output_operand_lossage ("invalid constraints for operand");
14824 else
14825 output_address (x);
14826 }
14827
14828 else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == SFmode)
14829 {
14830 REAL_VALUE_TYPE r;
14831 long l;
14832
14833 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
14834 REAL_VALUE_TO_TARGET_SINGLE (r, l);
14835
14836 if (ASSEMBLER_DIALECT == ASM_ATT)
14837 putc ('$', file);
14838 /* Sign extend 32bit SFmode immediate to 8 bytes. */
14839 if (code == 'q')
14840 fprintf (file, "0x%08llx", (unsigned long long) (int) l);
14841 else
14842 fprintf (file, "0x%08x", (unsigned int) l);
14843 }
14844
14845 else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == DFmode)
14846 {
14847 REAL_VALUE_TYPE r;
14848 long l[2];
14849
14850 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
14851 REAL_VALUE_TO_TARGET_DOUBLE (r, l);
14852
14853 if (ASSEMBLER_DIALECT == ASM_ATT)
14854 putc ('$', file);
14855 fprintf (file, "0x%lx%08lx", l[1] & 0xffffffff, l[0] & 0xffffffff);
14856 }
14857
14858 /* These float cases don't actually occur as immediate operands. */
14859 else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == XFmode)
14860 {
14861 char dstr[30];
14862
14863 real_to_decimal (dstr, CONST_DOUBLE_REAL_VALUE (x), sizeof (dstr), 0, 1);
14864 fputs (dstr, file);
14865 }
14866
14867 else
14868 {
14869 /* We have patterns that allow zero sets of memory, for instance.
14870 In 64-bit mode, we should probably support all 8-byte vectors,
14871 since we can in fact encode that into an immediate. */
14872 if (GET_CODE (x) == CONST_VECTOR)
14873 {
14874 gcc_assert (x == CONST0_RTX (GET_MODE (x)));
14875 x = const0_rtx;
14876 }
14877
14878 if (code != 'P' && code != 'p')
14879 {
14880 if (CONST_INT_P (x) || GET_CODE (x) == CONST_DOUBLE)
14881 {
14882 if (ASSEMBLER_DIALECT == ASM_ATT)
14883 putc ('$', file);
14884 }
14885 else if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF
14886 || GET_CODE (x) == LABEL_REF)
14887 {
14888 if (ASSEMBLER_DIALECT == ASM_ATT)
14889 putc ('$', file);
14890 else
14891 fputs ("OFFSET FLAT:", file);
14892 }
14893 }
14894 if (CONST_INT_P (x))
14895 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
14896 else if (flag_pic || MACHOPIC_INDIRECT)
14897 output_pic_addr_const (file, x, code);
14898 else
14899 output_addr_const (file, x);
14900 }
14901 }
14902
14903 static bool
14904 ix86_print_operand_punct_valid_p (unsigned char code)
14905 {
14906 return (code == '@' || code == '*' || code == '+' || code == '&'
14907 || code == ';' || code == '~' || code == '^');
14908 }
14909 \f
14910 /* Print a memory operand whose address is ADDR. */
14911
14912 static void
14913 ix86_print_operand_address (FILE *file, rtx addr)
14914 {
14915 struct ix86_address parts;
14916 rtx base, index, disp;
14917 int scale;
14918 int ok;
14919 bool vsib = false;
14920 int code = 0;
14921
14922 if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_VSIBADDR)
14923 {
14924 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
14925 gcc_assert (parts.index == NULL_RTX);
14926 parts.index = XVECEXP (addr, 0, 1);
14927 parts.scale = INTVAL (XVECEXP (addr, 0, 2));
14928 addr = XVECEXP (addr, 0, 0);
14929 vsib = true;
14930 }
14931 else if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_LEA_ADDR)
14932 {
14933 gcc_assert (TARGET_64BIT);
14934 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
14935 code = 'q';
14936 }
14937 else
14938 ok = ix86_decompose_address (addr, &parts);
14939
14940 gcc_assert (ok);
14941
14942 if (parts.base && GET_CODE (parts.base) == SUBREG)
14943 {
14944 rtx tmp = SUBREG_REG (parts.base);
14945 parts.base = simplify_subreg (GET_MODE (parts.base),
14946 tmp, GET_MODE (tmp), 0);
14947 gcc_assert (parts.base != NULL_RTX);
14948 }
14949
14950 if (parts.index && GET_CODE (parts.index) == SUBREG)
14951 {
14952 rtx tmp = SUBREG_REG (parts.index);
14953 parts.index = simplify_subreg (GET_MODE (parts.index),
14954 tmp, GET_MODE (tmp), 0);
14955 gcc_assert (parts.index != NULL_RTX);
14956 }
14957
14958 base = parts.base;
14959 index = parts.index;
14960 disp = parts.disp;
14961 scale = parts.scale;
14962
14963 switch (parts.seg)
14964 {
14965 case SEG_DEFAULT:
14966 break;
14967 case SEG_FS:
14968 case SEG_GS:
14969 if (ASSEMBLER_DIALECT == ASM_ATT)
14970 putc ('%', file);
14971 fputs ((parts.seg == SEG_FS ? "fs:" : "gs:"), file);
14972 break;
14973 default:
14974 gcc_unreachable ();
14975 }
14976
14977 /* Use one byte shorter RIP relative addressing for 64bit mode. */
14978 if (TARGET_64BIT && !base && !index)
14979 {
14980 rtx symbol = disp;
14981
14982 if (GET_CODE (disp) == CONST
14983 && GET_CODE (XEXP (disp, 0)) == PLUS
14984 && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
14985 symbol = XEXP (XEXP (disp, 0), 0);
14986
14987 if (GET_CODE (symbol) == LABEL_REF
14988 || (GET_CODE (symbol) == SYMBOL_REF
14989 && SYMBOL_REF_TLS_MODEL (symbol) == 0))
14990 base = pc_rtx;
14991 }
14992 if (!base && !index)
14993 {
14994 /* Displacement only requires special attention. */
14995
14996 if (CONST_INT_P (disp))
14997 {
14998 if (ASSEMBLER_DIALECT == ASM_INTEL && parts.seg == SEG_DEFAULT)
14999 fputs ("ds:", file);
15000 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (disp));
15001 }
15002 else if (flag_pic)
15003 output_pic_addr_const (file, disp, 0);
15004 else
15005 output_addr_const (file, disp);
15006 }
15007 else
15008 {
15009 /* Print SImode register names to force addr32 prefix. */
15010 if (SImode_address_operand (addr, VOIDmode))
15011 {
15012 #ifdef ENABLE_CHECKING
15013 gcc_assert (TARGET_64BIT);
15014 switch (GET_CODE (addr))
15015 {
15016 case SUBREG:
15017 gcc_assert (GET_MODE (addr) == SImode);
15018 gcc_assert (GET_MODE (SUBREG_REG (addr)) == DImode);
15019 break;
15020 case ZERO_EXTEND:
15021 case AND:
15022 gcc_assert (GET_MODE (addr) == DImode);
15023 break;
15024 default:
15025 gcc_unreachable ();
15026 }
15027 #endif
15028 gcc_assert (!code);
15029 code = 'l';
15030 }
15031
15032 if (ASSEMBLER_DIALECT == ASM_ATT)
15033 {
15034 if (disp)
15035 {
15036 if (flag_pic)
15037 output_pic_addr_const (file, disp, 0);
15038 else if (GET_CODE (disp) == LABEL_REF)
15039 output_asm_label (disp);
15040 else
15041 output_addr_const (file, disp);
15042 }
15043
15044 putc ('(', file);
15045 if (base)
15046 print_reg (base, code, file);
15047 if (index)
15048 {
15049 putc (',', file);
15050 print_reg (index, vsib ? 0 : code, file);
15051 if (scale != 1 || vsib)
15052 fprintf (file, ",%d", scale);
15053 }
15054 putc (')', file);
15055 }
15056 else
15057 {
15058 rtx offset = NULL_RTX;
15059
15060 if (disp)
15061 {
15062 /* Pull out the offset of a symbol; print any symbol itself. */
15063 if (GET_CODE (disp) == CONST
15064 && GET_CODE (XEXP (disp, 0)) == PLUS
15065 && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
15066 {
15067 offset = XEXP (XEXP (disp, 0), 1);
15068 disp = gen_rtx_CONST (VOIDmode,
15069 XEXP (XEXP (disp, 0), 0));
15070 }
15071
15072 if (flag_pic)
15073 output_pic_addr_const (file, disp, 0);
15074 else if (GET_CODE (disp) == LABEL_REF)
15075 output_asm_label (disp);
15076 else if (CONST_INT_P (disp))
15077 offset = disp;
15078 else
15079 output_addr_const (file, disp);
15080 }
15081
15082 putc ('[', file);
15083 if (base)
15084 {
15085 print_reg (base, code, file);
15086 if (offset)
15087 {
15088 if (INTVAL (offset) >= 0)
15089 putc ('+', file);
15090 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
15091 }
15092 }
15093 else if (offset)
15094 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
15095 else
15096 putc ('0', file);
15097
15098 if (index)
15099 {
15100 putc ('+', file);
15101 print_reg (index, vsib ? 0 : code, file);
15102 if (scale != 1 || vsib)
15103 fprintf (file, "*%d", scale);
15104 }
15105 putc (']', file);
15106 }
15107 }
15108 }
15109
15110 /* Implementation of TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA. */
15111
15112 static bool
15113 i386_asm_output_addr_const_extra (FILE *file, rtx x)
15114 {
15115 rtx op;
15116
15117 if (GET_CODE (x) != UNSPEC)
15118 return false;
15119
15120 op = XVECEXP (x, 0, 0);
15121 switch (XINT (x, 1))
15122 {
15123 case UNSPEC_GOTTPOFF:
15124 output_addr_const (file, op);
15125 /* FIXME: This might be @TPOFF in Sun ld. */
15126 fputs ("@gottpoff", file);
15127 break;
15128 case UNSPEC_TPOFF:
15129 output_addr_const (file, op);
15130 fputs ("@tpoff", file);
15131 break;
15132 case UNSPEC_NTPOFF:
15133 output_addr_const (file, op);
15134 if (TARGET_64BIT)
15135 fputs ("@tpoff", file);
15136 else
15137 fputs ("@ntpoff", file);
15138 break;
15139 case UNSPEC_DTPOFF:
15140 output_addr_const (file, op);
15141 fputs ("@dtpoff", file);
15142 break;
15143 case UNSPEC_GOTNTPOFF:
15144 output_addr_const (file, op);
15145 if (TARGET_64BIT)
15146 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
15147 "@gottpoff(%rip)" : "@gottpoff[rip]", file);
15148 else
15149 fputs ("@gotntpoff", file);
15150 break;
15151 case UNSPEC_INDNTPOFF:
15152 output_addr_const (file, op);
15153 fputs ("@indntpoff", file);
15154 break;
15155 #if TARGET_MACHO
15156 case UNSPEC_MACHOPIC_OFFSET:
15157 output_addr_const (file, op);
15158 putc ('-', file);
15159 machopic_output_function_base_name (file);
15160 break;
15161 #endif
15162
15163 case UNSPEC_STACK_CHECK:
15164 {
15165 int offset;
15166
15167 gcc_assert (flag_split_stack);
15168
15169 #ifdef TARGET_THREAD_SPLIT_STACK_OFFSET
15170 offset = TARGET_THREAD_SPLIT_STACK_OFFSET;
15171 #else
15172 gcc_unreachable ();
15173 #endif
15174
15175 fprintf (file, "%s:%d", TARGET_64BIT ? "%fs" : "%gs", offset);
15176 }
15177 break;
15178
15179 default:
15180 return false;
15181 }
15182
15183 return true;
15184 }
15185 \f
15186 /* Split one or more double-mode RTL references into pairs of half-mode
15187 references. The RTL can be REG, offsettable MEM, integer constant, or
15188 CONST_DOUBLE. "operands" is a pointer to an array of double-mode RTLs to
15189 split and "num" is its length. lo_half and hi_half are output arrays
15190 that parallel "operands". */
15191
15192 void
15193 split_double_mode (enum machine_mode mode, rtx operands[],
15194 int num, rtx lo_half[], rtx hi_half[])
15195 {
15196 enum machine_mode half_mode;
15197 unsigned int byte;
15198
15199 switch (mode)
15200 {
15201 case TImode:
15202 half_mode = DImode;
15203 break;
15204 case DImode:
15205 half_mode = SImode;
15206 break;
15207 default:
15208 gcc_unreachable ();
15209 }
15210
15211 byte = GET_MODE_SIZE (half_mode);
15212
15213 while (num--)
15214 {
15215 rtx op = operands[num];
15216
15217 /* simplify_subreg refuse to split volatile memory addresses,
15218 but we still have to handle it. */
15219 if (MEM_P (op))
15220 {
15221 lo_half[num] = adjust_address (op, half_mode, 0);
15222 hi_half[num] = adjust_address (op, half_mode, byte);
15223 }
15224 else
15225 {
15226 lo_half[num] = simplify_gen_subreg (half_mode, op,
15227 GET_MODE (op) == VOIDmode
15228 ? mode : GET_MODE (op), 0);
15229 hi_half[num] = simplify_gen_subreg (half_mode, op,
15230 GET_MODE (op) == VOIDmode
15231 ? mode : GET_MODE (op), byte);
15232 }
15233 }
15234 }
15235 \f
15236 /* Output code to perform a 387 binary operation in INSN, one of PLUS,
15237 MINUS, MULT or DIV. OPERANDS are the insn operands, where operands[3]
15238 is the expression of the binary operation. The output may either be
15239 emitted here, or returned to the caller, like all output_* functions.
15240
15241 There is no guarantee that the operands are the same mode, as they
15242 might be within FLOAT or FLOAT_EXTEND expressions. */
15243
15244 #ifndef SYSV386_COMPAT
15245 /* Set to 1 for compatibility with brain-damaged assemblers. No-one
15246 wants to fix the assemblers because that causes incompatibility
15247 with gcc. No-one wants to fix gcc because that causes
15248 incompatibility with assemblers... You can use the option of
15249 -DSYSV386_COMPAT=0 if you recompile both gcc and gas this way. */
15250 #define SYSV386_COMPAT 1
15251 #endif
15252
15253 const char *
15254 output_387_binary_op (rtx insn, rtx *operands)
15255 {
15256 static char buf[40];
15257 const char *p;
15258 const char *ssep;
15259 int is_sse = SSE_REG_P (operands[0]) || SSE_REG_P (operands[1]) || SSE_REG_P (operands[2]);
15260
15261 #ifdef ENABLE_CHECKING
15262 /* Even if we do not want to check the inputs, this documents input
15263 constraints. Which helps in understanding the following code. */
15264 if (STACK_REG_P (operands[0])
15265 && ((REG_P (operands[1])
15266 && REGNO (operands[0]) == REGNO (operands[1])
15267 && (STACK_REG_P (operands[2]) || MEM_P (operands[2])))
15268 || (REG_P (operands[2])
15269 && REGNO (operands[0]) == REGNO (operands[2])
15270 && (STACK_REG_P (operands[1]) || MEM_P (operands[1]))))
15271 && (STACK_TOP_P (operands[1]) || STACK_TOP_P (operands[2])))
15272 ; /* ok */
15273 else
15274 gcc_assert (is_sse);
15275 #endif
15276
15277 switch (GET_CODE (operands[3]))
15278 {
15279 case PLUS:
15280 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
15281 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
15282 p = "fiadd";
15283 else
15284 p = "fadd";
15285 ssep = "vadd";
15286 break;
15287
15288 case MINUS:
15289 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
15290 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
15291 p = "fisub";
15292 else
15293 p = "fsub";
15294 ssep = "vsub";
15295 break;
15296
15297 case MULT:
15298 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
15299 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
15300 p = "fimul";
15301 else
15302 p = "fmul";
15303 ssep = "vmul";
15304 break;
15305
15306 case DIV:
15307 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
15308 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
15309 p = "fidiv";
15310 else
15311 p = "fdiv";
15312 ssep = "vdiv";
15313 break;
15314
15315 default:
15316 gcc_unreachable ();
15317 }
15318
15319 if (is_sse)
15320 {
15321 if (TARGET_AVX)
15322 {
15323 strcpy (buf, ssep);
15324 if (GET_MODE (operands[0]) == SFmode)
15325 strcat (buf, "ss\t{%2, %1, %0|%0, %1, %2}");
15326 else
15327 strcat (buf, "sd\t{%2, %1, %0|%0, %1, %2}");
15328 }
15329 else
15330 {
15331 strcpy (buf, ssep + 1);
15332 if (GET_MODE (operands[0]) == SFmode)
15333 strcat (buf, "ss\t{%2, %0|%0, %2}");
15334 else
15335 strcat (buf, "sd\t{%2, %0|%0, %2}");
15336 }
15337 return buf;
15338 }
15339 strcpy (buf, p);
15340
15341 switch (GET_CODE (operands[3]))
15342 {
15343 case MULT:
15344 case PLUS:
15345 if (REG_P (operands[2]) && REGNO (operands[0]) == REGNO (operands[2]))
15346 {
15347 rtx temp = operands[2];
15348 operands[2] = operands[1];
15349 operands[1] = temp;
15350 }
15351
15352 /* know operands[0] == operands[1]. */
15353
15354 if (MEM_P (operands[2]))
15355 {
15356 p = "%Z2\t%2";
15357 break;
15358 }
15359
15360 if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
15361 {
15362 if (STACK_TOP_P (operands[0]))
15363 /* How is it that we are storing to a dead operand[2]?
15364 Well, presumably operands[1] is dead too. We can't
15365 store the result to st(0) as st(0) gets popped on this
15366 instruction. Instead store to operands[2] (which I
15367 think has to be st(1)). st(1) will be popped later.
15368 gcc <= 2.8.1 didn't have this check and generated
15369 assembly code that the Unixware assembler rejected. */
15370 p = "p\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */
15371 else
15372 p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */
15373 break;
15374 }
15375
15376 if (STACK_TOP_P (operands[0]))
15377 p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */
15378 else
15379 p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */
15380 break;
15381
15382 case MINUS:
15383 case DIV:
15384 if (MEM_P (operands[1]))
15385 {
15386 p = "r%Z1\t%1";
15387 break;
15388 }
15389
15390 if (MEM_P (operands[2]))
15391 {
15392 p = "%Z2\t%2";
15393 break;
15394 }
15395
15396 if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
15397 {
15398 #if SYSV386_COMPAT
15399 /* The SystemV/386 SVR3.2 assembler, and probably all AT&T
15400 derived assemblers, confusingly reverse the direction of
15401 the operation for fsub{r} and fdiv{r} when the
15402 destination register is not st(0). The Intel assembler
15403 doesn't have this brain damage. Read !SYSV386_COMPAT to
15404 figure out what the hardware really does. */
15405 if (STACK_TOP_P (operands[0]))
15406 p = "{p\t%0, %2|rp\t%2, %0}";
15407 else
15408 p = "{rp\t%2, %0|p\t%0, %2}";
15409 #else
15410 if (STACK_TOP_P (operands[0]))
15411 /* As above for fmul/fadd, we can't store to st(0). */
15412 p = "rp\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */
15413 else
15414 p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */
15415 #endif
15416 break;
15417 }
15418
15419 if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
15420 {
15421 #if SYSV386_COMPAT
15422 if (STACK_TOP_P (operands[0]))
15423 p = "{rp\t%0, %1|p\t%1, %0}";
15424 else
15425 p = "{p\t%1, %0|rp\t%0, %1}";
15426 #else
15427 if (STACK_TOP_P (operands[0]))
15428 p = "p\t{%0, %1|%1, %0}"; /* st(1) = st(1) op st(0); pop */
15429 else
15430 p = "rp\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2); pop */
15431 #endif
15432 break;
15433 }
15434
15435 if (STACK_TOP_P (operands[0]))
15436 {
15437 if (STACK_TOP_P (operands[1]))
15438 p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */
15439 else
15440 p = "r\t{%y1, %0|%0, %y1}"; /* st(0) = st(r1) op st(0) */
15441 break;
15442 }
15443 else if (STACK_TOP_P (operands[1]))
15444 {
15445 #if SYSV386_COMPAT
15446 p = "{\t%1, %0|r\t%0, %1}";
15447 #else
15448 p = "r\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2) */
15449 #endif
15450 }
15451 else
15452 {
15453 #if SYSV386_COMPAT
15454 p = "{r\t%2, %0|\t%0, %2}";
15455 #else
15456 p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */
15457 #endif
15458 }
15459 break;
15460
15461 default:
15462 gcc_unreachable ();
15463 }
15464
15465 strcat (buf, p);
15466 return buf;
15467 }
15468
15469 /* Return needed mode for entity in optimize_mode_switching pass. */
15470
15471 int
15472 ix86_mode_needed (int entity, rtx insn)
15473 {
15474 enum attr_i387_cw mode;
15475
15476 /* The mode UNINITIALIZED is used to store control word after a
15477 function call or ASM pattern. The mode ANY specify that function
15478 has no requirements on the control word and make no changes in the
15479 bits we are interested in. */
15480
15481 if (CALL_P (insn)
15482 || (NONJUMP_INSN_P (insn)
15483 && (asm_noperands (PATTERN (insn)) >= 0
15484 || GET_CODE (PATTERN (insn)) == ASM_INPUT)))
15485 return I387_CW_UNINITIALIZED;
15486
15487 if (recog_memoized (insn) < 0)
15488 return I387_CW_ANY;
15489
15490 mode = get_attr_i387_cw (insn);
15491
15492 switch (entity)
15493 {
15494 case I387_TRUNC:
15495 if (mode == I387_CW_TRUNC)
15496 return mode;
15497 break;
15498
15499 case I387_FLOOR:
15500 if (mode == I387_CW_FLOOR)
15501 return mode;
15502 break;
15503
15504 case I387_CEIL:
15505 if (mode == I387_CW_CEIL)
15506 return mode;
15507 break;
15508
15509 case I387_MASK_PM:
15510 if (mode == I387_CW_MASK_PM)
15511 return mode;
15512 break;
15513
15514 default:
15515 gcc_unreachable ();
15516 }
15517
15518 return I387_CW_ANY;
15519 }
15520
15521 /* Output code to initialize control word copies used by trunc?f?i and
15522 rounding patterns. CURRENT_MODE is set to current control word,
15523 while NEW_MODE is set to new control word. */
15524
15525 void
15526 emit_i387_cw_initialization (int mode)
15527 {
15528 rtx stored_mode = assign_386_stack_local (HImode, SLOT_CW_STORED);
15529 rtx new_mode;
15530
15531 enum ix86_stack_slot slot;
15532
15533 rtx reg = gen_reg_rtx (HImode);
15534
15535 emit_insn (gen_x86_fnstcw_1 (stored_mode));
15536 emit_move_insn (reg, copy_rtx (stored_mode));
15537
15538 if (TARGET_64BIT || TARGET_PARTIAL_REG_STALL
15539 || optimize_function_for_size_p (cfun))
15540 {
15541 switch (mode)
15542 {
15543 case I387_CW_TRUNC:
15544 /* round toward zero (truncate) */
15545 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0c00)));
15546 slot = SLOT_CW_TRUNC;
15547 break;
15548
15549 case I387_CW_FLOOR:
15550 /* round down toward -oo */
15551 emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
15552 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0400)));
15553 slot = SLOT_CW_FLOOR;
15554 break;
15555
15556 case I387_CW_CEIL:
15557 /* round up toward +oo */
15558 emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
15559 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0800)));
15560 slot = SLOT_CW_CEIL;
15561 break;
15562
15563 case I387_CW_MASK_PM:
15564 /* mask precision exception for nearbyint() */
15565 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
15566 slot = SLOT_CW_MASK_PM;
15567 break;
15568
15569 default:
15570 gcc_unreachable ();
15571 }
15572 }
15573 else
15574 {
15575 switch (mode)
15576 {
15577 case I387_CW_TRUNC:
15578 /* round toward zero (truncate) */
15579 emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0xc)));
15580 slot = SLOT_CW_TRUNC;
15581 break;
15582
15583 case I387_CW_FLOOR:
15584 /* round down toward -oo */
15585 emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0x4)));
15586 slot = SLOT_CW_FLOOR;
15587 break;
15588
15589 case I387_CW_CEIL:
15590 /* round up toward +oo */
15591 emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0x8)));
15592 slot = SLOT_CW_CEIL;
15593 break;
15594
15595 case I387_CW_MASK_PM:
15596 /* mask precision exception for nearbyint() */
15597 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
15598 slot = SLOT_CW_MASK_PM;
15599 break;
15600
15601 default:
15602 gcc_unreachable ();
15603 }
15604 }
15605
15606 gcc_assert (slot < MAX_386_STACK_LOCALS);
15607
15608 new_mode = assign_386_stack_local (HImode, slot);
15609 emit_move_insn (new_mode, reg);
15610 }
15611
15612 /* Output code for INSN to convert a float to a signed int. OPERANDS
15613 are the insn operands. The output may be [HSD]Imode and the input
15614 operand may be [SDX]Fmode. */
15615
15616 const char *
15617 output_fix_trunc (rtx insn, rtx *operands, bool fisttp)
15618 {
15619 int stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG) != 0;
15620 int dimode_p = GET_MODE (operands[0]) == DImode;
15621 int round_mode = get_attr_i387_cw (insn);
15622
15623 /* Jump through a hoop or two for DImode, since the hardware has no
15624 non-popping instruction. We used to do this a different way, but
15625 that was somewhat fragile and broke with post-reload splitters. */
15626 if ((dimode_p || fisttp) && !stack_top_dies)
15627 output_asm_insn ("fld\t%y1", operands);
15628
15629 gcc_assert (STACK_TOP_P (operands[1]));
15630 gcc_assert (MEM_P (operands[0]));
15631 gcc_assert (GET_MODE (operands[1]) != TFmode);
15632
15633 if (fisttp)
15634 output_asm_insn ("fisttp%Z0\t%0", operands);
15635 else
15636 {
15637 if (round_mode != I387_CW_ANY)
15638 output_asm_insn ("fldcw\t%3", operands);
15639 if (stack_top_dies || dimode_p)
15640 output_asm_insn ("fistp%Z0\t%0", operands);
15641 else
15642 output_asm_insn ("fist%Z0\t%0", operands);
15643 if (round_mode != I387_CW_ANY)
15644 output_asm_insn ("fldcw\t%2", operands);
15645 }
15646
15647 return "";
15648 }
15649
15650 /* Output code for x87 ffreep insn. The OPNO argument, which may only
15651 have the values zero or one, indicates the ffreep insn's operand
15652 from the OPERANDS array. */
15653
15654 static const char *
15655 output_387_ffreep (rtx *operands ATTRIBUTE_UNUSED, int opno)
15656 {
15657 if (TARGET_USE_FFREEP)
15658 #ifdef HAVE_AS_IX86_FFREEP
15659 return opno ? "ffreep\t%y1" : "ffreep\t%y0";
15660 #else
15661 {
15662 static char retval[32];
15663 int regno = REGNO (operands[opno]);
15664
15665 gcc_assert (STACK_REGNO_P (regno));
15666
15667 regno -= FIRST_STACK_REG;
15668
15669 snprintf (retval, sizeof (retval), ASM_SHORT "0xc%ddf", regno);
15670 return retval;
15671 }
15672 #endif
15673
15674 return opno ? "fstp\t%y1" : "fstp\t%y0";
15675 }
15676
15677
15678 /* Output code for INSN to compare OPERANDS. EFLAGS_P is 1 when fcomi
15679 should be used. UNORDERED_P is true when fucom should be used. */
15680
15681 const char *
15682 output_fp_compare (rtx insn, rtx *operands, bool eflags_p, bool unordered_p)
15683 {
15684 int stack_top_dies;
15685 rtx cmp_op0, cmp_op1;
15686 int is_sse = SSE_REG_P (operands[0]) || SSE_REG_P (operands[1]);
15687
15688 if (eflags_p)
15689 {
15690 cmp_op0 = operands[0];
15691 cmp_op1 = operands[1];
15692 }
15693 else
15694 {
15695 cmp_op0 = operands[1];
15696 cmp_op1 = operands[2];
15697 }
15698
15699 if (is_sse)
15700 {
15701 if (GET_MODE (operands[0]) == SFmode)
15702 if (unordered_p)
15703 return "%vucomiss\t{%1, %0|%0, %1}";
15704 else
15705 return "%vcomiss\t{%1, %0|%0, %1}";
15706 else
15707 if (unordered_p)
15708 return "%vucomisd\t{%1, %0|%0, %1}";
15709 else
15710 return "%vcomisd\t{%1, %0|%0, %1}";
15711 }
15712
15713 gcc_assert (STACK_TOP_P (cmp_op0));
15714
15715 stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG) != 0;
15716
15717 if (cmp_op1 == CONST0_RTX (GET_MODE (cmp_op1)))
15718 {
15719 if (stack_top_dies)
15720 {
15721 output_asm_insn ("ftst\n\tfnstsw\t%0", operands);
15722 return output_387_ffreep (operands, 1);
15723 }
15724 else
15725 return "ftst\n\tfnstsw\t%0";
15726 }
15727
15728 if (STACK_REG_P (cmp_op1)
15729 && stack_top_dies
15730 && find_regno_note (insn, REG_DEAD, REGNO (cmp_op1))
15731 && REGNO (cmp_op1) != FIRST_STACK_REG)
15732 {
15733 /* If both the top of the 387 stack dies, and the other operand
15734 is also a stack register that dies, then this must be a
15735 `fcompp' float compare */
15736
15737 if (eflags_p)
15738 {
15739 /* There is no double popping fcomi variant. Fortunately,
15740 eflags is immune from the fstp's cc clobbering. */
15741 if (unordered_p)
15742 output_asm_insn ("fucomip\t{%y1, %0|%0, %y1}", operands);
15743 else
15744 output_asm_insn ("fcomip\t{%y1, %0|%0, %y1}", operands);
15745 return output_387_ffreep (operands, 0);
15746 }
15747 else
15748 {
15749 if (unordered_p)
15750 return "fucompp\n\tfnstsw\t%0";
15751 else
15752 return "fcompp\n\tfnstsw\t%0";
15753 }
15754 }
15755 else
15756 {
15757 /* Encoded here as eflags_p | intmode | unordered_p | stack_top_dies. */
15758
15759 static const char * const alt[16] =
15760 {
15761 "fcom%Z2\t%y2\n\tfnstsw\t%0",
15762 "fcomp%Z2\t%y2\n\tfnstsw\t%0",
15763 "fucom%Z2\t%y2\n\tfnstsw\t%0",
15764 "fucomp%Z2\t%y2\n\tfnstsw\t%0",
15765
15766 "ficom%Z2\t%y2\n\tfnstsw\t%0",
15767 "ficomp%Z2\t%y2\n\tfnstsw\t%0",
15768 NULL,
15769 NULL,
15770
15771 "fcomi\t{%y1, %0|%0, %y1}",
15772 "fcomip\t{%y1, %0|%0, %y1}",
15773 "fucomi\t{%y1, %0|%0, %y1}",
15774 "fucomip\t{%y1, %0|%0, %y1}",
15775
15776 NULL,
15777 NULL,
15778 NULL,
15779 NULL
15780 };
15781
15782 int mask;
15783 const char *ret;
15784
15785 mask = eflags_p << 3;
15786 mask |= (GET_MODE_CLASS (GET_MODE (cmp_op1)) == MODE_INT) << 2;
15787 mask |= unordered_p << 1;
15788 mask |= stack_top_dies;
15789
15790 gcc_assert (mask < 16);
15791 ret = alt[mask];
15792 gcc_assert (ret);
15793
15794 return ret;
15795 }
15796 }
15797
15798 void
15799 ix86_output_addr_vec_elt (FILE *file, int value)
15800 {
15801 const char *directive = ASM_LONG;
15802
15803 #ifdef ASM_QUAD
15804 if (TARGET_LP64)
15805 directive = ASM_QUAD;
15806 #else
15807 gcc_assert (!TARGET_64BIT);
15808 #endif
15809
15810 fprintf (file, "%s%s%d\n", directive, LPREFIX, value);
15811 }
15812
15813 void
15814 ix86_output_addr_diff_elt (FILE *file, int value, int rel)
15815 {
15816 const char *directive = ASM_LONG;
15817
15818 #ifdef ASM_QUAD
15819 if (TARGET_64BIT && CASE_VECTOR_MODE == DImode)
15820 directive = ASM_QUAD;
15821 #else
15822 gcc_assert (!TARGET_64BIT);
15823 #endif
15824 /* We can't use @GOTOFF for text labels on VxWorks; see gotoff_operand. */
15825 if (TARGET_64BIT || TARGET_VXWORKS_RTP)
15826 fprintf (file, "%s%s%d-%s%d\n",
15827 directive, LPREFIX, value, LPREFIX, rel);
15828 else if (HAVE_AS_GOTOFF_IN_DATA)
15829 fprintf (file, ASM_LONG "%s%d@GOTOFF\n", LPREFIX, value);
15830 #if TARGET_MACHO
15831 else if (TARGET_MACHO)
15832 {
15833 fprintf (file, ASM_LONG "%s%d-", LPREFIX, value);
15834 machopic_output_function_base_name (file);
15835 putc ('\n', file);
15836 }
15837 #endif
15838 else
15839 asm_fprintf (file, ASM_LONG "%U%s+[.-%s%d]\n",
15840 GOT_SYMBOL_NAME, LPREFIX, value);
15841 }
15842 \f
15843 /* Generate either "mov $0, reg" or "xor reg, reg", as appropriate
15844 for the target. */
15845
15846 void
15847 ix86_expand_clear (rtx dest)
15848 {
15849 rtx tmp;
15850
15851 /* We play register width games, which are only valid after reload. */
15852 gcc_assert (reload_completed);
15853
15854 /* Avoid HImode and its attendant prefix byte. */
15855 if (GET_MODE_SIZE (GET_MODE (dest)) < 4)
15856 dest = gen_rtx_REG (SImode, REGNO (dest));
15857 tmp = gen_rtx_SET (VOIDmode, dest, const0_rtx);
15858
15859 /* This predicate should match that for movsi_xor and movdi_xor_rex64. */
15860 if (!TARGET_USE_MOV0 || optimize_insn_for_speed_p ())
15861 {
15862 rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
15863 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, clob));
15864 }
15865
15866 emit_insn (tmp);
15867 }
15868
15869 /* X is an unchanging MEM. If it is a constant pool reference, return
15870 the constant pool rtx, else NULL. */
15871
15872 rtx
15873 maybe_get_pool_constant (rtx x)
15874 {
15875 x = ix86_delegitimize_address (XEXP (x, 0));
15876
15877 if (GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x))
15878 return get_pool_constant (x);
15879
15880 return NULL_RTX;
15881 }
15882
15883 void
15884 ix86_expand_move (enum machine_mode mode, rtx operands[])
15885 {
15886 rtx op0, op1;
15887 enum tls_model model;
15888
15889 op0 = operands[0];
15890 op1 = operands[1];
15891
15892 if (GET_CODE (op1) == SYMBOL_REF)
15893 {
15894 model = SYMBOL_REF_TLS_MODEL (op1);
15895 if (model)
15896 {
15897 op1 = legitimize_tls_address (op1, model, true);
15898 op1 = force_operand (op1, op0);
15899 if (op1 == op0)
15900 return;
15901 if (GET_MODE (op1) != mode)
15902 op1 = convert_to_mode (mode, op1, 1);
15903 }
15904 else if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
15905 && SYMBOL_REF_DLLIMPORT_P (op1))
15906 op1 = legitimize_dllimport_symbol (op1, false);
15907 }
15908 else if (GET_CODE (op1) == CONST
15909 && GET_CODE (XEXP (op1, 0)) == PLUS
15910 && GET_CODE (XEXP (XEXP (op1, 0), 0)) == SYMBOL_REF)
15911 {
15912 rtx addend = XEXP (XEXP (op1, 0), 1);
15913 rtx symbol = XEXP (XEXP (op1, 0), 0);
15914 rtx tmp = NULL;
15915
15916 model = SYMBOL_REF_TLS_MODEL (symbol);
15917 if (model)
15918 tmp = legitimize_tls_address (symbol, model, true);
15919 else if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
15920 && SYMBOL_REF_DLLIMPORT_P (symbol))
15921 tmp = legitimize_dllimport_symbol (symbol, true);
15922
15923 if (tmp)
15924 {
15925 tmp = force_operand (tmp, NULL);
15926 tmp = expand_simple_binop (Pmode, PLUS, tmp, addend,
15927 op0, 1, OPTAB_DIRECT);
15928 if (tmp == op0)
15929 return;
15930 if (GET_MODE (tmp) != mode)
15931 op1 = convert_to_mode (mode, tmp, 1);
15932 }
15933 }
15934
15935 if ((flag_pic || MACHOPIC_INDIRECT)
15936 && symbolic_operand (op1, mode))
15937 {
15938 if (TARGET_MACHO && !TARGET_64BIT)
15939 {
15940 #if TARGET_MACHO
15941 /* dynamic-no-pic */
15942 if (MACHOPIC_INDIRECT)
15943 {
15944 rtx temp = ((reload_in_progress
15945 || ((op0 && REG_P (op0))
15946 && mode == Pmode))
15947 ? op0 : gen_reg_rtx (Pmode));
15948 op1 = machopic_indirect_data_reference (op1, temp);
15949 if (MACHOPIC_PURE)
15950 op1 = machopic_legitimize_pic_address (op1, mode,
15951 temp == op1 ? 0 : temp);
15952 }
15953 if (op0 != op1 && GET_CODE (op0) != MEM)
15954 {
15955 rtx insn = gen_rtx_SET (VOIDmode, op0, op1);
15956 emit_insn (insn);
15957 return;
15958 }
15959 if (GET_CODE (op0) == MEM)
15960 op1 = force_reg (Pmode, op1);
15961 else
15962 {
15963 rtx temp = op0;
15964 if (GET_CODE (temp) != REG)
15965 temp = gen_reg_rtx (Pmode);
15966 temp = legitimize_pic_address (op1, temp);
15967 if (temp == op0)
15968 return;
15969 op1 = temp;
15970 }
15971 /* dynamic-no-pic */
15972 #endif
15973 }
15974 else
15975 {
15976 if (MEM_P (op0))
15977 op1 = force_reg (mode, op1);
15978 else if (!(TARGET_64BIT && x86_64_movabs_operand (op1, DImode)))
15979 {
15980 rtx reg = can_create_pseudo_p () ? NULL_RTX : op0;
15981 op1 = legitimize_pic_address (op1, reg);
15982 if (op0 == op1)
15983 return;
15984 if (GET_MODE (op1) != mode)
15985 op1 = convert_to_mode (mode, op1, 1);
15986 }
15987 }
15988 }
15989 else
15990 {
15991 if (MEM_P (op0)
15992 && (PUSH_ROUNDING (GET_MODE_SIZE (mode)) != GET_MODE_SIZE (mode)
15993 || !push_operand (op0, mode))
15994 && MEM_P (op1))
15995 op1 = force_reg (mode, op1);
15996
15997 if (push_operand (op0, mode)
15998 && ! general_no_elim_operand (op1, mode))
15999 op1 = copy_to_mode_reg (mode, op1);
16000
16001 /* Force large constants in 64bit compilation into register
16002 to get them CSEed. */
16003 if (can_create_pseudo_p ()
16004 && (mode == DImode) && TARGET_64BIT
16005 && immediate_operand (op1, mode)
16006 && !x86_64_zext_immediate_operand (op1, VOIDmode)
16007 && !register_operand (op0, mode)
16008 && optimize)
16009 op1 = copy_to_mode_reg (mode, op1);
16010
16011 if (can_create_pseudo_p ()
16012 && FLOAT_MODE_P (mode)
16013 && GET_CODE (op1) == CONST_DOUBLE)
16014 {
16015 /* If we are loading a floating point constant to a register,
16016 force the value to memory now, since we'll get better code
16017 out the back end. */
16018
16019 op1 = validize_mem (force_const_mem (mode, op1));
16020 if (!register_operand (op0, mode))
16021 {
16022 rtx temp = gen_reg_rtx (mode);
16023 emit_insn (gen_rtx_SET (VOIDmode, temp, op1));
16024 emit_move_insn (op0, temp);
16025 return;
16026 }
16027 }
16028 }
16029
16030 emit_insn (gen_rtx_SET (VOIDmode, op0, op1));
16031 }
16032
16033 void
16034 ix86_expand_vector_move (enum machine_mode mode, rtx operands[])
16035 {
16036 rtx op0 = operands[0], op1 = operands[1];
16037 unsigned int align = GET_MODE_ALIGNMENT (mode);
16038
16039 /* Force constants other than zero into memory. We do not know how
16040 the instructions used to build constants modify the upper 64 bits
16041 of the register, once we have that information we may be able
16042 to handle some of them more efficiently. */
16043 if (can_create_pseudo_p ()
16044 && register_operand (op0, mode)
16045 && (CONSTANT_P (op1)
16046 || (GET_CODE (op1) == SUBREG
16047 && CONSTANT_P (SUBREG_REG (op1))))
16048 && !standard_sse_constant_p (op1))
16049 op1 = validize_mem (force_const_mem (mode, op1));
16050
16051 /* We need to check memory alignment for SSE mode since attribute
16052 can make operands unaligned. */
16053 if (can_create_pseudo_p ()
16054 && SSE_REG_MODE_P (mode)
16055 && ((MEM_P (op0) && (MEM_ALIGN (op0) < align))
16056 || (MEM_P (op1) && (MEM_ALIGN (op1) < align))))
16057 {
16058 rtx tmp[2];
16059
16060 /* ix86_expand_vector_move_misalign() does not like constants ... */
16061 if (CONSTANT_P (op1)
16062 || (GET_CODE (op1) == SUBREG
16063 && CONSTANT_P (SUBREG_REG (op1))))
16064 op1 = validize_mem (force_const_mem (mode, op1));
16065
16066 /* ... nor both arguments in memory. */
16067 if (!register_operand (op0, mode)
16068 && !register_operand (op1, mode))
16069 op1 = force_reg (mode, op1);
16070
16071 tmp[0] = op0; tmp[1] = op1;
16072 ix86_expand_vector_move_misalign (mode, tmp);
16073 return;
16074 }
16075
16076 /* Make operand1 a register if it isn't already. */
16077 if (can_create_pseudo_p ()
16078 && !register_operand (op0, mode)
16079 && !register_operand (op1, mode))
16080 {
16081 emit_move_insn (op0, force_reg (GET_MODE (op0), op1));
16082 return;
16083 }
16084
16085 emit_insn (gen_rtx_SET (VOIDmode, op0, op1));
16086 }
16087
16088 /* Split 32-byte AVX unaligned load and store if needed. */
16089
16090 static void
16091 ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1)
16092 {
16093 rtx m;
16094 rtx (*extract) (rtx, rtx, rtx);
16095 rtx (*load_unaligned) (rtx, rtx);
16096 rtx (*store_unaligned) (rtx, rtx);
16097 enum machine_mode mode;
16098
16099 switch (GET_MODE (op0))
16100 {
16101 default:
16102 gcc_unreachable ();
16103 case V32QImode:
16104 extract = gen_avx_vextractf128v32qi;
16105 load_unaligned = gen_avx_loaddqu256;
16106 store_unaligned = gen_avx_storedqu256;
16107 mode = V16QImode;
16108 break;
16109 case V8SFmode:
16110 extract = gen_avx_vextractf128v8sf;
16111 load_unaligned = gen_avx_loadups256;
16112 store_unaligned = gen_avx_storeups256;
16113 mode = V4SFmode;
16114 break;
16115 case V4DFmode:
16116 extract = gen_avx_vextractf128v4df;
16117 load_unaligned = gen_avx_loadupd256;
16118 store_unaligned = gen_avx_storeupd256;
16119 mode = V2DFmode;
16120 break;
16121 }
16122
16123 if (MEM_P (op1))
16124 {
16125 if (TARGET_AVX256_SPLIT_UNALIGNED_LOAD)
16126 {
16127 rtx r = gen_reg_rtx (mode);
16128 m = adjust_address (op1, mode, 0);
16129 emit_move_insn (r, m);
16130 m = adjust_address (op1, mode, 16);
16131 r = gen_rtx_VEC_CONCAT (GET_MODE (op0), r, m);
16132 emit_move_insn (op0, r);
16133 }
16134 else
16135 emit_insn (load_unaligned (op0, op1));
16136 }
16137 else if (MEM_P (op0))
16138 {
16139 if (TARGET_AVX256_SPLIT_UNALIGNED_STORE)
16140 {
16141 m = adjust_address (op0, mode, 0);
16142 emit_insn (extract (m, op1, const0_rtx));
16143 m = adjust_address (op0, mode, 16);
16144 emit_insn (extract (m, op1, const1_rtx));
16145 }
16146 else
16147 emit_insn (store_unaligned (op0, op1));
16148 }
16149 else
16150 gcc_unreachable ();
16151 }
16152
16153 /* Implement the movmisalign patterns for SSE. Non-SSE modes go
16154 straight to ix86_expand_vector_move. */
16155 /* Code generation for scalar reg-reg moves of single and double precision data:
16156 if (x86_sse_partial_reg_dependency == true | x86_sse_split_regs == true)
16157 movaps reg, reg
16158 else
16159 movss reg, reg
16160 if (x86_sse_partial_reg_dependency == true)
16161 movapd reg, reg
16162 else
16163 movsd reg, reg
16164
16165 Code generation for scalar loads of double precision data:
16166 if (x86_sse_split_regs == true)
16167 movlpd mem, reg (gas syntax)
16168 else
16169 movsd mem, reg
16170
16171 Code generation for unaligned packed loads of single precision data
16172 (x86_sse_unaligned_move_optimal overrides x86_sse_partial_reg_dependency):
16173 if (x86_sse_unaligned_move_optimal)
16174 movups mem, reg
16175
16176 if (x86_sse_partial_reg_dependency == true)
16177 {
16178 xorps reg, reg
16179 movlps mem, reg
16180 movhps mem+8, reg
16181 }
16182 else
16183 {
16184 movlps mem, reg
16185 movhps mem+8, reg
16186 }
16187
16188 Code generation for unaligned packed loads of double precision data
16189 (x86_sse_unaligned_move_optimal overrides x86_sse_split_regs):
16190 if (x86_sse_unaligned_move_optimal)
16191 movupd mem, reg
16192
16193 if (x86_sse_split_regs == true)
16194 {
16195 movlpd mem, reg
16196 movhpd mem+8, reg
16197 }
16198 else
16199 {
16200 movsd mem, reg
16201 movhpd mem+8, reg
16202 }
16203 */
16204
16205 void
16206 ix86_expand_vector_move_misalign (enum machine_mode mode, rtx operands[])
16207 {
16208 rtx op0, op1, m;
16209
16210 op0 = operands[0];
16211 op1 = operands[1];
16212
16213 if (TARGET_AVX
16214 && GET_MODE_SIZE (mode) == 32)
16215 {
16216 switch (GET_MODE_CLASS (mode))
16217 {
16218 case MODE_VECTOR_INT:
16219 case MODE_INT:
16220 op0 = gen_lowpart (V32QImode, op0);
16221 op1 = gen_lowpart (V32QImode, op1);
16222 /* FALLTHRU */
16223
16224 case MODE_VECTOR_FLOAT:
16225 ix86_avx256_split_vector_move_misalign (op0, op1);
16226 break;
16227
16228 default:
16229 gcc_unreachable ();
16230 }
16231
16232 return;
16233 }
16234
16235 if (MEM_P (op1))
16236 {
16237 /* ??? If we have typed data, then it would appear that using
16238 movdqu is the only way to get unaligned data loaded with
16239 integer type. */
16240 if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
16241 {
16242 op0 = gen_lowpart (V16QImode, op0);
16243 op1 = gen_lowpart (V16QImode, op1);
16244 /* We will eventually emit movups based on insn attributes. */
16245 emit_insn (gen_sse2_loaddqu (op0, op1));
16246 }
16247 else if (TARGET_SSE2 && mode == V2DFmode)
16248 {
16249 rtx zero;
16250
16251 if (TARGET_AVX
16252 || TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
16253 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
16254 || optimize_function_for_size_p (cfun))
16255 {
16256 /* We will eventually emit movups based on insn attributes. */
16257 emit_insn (gen_sse2_loadupd (op0, op1));
16258 return;
16259 }
16260
16261 /* When SSE registers are split into halves, we can avoid
16262 writing to the top half twice. */
16263 if (TARGET_SSE_SPLIT_REGS)
16264 {
16265 emit_clobber (op0);
16266 zero = op0;
16267 }
16268 else
16269 {
16270 /* ??? Not sure about the best option for the Intel chips.
16271 The following would seem to satisfy; the register is
16272 entirely cleared, breaking the dependency chain. We
16273 then store to the upper half, with a dependency depth
16274 of one. A rumor has it that Intel recommends two movsd
16275 followed by an unpacklpd, but this is unconfirmed. And
16276 given that the dependency depth of the unpacklpd would
16277 still be one, I'm not sure why this would be better. */
16278 zero = CONST0_RTX (V2DFmode);
16279 }
16280
16281 m = adjust_address (op1, DFmode, 0);
16282 emit_insn (gen_sse2_loadlpd (op0, zero, m));
16283 m = adjust_address (op1, DFmode, 8);
16284 emit_insn (gen_sse2_loadhpd (op0, op0, m));
16285 }
16286 else
16287 {
16288 if (TARGET_AVX
16289 || TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
16290 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
16291 || optimize_function_for_size_p (cfun))
16292 {
16293 op0 = gen_lowpart (V4SFmode, op0);
16294 op1 = gen_lowpart (V4SFmode, op1);
16295 emit_insn (gen_sse_loadups (op0, op1));
16296 return;
16297 }
16298
16299 if (TARGET_SSE_PARTIAL_REG_DEPENDENCY)
16300 emit_move_insn (op0, CONST0_RTX (mode));
16301 else
16302 emit_clobber (op0);
16303
16304 if (mode != V4SFmode)
16305 op0 = gen_lowpart (V4SFmode, op0);
16306
16307 m = adjust_address (op1, V2SFmode, 0);
16308 emit_insn (gen_sse_loadlps (op0, op0, m));
16309 m = adjust_address (op1, V2SFmode, 8);
16310 emit_insn (gen_sse_loadhps (op0, op0, m));
16311 }
16312 }
16313 else if (MEM_P (op0))
16314 {
16315 if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
16316 {
16317 op0 = gen_lowpart (V16QImode, op0);
16318 op1 = gen_lowpart (V16QImode, op1);
16319 /* We will eventually emit movups based on insn attributes. */
16320 emit_insn (gen_sse2_storedqu (op0, op1));
16321 }
16322 else if (TARGET_SSE2 && mode == V2DFmode)
16323 {
16324 if (TARGET_AVX
16325 || TARGET_SSE_UNALIGNED_STORE_OPTIMAL
16326 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
16327 || optimize_function_for_size_p (cfun))
16328 /* We will eventually emit movups based on insn attributes. */
16329 emit_insn (gen_sse2_storeupd (op0, op1));
16330 else
16331 {
16332 m = adjust_address (op0, DFmode, 0);
16333 emit_insn (gen_sse2_storelpd (m, op1));
16334 m = adjust_address (op0, DFmode, 8);
16335 emit_insn (gen_sse2_storehpd (m, op1));
16336 }
16337 }
16338 else
16339 {
16340 if (mode != V4SFmode)
16341 op1 = gen_lowpart (V4SFmode, op1);
16342
16343 if (TARGET_AVX
16344 || TARGET_SSE_UNALIGNED_STORE_OPTIMAL
16345 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
16346 || optimize_function_for_size_p (cfun))
16347 {
16348 op0 = gen_lowpart (V4SFmode, op0);
16349 emit_insn (gen_sse_storeups (op0, op1));
16350 }
16351 else
16352 {
16353 m = adjust_address (op0, V2SFmode, 0);
16354 emit_insn (gen_sse_storelps (m, op1));
16355 m = adjust_address (op0, V2SFmode, 8);
16356 emit_insn (gen_sse_storehps (m, op1));
16357 }
16358 }
16359 }
16360 else
16361 gcc_unreachable ();
16362 }
16363
16364 /* Expand a push in MODE. This is some mode for which we do not support
16365 proper push instructions, at least from the registers that we expect
16366 the value to live in. */
16367
16368 void
16369 ix86_expand_push (enum machine_mode mode, rtx x)
16370 {
16371 rtx tmp;
16372
16373 tmp = expand_simple_binop (Pmode, PLUS, stack_pointer_rtx,
16374 GEN_INT (-GET_MODE_SIZE (mode)),
16375 stack_pointer_rtx, 1, OPTAB_DIRECT);
16376 if (tmp != stack_pointer_rtx)
16377 emit_move_insn (stack_pointer_rtx, tmp);
16378
16379 tmp = gen_rtx_MEM (mode, stack_pointer_rtx);
16380
16381 /* When we push an operand onto stack, it has to be aligned at least
16382 at the function argument boundary. However since we don't have
16383 the argument type, we can't determine the actual argument
16384 boundary. */
16385 emit_move_insn (tmp, x);
16386 }
16387
16388 /* Helper function of ix86_fixup_binary_operands to canonicalize
16389 operand order. Returns true if the operands should be swapped. */
16390
16391 static bool
16392 ix86_swap_binary_operands_p (enum rtx_code code, enum machine_mode mode,
16393 rtx operands[])
16394 {
16395 rtx dst = operands[0];
16396 rtx src1 = operands[1];
16397 rtx src2 = operands[2];
16398
16399 /* If the operation is not commutative, we can't do anything. */
16400 if (GET_RTX_CLASS (code) != RTX_COMM_ARITH)
16401 return false;
16402
16403 /* Highest priority is that src1 should match dst. */
16404 if (rtx_equal_p (dst, src1))
16405 return false;
16406 if (rtx_equal_p (dst, src2))
16407 return true;
16408
16409 /* Next highest priority is that immediate constants come second. */
16410 if (immediate_operand (src2, mode))
16411 return false;
16412 if (immediate_operand (src1, mode))
16413 return true;
16414
16415 /* Lowest priority is that memory references should come second. */
16416 if (MEM_P (src2))
16417 return false;
16418 if (MEM_P (src1))
16419 return true;
16420
16421 return false;
16422 }
16423
16424
16425 /* Fix up OPERANDS to satisfy ix86_binary_operator_ok. Return the
16426 destination to use for the operation. If different from the true
16427 destination in operands[0], a copy operation will be required. */
16428
16429 rtx
16430 ix86_fixup_binary_operands (enum rtx_code code, enum machine_mode mode,
16431 rtx operands[])
16432 {
16433 rtx dst = operands[0];
16434 rtx src1 = operands[1];
16435 rtx src2 = operands[2];
16436
16437 /* Canonicalize operand order. */
16438 if (ix86_swap_binary_operands_p (code, mode, operands))
16439 {
16440 rtx temp;
16441
16442 /* It is invalid to swap operands of different modes. */
16443 gcc_assert (GET_MODE (src1) == GET_MODE (src2));
16444
16445 temp = src1;
16446 src1 = src2;
16447 src2 = temp;
16448 }
16449
16450 /* Both source operands cannot be in memory. */
16451 if (MEM_P (src1) && MEM_P (src2))
16452 {
16453 /* Optimization: Only read from memory once. */
16454 if (rtx_equal_p (src1, src2))
16455 {
16456 src2 = force_reg (mode, src2);
16457 src1 = src2;
16458 }
16459 else
16460 src2 = force_reg (mode, src2);
16461 }
16462
16463 /* If the destination is memory, and we do not have matching source
16464 operands, do things in registers. */
16465 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
16466 dst = gen_reg_rtx (mode);
16467
16468 /* Source 1 cannot be a constant. */
16469 if (CONSTANT_P (src1))
16470 src1 = force_reg (mode, src1);
16471
16472 /* Source 1 cannot be a non-matching memory. */
16473 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
16474 src1 = force_reg (mode, src1);
16475
16476 /* Improve address combine. */
16477 if (code == PLUS
16478 && GET_MODE_CLASS (mode) == MODE_INT
16479 && MEM_P (src2))
16480 src2 = force_reg (mode, src2);
16481
16482 operands[1] = src1;
16483 operands[2] = src2;
16484 return dst;
16485 }
16486
16487 /* Similarly, but assume that the destination has already been
16488 set up properly. */
16489
16490 void
16491 ix86_fixup_binary_operands_no_copy (enum rtx_code code,
16492 enum machine_mode mode, rtx operands[])
16493 {
16494 rtx dst = ix86_fixup_binary_operands (code, mode, operands);
16495 gcc_assert (dst == operands[0]);
16496 }
16497
16498 /* Attempt to expand a binary operator. Make the expansion closer to the
16499 actual machine, then just general_operand, which will allow 3 separate
16500 memory references (one output, two input) in a single insn. */
16501
16502 void
16503 ix86_expand_binary_operator (enum rtx_code code, enum machine_mode mode,
16504 rtx operands[])
16505 {
16506 rtx src1, src2, dst, op, clob;
16507
16508 dst = ix86_fixup_binary_operands (code, mode, operands);
16509 src1 = operands[1];
16510 src2 = operands[2];
16511
16512 /* Emit the instruction. */
16513
16514 op = gen_rtx_SET (VOIDmode, dst, gen_rtx_fmt_ee (code, mode, src1, src2));
16515 if (reload_in_progress)
16516 {
16517 /* Reload doesn't know about the flags register, and doesn't know that
16518 it doesn't want to clobber it. We can only do this with PLUS. */
16519 gcc_assert (code == PLUS);
16520 emit_insn (op);
16521 }
16522 else if (reload_completed
16523 && code == PLUS
16524 && !rtx_equal_p (dst, src1))
16525 {
16526 /* This is going to be an LEA; avoid splitting it later. */
16527 emit_insn (op);
16528 }
16529 else
16530 {
16531 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
16532 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
16533 }
16534
16535 /* Fix up the destination if needed. */
16536 if (dst != operands[0])
16537 emit_move_insn (operands[0], dst);
16538 }
16539
16540 /* Expand vector logical operation CODE (AND, IOR, XOR) in MODE with
16541 the given OPERANDS. */
16542
16543 void
16544 ix86_expand_vector_logical_operator (enum rtx_code code, enum machine_mode mode,
16545 rtx operands[])
16546 {
16547 rtx op1 = NULL_RTX, op2 = NULL_RTX;
16548 if (GET_CODE (operands[1]) == SUBREG)
16549 {
16550 op1 = operands[1];
16551 op2 = operands[2];
16552 }
16553 else if (GET_CODE (operands[2]) == SUBREG)
16554 {
16555 op1 = operands[2];
16556 op2 = operands[1];
16557 }
16558 /* Optimize (__m128i) d | (__m128i) e and similar code
16559 when d and e are float vectors into float vector logical
16560 insn. In C/C++ without using intrinsics there is no other way
16561 to express vector logical operation on float vectors than
16562 to cast them temporarily to integer vectors. */
16563 if (op1
16564 && !TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
16565 && ((GET_CODE (op2) == SUBREG || GET_CODE (op2) == CONST_VECTOR))
16566 && GET_MODE_CLASS (GET_MODE (SUBREG_REG (op1))) == MODE_VECTOR_FLOAT
16567 && GET_MODE_SIZE (GET_MODE (SUBREG_REG (op1))) == GET_MODE_SIZE (mode)
16568 && SUBREG_BYTE (op1) == 0
16569 && (GET_CODE (op2) == CONST_VECTOR
16570 || (GET_MODE (SUBREG_REG (op1)) == GET_MODE (SUBREG_REG (op2))
16571 && SUBREG_BYTE (op2) == 0))
16572 && can_create_pseudo_p ())
16573 {
16574 rtx dst;
16575 switch (GET_MODE (SUBREG_REG (op1)))
16576 {
16577 case V4SFmode:
16578 case V8SFmode:
16579 case V2DFmode:
16580 case V4DFmode:
16581 dst = gen_reg_rtx (GET_MODE (SUBREG_REG (op1)));
16582 if (GET_CODE (op2) == CONST_VECTOR)
16583 {
16584 op2 = gen_lowpart (GET_MODE (dst), op2);
16585 op2 = force_reg (GET_MODE (dst), op2);
16586 }
16587 else
16588 {
16589 op1 = operands[1];
16590 op2 = SUBREG_REG (operands[2]);
16591 if (!nonimmediate_operand (op2, GET_MODE (dst)))
16592 op2 = force_reg (GET_MODE (dst), op2);
16593 }
16594 op1 = SUBREG_REG (op1);
16595 if (!nonimmediate_operand (op1, GET_MODE (dst)))
16596 op1 = force_reg (GET_MODE (dst), op1);
16597 emit_insn (gen_rtx_SET (VOIDmode, dst,
16598 gen_rtx_fmt_ee (code, GET_MODE (dst),
16599 op1, op2)));
16600 emit_move_insn (operands[0], gen_lowpart (mode, dst));
16601 return;
16602 default:
16603 break;
16604 }
16605 }
16606 if (!nonimmediate_operand (operands[1], mode))
16607 operands[1] = force_reg (mode, operands[1]);
16608 if (!nonimmediate_operand (operands[2], mode))
16609 operands[2] = force_reg (mode, operands[2]);
16610 ix86_fixup_binary_operands_no_copy (code, mode, operands);
16611 emit_insn (gen_rtx_SET (VOIDmode, operands[0],
16612 gen_rtx_fmt_ee (code, mode, operands[1],
16613 operands[2])));
16614 }
16615
16616 /* Return TRUE or FALSE depending on whether the binary operator meets the
16617 appropriate constraints. */
16618
16619 bool
16620 ix86_binary_operator_ok (enum rtx_code code, enum machine_mode mode,
16621 rtx operands[3])
16622 {
16623 rtx dst = operands[0];
16624 rtx src1 = operands[1];
16625 rtx src2 = operands[2];
16626
16627 /* Both source operands cannot be in memory. */
16628 if (MEM_P (src1) && MEM_P (src2))
16629 return false;
16630
16631 /* Canonicalize operand order for commutative operators. */
16632 if (ix86_swap_binary_operands_p (code, mode, operands))
16633 {
16634 rtx temp = src1;
16635 src1 = src2;
16636 src2 = temp;
16637 }
16638
16639 /* If the destination is memory, we must have a matching source operand. */
16640 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
16641 return false;
16642
16643 /* Source 1 cannot be a constant. */
16644 if (CONSTANT_P (src1))
16645 return false;
16646
16647 /* Source 1 cannot be a non-matching memory. */
16648 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
16649 /* Support "andhi/andsi/anddi" as a zero-extending move. */
16650 return (code == AND
16651 && (mode == HImode
16652 || mode == SImode
16653 || (TARGET_64BIT && mode == DImode))
16654 && satisfies_constraint_L (src2));
16655
16656 return true;
16657 }
16658
16659 /* Attempt to expand a unary operator. Make the expansion closer to the
16660 actual machine, then just general_operand, which will allow 2 separate
16661 memory references (one output, one input) in a single insn. */
16662
16663 void
16664 ix86_expand_unary_operator (enum rtx_code code, enum machine_mode mode,
16665 rtx operands[])
16666 {
16667 int matching_memory;
16668 rtx src, dst, op, clob;
16669
16670 dst = operands[0];
16671 src = operands[1];
16672
16673 /* If the destination is memory, and we do not have matching source
16674 operands, do things in registers. */
16675 matching_memory = 0;
16676 if (MEM_P (dst))
16677 {
16678 if (rtx_equal_p (dst, src))
16679 matching_memory = 1;
16680 else
16681 dst = gen_reg_rtx (mode);
16682 }
16683
16684 /* When source operand is memory, destination must match. */
16685 if (MEM_P (src) && !matching_memory)
16686 src = force_reg (mode, src);
16687
16688 /* Emit the instruction. */
16689
16690 op = gen_rtx_SET (VOIDmode, dst, gen_rtx_fmt_e (code, mode, src));
16691 if (reload_in_progress || code == NOT)
16692 {
16693 /* Reload doesn't know about the flags register, and doesn't know that
16694 it doesn't want to clobber it. */
16695 gcc_assert (code == NOT);
16696 emit_insn (op);
16697 }
16698 else
16699 {
16700 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
16701 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
16702 }
16703
16704 /* Fix up the destination if needed. */
16705 if (dst != operands[0])
16706 emit_move_insn (operands[0], dst);
16707 }
16708
16709 /* Split 32bit/64bit divmod with 8bit unsigned divmod if dividend and
16710 divisor are within the range [0-255]. */
16711
16712 void
16713 ix86_split_idivmod (enum machine_mode mode, rtx operands[],
16714 bool signed_p)
16715 {
16716 rtx end_label, qimode_label;
16717 rtx insn, div, mod;
16718 rtx scratch, tmp0, tmp1, tmp2;
16719 rtx (*gen_divmod4_1) (rtx, rtx, rtx, rtx);
16720 rtx (*gen_zero_extend) (rtx, rtx);
16721 rtx (*gen_test_ccno_1) (rtx, rtx);
16722
16723 switch (mode)
16724 {
16725 case SImode:
16726 gen_divmod4_1 = signed_p ? gen_divmodsi4_1 : gen_udivmodsi4_1;
16727 gen_test_ccno_1 = gen_testsi_ccno_1;
16728 gen_zero_extend = gen_zero_extendqisi2;
16729 break;
16730 case DImode:
16731 gen_divmod4_1 = signed_p ? gen_divmoddi4_1 : gen_udivmoddi4_1;
16732 gen_test_ccno_1 = gen_testdi_ccno_1;
16733 gen_zero_extend = gen_zero_extendqidi2;
16734 break;
16735 default:
16736 gcc_unreachable ();
16737 }
16738
16739 end_label = gen_label_rtx ();
16740 qimode_label = gen_label_rtx ();
16741
16742 scratch = gen_reg_rtx (mode);
16743
16744 /* Use 8bit unsigned divimod if dividend and divisor are within
16745 the range [0-255]. */
16746 emit_move_insn (scratch, operands[2]);
16747 scratch = expand_simple_binop (mode, IOR, scratch, operands[3],
16748 scratch, 1, OPTAB_DIRECT);
16749 emit_insn (gen_test_ccno_1 (scratch, GEN_INT (-0x100)));
16750 tmp0 = gen_rtx_REG (CCNOmode, FLAGS_REG);
16751 tmp0 = gen_rtx_EQ (VOIDmode, tmp0, const0_rtx);
16752 tmp0 = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp0,
16753 gen_rtx_LABEL_REF (VOIDmode, qimode_label),
16754 pc_rtx);
16755 insn = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp0));
16756 predict_jump (REG_BR_PROB_BASE * 50 / 100);
16757 JUMP_LABEL (insn) = qimode_label;
16758
16759 /* Generate original signed/unsigned divimod. */
16760 div = gen_divmod4_1 (operands[0], operands[1],
16761 operands[2], operands[3]);
16762 emit_insn (div);
16763
16764 /* Branch to the end. */
16765 emit_jump_insn (gen_jump (end_label));
16766 emit_barrier ();
16767
16768 /* Generate 8bit unsigned divide. */
16769 emit_label (qimode_label);
16770 /* Don't use operands[0] for result of 8bit divide since not all
16771 registers support QImode ZERO_EXTRACT. */
16772 tmp0 = simplify_gen_subreg (HImode, scratch, mode, 0);
16773 tmp1 = simplify_gen_subreg (HImode, operands[2], mode, 0);
16774 tmp2 = simplify_gen_subreg (QImode, operands[3], mode, 0);
16775 emit_insn (gen_udivmodhiqi3 (tmp0, tmp1, tmp2));
16776
16777 if (signed_p)
16778 {
16779 div = gen_rtx_DIV (SImode, operands[2], operands[3]);
16780 mod = gen_rtx_MOD (SImode, operands[2], operands[3]);
16781 }
16782 else
16783 {
16784 div = gen_rtx_UDIV (SImode, operands[2], operands[3]);
16785 mod = gen_rtx_UMOD (SImode, operands[2], operands[3]);
16786 }
16787
16788 /* Extract remainder from AH. */
16789 tmp1 = gen_rtx_ZERO_EXTRACT (mode, tmp0, GEN_INT (8), GEN_INT (8));
16790 if (REG_P (operands[1]))
16791 insn = emit_move_insn (operands[1], tmp1);
16792 else
16793 {
16794 /* Need a new scratch register since the old one has result
16795 of 8bit divide. */
16796 scratch = gen_reg_rtx (mode);
16797 emit_move_insn (scratch, tmp1);
16798 insn = emit_move_insn (operands[1], scratch);
16799 }
16800 set_unique_reg_note (insn, REG_EQUAL, mod);
16801
16802 /* Zero extend quotient from AL. */
16803 tmp1 = gen_lowpart (QImode, tmp0);
16804 insn = emit_insn (gen_zero_extend (operands[0], tmp1));
16805 set_unique_reg_note (insn, REG_EQUAL, div);
16806
16807 emit_label (end_label);
16808 }
16809
16810 #define LEA_MAX_STALL (3)
16811 #define LEA_SEARCH_THRESHOLD (LEA_MAX_STALL << 1)
16812
16813 /* Increase given DISTANCE in half-cycles according to
16814 dependencies between PREV and NEXT instructions.
16815 Add 1 half-cycle if there is no dependency and
16816 go to next cycle if there is some dependecy. */
16817
16818 static unsigned int
16819 increase_distance (rtx prev, rtx next, unsigned int distance)
16820 {
16821 df_ref *use_rec;
16822 df_ref *def_rec;
16823
16824 if (!prev || !next)
16825 return distance + (distance & 1) + 2;
16826
16827 if (!DF_INSN_USES (next) || !DF_INSN_DEFS (prev))
16828 return distance + 1;
16829
16830 for (use_rec = DF_INSN_USES (next); *use_rec; use_rec++)
16831 for (def_rec = DF_INSN_DEFS (prev); *def_rec; def_rec++)
16832 if (!DF_REF_IS_ARTIFICIAL (*def_rec)
16833 && DF_REF_REGNO (*use_rec) == DF_REF_REGNO (*def_rec))
16834 return distance + (distance & 1) + 2;
16835
16836 return distance + 1;
16837 }
16838
16839 /* Function checks if instruction INSN defines register number
16840 REGNO1 or REGNO2. */
16841
16842 static bool
16843 insn_defines_reg (unsigned int regno1, unsigned int regno2,
16844 rtx insn)
16845 {
16846 df_ref *def_rec;
16847
16848 for (def_rec = DF_INSN_DEFS (insn); *def_rec; def_rec++)
16849 if (DF_REF_REG_DEF_P (*def_rec)
16850 && !DF_REF_IS_ARTIFICIAL (*def_rec)
16851 && (regno1 == DF_REF_REGNO (*def_rec)
16852 || regno2 == DF_REF_REGNO (*def_rec)))
16853 {
16854 return true;
16855 }
16856
16857 return false;
16858 }
16859
16860 /* Function checks if instruction INSN uses register number
16861 REGNO as a part of address expression. */
16862
16863 static bool
16864 insn_uses_reg_mem (unsigned int regno, rtx insn)
16865 {
16866 df_ref *use_rec;
16867
16868 for (use_rec = DF_INSN_USES (insn); *use_rec; use_rec++)
16869 if (DF_REF_REG_MEM_P (*use_rec) && regno == DF_REF_REGNO (*use_rec))
16870 return true;
16871
16872 return false;
16873 }
16874
16875 /* Search backward for non-agu definition of register number REGNO1
16876 or register number REGNO2 in basic block starting from instruction
16877 START up to head of basic block or instruction INSN.
16878
16879 Function puts true value into *FOUND var if definition was found
16880 and false otherwise.
16881
16882 Distance in half-cycles between START and found instruction or head
16883 of BB is added to DISTANCE and returned. */
16884
16885 static int
16886 distance_non_agu_define_in_bb (unsigned int regno1, unsigned int regno2,
16887 rtx insn, int distance,
16888 rtx start, bool *found)
16889 {
16890 basic_block bb = start ? BLOCK_FOR_INSN (start) : NULL;
16891 rtx prev = start;
16892 rtx next = NULL;
16893
16894 *found = false;
16895
16896 while (prev
16897 && prev != insn
16898 && distance < LEA_SEARCH_THRESHOLD)
16899 {
16900 if (NONDEBUG_INSN_P (prev) && NONJUMP_INSN_P (prev))
16901 {
16902 distance = increase_distance (prev, next, distance);
16903 if (insn_defines_reg (regno1, regno2, prev))
16904 {
16905 if (recog_memoized (prev) < 0
16906 || get_attr_type (prev) != TYPE_LEA)
16907 {
16908 *found = true;
16909 return distance;
16910 }
16911 }
16912
16913 next = prev;
16914 }
16915 if (prev == BB_HEAD (bb))
16916 break;
16917
16918 prev = PREV_INSN (prev);
16919 }
16920
16921 return distance;
16922 }
16923
16924 /* Search backward for non-agu definition of register number REGNO1
16925 or register number REGNO2 in INSN's basic block until
16926 1. Pass LEA_SEARCH_THRESHOLD instructions, or
16927 2. Reach neighbour BBs boundary, or
16928 3. Reach agu definition.
16929 Returns the distance between the non-agu definition point and INSN.
16930 If no definition point, returns -1. */
16931
16932 static int
16933 distance_non_agu_define (unsigned int regno1, unsigned int regno2,
16934 rtx insn)
16935 {
16936 basic_block bb = BLOCK_FOR_INSN (insn);
16937 int distance = 0;
16938 bool found = false;
16939
16940 if (insn != BB_HEAD (bb))
16941 distance = distance_non_agu_define_in_bb (regno1, regno2, insn,
16942 distance, PREV_INSN (insn),
16943 &found);
16944
16945 if (!found && distance < LEA_SEARCH_THRESHOLD)
16946 {
16947 edge e;
16948 edge_iterator ei;
16949 bool simple_loop = false;
16950
16951 FOR_EACH_EDGE (e, ei, bb->preds)
16952 if (e->src == bb)
16953 {
16954 simple_loop = true;
16955 break;
16956 }
16957
16958 if (simple_loop)
16959 distance = distance_non_agu_define_in_bb (regno1, regno2,
16960 insn, distance,
16961 BB_END (bb), &found);
16962 else
16963 {
16964 int shortest_dist = -1;
16965 bool found_in_bb = false;
16966
16967 FOR_EACH_EDGE (e, ei, bb->preds)
16968 {
16969 int bb_dist
16970 = distance_non_agu_define_in_bb (regno1, regno2,
16971 insn, distance,
16972 BB_END (e->src),
16973 &found_in_bb);
16974 if (found_in_bb)
16975 {
16976 if (shortest_dist < 0)
16977 shortest_dist = bb_dist;
16978 else if (bb_dist > 0)
16979 shortest_dist = MIN (bb_dist, shortest_dist);
16980
16981 found = true;
16982 }
16983 }
16984
16985 distance = shortest_dist;
16986 }
16987 }
16988
16989 /* get_attr_type may modify recog data. We want to make sure
16990 that recog data is valid for instruction INSN, on which
16991 distance_non_agu_define is called. INSN is unchanged here. */
16992 extract_insn_cached (insn);
16993
16994 if (!found)
16995 return -1;
16996
16997 return distance >> 1;
16998 }
16999
17000 /* Return the distance in half-cycles between INSN and the next
17001 insn that uses register number REGNO in memory address added
17002 to DISTANCE. Return -1 if REGNO0 is set.
17003
17004 Put true value into *FOUND if register usage was found and
17005 false otherwise.
17006 Put true value into *REDEFINED if register redefinition was
17007 found and false otherwise. */
17008
17009 static int
17010 distance_agu_use_in_bb (unsigned int regno,
17011 rtx insn, int distance, rtx start,
17012 bool *found, bool *redefined)
17013 {
17014 basic_block bb = start ? BLOCK_FOR_INSN (start) : NULL;
17015 rtx next = start;
17016 rtx prev = NULL;
17017
17018 *found = false;
17019 *redefined = false;
17020
17021 while (next
17022 && next != insn
17023 && distance < LEA_SEARCH_THRESHOLD)
17024 {
17025 if (NONDEBUG_INSN_P (next) && NONJUMP_INSN_P (next))
17026 {
17027 distance = increase_distance(prev, next, distance);
17028 if (insn_uses_reg_mem (regno, next))
17029 {
17030 /* Return DISTANCE if OP0 is used in memory
17031 address in NEXT. */
17032 *found = true;
17033 return distance;
17034 }
17035
17036 if (insn_defines_reg (regno, INVALID_REGNUM, next))
17037 {
17038 /* Return -1 if OP0 is set in NEXT. */
17039 *redefined = true;
17040 return -1;
17041 }
17042
17043 prev = next;
17044 }
17045
17046 if (next == BB_END (bb))
17047 break;
17048
17049 next = NEXT_INSN (next);
17050 }
17051
17052 return distance;
17053 }
17054
17055 /* Return the distance between INSN and the next insn that uses
17056 register number REGNO0 in memory address. Return -1 if no such
17057 a use is found within LEA_SEARCH_THRESHOLD or REGNO0 is set. */
17058
17059 static int
17060 distance_agu_use (unsigned int regno0, rtx insn)
17061 {
17062 basic_block bb = BLOCK_FOR_INSN (insn);
17063 int distance = 0;
17064 bool found = false;
17065 bool redefined = false;
17066
17067 if (insn != BB_END (bb))
17068 distance = distance_agu_use_in_bb (regno0, insn, distance,
17069 NEXT_INSN (insn),
17070 &found, &redefined);
17071
17072 if (!found && !redefined && distance < LEA_SEARCH_THRESHOLD)
17073 {
17074 edge e;
17075 edge_iterator ei;
17076 bool simple_loop = false;
17077
17078 FOR_EACH_EDGE (e, ei, bb->succs)
17079 if (e->dest == bb)
17080 {
17081 simple_loop = true;
17082 break;
17083 }
17084
17085 if (simple_loop)
17086 distance = distance_agu_use_in_bb (regno0, insn,
17087 distance, BB_HEAD (bb),
17088 &found, &redefined);
17089 else
17090 {
17091 int shortest_dist = -1;
17092 bool found_in_bb = false;
17093 bool redefined_in_bb = false;
17094
17095 FOR_EACH_EDGE (e, ei, bb->succs)
17096 {
17097 int bb_dist
17098 = distance_agu_use_in_bb (regno0, insn,
17099 distance, BB_HEAD (e->dest),
17100 &found_in_bb, &redefined_in_bb);
17101 if (found_in_bb)
17102 {
17103 if (shortest_dist < 0)
17104 shortest_dist = bb_dist;
17105 else if (bb_dist > 0)
17106 shortest_dist = MIN (bb_dist, shortest_dist);
17107
17108 found = true;
17109 }
17110 }
17111
17112 distance = shortest_dist;
17113 }
17114 }
17115
17116 if (!found || redefined)
17117 return -1;
17118
17119 return distance >> 1;
17120 }
17121
17122 /* Define this macro to tune LEA priority vs ADD, it take effect when
17123 there is a dilemma of choicing LEA or ADD
17124 Negative value: ADD is more preferred than LEA
17125 Zero: Netrual
17126 Positive value: LEA is more preferred than ADD*/
17127 #define IX86_LEA_PRIORITY 0
17128
17129 /* Return true if usage of lea INSN has performance advantage
17130 over a sequence of instructions. Instructions sequence has
17131 SPLIT_COST cycles higher latency than lea latency. */
17132
17133 static bool
17134 ix86_lea_outperforms (rtx insn, unsigned int regno0, unsigned int regno1,
17135 unsigned int regno2, int split_cost)
17136 {
17137 int dist_define, dist_use;
17138
17139 dist_define = distance_non_agu_define (regno1, regno2, insn);
17140 dist_use = distance_agu_use (regno0, insn);
17141
17142 if (dist_define < 0 || dist_define >= LEA_MAX_STALL)
17143 {
17144 /* If there is no non AGU operand definition, no AGU
17145 operand usage and split cost is 0 then both lea
17146 and non lea variants have same priority. Currently
17147 we prefer lea for 64 bit code and non lea on 32 bit
17148 code. */
17149 if (dist_use < 0 && split_cost == 0)
17150 return TARGET_64BIT || IX86_LEA_PRIORITY;
17151 else
17152 return true;
17153 }
17154
17155 /* With longer definitions distance lea is more preferable.
17156 Here we change it to take into account splitting cost and
17157 lea priority. */
17158 dist_define += split_cost + IX86_LEA_PRIORITY;
17159
17160 /* If there is no use in memory addess then we just check
17161 that split cost exceeds AGU stall. */
17162 if (dist_use < 0)
17163 return dist_define > LEA_MAX_STALL;
17164
17165 /* If this insn has both backward non-agu dependence and forward
17166 agu dependence, the one with short distance takes effect. */
17167 return dist_define >= dist_use;
17168 }
17169
17170 /* Return true if it is legal to clobber flags by INSN and
17171 false otherwise. */
17172
17173 static bool
17174 ix86_ok_to_clobber_flags (rtx insn)
17175 {
17176 basic_block bb = BLOCK_FOR_INSN (insn);
17177 df_ref *use;
17178 bitmap live;
17179
17180 while (insn)
17181 {
17182 if (NONDEBUG_INSN_P (insn))
17183 {
17184 for (use = DF_INSN_USES (insn); *use; use++)
17185 if (DF_REF_REG_USE_P (*use) && DF_REF_REGNO (*use) == FLAGS_REG)
17186 return false;
17187
17188 if (insn_defines_reg (FLAGS_REG, INVALID_REGNUM, insn))
17189 return true;
17190 }
17191
17192 if (insn == BB_END (bb))
17193 break;
17194
17195 insn = NEXT_INSN (insn);
17196 }
17197
17198 live = df_get_live_out(bb);
17199 return !REGNO_REG_SET_P (live, FLAGS_REG);
17200 }
17201
17202 /* Return true if we need to split op0 = op1 + op2 into a sequence of
17203 move and add to avoid AGU stalls. */
17204
17205 bool
17206 ix86_avoid_lea_for_add (rtx insn, rtx operands[])
17207 {
17208 unsigned int regno0, regno1, regno2;
17209
17210 /* Check if we need to optimize. */
17211 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
17212 return false;
17213
17214 /* Check it is correct to split here. */
17215 if (!ix86_ok_to_clobber_flags(insn))
17216 return false;
17217
17218 regno0 = true_regnum (operands[0]);
17219 regno1 = true_regnum (operands[1]);
17220 regno2 = true_regnum (operands[2]);
17221
17222 /* We need to split only adds with non destructive
17223 destination operand. */
17224 if (regno0 == regno1 || regno0 == regno2)
17225 return false;
17226 else
17227 return !ix86_lea_outperforms (insn, regno0, regno1, regno2, 1);
17228 }
17229
17230 /* Return true if we should emit lea instruction instead of mov
17231 instruction. */
17232
17233 bool
17234 ix86_use_lea_for_mov (rtx insn, rtx operands[])
17235 {
17236 unsigned int regno0, regno1;
17237
17238 /* Check if we need to optimize. */
17239 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
17240 return false;
17241
17242 /* Use lea for reg to reg moves only. */
17243 if (!REG_P (operands[0]) || !REG_P (operands[1]))
17244 return false;
17245
17246 regno0 = true_regnum (operands[0]);
17247 regno1 = true_regnum (operands[1]);
17248
17249 return ix86_lea_outperforms (insn, regno0, regno1, INVALID_REGNUM, 0);
17250 }
17251
17252 /* Return true if we need to split lea into a sequence of
17253 instructions to avoid AGU stalls. */
17254
17255 bool
17256 ix86_avoid_lea_for_addr (rtx insn, rtx operands[])
17257 {
17258 unsigned int regno0, regno1, regno2;
17259 int split_cost;
17260 struct ix86_address parts;
17261 int ok;
17262
17263 /* Check we need to optimize. */
17264 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
17265 return false;
17266
17267 /* Check it is correct to split here. */
17268 if (!ix86_ok_to_clobber_flags(insn))
17269 return false;
17270
17271 ok = ix86_decompose_address (operands[1], &parts);
17272 gcc_assert (ok);
17273
17274 /* There should be at least two components in the address. */
17275 if ((parts.base != NULL_RTX) + (parts.index != NULL_RTX)
17276 + (parts.disp != NULL_RTX) + (parts.scale > 1) < 2)
17277 return false;
17278
17279 /* We should not split into add if non legitimate pic
17280 operand is used as displacement. */
17281 if (parts.disp && flag_pic && !LEGITIMATE_PIC_OPERAND_P (parts.disp))
17282 return false;
17283
17284 regno0 = true_regnum (operands[0]) ;
17285 regno1 = INVALID_REGNUM;
17286 regno2 = INVALID_REGNUM;
17287
17288 if (parts.base)
17289 regno1 = true_regnum (parts.base);
17290 if (parts.index)
17291 regno2 = true_regnum (parts.index);
17292
17293 split_cost = 0;
17294
17295 /* Compute how many cycles we will add to execution time
17296 if split lea into a sequence of instructions. */
17297 if (parts.base || parts.index)
17298 {
17299 /* Have to use mov instruction if non desctructive
17300 destination form is used. */
17301 if (regno1 != regno0 && regno2 != regno0)
17302 split_cost += 1;
17303
17304 /* Have to add index to base if both exist. */
17305 if (parts.base && parts.index)
17306 split_cost += 1;
17307
17308 /* Have to use shift and adds if scale is 2 or greater. */
17309 if (parts.scale > 1)
17310 {
17311 if (regno0 != regno1)
17312 split_cost += 1;
17313 else if (regno2 == regno0)
17314 split_cost += 4;
17315 else
17316 split_cost += parts.scale;
17317 }
17318
17319 /* Have to use add instruction with immediate if
17320 disp is non zero. */
17321 if (parts.disp && parts.disp != const0_rtx)
17322 split_cost += 1;
17323
17324 /* Subtract the price of lea. */
17325 split_cost -= 1;
17326 }
17327
17328 return !ix86_lea_outperforms (insn, regno0, regno1, regno2, split_cost);
17329 }
17330
17331 /* Emit x86 binary operand CODE in mode MODE, where the first operand
17332 matches destination. RTX includes clobber of FLAGS_REG. */
17333
17334 static void
17335 ix86_emit_binop (enum rtx_code code, enum machine_mode mode,
17336 rtx dst, rtx src)
17337 {
17338 rtx op, clob;
17339
17340 op = gen_rtx_SET (VOIDmode, dst, gen_rtx_fmt_ee (code, mode, dst, src));
17341 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
17342
17343 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
17344 }
17345
17346 /* Return true if regno1 def is nearest to the insn. */
17347
17348 static bool
17349 find_nearest_reg_def (rtx insn, int regno1, int regno2)
17350 {
17351 rtx prev = insn;
17352 rtx start = BB_HEAD (BLOCK_FOR_INSN (insn));
17353
17354 if (insn == start)
17355 return false;
17356 while (prev && prev != start)
17357 {
17358 if (!INSN_P (prev) || !NONDEBUG_INSN_P (prev))
17359 {
17360 prev = PREV_INSN (prev);
17361 continue;
17362 }
17363 if (insn_defines_reg (regno1, INVALID_REGNUM, prev))
17364 return true;
17365 else if (insn_defines_reg (regno2, INVALID_REGNUM, prev))
17366 return false;
17367 prev = PREV_INSN (prev);
17368 }
17369
17370 /* None of the regs is defined in the bb. */
17371 return false;
17372 }
17373
17374 /* Split lea instructions into a sequence of instructions
17375 which are executed on ALU to avoid AGU stalls.
17376 It is assumed that it is allowed to clobber flags register
17377 at lea position. */
17378
17379 void
17380 ix86_split_lea_for_addr (rtx insn, rtx operands[], enum machine_mode mode)
17381 {
17382 unsigned int regno0, regno1, regno2;
17383 struct ix86_address parts;
17384 rtx target, tmp;
17385 int ok, adds;
17386
17387 ok = ix86_decompose_address (operands[1], &parts);
17388 gcc_assert (ok);
17389
17390 target = gen_lowpart (mode, operands[0]);
17391
17392 regno0 = true_regnum (target);
17393 regno1 = INVALID_REGNUM;
17394 regno2 = INVALID_REGNUM;
17395
17396 if (parts.base)
17397 {
17398 parts.base = gen_lowpart (mode, parts.base);
17399 regno1 = true_regnum (parts.base);
17400 }
17401
17402 if (parts.index)
17403 {
17404 parts.index = gen_lowpart (mode, parts.index);
17405 regno2 = true_regnum (parts.index);
17406 }
17407
17408 if (parts.disp)
17409 parts.disp = gen_lowpart (mode, parts.disp);
17410
17411 if (parts.scale > 1)
17412 {
17413 /* Case r1 = r1 + ... */
17414 if (regno1 == regno0)
17415 {
17416 /* If we have a case r1 = r1 + C * r1 then we
17417 should use multiplication which is very
17418 expensive. Assume cost model is wrong if we
17419 have such case here. */
17420 gcc_assert (regno2 != regno0);
17421
17422 for (adds = parts.scale; adds > 0; adds--)
17423 ix86_emit_binop (PLUS, mode, target, parts.index);
17424 }
17425 else
17426 {
17427 /* r1 = r2 + r3 * C case. Need to move r3 into r1. */
17428 if (regno0 != regno2)
17429 emit_insn (gen_rtx_SET (VOIDmode, target, parts.index));
17430
17431 /* Use shift for scaling. */
17432 ix86_emit_binop (ASHIFT, mode, target,
17433 GEN_INT (exact_log2 (parts.scale)));
17434
17435 if (parts.base)
17436 ix86_emit_binop (PLUS, mode, target, parts.base);
17437
17438 if (parts.disp && parts.disp != const0_rtx)
17439 ix86_emit_binop (PLUS, mode, target, parts.disp);
17440 }
17441 }
17442 else if (!parts.base && !parts.index)
17443 {
17444 gcc_assert(parts.disp);
17445 emit_insn (gen_rtx_SET (VOIDmode, target, parts.disp));
17446 }
17447 else
17448 {
17449 if (!parts.base)
17450 {
17451 if (regno0 != regno2)
17452 emit_insn (gen_rtx_SET (VOIDmode, target, parts.index));
17453 }
17454 else if (!parts.index)
17455 {
17456 if (regno0 != regno1)
17457 emit_insn (gen_rtx_SET (VOIDmode, target, parts.base));
17458 }
17459 else
17460 {
17461 if (regno0 == regno1)
17462 tmp = parts.index;
17463 else if (regno0 == regno2)
17464 tmp = parts.base;
17465 else
17466 {
17467 rtx tmp1;
17468
17469 /* Find better operand for SET instruction, depending
17470 on which definition is farther from the insn. */
17471 if (find_nearest_reg_def (insn, regno1, regno2))
17472 tmp = parts.index, tmp1 = parts.base;
17473 else
17474 tmp = parts.base, tmp1 = parts.index;
17475
17476 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
17477
17478 if (parts.disp && parts.disp != const0_rtx)
17479 ix86_emit_binop (PLUS, mode, target, parts.disp);
17480
17481 ix86_emit_binop (PLUS, mode, target, tmp1);
17482 return;
17483 }
17484
17485 ix86_emit_binop (PLUS, mode, target, tmp);
17486 }
17487
17488 if (parts.disp && parts.disp != const0_rtx)
17489 ix86_emit_binop (PLUS, mode, target, parts.disp);
17490 }
17491 }
17492
17493 /* Return true if it is ok to optimize an ADD operation to LEA
17494 operation to avoid flag register consumation. For most processors,
17495 ADD is faster than LEA. For the processors like ATOM, if the
17496 destination register of LEA holds an actual address which will be
17497 used soon, LEA is better and otherwise ADD is better. */
17498
17499 bool
17500 ix86_lea_for_add_ok (rtx insn, rtx operands[])
17501 {
17502 unsigned int regno0 = true_regnum (operands[0]);
17503 unsigned int regno1 = true_regnum (operands[1]);
17504 unsigned int regno2 = true_regnum (operands[2]);
17505
17506 /* If a = b + c, (a!=b && a!=c), must use lea form. */
17507 if (regno0 != regno1 && regno0 != regno2)
17508 return true;
17509
17510 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
17511 return false;
17512
17513 return ix86_lea_outperforms (insn, regno0, regno1, regno2, 0);
17514 }
17515
17516 /* Return true if destination reg of SET_BODY is shift count of
17517 USE_BODY. */
17518
17519 static bool
17520 ix86_dep_by_shift_count_body (const_rtx set_body, const_rtx use_body)
17521 {
17522 rtx set_dest;
17523 rtx shift_rtx;
17524 int i;
17525
17526 /* Retrieve destination of SET_BODY. */
17527 switch (GET_CODE (set_body))
17528 {
17529 case SET:
17530 set_dest = SET_DEST (set_body);
17531 if (!set_dest || !REG_P (set_dest))
17532 return false;
17533 break;
17534 case PARALLEL:
17535 for (i = XVECLEN (set_body, 0) - 1; i >= 0; i--)
17536 if (ix86_dep_by_shift_count_body (XVECEXP (set_body, 0, i),
17537 use_body))
17538 return true;
17539 default:
17540 return false;
17541 break;
17542 }
17543
17544 /* Retrieve shift count of USE_BODY. */
17545 switch (GET_CODE (use_body))
17546 {
17547 case SET:
17548 shift_rtx = XEXP (use_body, 1);
17549 break;
17550 case PARALLEL:
17551 for (i = XVECLEN (use_body, 0) - 1; i >= 0; i--)
17552 if (ix86_dep_by_shift_count_body (set_body,
17553 XVECEXP (use_body, 0, i)))
17554 return true;
17555 default:
17556 return false;
17557 break;
17558 }
17559
17560 if (shift_rtx
17561 && (GET_CODE (shift_rtx) == ASHIFT
17562 || GET_CODE (shift_rtx) == LSHIFTRT
17563 || GET_CODE (shift_rtx) == ASHIFTRT
17564 || GET_CODE (shift_rtx) == ROTATE
17565 || GET_CODE (shift_rtx) == ROTATERT))
17566 {
17567 rtx shift_count = XEXP (shift_rtx, 1);
17568
17569 /* Return true if shift count is dest of SET_BODY. */
17570 if (REG_P (shift_count))
17571 {
17572 /* Add check since it can be invoked before register
17573 allocation in pre-reload schedule. */
17574 if (reload_completed
17575 && true_regnum (set_dest) == true_regnum (shift_count))
17576 return true;
17577 else if (REGNO(set_dest) == REGNO(shift_count))
17578 return true;
17579 }
17580 }
17581
17582 return false;
17583 }
17584
17585 /* Return true if destination reg of SET_INSN is shift count of
17586 USE_INSN. */
17587
17588 bool
17589 ix86_dep_by_shift_count (const_rtx set_insn, const_rtx use_insn)
17590 {
17591 return ix86_dep_by_shift_count_body (PATTERN (set_insn),
17592 PATTERN (use_insn));
17593 }
17594
17595 /* Return TRUE or FALSE depending on whether the unary operator meets the
17596 appropriate constraints. */
17597
17598 bool
17599 ix86_unary_operator_ok (enum rtx_code code ATTRIBUTE_UNUSED,
17600 enum machine_mode mode ATTRIBUTE_UNUSED,
17601 rtx operands[2] ATTRIBUTE_UNUSED)
17602 {
17603 /* If one of operands is memory, source and destination must match. */
17604 if ((MEM_P (operands[0])
17605 || MEM_P (operands[1]))
17606 && ! rtx_equal_p (operands[0], operands[1]))
17607 return false;
17608 return true;
17609 }
17610
17611 /* Return TRUE if the operands to a vec_interleave_{high,low}v2df
17612 are ok, keeping in mind the possible movddup alternative. */
17613
17614 bool
17615 ix86_vec_interleave_v2df_operator_ok (rtx operands[3], bool high)
17616 {
17617 if (MEM_P (operands[0]))
17618 return rtx_equal_p (operands[0], operands[1 + high]);
17619 if (MEM_P (operands[1]) && MEM_P (operands[2]))
17620 return TARGET_SSE3 && rtx_equal_p (operands[1], operands[2]);
17621 return true;
17622 }
17623
17624 /* Post-reload splitter for converting an SF or DFmode value in an
17625 SSE register into an unsigned SImode. */
17626
17627 void
17628 ix86_split_convert_uns_si_sse (rtx operands[])
17629 {
17630 enum machine_mode vecmode;
17631 rtx value, large, zero_or_two31, input, two31, x;
17632
17633 large = operands[1];
17634 zero_or_two31 = operands[2];
17635 input = operands[3];
17636 two31 = operands[4];
17637 vecmode = GET_MODE (large);
17638 value = gen_rtx_REG (vecmode, REGNO (operands[0]));
17639
17640 /* Load up the value into the low element. We must ensure that the other
17641 elements are valid floats -- zero is the easiest such value. */
17642 if (MEM_P (input))
17643 {
17644 if (vecmode == V4SFmode)
17645 emit_insn (gen_vec_setv4sf_0 (value, CONST0_RTX (V4SFmode), input));
17646 else
17647 emit_insn (gen_sse2_loadlpd (value, CONST0_RTX (V2DFmode), input));
17648 }
17649 else
17650 {
17651 input = gen_rtx_REG (vecmode, REGNO (input));
17652 emit_move_insn (value, CONST0_RTX (vecmode));
17653 if (vecmode == V4SFmode)
17654 emit_insn (gen_sse_movss (value, value, input));
17655 else
17656 emit_insn (gen_sse2_movsd (value, value, input));
17657 }
17658
17659 emit_move_insn (large, two31);
17660 emit_move_insn (zero_or_two31, MEM_P (two31) ? large : two31);
17661
17662 x = gen_rtx_fmt_ee (LE, vecmode, large, value);
17663 emit_insn (gen_rtx_SET (VOIDmode, large, x));
17664
17665 x = gen_rtx_AND (vecmode, zero_or_two31, large);
17666 emit_insn (gen_rtx_SET (VOIDmode, zero_or_two31, x));
17667
17668 x = gen_rtx_MINUS (vecmode, value, zero_or_two31);
17669 emit_insn (gen_rtx_SET (VOIDmode, value, x));
17670
17671 large = gen_rtx_REG (V4SImode, REGNO (large));
17672 emit_insn (gen_ashlv4si3 (large, large, GEN_INT (31)));
17673
17674 x = gen_rtx_REG (V4SImode, REGNO (value));
17675 if (vecmode == V4SFmode)
17676 emit_insn (gen_fix_truncv4sfv4si2 (x, value));
17677 else
17678 emit_insn (gen_sse2_cvttpd2dq (x, value));
17679 value = x;
17680
17681 emit_insn (gen_xorv4si3 (value, value, large));
17682 }
17683
17684 /* Convert an unsigned DImode value into a DFmode, using only SSE.
17685 Expects the 64-bit DImode to be supplied in a pair of integral
17686 registers. Requires SSE2; will use SSE3 if available. For x86_32,
17687 -mfpmath=sse, !optimize_size only. */
17688
17689 void
17690 ix86_expand_convert_uns_didf_sse (rtx target, rtx input)
17691 {
17692 REAL_VALUE_TYPE bias_lo_rvt, bias_hi_rvt;
17693 rtx int_xmm, fp_xmm;
17694 rtx biases, exponents;
17695 rtx x;
17696
17697 int_xmm = gen_reg_rtx (V4SImode);
17698 if (TARGET_INTER_UNIT_MOVES)
17699 emit_insn (gen_movdi_to_sse (int_xmm, input));
17700 else if (TARGET_SSE_SPLIT_REGS)
17701 {
17702 emit_clobber (int_xmm);
17703 emit_move_insn (gen_lowpart (DImode, int_xmm), input);
17704 }
17705 else
17706 {
17707 x = gen_reg_rtx (V2DImode);
17708 ix86_expand_vector_init_one_nonzero (false, V2DImode, x, input, 0);
17709 emit_move_insn (int_xmm, gen_lowpart (V4SImode, x));
17710 }
17711
17712 x = gen_rtx_CONST_VECTOR (V4SImode,
17713 gen_rtvec (4, GEN_INT (0x43300000UL),
17714 GEN_INT (0x45300000UL),
17715 const0_rtx, const0_rtx));
17716 exponents = validize_mem (force_const_mem (V4SImode, x));
17717
17718 /* int_xmm = {0x45300000UL, fp_xmm/hi, 0x43300000, fp_xmm/lo } */
17719 emit_insn (gen_vec_interleave_lowv4si (int_xmm, int_xmm, exponents));
17720
17721 /* Concatenating (juxtaposing) (0x43300000UL ## fp_value_low_xmm)
17722 yields a valid DF value equal to (0x1.0p52 + double(fp_value_lo_xmm)).
17723 Similarly (0x45300000UL ## fp_value_hi_xmm) yields
17724 (0x1.0p84 + double(fp_value_hi_xmm)).
17725 Note these exponents differ by 32. */
17726
17727 fp_xmm = copy_to_mode_reg (V2DFmode, gen_lowpart (V2DFmode, int_xmm));
17728
17729 /* Subtract off those 0x1.0p52 and 0x1.0p84 biases, to produce values
17730 in [0,2**32-1] and [0]+[2**32,2**64-1] respectively. */
17731 real_ldexp (&bias_lo_rvt, &dconst1, 52);
17732 real_ldexp (&bias_hi_rvt, &dconst1, 84);
17733 biases = const_double_from_real_value (bias_lo_rvt, DFmode);
17734 x = const_double_from_real_value (bias_hi_rvt, DFmode);
17735 biases = gen_rtx_CONST_VECTOR (V2DFmode, gen_rtvec (2, biases, x));
17736 biases = validize_mem (force_const_mem (V2DFmode, biases));
17737 emit_insn (gen_subv2df3 (fp_xmm, fp_xmm, biases));
17738
17739 /* Add the upper and lower DFmode values together. */
17740 if (TARGET_SSE3)
17741 emit_insn (gen_sse3_haddv2df3 (fp_xmm, fp_xmm, fp_xmm));
17742 else
17743 {
17744 x = copy_to_mode_reg (V2DFmode, fp_xmm);
17745 emit_insn (gen_vec_interleave_highv2df (fp_xmm, fp_xmm, fp_xmm));
17746 emit_insn (gen_addv2df3 (fp_xmm, fp_xmm, x));
17747 }
17748
17749 ix86_expand_vector_extract (false, target, fp_xmm, 0);
17750 }
17751
17752 /* Not used, but eases macroization of patterns. */
17753 void
17754 ix86_expand_convert_uns_sixf_sse (rtx target ATTRIBUTE_UNUSED,
17755 rtx input ATTRIBUTE_UNUSED)
17756 {
17757 gcc_unreachable ();
17758 }
17759
17760 /* Convert an unsigned SImode value into a DFmode. Only currently used
17761 for SSE, but applicable anywhere. */
17762
17763 void
17764 ix86_expand_convert_uns_sidf_sse (rtx target, rtx input)
17765 {
17766 REAL_VALUE_TYPE TWO31r;
17767 rtx x, fp;
17768
17769 x = expand_simple_binop (SImode, PLUS, input, GEN_INT (-2147483647 - 1),
17770 NULL, 1, OPTAB_DIRECT);
17771
17772 fp = gen_reg_rtx (DFmode);
17773 emit_insn (gen_floatsidf2 (fp, x));
17774
17775 real_ldexp (&TWO31r, &dconst1, 31);
17776 x = const_double_from_real_value (TWO31r, DFmode);
17777
17778 x = expand_simple_binop (DFmode, PLUS, fp, x, target, 0, OPTAB_DIRECT);
17779 if (x != target)
17780 emit_move_insn (target, x);
17781 }
17782
17783 /* Convert a signed DImode value into a DFmode. Only used for SSE in
17784 32-bit mode; otherwise we have a direct convert instruction. */
17785
17786 void
17787 ix86_expand_convert_sign_didf_sse (rtx target, rtx input)
17788 {
17789 REAL_VALUE_TYPE TWO32r;
17790 rtx fp_lo, fp_hi, x;
17791
17792 fp_lo = gen_reg_rtx (DFmode);
17793 fp_hi = gen_reg_rtx (DFmode);
17794
17795 emit_insn (gen_floatsidf2 (fp_hi, gen_highpart (SImode, input)));
17796
17797 real_ldexp (&TWO32r, &dconst1, 32);
17798 x = const_double_from_real_value (TWO32r, DFmode);
17799 fp_hi = expand_simple_binop (DFmode, MULT, fp_hi, x, fp_hi, 0, OPTAB_DIRECT);
17800
17801 ix86_expand_convert_uns_sidf_sse (fp_lo, gen_lowpart (SImode, input));
17802
17803 x = expand_simple_binop (DFmode, PLUS, fp_hi, fp_lo, target,
17804 0, OPTAB_DIRECT);
17805 if (x != target)
17806 emit_move_insn (target, x);
17807 }
17808
17809 /* Convert an unsigned SImode value into a SFmode, using only SSE.
17810 For x86_32, -mfpmath=sse, !optimize_size only. */
17811 void
17812 ix86_expand_convert_uns_sisf_sse (rtx target, rtx input)
17813 {
17814 REAL_VALUE_TYPE ONE16r;
17815 rtx fp_hi, fp_lo, int_hi, int_lo, x;
17816
17817 real_ldexp (&ONE16r, &dconst1, 16);
17818 x = const_double_from_real_value (ONE16r, SFmode);
17819 int_lo = expand_simple_binop (SImode, AND, input, GEN_INT(0xffff),
17820 NULL, 0, OPTAB_DIRECT);
17821 int_hi = expand_simple_binop (SImode, LSHIFTRT, input, GEN_INT(16),
17822 NULL, 0, OPTAB_DIRECT);
17823 fp_hi = gen_reg_rtx (SFmode);
17824 fp_lo = gen_reg_rtx (SFmode);
17825 emit_insn (gen_floatsisf2 (fp_hi, int_hi));
17826 emit_insn (gen_floatsisf2 (fp_lo, int_lo));
17827 fp_hi = expand_simple_binop (SFmode, MULT, fp_hi, x, fp_hi,
17828 0, OPTAB_DIRECT);
17829 fp_hi = expand_simple_binop (SFmode, PLUS, fp_hi, fp_lo, target,
17830 0, OPTAB_DIRECT);
17831 if (!rtx_equal_p (target, fp_hi))
17832 emit_move_insn (target, fp_hi);
17833 }
17834
17835 /* floatunsv{4,8}siv{4,8}sf2 expander. Expand code to convert
17836 a vector of unsigned ints VAL to vector of floats TARGET. */
17837
17838 void
17839 ix86_expand_vector_convert_uns_vsivsf (rtx target, rtx val)
17840 {
17841 rtx tmp[8];
17842 REAL_VALUE_TYPE TWO16r;
17843 enum machine_mode intmode = GET_MODE (val);
17844 enum machine_mode fltmode = GET_MODE (target);
17845 rtx (*cvt) (rtx, rtx);
17846
17847 if (intmode == V4SImode)
17848 cvt = gen_floatv4siv4sf2;
17849 else
17850 cvt = gen_floatv8siv8sf2;
17851 tmp[0] = ix86_build_const_vector (intmode, 1, GEN_INT (0xffff));
17852 tmp[0] = force_reg (intmode, tmp[0]);
17853 tmp[1] = expand_simple_binop (intmode, AND, val, tmp[0], NULL_RTX, 1,
17854 OPTAB_DIRECT);
17855 tmp[2] = expand_simple_binop (intmode, LSHIFTRT, val, GEN_INT (16),
17856 NULL_RTX, 1, OPTAB_DIRECT);
17857 tmp[3] = gen_reg_rtx (fltmode);
17858 emit_insn (cvt (tmp[3], tmp[1]));
17859 tmp[4] = gen_reg_rtx (fltmode);
17860 emit_insn (cvt (tmp[4], tmp[2]));
17861 real_ldexp (&TWO16r, &dconst1, 16);
17862 tmp[5] = const_double_from_real_value (TWO16r, SFmode);
17863 tmp[5] = force_reg (fltmode, ix86_build_const_vector (fltmode, 1, tmp[5]));
17864 tmp[6] = expand_simple_binop (fltmode, MULT, tmp[4], tmp[5], NULL_RTX, 1,
17865 OPTAB_DIRECT);
17866 tmp[7] = expand_simple_binop (fltmode, PLUS, tmp[3], tmp[6], target, 1,
17867 OPTAB_DIRECT);
17868 if (tmp[7] != target)
17869 emit_move_insn (target, tmp[7]);
17870 }
17871
17872 /* Adjust a V*SFmode/V*DFmode value VAL so that *sfix_trunc* resp. fix_trunc*
17873 pattern can be used on it instead of *ufix_trunc* resp. fixuns_trunc*.
17874 This is done by doing just signed conversion if < 0x1p31, and otherwise by
17875 subtracting 0x1p31 first and xoring in 0x80000000 from *XORP afterwards. */
17876
17877 rtx
17878 ix86_expand_adjust_ufix_to_sfix_si (rtx val, rtx *xorp)
17879 {
17880 REAL_VALUE_TYPE TWO31r;
17881 rtx two31r, tmp[4];
17882 enum machine_mode mode = GET_MODE (val);
17883 enum machine_mode scalarmode = GET_MODE_INNER (mode);
17884 enum machine_mode intmode = GET_MODE_SIZE (mode) == 32 ? V8SImode : V4SImode;
17885 rtx (*cmp) (rtx, rtx, rtx, rtx);
17886 int i;
17887
17888 for (i = 0; i < 3; i++)
17889 tmp[i] = gen_reg_rtx (mode);
17890 real_ldexp (&TWO31r, &dconst1, 31);
17891 two31r = const_double_from_real_value (TWO31r, scalarmode);
17892 two31r = ix86_build_const_vector (mode, 1, two31r);
17893 two31r = force_reg (mode, two31r);
17894 switch (mode)
17895 {
17896 case V8SFmode: cmp = gen_avx_maskcmpv8sf3; break;
17897 case V4SFmode: cmp = gen_sse_maskcmpv4sf3; break;
17898 case V4DFmode: cmp = gen_avx_maskcmpv4df3; break;
17899 case V2DFmode: cmp = gen_sse2_maskcmpv2df3; break;
17900 default: gcc_unreachable ();
17901 }
17902 tmp[3] = gen_rtx_LE (mode, two31r, val);
17903 emit_insn (cmp (tmp[0], two31r, val, tmp[3]));
17904 tmp[1] = expand_simple_binop (mode, AND, tmp[0], two31r, tmp[1],
17905 0, OPTAB_DIRECT);
17906 if (intmode == V4SImode || TARGET_AVX2)
17907 *xorp = expand_simple_binop (intmode, ASHIFT,
17908 gen_lowpart (intmode, tmp[0]),
17909 GEN_INT (31), NULL_RTX, 0,
17910 OPTAB_DIRECT);
17911 else
17912 {
17913 rtx two31 = GEN_INT ((unsigned HOST_WIDE_INT) 1 << 31);
17914 two31 = ix86_build_const_vector (intmode, 1, two31);
17915 *xorp = expand_simple_binop (intmode, AND,
17916 gen_lowpart (intmode, tmp[0]),
17917 two31, NULL_RTX, 0,
17918 OPTAB_DIRECT);
17919 }
17920 return expand_simple_binop (mode, MINUS, val, tmp[1], tmp[2],
17921 0, OPTAB_DIRECT);
17922 }
17923
17924 /* A subroutine of ix86_build_signbit_mask. If VECT is true,
17925 then replicate the value for all elements of the vector
17926 register. */
17927
17928 rtx
17929 ix86_build_const_vector (enum machine_mode mode, bool vect, rtx value)
17930 {
17931 int i, n_elt;
17932 rtvec v;
17933 enum machine_mode scalar_mode;
17934
17935 switch (mode)
17936 {
17937 case V32QImode:
17938 case V16QImode:
17939 case V16HImode:
17940 case V8HImode:
17941 case V8SImode:
17942 case V4SImode:
17943 case V4DImode:
17944 case V2DImode:
17945 gcc_assert (vect);
17946 case V8SFmode:
17947 case V4SFmode:
17948 case V4DFmode:
17949 case V2DFmode:
17950 n_elt = GET_MODE_NUNITS (mode);
17951 v = rtvec_alloc (n_elt);
17952 scalar_mode = GET_MODE_INNER (mode);
17953
17954 RTVEC_ELT (v, 0) = value;
17955
17956 for (i = 1; i < n_elt; ++i)
17957 RTVEC_ELT (v, i) = vect ? value : CONST0_RTX (scalar_mode);
17958
17959 return gen_rtx_CONST_VECTOR (mode, v);
17960
17961 default:
17962 gcc_unreachable ();
17963 }
17964 }
17965
17966 /* A subroutine of ix86_expand_fp_absneg_operator, copysign expanders
17967 and ix86_expand_int_vcond. Create a mask for the sign bit in MODE
17968 for an SSE register. If VECT is true, then replicate the mask for
17969 all elements of the vector register. If INVERT is true, then create
17970 a mask excluding the sign bit. */
17971
17972 rtx
17973 ix86_build_signbit_mask (enum machine_mode mode, bool vect, bool invert)
17974 {
17975 enum machine_mode vec_mode, imode;
17976 HOST_WIDE_INT hi, lo;
17977 int shift = 63;
17978 rtx v;
17979 rtx mask;
17980
17981 /* Find the sign bit, sign extended to 2*HWI. */
17982 switch (mode)
17983 {
17984 case V8SImode:
17985 case V4SImode:
17986 case V8SFmode:
17987 case V4SFmode:
17988 vec_mode = mode;
17989 mode = GET_MODE_INNER (mode);
17990 imode = SImode;
17991 lo = 0x80000000, hi = lo < 0;
17992 break;
17993
17994 case V4DImode:
17995 case V2DImode:
17996 case V4DFmode:
17997 case V2DFmode:
17998 vec_mode = mode;
17999 mode = GET_MODE_INNER (mode);
18000 imode = DImode;
18001 if (HOST_BITS_PER_WIDE_INT >= 64)
18002 lo = (HOST_WIDE_INT)1 << shift, hi = -1;
18003 else
18004 lo = 0, hi = (HOST_WIDE_INT)1 << (shift - HOST_BITS_PER_WIDE_INT);
18005 break;
18006
18007 case TImode:
18008 case TFmode:
18009 vec_mode = VOIDmode;
18010 if (HOST_BITS_PER_WIDE_INT >= 64)
18011 {
18012 imode = TImode;
18013 lo = 0, hi = (HOST_WIDE_INT)1 << shift;
18014 }
18015 else
18016 {
18017 rtvec vec;
18018
18019 imode = DImode;
18020 lo = 0, hi = (HOST_WIDE_INT)1 << (shift - HOST_BITS_PER_WIDE_INT);
18021
18022 if (invert)
18023 {
18024 lo = ~lo, hi = ~hi;
18025 v = constm1_rtx;
18026 }
18027 else
18028 v = const0_rtx;
18029
18030 mask = immed_double_const (lo, hi, imode);
18031
18032 vec = gen_rtvec (2, v, mask);
18033 v = gen_rtx_CONST_VECTOR (V2DImode, vec);
18034 v = copy_to_mode_reg (mode, gen_lowpart (mode, v));
18035
18036 return v;
18037 }
18038 break;
18039
18040 default:
18041 gcc_unreachable ();
18042 }
18043
18044 if (invert)
18045 lo = ~lo, hi = ~hi;
18046
18047 /* Force this value into the low part of a fp vector constant. */
18048 mask = immed_double_const (lo, hi, imode);
18049 mask = gen_lowpart (mode, mask);
18050
18051 if (vec_mode == VOIDmode)
18052 return force_reg (mode, mask);
18053
18054 v = ix86_build_const_vector (vec_mode, vect, mask);
18055 return force_reg (vec_mode, v);
18056 }
18057
18058 /* Generate code for floating point ABS or NEG. */
18059
18060 void
18061 ix86_expand_fp_absneg_operator (enum rtx_code code, enum machine_mode mode,
18062 rtx operands[])
18063 {
18064 rtx mask, set, dst, src;
18065 bool use_sse = false;
18066 bool vector_mode = VECTOR_MODE_P (mode);
18067 enum machine_mode vmode = mode;
18068
18069 if (vector_mode)
18070 use_sse = true;
18071 else if (mode == TFmode)
18072 use_sse = true;
18073 else if (TARGET_SSE_MATH)
18074 {
18075 use_sse = SSE_FLOAT_MODE_P (mode);
18076 if (mode == SFmode)
18077 vmode = V4SFmode;
18078 else if (mode == DFmode)
18079 vmode = V2DFmode;
18080 }
18081
18082 /* NEG and ABS performed with SSE use bitwise mask operations.
18083 Create the appropriate mask now. */
18084 if (use_sse)
18085 mask = ix86_build_signbit_mask (vmode, vector_mode, code == ABS);
18086 else
18087 mask = NULL_RTX;
18088
18089 dst = operands[0];
18090 src = operands[1];
18091
18092 set = gen_rtx_fmt_e (code, mode, src);
18093 set = gen_rtx_SET (VOIDmode, dst, set);
18094
18095 if (mask)
18096 {
18097 rtx use, clob;
18098 rtvec par;
18099
18100 use = gen_rtx_USE (VOIDmode, mask);
18101 if (vector_mode)
18102 par = gen_rtvec (2, set, use);
18103 else
18104 {
18105 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
18106 par = gen_rtvec (3, set, use, clob);
18107 }
18108 emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
18109 }
18110 else
18111 emit_insn (set);
18112 }
18113
18114 /* Expand a copysign operation. Special case operand 0 being a constant. */
18115
18116 void
18117 ix86_expand_copysign (rtx operands[])
18118 {
18119 enum machine_mode mode, vmode;
18120 rtx dest, op0, op1, mask, nmask;
18121
18122 dest = operands[0];
18123 op0 = operands[1];
18124 op1 = operands[2];
18125
18126 mode = GET_MODE (dest);
18127
18128 if (mode == SFmode)
18129 vmode = V4SFmode;
18130 else if (mode == DFmode)
18131 vmode = V2DFmode;
18132 else
18133 vmode = mode;
18134
18135 if (GET_CODE (op0) == CONST_DOUBLE)
18136 {
18137 rtx (*copysign_insn)(rtx, rtx, rtx, rtx);
18138
18139 if (real_isneg (CONST_DOUBLE_REAL_VALUE (op0)))
18140 op0 = simplify_unary_operation (ABS, mode, op0, mode);
18141
18142 if (mode == SFmode || mode == DFmode)
18143 {
18144 if (op0 == CONST0_RTX (mode))
18145 op0 = CONST0_RTX (vmode);
18146 else
18147 {
18148 rtx v = ix86_build_const_vector (vmode, false, op0);
18149
18150 op0 = force_reg (vmode, v);
18151 }
18152 }
18153 else if (op0 != CONST0_RTX (mode))
18154 op0 = force_reg (mode, op0);
18155
18156 mask = ix86_build_signbit_mask (vmode, 0, 0);
18157
18158 if (mode == SFmode)
18159 copysign_insn = gen_copysignsf3_const;
18160 else if (mode == DFmode)
18161 copysign_insn = gen_copysigndf3_const;
18162 else
18163 copysign_insn = gen_copysigntf3_const;
18164
18165 emit_insn (copysign_insn (dest, op0, op1, mask));
18166 }
18167 else
18168 {
18169 rtx (*copysign_insn)(rtx, rtx, rtx, rtx, rtx, rtx);
18170
18171 nmask = ix86_build_signbit_mask (vmode, 0, 1);
18172 mask = ix86_build_signbit_mask (vmode, 0, 0);
18173
18174 if (mode == SFmode)
18175 copysign_insn = gen_copysignsf3_var;
18176 else if (mode == DFmode)
18177 copysign_insn = gen_copysigndf3_var;
18178 else
18179 copysign_insn = gen_copysigntf3_var;
18180
18181 emit_insn (copysign_insn (dest, NULL_RTX, op0, op1, nmask, mask));
18182 }
18183 }
18184
18185 /* Deconstruct a copysign operation into bit masks. Operand 0 is known to
18186 be a constant, and so has already been expanded into a vector constant. */
18187
18188 void
18189 ix86_split_copysign_const (rtx operands[])
18190 {
18191 enum machine_mode mode, vmode;
18192 rtx dest, op0, mask, x;
18193
18194 dest = operands[0];
18195 op0 = operands[1];
18196 mask = operands[3];
18197
18198 mode = GET_MODE (dest);
18199 vmode = GET_MODE (mask);
18200
18201 dest = simplify_gen_subreg (vmode, dest, mode, 0);
18202 x = gen_rtx_AND (vmode, dest, mask);
18203 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
18204
18205 if (op0 != CONST0_RTX (vmode))
18206 {
18207 x = gen_rtx_IOR (vmode, dest, op0);
18208 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
18209 }
18210 }
18211
18212 /* Deconstruct a copysign operation into bit masks. Operand 0 is variable,
18213 so we have to do two masks. */
18214
18215 void
18216 ix86_split_copysign_var (rtx operands[])
18217 {
18218 enum machine_mode mode, vmode;
18219 rtx dest, scratch, op0, op1, mask, nmask, x;
18220
18221 dest = operands[0];
18222 scratch = operands[1];
18223 op0 = operands[2];
18224 op1 = operands[3];
18225 nmask = operands[4];
18226 mask = operands[5];
18227
18228 mode = GET_MODE (dest);
18229 vmode = GET_MODE (mask);
18230
18231 if (rtx_equal_p (op0, op1))
18232 {
18233 /* Shouldn't happen often (it's useless, obviously), but when it does
18234 we'd generate incorrect code if we continue below. */
18235 emit_move_insn (dest, op0);
18236 return;
18237 }
18238
18239 if (REG_P (mask) && REGNO (dest) == REGNO (mask)) /* alternative 0 */
18240 {
18241 gcc_assert (REGNO (op1) == REGNO (scratch));
18242
18243 x = gen_rtx_AND (vmode, scratch, mask);
18244 emit_insn (gen_rtx_SET (VOIDmode, scratch, x));
18245
18246 dest = mask;
18247 op0 = simplify_gen_subreg (vmode, op0, mode, 0);
18248 x = gen_rtx_NOT (vmode, dest);
18249 x = gen_rtx_AND (vmode, x, op0);
18250 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
18251 }
18252 else
18253 {
18254 if (REGNO (op1) == REGNO (scratch)) /* alternative 1,3 */
18255 {
18256 x = gen_rtx_AND (vmode, scratch, mask);
18257 }
18258 else /* alternative 2,4 */
18259 {
18260 gcc_assert (REGNO (mask) == REGNO (scratch));
18261 op1 = simplify_gen_subreg (vmode, op1, mode, 0);
18262 x = gen_rtx_AND (vmode, scratch, op1);
18263 }
18264 emit_insn (gen_rtx_SET (VOIDmode, scratch, x));
18265
18266 if (REGNO (op0) == REGNO (dest)) /* alternative 1,2 */
18267 {
18268 dest = simplify_gen_subreg (vmode, op0, mode, 0);
18269 x = gen_rtx_AND (vmode, dest, nmask);
18270 }
18271 else /* alternative 3,4 */
18272 {
18273 gcc_assert (REGNO (nmask) == REGNO (dest));
18274 dest = nmask;
18275 op0 = simplify_gen_subreg (vmode, op0, mode, 0);
18276 x = gen_rtx_AND (vmode, dest, op0);
18277 }
18278 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
18279 }
18280
18281 x = gen_rtx_IOR (vmode, dest, scratch);
18282 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
18283 }
18284
18285 /* Return TRUE or FALSE depending on whether the first SET in INSN
18286 has source and destination with matching CC modes, and that the
18287 CC mode is at least as constrained as REQ_MODE. */
18288
18289 bool
18290 ix86_match_ccmode (rtx insn, enum machine_mode req_mode)
18291 {
18292 rtx set;
18293 enum machine_mode set_mode;
18294
18295 set = PATTERN (insn);
18296 if (GET_CODE (set) == PARALLEL)
18297 set = XVECEXP (set, 0, 0);
18298 gcc_assert (GET_CODE (set) == SET);
18299 gcc_assert (GET_CODE (SET_SRC (set)) == COMPARE);
18300
18301 set_mode = GET_MODE (SET_DEST (set));
18302 switch (set_mode)
18303 {
18304 case CCNOmode:
18305 if (req_mode != CCNOmode
18306 && (req_mode != CCmode
18307 || XEXP (SET_SRC (set), 1) != const0_rtx))
18308 return false;
18309 break;
18310 case CCmode:
18311 if (req_mode == CCGCmode)
18312 return false;
18313 /* FALLTHRU */
18314 case CCGCmode:
18315 if (req_mode == CCGOCmode || req_mode == CCNOmode)
18316 return false;
18317 /* FALLTHRU */
18318 case CCGOCmode:
18319 if (req_mode == CCZmode)
18320 return false;
18321 /* FALLTHRU */
18322 case CCZmode:
18323 break;
18324
18325 case CCAmode:
18326 case CCCmode:
18327 case CCOmode:
18328 case CCSmode:
18329 if (set_mode != req_mode)
18330 return false;
18331 break;
18332
18333 default:
18334 gcc_unreachable ();
18335 }
18336
18337 return GET_MODE (SET_SRC (set)) == set_mode;
18338 }
18339
18340 /* Generate insn patterns to do an integer compare of OPERANDS. */
18341
18342 static rtx
18343 ix86_expand_int_compare (enum rtx_code code, rtx op0, rtx op1)
18344 {
18345 enum machine_mode cmpmode;
18346 rtx tmp, flags;
18347
18348 cmpmode = SELECT_CC_MODE (code, op0, op1);
18349 flags = gen_rtx_REG (cmpmode, FLAGS_REG);
18350
18351 /* This is very simple, but making the interface the same as in the
18352 FP case makes the rest of the code easier. */
18353 tmp = gen_rtx_COMPARE (cmpmode, op0, op1);
18354 emit_insn (gen_rtx_SET (VOIDmode, flags, tmp));
18355
18356 /* Return the test that should be put into the flags user, i.e.
18357 the bcc, scc, or cmov instruction. */
18358 return gen_rtx_fmt_ee (code, VOIDmode, flags, const0_rtx);
18359 }
18360
18361 /* Figure out whether to use ordered or unordered fp comparisons.
18362 Return the appropriate mode to use. */
18363
18364 enum machine_mode
18365 ix86_fp_compare_mode (enum rtx_code code ATTRIBUTE_UNUSED)
18366 {
18367 /* ??? In order to make all comparisons reversible, we do all comparisons
18368 non-trapping when compiling for IEEE. Once gcc is able to distinguish
18369 all forms trapping and nontrapping comparisons, we can make inequality
18370 comparisons trapping again, since it results in better code when using
18371 FCOM based compares. */
18372 return TARGET_IEEE_FP ? CCFPUmode : CCFPmode;
18373 }
18374
18375 enum machine_mode
18376 ix86_cc_mode (enum rtx_code code, rtx op0, rtx op1)
18377 {
18378 enum machine_mode mode = GET_MODE (op0);
18379
18380 if (SCALAR_FLOAT_MODE_P (mode))
18381 {
18382 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
18383 return ix86_fp_compare_mode (code);
18384 }
18385
18386 switch (code)
18387 {
18388 /* Only zero flag is needed. */
18389 case EQ: /* ZF=0 */
18390 case NE: /* ZF!=0 */
18391 return CCZmode;
18392 /* Codes needing carry flag. */
18393 case GEU: /* CF=0 */
18394 case LTU: /* CF=1 */
18395 /* Detect overflow checks. They need just the carry flag. */
18396 if (GET_CODE (op0) == PLUS
18397 && rtx_equal_p (op1, XEXP (op0, 0)))
18398 return CCCmode;
18399 else
18400 return CCmode;
18401 case GTU: /* CF=0 & ZF=0 */
18402 case LEU: /* CF=1 | ZF=1 */
18403 /* Detect overflow checks. They need just the carry flag. */
18404 if (GET_CODE (op0) == MINUS
18405 && rtx_equal_p (op1, XEXP (op0, 0)))
18406 return CCCmode;
18407 else
18408 return CCmode;
18409 /* Codes possibly doable only with sign flag when
18410 comparing against zero. */
18411 case GE: /* SF=OF or SF=0 */
18412 case LT: /* SF<>OF or SF=1 */
18413 if (op1 == const0_rtx)
18414 return CCGOCmode;
18415 else
18416 /* For other cases Carry flag is not required. */
18417 return CCGCmode;
18418 /* Codes doable only with sign flag when comparing
18419 against zero, but we miss jump instruction for it
18420 so we need to use relational tests against overflow
18421 that thus needs to be zero. */
18422 case GT: /* ZF=0 & SF=OF */
18423 case LE: /* ZF=1 | SF<>OF */
18424 if (op1 == const0_rtx)
18425 return CCNOmode;
18426 else
18427 return CCGCmode;
18428 /* strcmp pattern do (use flags) and combine may ask us for proper
18429 mode. */
18430 case USE:
18431 return CCmode;
18432 default:
18433 gcc_unreachable ();
18434 }
18435 }
18436
18437 /* Return the fixed registers used for condition codes. */
18438
18439 static bool
18440 ix86_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
18441 {
18442 *p1 = FLAGS_REG;
18443 *p2 = FPSR_REG;
18444 return true;
18445 }
18446
18447 /* If two condition code modes are compatible, return a condition code
18448 mode which is compatible with both. Otherwise, return
18449 VOIDmode. */
18450
18451 static enum machine_mode
18452 ix86_cc_modes_compatible (enum machine_mode m1, enum machine_mode m2)
18453 {
18454 if (m1 == m2)
18455 return m1;
18456
18457 if (GET_MODE_CLASS (m1) != MODE_CC || GET_MODE_CLASS (m2) != MODE_CC)
18458 return VOIDmode;
18459
18460 if ((m1 == CCGCmode && m2 == CCGOCmode)
18461 || (m1 == CCGOCmode && m2 == CCGCmode))
18462 return CCGCmode;
18463
18464 if (m1 == CCZmode && (m2 == CCGCmode || m2 == CCGOCmode))
18465 return m2;
18466 else if (m2 == CCZmode && (m1 == CCGCmode || m1 == CCGOCmode))
18467 return m1;
18468
18469 switch (m1)
18470 {
18471 default:
18472 gcc_unreachable ();
18473
18474 case CCmode:
18475 case CCGCmode:
18476 case CCGOCmode:
18477 case CCNOmode:
18478 case CCAmode:
18479 case CCCmode:
18480 case CCOmode:
18481 case CCSmode:
18482 case CCZmode:
18483 switch (m2)
18484 {
18485 default:
18486 return VOIDmode;
18487
18488 case CCmode:
18489 case CCGCmode:
18490 case CCGOCmode:
18491 case CCNOmode:
18492 case CCAmode:
18493 case CCCmode:
18494 case CCOmode:
18495 case CCSmode:
18496 case CCZmode:
18497 return CCmode;
18498 }
18499
18500 case CCFPmode:
18501 case CCFPUmode:
18502 /* These are only compatible with themselves, which we already
18503 checked above. */
18504 return VOIDmode;
18505 }
18506 }
18507
18508
18509 /* Return a comparison we can do and that it is equivalent to
18510 swap_condition (code) apart possibly from orderedness.
18511 But, never change orderedness if TARGET_IEEE_FP, returning
18512 UNKNOWN in that case if necessary. */
18513
18514 static enum rtx_code
18515 ix86_fp_swap_condition (enum rtx_code code)
18516 {
18517 switch (code)
18518 {
18519 case GT: /* GTU - CF=0 & ZF=0 */
18520 return TARGET_IEEE_FP ? UNKNOWN : UNLT;
18521 case GE: /* GEU - CF=0 */
18522 return TARGET_IEEE_FP ? UNKNOWN : UNLE;
18523 case UNLT: /* LTU - CF=1 */
18524 return TARGET_IEEE_FP ? UNKNOWN : GT;
18525 case UNLE: /* LEU - CF=1 | ZF=1 */
18526 return TARGET_IEEE_FP ? UNKNOWN : GE;
18527 default:
18528 return swap_condition (code);
18529 }
18530 }
18531
18532 /* Return cost of comparison CODE using the best strategy for performance.
18533 All following functions do use number of instructions as a cost metrics.
18534 In future this should be tweaked to compute bytes for optimize_size and
18535 take into account performance of various instructions on various CPUs. */
18536
18537 static int
18538 ix86_fp_comparison_cost (enum rtx_code code)
18539 {
18540 int arith_cost;
18541
18542 /* The cost of code using bit-twiddling on %ah. */
18543 switch (code)
18544 {
18545 case UNLE:
18546 case UNLT:
18547 case LTGT:
18548 case GT:
18549 case GE:
18550 case UNORDERED:
18551 case ORDERED:
18552 case UNEQ:
18553 arith_cost = 4;
18554 break;
18555 case LT:
18556 case NE:
18557 case EQ:
18558 case UNGE:
18559 arith_cost = TARGET_IEEE_FP ? 5 : 4;
18560 break;
18561 case LE:
18562 case UNGT:
18563 arith_cost = TARGET_IEEE_FP ? 6 : 4;
18564 break;
18565 default:
18566 gcc_unreachable ();
18567 }
18568
18569 switch (ix86_fp_comparison_strategy (code))
18570 {
18571 case IX86_FPCMP_COMI:
18572 return arith_cost > 4 ? 3 : 2;
18573 case IX86_FPCMP_SAHF:
18574 return arith_cost > 4 ? 4 : 3;
18575 default:
18576 return arith_cost;
18577 }
18578 }
18579
18580 /* Return strategy to use for floating-point. We assume that fcomi is always
18581 preferrable where available, since that is also true when looking at size
18582 (2 bytes, vs. 3 for fnstsw+sahf and at least 5 for fnstsw+test). */
18583
18584 enum ix86_fpcmp_strategy
18585 ix86_fp_comparison_strategy (enum rtx_code code ATTRIBUTE_UNUSED)
18586 {
18587 /* Do fcomi/sahf based test when profitable. */
18588
18589 if (TARGET_CMOVE)
18590 return IX86_FPCMP_COMI;
18591
18592 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_function_for_size_p (cfun)))
18593 return IX86_FPCMP_SAHF;
18594
18595 return IX86_FPCMP_ARITH;
18596 }
18597
18598 /* Swap, force into registers, or otherwise massage the two operands
18599 to a fp comparison. The operands are updated in place; the new
18600 comparison code is returned. */
18601
18602 static enum rtx_code
18603 ix86_prepare_fp_compare_args (enum rtx_code code, rtx *pop0, rtx *pop1)
18604 {
18605 enum machine_mode fpcmp_mode = ix86_fp_compare_mode (code);
18606 rtx op0 = *pop0, op1 = *pop1;
18607 enum machine_mode op_mode = GET_MODE (op0);
18608 int is_sse = TARGET_SSE_MATH && SSE_FLOAT_MODE_P (op_mode);
18609
18610 /* All of the unordered compare instructions only work on registers.
18611 The same is true of the fcomi compare instructions. The XFmode
18612 compare instructions require registers except when comparing
18613 against zero or when converting operand 1 from fixed point to
18614 floating point. */
18615
18616 if (!is_sse
18617 && (fpcmp_mode == CCFPUmode
18618 || (op_mode == XFmode
18619 && ! (standard_80387_constant_p (op0) == 1
18620 || standard_80387_constant_p (op1) == 1)
18621 && GET_CODE (op1) != FLOAT)
18622 || ix86_fp_comparison_strategy (code) == IX86_FPCMP_COMI))
18623 {
18624 op0 = force_reg (op_mode, op0);
18625 op1 = force_reg (op_mode, op1);
18626 }
18627 else
18628 {
18629 /* %%% We only allow op1 in memory; op0 must be st(0). So swap
18630 things around if they appear profitable, otherwise force op0
18631 into a register. */
18632
18633 if (standard_80387_constant_p (op0) == 0
18634 || (MEM_P (op0)
18635 && ! (standard_80387_constant_p (op1) == 0
18636 || MEM_P (op1))))
18637 {
18638 enum rtx_code new_code = ix86_fp_swap_condition (code);
18639 if (new_code != UNKNOWN)
18640 {
18641 rtx tmp;
18642 tmp = op0, op0 = op1, op1 = tmp;
18643 code = new_code;
18644 }
18645 }
18646
18647 if (!REG_P (op0))
18648 op0 = force_reg (op_mode, op0);
18649
18650 if (CONSTANT_P (op1))
18651 {
18652 int tmp = standard_80387_constant_p (op1);
18653 if (tmp == 0)
18654 op1 = validize_mem (force_const_mem (op_mode, op1));
18655 else if (tmp == 1)
18656 {
18657 if (TARGET_CMOVE)
18658 op1 = force_reg (op_mode, op1);
18659 }
18660 else
18661 op1 = force_reg (op_mode, op1);
18662 }
18663 }
18664
18665 /* Try to rearrange the comparison to make it cheaper. */
18666 if (ix86_fp_comparison_cost (code)
18667 > ix86_fp_comparison_cost (swap_condition (code))
18668 && (REG_P (op1) || can_create_pseudo_p ()))
18669 {
18670 rtx tmp;
18671 tmp = op0, op0 = op1, op1 = tmp;
18672 code = swap_condition (code);
18673 if (!REG_P (op0))
18674 op0 = force_reg (op_mode, op0);
18675 }
18676
18677 *pop0 = op0;
18678 *pop1 = op1;
18679 return code;
18680 }
18681
18682 /* Convert comparison codes we use to represent FP comparison to integer
18683 code that will result in proper branch. Return UNKNOWN if no such code
18684 is available. */
18685
18686 enum rtx_code
18687 ix86_fp_compare_code_to_integer (enum rtx_code code)
18688 {
18689 switch (code)
18690 {
18691 case GT:
18692 return GTU;
18693 case GE:
18694 return GEU;
18695 case ORDERED:
18696 case UNORDERED:
18697 return code;
18698 break;
18699 case UNEQ:
18700 return EQ;
18701 break;
18702 case UNLT:
18703 return LTU;
18704 break;
18705 case UNLE:
18706 return LEU;
18707 break;
18708 case LTGT:
18709 return NE;
18710 break;
18711 default:
18712 return UNKNOWN;
18713 }
18714 }
18715
18716 /* Generate insn patterns to do a floating point compare of OPERANDS. */
18717
18718 static rtx
18719 ix86_expand_fp_compare (enum rtx_code code, rtx op0, rtx op1, rtx scratch)
18720 {
18721 enum machine_mode fpcmp_mode, intcmp_mode;
18722 rtx tmp, tmp2;
18723
18724 fpcmp_mode = ix86_fp_compare_mode (code);
18725 code = ix86_prepare_fp_compare_args (code, &op0, &op1);
18726
18727 /* Do fcomi/sahf based test when profitable. */
18728 switch (ix86_fp_comparison_strategy (code))
18729 {
18730 case IX86_FPCMP_COMI:
18731 intcmp_mode = fpcmp_mode;
18732 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
18733 tmp = gen_rtx_SET (VOIDmode, gen_rtx_REG (fpcmp_mode, FLAGS_REG),
18734 tmp);
18735 emit_insn (tmp);
18736 break;
18737
18738 case IX86_FPCMP_SAHF:
18739 intcmp_mode = fpcmp_mode;
18740 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
18741 tmp = gen_rtx_SET (VOIDmode, gen_rtx_REG (fpcmp_mode, FLAGS_REG),
18742 tmp);
18743
18744 if (!scratch)
18745 scratch = gen_reg_rtx (HImode);
18746 tmp2 = gen_rtx_CLOBBER (VOIDmode, scratch);
18747 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, tmp2)));
18748 break;
18749
18750 case IX86_FPCMP_ARITH:
18751 /* Sadness wrt reg-stack pops killing fpsr -- gotta get fnstsw first. */
18752 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
18753 tmp2 = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
18754 if (!scratch)
18755 scratch = gen_reg_rtx (HImode);
18756 emit_insn (gen_rtx_SET (VOIDmode, scratch, tmp2));
18757
18758 /* In the unordered case, we have to check C2 for NaN's, which
18759 doesn't happen to work out to anything nice combination-wise.
18760 So do some bit twiddling on the value we've got in AH to come
18761 up with an appropriate set of condition codes. */
18762
18763 intcmp_mode = CCNOmode;
18764 switch (code)
18765 {
18766 case GT:
18767 case UNGT:
18768 if (code == GT || !TARGET_IEEE_FP)
18769 {
18770 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x45)));
18771 code = EQ;
18772 }
18773 else
18774 {
18775 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18776 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
18777 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x44)));
18778 intcmp_mode = CCmode;
18779 code = GEU;
18780 }
18781 break;
18782 case LT:
18783 case UNLT:
18784 if (code == LT && TARGET_IEEE_FP)
18785 {
18786 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18787 emit_insn (gen_cmpqi_ext_3 (scratch, const1_rtx));
18788 intcmp_mode = CCmode;
18789 code = EQ;
18790 }
18791 else
18792 {
18793 emit_insn (gen_testqi_ext_ccno_0 (scratch, const1_rtx));
18794 code = NE;
18795 }
18796 break;
18797 case GE:
18798 case UNGE:
18799 if (code == GE || !TARGET_IEEE_FP)
18800 {
18801 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x05)));
18802 code = EQ;
18803 }
18804 else
18805 {
18806 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18807 emit_insn (gen_xorqi_cc_ext_1 (scratch, scratch, const1_rtx));
18808 code = NE;
18809 }
18810 break;
18811 case LE:
18812 case UNLE:
18813 if (code == LE && TARGET_IEEE_FP)
18814 {
18815 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18816 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
18817 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
18818 intcmp_mode = CCmode;
18819 code = LTU;
18820 }
18821 else
18822 {
18823 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x45)));
18824 code = NE;
18825 }
18826 break;
18827 case EQ:
18828 case UNEQ:
18829 if (code == EQ && TARGET_IEEE_FP)
18830 {
18831 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18832 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
18833 intcmp_mode = CCmode;
18834 code = EQ;
18835 }
18836 else
18837 {
18838 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x40)));
18839 code = NE;
18840 }
18841 break;
18842 case NE:
18843 case LTGT:
18844 if (code == NE && TARGET_IEEE_FP)
18845 {
18846 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18847 emit_insn (gen_xorqi_cc_ext_1 (scratch, scratch,
18848 GEN_INT (0x40)));
18849 code = NE;
18850 }
18851 else
18852 {
18853 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x40)));
18854 code = EQ;
18855 }
18856 break;
18857
18858 case UNORDERED:
18859 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x04)));
18860 code = NE;
18861 break;
18862 case ORDERED:
18863 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x04)));
18864 code = EQ;
18865 break;
18866
18867 default:
18868 gcc_unreachable ();
18869 }
18870 break;
18871
18872 default:
18873 gcc_unreachable();
18874 }
18875
18876 /* Return the test that should be put into the flags user, i.e.
18877 the bcc, scc, or cmov instruction. */
18878 return gen_rtx_fmt_ee (code, VOIDmode,
18879 gen_rtx_REG (intcmp_mode, FLAGS_REG),
18880 const0_rtx);
18881 }
18882
18883 static rtx
18884 ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1)
18885 {
18886 rtx ret;
18887
18888 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC)
18889 ret = gen_rtx_fmt_ee (code, VOIDmode, op0, op1);
18890
18891 else if (SCALAR_FLOAT_MODE_P (GET_MODE (op0)))
18892 {
18893 gcc_assert (!DECIMAL_FLOAT_MODE_P (GET_MODE (op0)));
18894 ret = ix86_expand_fp_compare (code, op0, op1, NULL_RTX);
18895 }
18896 else
18897 ret = ix86_expand_int_compare (code, op0, op1);
18898
18899 return ret;
18900 }
18901
18902 void
18903 ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label)
18904 {
18905 enum machine_mode mode = GET_MODE (op0);
18906 rtx tmp;
18907
18908 switch (mode)
18909 {
18910 case SFmode:
18911 case DFmode:
18912 case XFmode:
18913 case QImode:
18914 case HImode:
18915 case SImode:
18916 simple:
18917 tmp = ix86_expand_compare (code, op0, op1);
18918 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
18919 gen_rtx_LABEL_REF (VOIDmode, label),
18920 pc_rtx);
18921 emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
18922 return;
18923
18924 case DImode:
18925 if (TARGET_64BIT)
18926 goto simple;
18927 case TImode:
18928 /* Expand DImode branch into multiple compare+branch. */
18929 {
18930 rtx lo[2], hi[2], label2;
18931 enum rtx_code code1, code2, code3;
18932 enum machine_mode submode;
18933
18934 if (CONSTANT_P (op0) && !CONSTANT_P (op1))
18935 {
18936 tmp = op0, op0 = op1, op1 = tmp;
18937 code = swap_condition (code);
18938 }
18939
18940 split_double_mode (mode, &op0, 1, lo+0, hi+0);
18941 split_double_mode (mode, &op1, 1, lo+1, hi+1);
18942
18943 submode = mode == DImode ? SImode : DImode;
18944
18945 /* When comparing for equality, we can use (hi0^hi1)|(lo0^lo1) to
18946 avoid two branches. This costs one extra insn, so disable when
18947 optimizing for size. */
18948
18949 if ((code == EQ || code == NE)
18950 && (!optimize_insn_for_size_p ()
18951 || hi[1] == const0_rtx || lo[1] == const0_rtx))
18952 {
18953 rtx xor0, xor1;
18954
18955 xor1 = hi[0];
18956 if (hi[1] != const0_rtx)
18957 xor1 = expand_binop (submode, xor_optab, xor1, hi[1],
18958 NULL_RTX, 0, OPTAB_WIDEN);
18959
18960 xor0 = lo[0];
18961 if (lo[1] != const0_rtx)
18962 xor0 = expand_binop (submode, xor_optab, xor0, lo[1],
18963 NULL_RTX, 0, OPTAB_WIDEN);
18964
18965 tmp = expand_binop (submode, ior_optab, xor1, xor0,
18966 NULL_RTX, 0, OPTAB_WIDEN);
18967
18968 ix86_expand_branch (code, tmp, const0_rtx, label);
18969 return;
18970 }
18971
18972 /* Otherwise, if we are doing less-than or greater-or-equal-than,
18973 op1 is a constant and the low word is zero, then we can just
18974 examine the high word. Similarly for low word -1 and
18975 less-or-equal-than or greater-than. */
18976
18977 if (CONST_INT_P (hi[1]))
18978 switch (code)
18979 {
18980 case LT: case LTU: case GE: case GEU:
18981 if (lo[1] == const0_rtx)
18982 {
18983 ix86_expand_branch (code, hi[0], hi[1], label);
18984 return;
18985 }
18986 break;
18987 case LE: case LEU: case GT: case GTU:
18988 if (lo[1] == constm1_rtx)
18989 {
18990 ix86_expand_branch (code, hi[0], hi[1], label);
18991 return;
18992 }
18993 break;
18994 default:
18995 break;
18996 }
18997
18998 /* Otherwise, we need two or three jumps. */
18999
19000 label2 = gen_label_rtx ();
19001
19002 code1 = code;
19003 code2 = swap_condition (code);
19004 code3 = unsigned_condition (code);
19005
19006 switch (code)
19007 {
19008 case LT: case GT: case LTU: case GTU:
19009 break;
19010
19011 case LE: code1 = LT; code2 = GT; break;
19012 case GE: code1 = GT; code2 = LT; break;
19013 case LEU: code1 = LTU; code2 = GTU; break;
19014 case GEU: code1 = GTU; code2 = LTU; break;
19015
19016 case EQ: code1 = UNKNOWN; code2 = NE; break;
19017 case NE: code2 = UNKNOWN; break;
19018
19019 default:
19020 gcc_unreachable ();
19021 }
19022
19023 /*
19024 * a < b =>
19025 * if (hi(a) < hi(b)) goto true;
19026 * if (hi(a) > hi(b)) goto false;
19027 * if (lo(a) < lo(b)) goto true;
19028 * false:
19029 */
19030
19031 if (code1 != UNKNOWN)
19032 ix86_expand_branch (code1, hi[0], hi[1], label);
19033 if (code2 != UNKNOWN)
19034 ix86_expand_branch (code2, hi[0], hi[1], label2);
19035
19036 ix86_expand_branch (code3, lo[0], lo[1], label);
19037
19038 if (code2 != UNKNOWN)
19039 emit_label (label2);
19040 return;
19041 }
19042
19043 default:
19044 gcc_assert (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC);
19045 goto simple;
19046 }
19047 }
19048
19049 /* Split branch based on floating point condition. */
19050 void
19051 ix86_split_fp_branch (enum rtx_code code, rtx op1, rtx op2,
19052 rtx target1, rtx target2, rtx tmp, rtx pushed)
19053 {
19054 rtx condition;
19055 rtx i;
19056
19057 if (target2 != pc_rtx)
19058 {
19059 rtx tmp = target2;
19060 code = reverse_condition_maybe_unordered (code);
19061 target2 = target1;
19062 target1 = tmp;
19063 }
19064
19065 condition = ix86_expand_fp_compare (code, op1, op2,
19066 tmp);
19067
19068 /* Remove pushed operand from stack. */
19069 if (pushed)
19070 ix86_free_from_memory (GET_MODE (pushed));
19071
19072 i = emit_jump_insn (gen_rtx_SET
19073 (VOIDmode, pc_rtx,
19074 gen_rtx_IF_THEN_ELSE (VOIDmode,
19075 condition, target1, target2)));
19076 if (split_branch_probability >= 0)
19077 add_reg_note (i, REG_BR_PROB, GEN_INT (split_branch_probability));
19078 }
19079
19080 void
19081 ix86_expand_setcc (rtx dest, enum rtx_code code, rtx op0, rtx op1)
19082 {
19083 rtx ret;
19084
19085 gcc_assert (GET_MODE (dest) == QImode);
19086
19087 ret = ix86_expand_compare (code, op0, op1);
19088 PUT_MODE (ret, QImode);
19089 emit_insn (gen_rtx_SET (VOIDmode, dest, ret));
19090 }
19091
19092 /* Expand comparison setting or clearing carry flag. Return true when
19093 successful and set pop for the operation. */
19094 static bool
19095 ix86_expand_carry_flag_compare (enum rtx_code code, rtx op0, rtx op1, rtx *pop)
19096 {
19097 enum machine_mode mode =
19098 GET_MODE (op0) != VOIDmode ? GET_MODE (op0) : GET_MODE (op1);
19099
19100 /* Do not handle double-mode compares that go through special path. */
19101 if (mode == (TARGET_64BIT ? TImode : DImode))
19102 return false;
19103
19104 if (SCALAR_FLOAT_MODE_P (mode))
19105 {
19106 rtx compare_op, compare_seq;
19107
19108 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
19109
19110 /* Shortcut: following common codes never translate
19111 into carry flag compares. */
19112 if (code == EQ || code == NE || code == UNEQ || code == LTGT
19113 || code == ORDERED || code == UNORDERED)
19114 return false;
19115
19116 /* These comparisons require zero flag; swap operands so they won't. */
19117 if ((code == GT || code == UNLE || code == LE || code == UNGT)
19118 && !TARGET_IEEE_FP)
19119 {
19120 rtx tmp = op0;
19121 op0 = op1;
19122 op1 = tmp;
19123 code = swap_condition (code);
19124 }
19125
19126 /* Try to expand the comparison and verify that we end up with
19127 carry flag based comparison. This fails to be true only when
19128 we decide to expand comparison using arithmetic that is not
19129 too common scenario. */
19130 start_sequence ();
19131 compare_op = ix86_expand_fp_compare (code, op0, op1, NULL_RTX);
19132 compare_seq = get_insns ();
19133 end_sequence ();
19134
19135 if (GET_MODE (XEXP (compare_op, 0)) == CCFPmode
19136 || GET_MODE (XEXP (compare_op, 0)) == CCFPUmode)
19137 code = ix86_fp_compare_code_to_integer (GET_CODE (compare_op));
19138 else
19139 code = GET_CODE (compare_op);
19140
19141 if (code != LTU && code != GEU)
19142 return false;
19143
19144 emit_insn (compare_seq);
19145 *pop = compare_op;
19146 return true;
19147 }
19148
19149 if (!INTEGRAL_MODE_P (mode))
19150 return false;
19151
19152 switch (code)
19153 {
19154 case LTU:
19155 case GEU:
19156 break;
19157
19158 /* Convert a==0 into (unsigned)a<1. */
19159 case EQ:
19160 case NE:
19161 if (op1 != const0_rtx)
19162 return false;
19163 op1 = const1_rtx;
19164 code = (code == EQ ? LTU : GEU);
19165 break;
19166
19167 /* Convert a>b into b<a or a>=b-1. */
19168 case GTU:
19169 case LEU:
19170 if (CONST_INT_P (op1))
19171 {
19172 op1 = gen_int_mode (INTVAL (op1) + 1, GET_MODE (op0));
19173 /* Bail out on overflow. We still can swap operands but that
19174 would force loading of the constant into register. */
19175 if (op1 == const0_rtx
19176 || !x86_64_immediate_operand (op1, GET_MODE (op1)))
19177 return false;
19178 code = (code == GTU ? GEU : LTU);
19179 }
19180 else
19181 {
19182 rtx tmp = op1;
19183 op1 = op0;
19184 op0 = tmp;
19185 code = (code == GTU ? LTU : GEU);
19186 }
19187 break;
19188
19189 /* Convert a>=0 into (unsigned)a<0x80000000. */
19190 case LT:
19191 case GE:
19192 if (mode == DImode || op1 != const0_rtx)
19193 return false;
19194 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
19195 code = (code == LT ? GEU : LTU);
19196 break;
19197 case LE:
19198 case GT:
19199 if (mode == DImode || op1 != constm1_rtx)
19200 return false;
19201 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
19202 code = (code == LE ? GEU : LTU);
19203 break;
19204
19205 default:
19206 return false;
19207 }
19208 /* Swapping operands may cause constant to appear as first operand. */
19209 if (!nonimmediate_operand (op0, VOIDmode))
19210 {
19211 if (!can_create_pseudo_p ())
19212 return false;
19213 op0 = force_reg (mode, op0);
19214 }
19215 *pop = ix86_expand_compare (code, op0, op1);
19216 gcc_assert (GET_CODE (*pop) == LTU || GET_CODE (*pop) == GEU);
19217 return true;
19218 }
19219
19220 bool
19221 ix86_expand_int_movcc (rtx operands[])
19222 {
19223 enum rtx_code code = GET_CODE (operands[1]), compare_code;
19224 rtx compare_seq, compare_op;
19225 enum machine_mode mode = GET_MODE (operands[0]);
19226 bool sign_bit_compare_p = false;
19227 rtx op0 = XEXP (operands[1], 0);
19228 rtx op1 = XEXP (operands[1], 1);
19229
19230 if (GET_MODE (op0) == TImode
19231 || (GET_MODE (op0) == DImode
19232 && !TARGET_64BIT))
19233 return false;
19234
19235 start_sequence ();
19236 compare_op = ix86_expand_compare (code, op0, op1);
19237 compare_seq = get_insns ();
19238 end_sequence ();
19239
19240 compare_code = GET_CODE (compare_op);
19241
19242 if ((op1 == const0_rtx && (code == GE || code == LT))
19243 || (op1 == constm1_rtx && (code == GT || code == LE)))
19244 sign_bit_compare_p = true;
19245
19246 /* Don't attempt mode expansion here -- if we had to expand 5 or 6
19247 HImode insns, we'd be swallowed in word prefix ops. */
19248
19249 if ((mode != HImode || TARGET_FAST_PREFIX)
19250 && (mode != (TARGET_64BIT ? TImode : DImode))
19251 && CONST_INT_P (operands[2])
19252 && CONST_INT_P (operands[3]))
19253 {
19254 rtx out = operands[0];
19255 HOST_WIDE_INT ct = INTVAL (operands[2]);
19256 HOST_WIDE_INT cf = INTVAL (operands[3]);
19257 HOST_WIDE_INT diff;
19258
19259 diff = ct - cf;
19260 /* Sign bit compares are better done using shifts than we do by using
19261 sbb. */
19262 if (sign_bit_compare_p
19263 || ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
19264 {
19265 /* Detect overlap between destination and compare sources. */
19266 rtx tmp = out;
19267
19268 if (!sign_bit_compare_p)
19269 {
19270 rtx flags;
19271 bool fpcmp = false;
19272
19273 compare_code = GET_CODE (compare_op);
19274
19275 flags = XEXP (compare_op, 0);
19276
19277 if (GET_MODE (flags) == CCFPmode
19278 || GET_MODE (flags) == CCFPUmode)
19279 {
19280 fpcmp = true;
19281 compare_code
19282 = ix86_fp_compare_code_to_integer (compare_code);
19283 }
19284
19285 /* To simplify rest of code, restrict to the GEU case. */
19286 if (compare_code == LTU)
19287 {
19288 HOST_WIDE_INT tmp = ct;
19289 ct = cf;
19290 cf = tmp;
19291 compare_code = reverse_condition (compare_code);
19292 code = reverse_condition (code);
19293 }
19294 else
19295 {
19296 if (fpcmp)
19297 PUT_CODE (compare_op,
19298 reverse_condition_maybe_unordered
19299 (GET_CODE (compare_op)));
19300 else
19301 PUT_CODE (compare_op,
19302 reverse_condition (GET_CODE (compare_op)));
19303 }
19304 diff = ct - cf;
19305
19306 if (reg_overlap_mentioned_p (out, op0)
19307 || reg_overlap_mentioned_p (out, op1))
19308 tmp = gen_reg_rtx (mode);
19309
19310 if (mode == DImode)
19311 emit_insn (gen_x86_movdicc_0_m1 (tmp, flags, compare_op));
19312 else
19313 emit_insn (gen_x86_movsicc_0_m1 (gen_lowpart (SImode, tmp),
19314 flags, compare_op));
19315 }
19316 else
19317 {
19318 if (code == GT || code == GE)
19319 code = reverse_condition (code);
19320 else
19321 {
19322 HOST_WIDE_INT tmp = ct;
19323 ct = cf;
19324 cf = tmp;
19325 diff = ct - cf;
19326 }
19327 tmp = emit_store_flag (tmp, code, op0, op1, VOIDmode, 0, -1);
19328 }
19329
19330 if (diff == 1)
19331 {
19332 /*
19333 * cmpl op0,op1
19334 * sbbl dest,dest
19335 * [addl dest, ct]
19336 *
19337 * Size 5 - 8.
19338 */
19339 if (ct)
19340 tmp = expand_simple_binop (mode, PLUS,
19341 tmp, GEN_INT (ct),
19342 copy_rtx (tmp), 1, OPTAB_DIRECT);
19343 }
19344 else if (cf == -1)
19345 {
19346 /*
19347 * cmpl op0,op1
19348 * sbbl dest,dest
19349 * orl $ct, dest
19350 *
19351 * Size 8.
19352 */
19353 tmp = expand_simple_binop (mode, IOR,
19354 tmp, GEN_INT (ct),
19355 copy_rtx (tmp), 1, OPTAB_DIRECT);
19356 }
19357 else if (diff == -1 && ct)
19358 {
19359 /*
19360 * cmpl op0,op1
19361 * sbbl dest,dest
19362 * notl dest
19363 * [addl dest, cf]
19364 *
19365 * Size 8 - 11.
19366 */
19367 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
19368 if (cf)
19369 tmp = expand_simple_binop (mode, PLUS,
19370 copy_rtx (tmp), GEN_INT (cf),
19371 copy_rtx (tmp), 1, OPTAB_DIRECT);
19372 }
19373 else
19374 {
19375 /*
19376 * cmpl op0,op1
19377 * sbbl dest,dest
19378 * [notl dest]
19379 * andl cf - ct, dest
19380 * [addl dest, ct]
19381 *
19382 * Size 8 - 11.
19383 */
19384
19385 if (cf == 0)
19386 {
19387 cf = ct;
19388 ct = 0;
19389 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
19390 }
19391
19392 tmp = expand_simple_binop (mode, AND,
19393 copy_rtx (tmp),
19394 gen_int_mode (cf - ct, mode),
19395 copy_rtx (tmp), 1, OPTAB_DIRECT);
19396 if (ct)
19397 tmp = expand_simple_binop (mode, PLUS,
19398 copy_rtx (tmp), GEN_INT (ct),
19399 copy_rtx (tmp), 1, OPTAB_DIRECT);
19400 }
19401
19402 if (!rtx_equal_p (tmp, out))
19403 emit_move_insn (copy_rtx (out), copy_rtx (tmp));
19404
19405 return true;
19406 }
19407
19408 if (diff < 0)
19409 {
19410 enum machine_mode cmp_mode = GET_MODE (op0);
19411
19412 HOST_WIDE_INT tmp;
19413 tmp = ct, ct = cf, cf = tmp;
19414 diff = -diff;
19415
19416 if (SCALAR_FLOAT_MODE_P (cmp_mode))
19417 {
19418 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
19419
19420 /* We may be reversing unordered compare to normal compare, that
19421 is not valid in general (we may convert non-trapping condition
19422 to trapping one), however on i386 we currently emit all
19423 comparisons unordered. */
19424 compare_code = reverse_condition_maybe_unordered (compare_code);
19425 code = reverse_condition_maybe_unordered (code);
19426 }
19427 else
19428 {
19429 compare_code = reverse_condition (compare_code);
19430 code = reverse_condition (code);
19431 }
19432 }
19433
19434 compare_code = UNKNOWN;
19435 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT
19436 && CONST_INT_P (op1))
19437 {
19438 if (op1 == const0_rtx
19439 && (code == LT || code == GE))
19440 compare_code = code;
19441 else if (op1 == constm1_rtx)
19442 {
19443 if (code == LE)
19444 compare_code = LT;
19445 else if (code == GT)
19446 compare_code = GE;
19447 }
19448 }
19449
19450 /* Optimize dest = (op0 < 0) ? -1 : cf. */
19451 if (compare_code != UNKNOWN
19452 && GET_MODE (op0) == GET_MODE (out)
19453 && (cf == -1 || ct == -1))
19454 {
19455 /* If lea code below could be used, only optimize
19456 if it results in a 2 insn sequence. */
19457
19458 if (! (diff == 1 || diff == 2 || diff == 4 || diff == 8
19459 || diff == 3 || diff == 5 || diff == 9)
19460 || (compare_code == LT && ct == -1)
19461 || (compare_code == GE && cf == -1))
19462 {
19463 /*
19464 * notl op1 (if necessary)
19465 * sarl $31, op1
19466 * orl cf, op1
19467 */
19468 if (ct != -1)
19469 {
19470 cf = ct;
19471 ct = -1;
19472 code = reverse_condition (code);
19473 }
19474
19475 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
19476
19477 out = expand_simple_binop (mode, IOR,
19478 out, GEN_INT (cf),
19479 out, 1, OPTAB_DIRECT);
19480 if (out != operands[0])
19481 emit_move_insn (operands[0], out);
19482
19483 return true;
19484 }
19485 }
19486
19487
19488 if ((diff == 1 || diff == 2 || diff == 4 || diff == 8
19489 || diff == 3 || diff == 5 || diff == 9)
19490 && ((mode != QImode && mode != HImode) || !TARGET_PARTIAL_REG_STALL)
19491 && (mode != DImode
19492 || x86_64_immediate_operand (GEN_INT (cf), VOIDmode)))
19493 {
19494 /*
19495 * xorl dest,dest
19496 * cmpl op1,op2
19497 * setcc dest
19498 * lea cf(dest*(ct-cf)),dest
19499 *
19500 * Size 14.
19501 *
19502 * This also catches the degenerate setcc-only case.
19503 */
19504
19505 rtx tmp;
19506 int nops;
19507
19508 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
19509
19510 nops = 0;
19511 /* On x86_64 the lea instruction operates on Pmode, so we need
19512 to get arithmetics done in proper mode to match. */
19513 if (diff == 1)
19514 tmp = copy_rtx (out);
19515 else
19516 {
19517 rtx out1;
19518 out1 = copy_rtx (out);
19519 tmp = gen_rtx_MULT (mode, out1, GEN_INT (diff & ~1));
19520 nops++;
19521 if (diff & 1)
19522 {
19523 tmp = gen_rtx_PLUS (mode, tmp, out1);
19524 nops++;
19525 }
19526 }
19527 if (cf != 0)
19528 {
19529 tmp = gen_rtx_PLUS (mode, tmp, GEN_INT (cf));
19530 nops++;
19531 }
19532 if (!rtx_equal_p (tmp, out))
19533 {
19534 if (nops == 1)
19535 out = force_operand (tmp, copy_rtx (out));
19536 else
19537 emit_insn (gen_rtx_SET (VOIDmode, copy_rtx (out), copy_rtx (tmp)));
19538 }
19539 if (!rtx_equal_p (out, operands[0]))
19540 emit_move_insn (operands[0], copy_rtx (out));
19541
19542 return true;
19543 }
19544
19545 /*
19546 * General case: Jumpful:
19547 * xorl dest,dest cmpl op1, op2
19548 * cmpl op1, op2 movl ct, dest
19549 * setcc dest jcc 1f
19550 * decl dest movl cf, dest
19551 * andl (cf-ct),dest 1:
19552 * addl ct,dest
19553 *
19554 * Size 20. Size 14.
19555 *
19556 * This is reasonably steep, but branch mispredict costs are
19557 * high on modern cpus, so consider failing only if optimizing
19558 * for space.
19559 */
19560
19561 if ((!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
19562 && BRANCH_COST (optimize_insn_for_speed_p (),
19563 false) >= 2)
19564 {
19565 if (cf == 0)
19566 {
19567 enum machine_mode cmp_mode = GET_MODE (op0);
19568
19569 cf = ct;
19570 ct = 0;
19571
19572 if (SCALAR_FLOAT_MODE_P (cmp_mode))
19573 {
19574 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
19575
19576 /* We may be reversing unordered compare to normal compare,
19577 that is not valid in general (we may convert non-trapping
19578 condition to trapping one), however on i386 we currently
19579 emit all comparisons unordered. */
19580 code = reverse_condition_maybe_unordered (code);
19581 }
19582 else
19583 {
19584 code = reverse_condition (code);
19585 if (compare_code != UNKNOWN)
19586 compare_code = reverse_condition (compare_code);
19587 }
19588 }
19589
19590 if (compare_code != UNKNOWN)
19591 {
19592 /* notl op1 (if needed)
19593 sarl $31, op1
19594 andl (cf-ct), op1
19595 addl ct, op1
19596
19597 For x < 0 (resp. x <= -1) there will be no notl,
19598 so if possible swap the constants to get rid of the
19599 complement.
19600 True/false will be -1/0 while code below (store flag
19601 followed by decrement) is 0/-1, so the constants need
19602 to be exchanged once more. */
19603
19604 if (compare_code == GE || !cf)
19605 {
19606 code = reverse_condition (code);
19607 compare_code = LT;
19608 }
19609 else
19610 {
19611 HOST_WIDE_INT tmp = cf;
19612 cf = ct;
19613 ct = tmp;
19614 }
19615
19616 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
19617 }
19618 else
19619 {
19620 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
19621
19622 out = expand_simple_binop (mode, PLUS, copy_rtx (out),
19623 constm1_rtx,
19624 copy_rtx (out), 1, OPTAB_DIRECT);
19625 }
19626
19627 out = expand_simple_binop (mode, AND, copy_rtx (out),
19628 gen_int_mode (cf - ct, mode),
19629 copy_rtx (out), 1, OPTAB_DIRECT);
19630 if (ct)
19631 out = expand_simple_binop (mode, PLUS, copy_rtx (out), GEN_INT (ct),
19632 copy_rtx (out), 1, OPTAB_DIRECT);
19633 if (!rtx_equal_p (out, operands[0]))
19634 emit_move_insn (operands[0], copy_rtx (out));
19635
19636 return true;
19637 }
19638 }
19639
19640 if (!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
19641 {
19642 /* Try a few things more with specific constants and a variable. */
19643
19644 optab op;
19645 rtx var, orig_out, out, tmp;
19646
19647 if (BRANCH_COST (optimize_insn_for_speed_p (), false) <= 2)
19648 return false;
19649
19650 /* If one of the two operands is an interesting constant, load a
19651 constant with the above and mask it in with a logical operation. */
19652
19653 if (CONST_INT_P (operands[2]))
19654 {
19655 var = operands[3];
19656 if (INTVAL (operands[2]) == 0 && operands[3] != constm1_rtx)
19657 operands[3] = constm1_rtx, op = and_optab;
19658 else if (INTVAL (operands[2]) == -1 && operands[3] != const0_rtx)
19659 operands[3] = const0_rtx, op = ior_optab;
19660 else
19661 return false;
19662 }
19663 else if (CONST_INT_P (operands[3]))
19664 {
19665 var = operands[2];
19666 if (INTVAL (operands[3]) == 0 && operands[2] != constm1_rtx)
19667 operands[2] = constm1_rtx, op = and_optab;
19668 else if (INTVAL (operands[3]) == -1 && operands[3] != const0_rtx)
19669 operands[2] = const0_rtx, op = ior_optab;
19670 else
19671 return false;
19672 }
19673 else
19674 return false;
19675
19676 orig_out = operands[0];
19677 tmp = gen_reg_rtx (mode);
19678 operands[0] = tmp;
19679
19680 /* Recurse to get the constant loaded. */
19681 if (ix86_expand_int_movcc (operands) == 0)
19682 return false;
19683
19684 /* Mask in the interesting variable. */
19685 out = expand_binop (mode, op, var, tmp, orig_out, 0,
19686 OPTAB_WIDEN);
19687 if (!rtx_equal_p (out, orig_out))
19688 emit_move_insn (copy_rtx (orig_out), copy_rtx (out));
19689
19690 return true;
19691 }
19692
19693 /*
19694 * For comparison with above,
19695 *
19696 * movl cf,dest
19697 * movl ct,tmp
19698 * cmpl op1,op2
19699 * cmovcc tmp,dest
19700 *
19701 * Size 15.
19702 */
19703
19704 if (! nonimmediate_operand (operands[2], mode))
19705 operands[2] = force_reg (mode, operands[2]);
19706 if (! nonimmediate_operand (operands[3], mode))
19707 operands[3] = force_reg (mode, operands[3]);
19708
19709 if (! register_operand (operands[2], VOIDmode)
19710 && (mode == QImode
19711 || ! register_operand (operands[3], VOIDmode)))
19712 operands[2] = force_reg (mode, operands[2]);
19713
19714 if (mode == QImode
19715 && ! register_operand (operands[3], VOIDmode))
19716 operands[3] = force_reg (mode, operands[3]);
19717
19718 emit_insn (compare_seq);
19719 emit_insn (gen_rtx_SET (VOIDmode, operands[0],
19720 gen_rtx_IF_THEN_ELSE (mode,
19721 compare_op, operands[2],
19722 operands[3])));
19723 return true;
19724 }
19725
19726 /* Swap, force into registers, or otherwise massage the two operands
19727 to an sse comparison with a mask result. Thus we differ a bit from
19728 ix86_prepare_fp_compare_args which expects to produce a flags result.
19729
19730 The DEST operand exists to help determine whether to commute commutative
19731 operators. The POP0/POP1 operands are updated in place. The new
19732 comparison code is returned, or UNKNOWN if not implementable. */
19733
19734 static enum rtx_code
19735 ix86_prepare_sse_fp_compare_args (rtx dest, enum rtx_code code,
19736 rtx *pop0, rtx *pop1)
19737 {
19738 rtx tmp;
19739
19740 switch (code)
19741 {
19742 case LTGT:
19743 case UNEQ:
19744 /* AVX supports all the needed comparisons. */
19745 if (TARGET_AVX)
19746 break;
19747 /* We have no LTGT as an operator. We could implement it with
19748 NE & ORDERED, but this requires an extra temporary. It's
19749 not clear that it's worth it. */
19750 return UNKNOWN;
19751
19752 case LT:
19753 case LE:
19754 case UNGT:
19755 case UNGE:
19756 /* These are supported directly. */
19757 break;
19758
19759 case EQ:
19760 case NE:
19761 case UNORDERED:
19762 case ORDERED:
19763 /* AVX has 3 operand comparisons, no need to swap anything. */
19764 if (TARGET_AVX)
19765 break;
19766 /* For commutative operators, try to canonicalize the destination
19767 operand to be first in the comparison - this helps reload to
19768 avoid extra moves. */
19769 if (!dest || !rtx_equal_p (dest, *pop1))
19770 break;
19771 /* FALLTHRU */
19772
19773 case GE:
19774 case GT:
19775 case UNLE:
19776 case UNLT:
19777 /* These are not supported directly before AVX, and furthermore
19778 ix86_expand_sse_fp_minmax only optimizes LT/UNGE. Swap the
19779 comparison operands to transform into something that is
19780 supported. */
19781 tmp = *pop0;
19782 *pop0 = *pop1;
19783 *pop1 = tmp;
19784 code = swap_condition (code);
19785 break;
19786
19787 default:
19788 gcc_unreachable ();
19789 }
19790
19791 return code;
19792 }
19793
19794 /* Detect conditional moves that exactly match min/max operational
19795 semantics. Note that this is IEEE safe, as long as we don't
19796 interchange the operands.
19797
19798 Returns FALSE if this conditional move doesn't match a MIN/MAX,
19799 and TRUE if the operation is successful and instructions are emitted. */
19800
19801 static bool
19802 ix86_expand_sse_fp_minmax (rtx dest, enum rtx_code code, rtx cmp_op0,
19803 rtx cmp_op1, rtx if_true, rtx if_false)
19804 {
19805 enum machine_mode mode;
19806 bool is_min;
19807 rtx tmp;
19808
19809 if (code == LT)
19810 ;
19811 else if (code == UNGE)
19812 {
19813 tmp = if_true;
19814 if_true = if_false;
19815 if_false = tmp;
19816 }
19817 else
19818 return false;
19819
19820 if (rtx_equal_p (cmp_op0, if_true) && rtx_equal_p (cmp_op1, if_false))
19821 is_min = true;
19822 else if (rtx_equal_p (cmp_op1, if_true) && rtx_equal_p (cmp_op0, if_false))
19823 is_min = false;
19824 else
19825 return false;
19826
19827 mode = GET_MODE (dest);
19828
19829 /* We want to check HONOR_NANS and HONOR_SIGNED_ZEROS here,
19830 but MODE may be a vector mode and thus not appropriate. */
19831 if (!flag_finite_math_only || !flag_unsafe_math_optimizations)
19832 {
19833 int u = is_min ? UNSPEC_IEEE_MIN : UNSPEC_IEEE_MAX;
19834 rtvec v;
19835
19836 if_true = force_reg (mode, if_true);
19837 v = gen_rtvec (2, if_true, if_false);
19838 tmp = gen_rtx_UNSPEC (mode, v, u);
19839 }
19840 else
19841 {
19842 code = is_min ? SMIN : SMAX;
19843 tmp = gen_rtx_fmt_ee (code, mode, if_true, if_false);
19844 }
19845
19846 emit_insn (gen_rtx_SET (VOIDmode, dest, tmp));
19847 return true;
19848 }
19849
19850 /* Expand an sse vector comparison. Return the register with the result. */
19851
19852 static rtx
19853 ix86_expand_sse_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1,
19854 rtx op_true, rtx op_false)
19855 {
19856 enum machine_mode mode = GET_MODE (dest);
19857 enum machine_mode cmp_mode = GET_MODE (cmp_op0);
19858 rtx x;
19859
19860 cmp_op0 = force_reg (cmp_mode, cmp_op0);
19861 if (!nonimmediate_operand (cmp_op1, cmp_mode))
19862 cmp_op1 = force_reg (cmp_mode, cmp_op1);
19863
19864 if (optimize
19865 || reg_overlap_mentioned_p (dest, op_true)
19866 || reg_overlap_mentioned_p (dest, op_false))
19867 dest = gen_reg_rtx (mode);
19868
19869 x = gen_rtx_fmt_ee (code, cmp_mode, cmp_op0, cmp_op1);
19870 if (cmp_mode != mode)
19871 {
19872 x = force_reg (cmp_mode, x);
19873 convert_move (dest, x, false);
19874 }
19875 else
19876 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
19877
19878 return dest;
19879 }
19880
19881 /* Expand DEST = CMP ? OP_TRUE : OP_FALSE into a sequence of logical
19882 operations. This is used for both scalar and vector conditional moves. */
19883
19884 static void
19885 ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
19886 {
19887 enum machine_mode mode = GET_MODE (dest);
19888 rtx t2, t3, x;
19889
19890 if (vector_all_ones_operand (op_true, mode)
19891 && rtx_equal_p (op_false, CONST0_RTX (mode)))
19892 {
19893 emit_insn (gen_rtx_SET (VOIDmode, dest, cmp));
19894 }
19895 else if (op_false == CONST0_RTX (mode))
19896 {
19897 op_true = force_reg (mode, op_true);
19898 x = gen_rtx_AND (mode, cmp, op_true);
19899 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
19900 }
19901 else if (op_true == CONST0_RTX (mode))
19902 {
19903 op_false = force_reg (mode, op_false);
19904 x = gen_rtx_NOT (mode, cmp);
19905 x = gen_rtx_AND (mode, x, op_false);
19906 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
19907 }
19908 else if (INTEGRAL_MODE_P (mode) && op_true == CONSTM1_RTX (mode))
19909 {
19910 op_false = force_reg (mode, op_false);
19911 x = gen_rtx_IOR (mode, cmp, op_false);
19912 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
19913 }
19914 else if (TARGET_XOP)
19915 {
19916 op_true = force_reg (mode, op_true);
19917
19918 if (!nonimmediate_operand (op_false, mode))
19919 op_false = force_reg (mode, op_false);
19920
19921 emit_insn (gen_rtx_SET (mode, dest,
19922 gen_rtx_IF_THEN_ELSE (mode, cmp,
19923 op_true,
19924 op_false)));
19925 }
19926 else
19927 {
19928 rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
19929
19930 if (!nonimmediate_operand (op_true, mode))
19931 op_true = force_reg (mode, op_true);
19932
19933 op_false = force_reg (mode, op_false);
19934
19935 switch (mode)
19936 {
19937 case V4SFmode:
19938 if (TARGET_SSE4_1)
19939 gen = gen_sse4_1_blendvps;
19940 break;
19941 case V2DFmode:
19942 if (TARGET_SSE4_1)
19943 gen = gen_sse4_1_blendvpd;
19944 break;
19945 case V16QImode:
19946 case V8HImode:
19947 case V4SImode:
19948 case V2DImode:
19949 if (TARGET_SSE4_1)
19950 {
19951 gen = gen_sse4_1_pblendvb;
19952 dest = gen_lowpart (V16QImode, dest);
19953 op_false = gen_lowpart (V16QImode, op_false);
19954 op_true = gen_lowpart (V16QImode, op_true);
19955 cmp = gen_lowpart (V16QImode, cmp);
19956 }
19957 break;
19958 case V8SFmode:
19959 if (TARGET_AVX)
19960 gen = gen_avx_blendvps256;
19961 break;
19962 case V4DFmode:
19963 if (TARGET_AVX)
19964 gen = gen_avx_blendvpd256;
19965 break;
19966 case V32QImode:
19967 case V16HImode:
19968 case V8SImode:
19969 case V4DImode:
19970 if (TARGET_AVX2)
19971 {
19972 gen = gen_avx2_pblendvb;
19973 dest = gen_lowpart (V32QImode, dest);
19974 op_false = gen_lowpart (V32QImode, op_false);
19975 op_true = gen_lowpart (V32QImode, op_true);
19976 cmp = gen_lowpart (V32QImode, cmp);
19977 }
19978 break;
19979 default:
19980 break;
19981 }
19982
19983 if (gen != NULL)
19984 emit_insn (gen (dest, op_false, op_true, cmp));
19985 else
19986 {
19987 op_true = force_reg (mode, op_true);
19988
19989 t2 = gen_reg_rtx (mode);
19990 if (optimize)
19991 t3 = gen_reg_rtx (mode);
19992 else
19993 t3 = dest;
19994
19995 x = gen_rtx_AND (mode, op_true, cmp);
19996 emit_insn (gen_rtx_SET (VOIDmode, t2, x));
19997
19998 x = gen_rtx_NOT (mode, cmp);
19999 x = gen_rtx_AND (mode, x, op_false);
20000 emit_insn (gen_rtx_SET (VOIDmode, t3, x));
20001
20002 x = gen_rtx_IOR (mode, t3, t2);
20003 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
20004 }
20005 }
20006 }
20007
20008 /* Expand a floating-point conditional move. Return true if successful. */
20009
20010 bool
20011 ix86_expand_fp_movcc (rtx operands[])
20012 {
20013 enum machine_mode mode = GET_MODE (operands[0]);
20014 enum rtx_code code = GET_CODE (operands[1]);
20015 rtx tmp, compare_op;
20016 rtx op0 = XEXP (operands[1], 0);
20017 rtx op1 = XEXP (operands[1], 1);
20018
20019 if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode))
20020 {
20021 enum machine_mode cmode;
20022
20023 /* Since we've no cmove for sse registers, don't force bad register
20024 allocation just to gain access to it. Deny movcc when the
20025 comparison mode doesn't match the move mode. */
20026 cmode = GET_MODE (op0);
20027 if (cmode == VOIDmode)
20028 cmode = GET_MODE (op1);
20029 if (cmode != mode)
20030 return false;
20031
20032 code = ix86_prepare_sse_fp_compare_args (operands[0], code, &op0, &op1);
20033 if (code == UNKNOWN)
20034 return false;
20035
20036 if (ix86_expand_sse_fp_minmax (operands[0], code, op0, op1,
20037 operands[2], operands[3]))
20038 return true;
20039
20040 tmp = ix86_expand_sse_cmp (operands[0], code, op0, op1,
20041 operands[2], operands[3]);
20042 ix86_expand_sse_movcc (operands[0], tmp, operands[2], operands[3]);
20043 return true;
20044 }
20045
20046 /* The floating point conditional move instructions don't directly
20047 support conditions resulting from a signed integer comparison. */
20048
20049 compare_op = ix86_expand_compare (code, op0, op1);
20050 if (!fcmov_comparison_operator (compare_op, VOIDmode))
20051 {
20052 tmp = gen_reg_rtx (QImode);
20053 ix86_expand_setcc (tmp, code, op0, op1);
20054
20055 compare_op = ix86_expand_compare (NE, tmp, const0_rtx);
20056 }
20057
20058 emit_insn (gen_rtx_SET (VOIDmode, operands[0],
20059 gen_rtx_IF_THEN_ELSE (mode, compare_op,
20060 operands[2], operands[3])));
20061
20062 return true;
20063 }
20064
20065 /* Expand a floating-point vector conditional move; a vcond operation
20066 rather than a movcc operation. */
20067
20068 bool
20069 ix86_expand_fp_vcond (rtx operands[])
20070 {
20071 enum rtx_code code = GET_CODE (operands[3]);
20072 rtx cmp;
20073
20074 code = ix86_prepare_sse_fp_compare_args (operands[0], code,
20075 &operands[4], &operands[5]);
20076 if (code == UNKNOWN)
20077 {
20078 rtx temp;
20079 switch (GET_CODE (operands[3]))
20080 {
20081 case LTGT:
20082 temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[4],
20083 operands[5], operands[0], operands[0]);
20084 cmp = ix86_expand_sse_cmp (operands[0], NE, operands[4],
20085 operands[5], operands[1], operands[2]);
20086 code = AND;
20087 break;
20088 case UNEQ:
20089 temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[4],
20090 operands[5], operands[0], operands[0]);
20091 cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[4],
20092 operands[5], operands[1], operands[2]);
20093 code = IOR;
20094 break;
20095 default:
20096 gcc_unreachable ();
20097 }
20098 cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
20099 OPTAB_DIRECT);
20100 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
20101 return true;
20102 }
20103
20104 if (ix86_expand_sse_fp_minmax (operands[0], code, operands[4],
20105 operands[5], operands[1], operands[2]))
20106 return true;
20107
20108 cmp = ix86_expand_sse_cmp (operands[0], code, operands[4], operands[5],
20109 operands[1], operands[2]);
20110 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
20111 return true;
20112 }
20113
20114 /* Expand a signed/unsigned integral vector conditional move. */
20115
20116 bool
20117 ix86_expand_int_vcond (rtx operands[])
20118 {
20119 enum machine_mode data_mode = GET_MODE (operands[0]);
20120 enum machine_mode mode = GET_MODE (operands[4]);
20121 enum rtx_code code = GET_CODE (operands[3]);
20122 bool negate = false;
20123 rtx x, cop0, cop1;
20124
20125 cop0 = operands[4];
20126 cop1 = operands[5];
20127
20128 /* Try to optimize x < 0 ? -1 : 0 into (signed) x >> 31
20129 and x < 0 ? 1 : 0 into (unsigned) x >> 31. */
20130 if ((code == LT || code == GE)
20131 && data_mode == mode
20132 && cop1 == CONST0_RTX (mode)
20133 && operands[1 + (code == LT)] == CONST0_RTX (data_mode)
20134 && GET_MODE_SIZE (GET_MODE_INNER (data_mode)) > 1
20135 && GET_MODE_SIZE (GET_MODE_INNER (data_mode)) <= 8
20136 && (GET_MODE_SIZE (data_mode) == 16
20137 || (TARGET_AVX2 && GET_MODE_SIZE (data_mode) == 32)))
20138 {
20139 rtx negop = operands[2 - (code == LT)];
20140 int shift = GET_MODE_BITSIZE (GET_MODE_INNER (data_mode)) - 1;
20141 if (negop == CONST1_RTX (data_mode))
20142 {
20143 rtx res = expand_simple_binop (mode, LSHIFTRT, cop0, GEN_INT (shift),
20144 operands[0], 1, OPTAB_DIRECT);
20145 if (res != operands[0])
20146 emit_move_insn (operands[0], res);
20147 return true;
20148 }
20149 else if (GET_MODE_INNER (data_mode) != DImode
20150 && vector_all_ones_operand (negop, data_mode))
20151 {
20152 rtx res = expand_simple_binop (mode, ASHIFTRT, cop0, GEN_INT (shift),
20153 operands[0], 0, OPTAB_DIRECT);
20154 if (res != operands[0])
20155 emit_move_insn (operands[0], res);
20156 return true;
20157 }
20158 }
20159
20160 if (!nonimmediate_operand (cop1, mode))
20161 cop1 = force_reg (mode, cop1);
20162 if (!general_operand (operands[1], data_mode))
20163 operands[1] = force_reg (data_mode, operands[1]);
20164 if (!general_operand (operands[2], data_mode))
20165 operands[2] = force_reg (data_mode, operands[2]);
20166
20167 /* XOP supports all of the comparisons on all 128-bit vector int types. */
20168 if (TARGET_XOP
20169 && (mode == V16QImode || mode == V8HImode
20170 || mode == V4SImode || mode == V2DImode))
20171 ;
20172 else
20173 {
20174 /* Canonicalize the comparison to EQ, GT, GTU. */
20175 switch (code)
20176 {
20177 case EQ:
20178 case GT:
20179 case GTU:
20180 break;
20181
20182 case NE:
20183 case LE:
20184 case LEU:
20185 code = reverse_condition (code);
20186 negate = true;
20187 break;
20188
20189 case GE:
20190 case GEU:
20191 code = reverse_condition (code);
20192 negate = true;
20193 /* FALLTHRU */
20194
20195 case LT:
20196 case LTU:
20197 code = swap_condition (code);
20198 x = cop0, cop0 = cop1, cop1 = x;
20199 break;
20200
20201 default:
20202 gcc_unreachable ();
20203 }
20204
20205 /* Only SSE4.1/SSE4.2 supports V2DImode. */
20206 if (mode == V2DImode)
20207 {
20208 switch (code)
20209 {
20210 case EQ:
20211 /* SSE4.1 supports EQ. */
20212 if (!TARGET_SSE4_1)
20213 return false;
20214 break;
20215
20216 case GT:
20217 case GTU:
20218 /* SSE4.2 supports GT/GTU. */
20219 if (!TARGET_SSE4_2)
20220 return false;
20221 break;
20222
20223 default:
20224 gcc_unreachable ();
20225 }
20226 }
20227
20228 /* Unsigned parallel compare is not supported by the hardware.
20229 Play some tricks to turn this into a signed comparison
20230 against 0. */
20231 if (code == GTU)
20232 {
20233 cop0 = force_reg (mode, cop0);
20234
20235 switch (mode)
20236 {
20237 case V8SImode:
20238 case V4DImode:
20239 case V4SImode:
20240 case V2DImode:
20241 {
20242 rtx t1, t2, mask;
20243 rtx (*gen_sub3) (rtx, rtx, rtx);
20244
20245 switch (mode)
20246 {
20247 case V8SImode: gen_sub3 = gen_subv8si3; break;
20248 case V4DImode: gen_sub3 = gen_subv4di3; break;
20249 case V4SImode: gen_sub3 = gen_subv4si3; break;
20250 case V2DImode: gen_sub3 = gen_subv2di3; break;
20251 default:
20252 gcc_unreachable ();
20253 }
20254 /* Subtract (-(INT MAX) - 1) from both operands to make
20255 them signed. */
20256 mask = ix86_build_signbit_mask (mode, true, false);
20257 t1 = gen_reg_rtx (mode);
20258 emit_insn (gen_sub3 (t1, cop0, mask));
20259
20260 t2 = gen_reg_rtx (mode);
20261 emit_insn (gen_sub3 (t2, cop1, mask));
20262
20263 cop0 = t1;
20264 cop1 = t2;
20265 code = GT;
20266 }
20267 break;
20268
20269 case V32QImode:
20270 case V16HImode:
20271 case V16QImode:
20272 case V8HImode:
20273 /* Perform a parallel unsigned saturating subtraction. */
20274 x = gen_reg_rtx (mode);
20275 emit_insn (gen_rtx_SET (VOIDmode, x,
20276 gen_rtx_US_MINUS (mode, cop0, cop1)));
20277
20278 cop0 = x;
20279 cop1 = CONST0_RTX (mode);
20280 code = EQ;
20281 negate = !negate;
20282 break;
20283
20284 default:
20285 gcc_unreachable ();
20286 }
20287 }
20288 }
20289
20290 /* Allow the comparison to be done in one mode, but the movcc to
20291 happen in another mode. */
20292 if (data_mode == mode)
20293 {
20294 x = ix86_expand_sse_cmp (operands[0], code, cop0, cop1,
20295 operands[1+negate], operands[2-negate]);
20296 }
20297 else
20298 {
20299 gcc_assert (GET_MODE_SIZE (data_mode) == GET_MODE_SIZE (mode));
20300 x = ix86_expand_sse_cmp (gen_lowpart (mode, operands[0]),
20301 code, cop0, cop1,
20302 operands[1+negate], operands[2-negate]);
20303 x = gen_lowpart (data_mode, x);
20304 }
20305
20306 ix86_expand_sse_movcc (operands[0], x, operands[1+negate],
20307 operands[2-negate]);
20308 return true;
20309 }
20310
20311 /* Expand a variable vector permutation. */
20312
20313 void
20314 ix86_expand_vec_perm (rtx operands[])
20315 {
20316 rtx target = operands[0];
20317 rtx op0 = operands[1];
20318 rtx op1 = operands[2];
20319 rtx mask = operands[3];
20320 rtx t1, t2, t3, t4, vt, vt2, vec[32];
20321 enum machine_mode mode = GET_MODE (op0);
20322 enum machine_mode maskmode = GET_MODE (mask);
20323 int w, e, i;
20324 bool one_operand_shuffle = rtx_equal_p (op0, op1);
20325
20326 /* Number of elements in the vector. */
20327 w = GET_MODE_NUNITS (mode);
20328 e = GET_MODE_UNIT_SIZE (mode);
20329 gcc_assert (w <= 32);
20330
20331 if (TARGET_AVX2)
20332 {
20333 if (mode == V4DImode || mode == V4DFmode || mode == V16HImode)
20334 {
20335 /* Unfortunately, the VPERMQ and VPERMPD instructions only support
20336 an constant shuffle operand. With a tiny bit of effort we can
20337 use VPERMD instead. A re-interpretation stall for V4DFmode is
20338 unfortunate but there's no avoiding it.
20339 Similarly for V16HImode we don't have instructions for variable
20340 shuffling, while for V32QImode we can use after preparing suitable
20341 masks vpshufb; vpshufb; vpermq; vpor. */
20342
20343 if (mode == V16HImode)
20344 {
20345 maskmode = mode = V32QImode;
20346 w = 32;
20347 e = 1;
20348 }
20349 else
20350 {
20351 maskmode = mode = V8SImode;
20352 w = 8;
20353 e = 4;
20354 }
20355 t1 = gen_reg_rtx (maskmode);
20356
20357 /* Replicate the low bits of the V4DImode mask into V8SImode:
20358 mask = { A B C D }
20359 t1 = { A A B B C C D D }. */
20360 for (i = 0; i < w / 2; ++i)
20361 vec[i*2 + 1] = vec[i*2] = GEN_INT (i * 2);
20362 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
20363 vt = force_reg (maskmode, vt);
20364 mask = gen_lowpart (maskmode, mask);
20365 if (maskmode == V8SImode)
20366 emit_insn (gen_avx2_permvarv8si (t1, mask, vt));
20367 else
20368 emit_insn (gen_avx2_pshufbv32qi3 (t1, mask, vt));
20369
20370 /* Multiply the shuffle indicies by two. */
20371 t1 = expand_simple_binop (maskmode, PLUS, t1, t1, t1, 1,
20372 OPTAB_DIRECT);
20373
20374 /* Add one to the odd shuffle indicies:
20375 t1 = { A*2, A*2+1, B*2, B*2+1, ... }. */
20376 for (i = 0; i < w / 2; ++i)
20377 {
20378 vec[i * 2] = const0_rtx;
20379 vec[i * 2 + 1] = const1_rtx;
20380 }
20381 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
20382 vt = force_const_mem (maskmode, vt);
20383 t1 = expand_simple_binop (maskmode, PLUS, t1, vt, t1, 1,
20384 OPTAB_DIRECT);
20385
20386 /* Continue as if V8SImode (resp. V32QImode) was used initially. */
20387 operands[3] = mask = t1;
20388 target = gen_lowpart (mode, target);
20389 op0 = gen_lowpart (mode, op0);
20390 op1 = gen_lowpart (mode, op1);
20391 }
20392
20393 switch (mode)
20394 {
20395 case V8SImode:
20396 /* The VPERMD and VPERMPS instructions already properly ignore
20397 the high bits of the shuffle elements. No need for us to
20398 perform an AND ourselves. */
20399 if (one_operand_shuffle)
20400 emit_insn (gen_avx2_permvarv8si (target, op0, mask));
20401 else
20402 {
20403 t1 = gen_reg_rtx (V8SImode);
20404 t2 = gen_reg_rtx (V8SImode);
20405 emit_insn (gen_avx2_permvarv8si (t1, op0, mask));
20406 emit_insn (gen_avx2_permvarv8si (t2, op1, mask));
20407 goto merge_two;
20408 }
20409 return;
20410
20411 case V8SFmode:
20412 mask = gen_lowpart (V8SFmode, mask);
20413 if (one_operand_shuffle)
20414 emit_insn (gen_avx2_permvarv8sf (target, op0, mask));
20415 else
20416 {
20417 t1 = gen_reg_rtx (V8SFmode);
20418 t2 = gen_reg_rtx (V8SFmode);
20419 emit_insn (gen_avx2_permvarv8sf (t1, op0, mask));
20420 emit_insn (gen_avx2_permvarv8sf (t2, op1, mask));
20421 goto merge_two;
20422 }
20423 return;
20424
20425 case V4SImode:
20426 /* By combining the two 128-bit input vectors into one 256-bit
20427 input vector, we can use VPERMD and VPERMPS for the full
20428 two-operand shuffle. */
20429 t1 = gen_reg_rtx (V8SImode);
20430 t2 = gen_reg_rtx (V8SImode);
20431 emit_insn (gen_avx_vec_concatv8si (t1, op0, op1));
20432 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
20433 emit_insn (gen_avx2_permvarv8si (t1, t1, t2));
20434 emit_insn (gen_avx_vextractf128v8si (target, t1, const0_rtx));
20435 return;
20436
20437 case V4SFmode:
20438 t1 = gen_reg_rtx (V8SFmode);
20439 t2 = gen_reg_rtx (V8SImode);
20440 mask = gen_lowpart (V4SImode, mask);
20441 emit_insn (gen_avx_vec_concatv8sf (t1, op0, op1));
20442 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
20443 emit_insn (gen_avx2_permvarv8sf (t1, t1, t2));
20444 emit_insn (gen_avx_vextractf128v8sf (target, t1, const0_rtx));
20445 return;
20446
20447 case V32QImode:
20448 t1 = gen_reg_rtx (V32QImode);
20449 t2 = gen_reg_rtx (V32QImode);
20450 t3 = gen_reg_rtx (V32QImode);
20451 vt2 = GEN_INT (128);
20452 for (i = 0; i < 32; i++)
20453 vec[i] = vt2;
20454 vt = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
20455 vt = force_reg (V32QImode, vt);
20456 for (i = 0; i < 32; i++)
20457 vec[i] = i < 16 ? vt2 : const0_rtx;
20458 vt2 = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
20459 vt2 = force_reg (V32QImode, vt2);
20460 /* From mask create two adjusted masks, which contain the same
20461 bits as mask in the low 7 bits of each vector element.
20462 The first mask will have the most significant bit clear
20463 if it requests element from the same 128-bit lane
20464 and MSB set if it requests element from the other 128-bit lane.
20465 The second mask will have the opposite values of the MSB,
20466 and additionally will have its 128-bit lanes swapped.
20467 E.g. { 07 12 1e 09 ... | 17 19 05 1f ... } mask vector will have
20468 t1 { 07 92 9e 09 ... | 17 19 85 1f ... } and
20469 t3 { 97 99 05 9f ... | 87 12 1e 89 ... } where each ...
20470 stands for other 12 bytes. */
20471 /* The bit whether element is from the same lane or the other
20472 lane is bit 4, so shift it up by 3 to the MSB position. */
20473 emit_insn (gen_ashlv4di3 (gen_lowpart (V4DImode, t1),
20474 gen_lowpart (V4DImode, mask),
20475 GEN_INT (3)));
20476 /* Clear MSB bits from the mask just in case it had them set. */
20477 emit_insn (gen_avx2_andnotv32qi3 (t2, vt, mask));
20478 /* After this t1 will have MSB set for elements from other lane. */
20479 emit_insn (gen_xorv32qi3 (t1, t1, vt2));
20480 /* Clear bits other than MSB. */
20481 emit_insn (gen_andv32qi3 (t1, t1, vt));
20482 /* Or in the lower bits from mask into t3. */
20483 emit_insn (gen_iorv32qi3 (t3, t1, t2));
20484 /* And invert MSB bits in t1, so MSB is set for elements from the same
20485 lane. */
20486 emit_insn (gen_xorv32qi3 (t1, t1, vt));
20487 /* Swap 128-bit lanes in t3. */
20488 emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t3),
20489 gen_lowpart (V4DImode, t3),
20490 const2_rtx, GEN_INT (3),
20491 const0_rtx, const1_rtx));
20492 /* And or in the lower bits from mask into t1. */
20493 emit_insn (gen_iorv32qi3 (t1, t1, t2));
20494 if (one_operand_shuffle)
20495 {
20496 /* Each of these shuffles will put 0s in places where
20497 element from the other 128-bit lane is needed, otherwise
20498 will shuffle in the requested value. */
20499 emit_insn (gen_avx2_pshufbv32qi3 (t3, op0, t3));
20500 emit_insn (gen_avx2_pshufbv32qi3 (t1, op0, t1));
20501 /* For t3 the 128-bit lanes are swapped again. */
20502 emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t3),
20503 gen_lowpart (V4DImode, t3),
20504 const2_rtx, GEN_INT (3),
20505 const0_rtx, const1_rtx));
20506 /* And oring both together leads to the result. */
20507 emit_insn (gen_iorv32qi3 (target, t1, t3));
20508 return;
20509 }
20510
20511 t4 = gen_reg_rtx (V32QImode);
20512 /* Similarly to the above one_operand_shuffle code,
20513 just for repeated twice for each operand. merge_two:
20514 code will merge the two results together. */
20515 emit_insn (gen_avx2_pshufbv32qi3 (t4, op0, t3));
20516 emit_insn (gen_avx2_pshufbv32qi3 (t3, op1, t3));
20517 emit_insn (gen_avx2_pshufbv32qi3 (t2, op0, t1));
20518 emit_insn (gen_avx2_pshufbv32qi3 (t1, op1, t1));
20519 emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t4),
20520 gen_lowpart (V4DImode, t4),
20521 const2_rtx, GEN_INT (3),
20522 const0_rtx, const1_rtx));
20523 emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t3),
20524 gen_lowpart (V4DImode, t3),
20525 const2_rtx, GEN_INT (3),
20526 const0_rtx, const1_rtx));
20527 emit_insn (gen_iorv32qi3 (t4, t2, t4));
20528 emit_insn (gen_iorv32qi3 (t3, t1, t3));
20529 t1 = t4;
20530 t2 = t3;
20531 goto merge_two;
20532
20533 default:
20534 gcc_assert (GET_MODE_SIZE (mode) <= 16);
20535 break;
20536 }
20537 }
20538
20539 if (TARGET_XOP)
20540 {
20541 /* The XOP VPPERM insn supports three inputs. By ignoring the
20542 one_operand_shuffle special case, we avoid creating another
20543 set of constant vectors in memory. */
20544 one_operand_shuffle = false;
20545
20546 /* mask = mask & {2*w-1, ...} */
20547 vt = GEN_INT (2*w - 1);
20548 }
20549 else
20550 {
20551 /* mask = mask & {w-1, ...} */
20552 vt = GEN_INT (w - 1);
20553 }
20554
20555 for (i = 0; i < w; i++)
20556 vec[i] = vt;
20557 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
20558 mask = expand_simple_binop (maskmode, AND, mask, vt,
20559 NULL_RTX, 0, OPTAB_DIRECT);
20560
20561 /* For non-QImode operations, convert the word permutation control
20562 into a byte permutation control. */
20563 if (mode != V16QImode)
20564 {
20565 mask = expand_simple_binop (maskmode, ASHIFT, mask,
20566 GEN_INT (exact_log2 (e)),
20567 NULL_RTX, 0, OPTAB_DIRECT);
20568
20569 /* Convert mask to vector of chars. */
20570 mask = force_reg (V16QImode, gen_lowpart (V16QImode, mask));
20571
20572 /* Replicate each of the input bytes into byte positions:
20573 (v2di) --> {0,0,0,0,0,0,0,0, 8,8,8,8,8,8,8,8}
20574 (v4si) --> {0,0,0,0, 4,4,4,4, 8,8,8,8, 12,12,12,12}
20575 (v8hi) --> {0,0, 2,2, 4,4, 6,6, ...}. */
20576 for (i = 0; i < 16; ++i)
20577 vec[i] = GEN_INT (i/e * e);
20578 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
20579 vt = force_const_mem (V16QImode, vt);
20580 if (TARGET_XOP)
20581 emit_insn (gen_xop_pperm (mask, mask, mask, vt));
20582 else
20583 emit_insn (gen_ssse3_pshufbv16qi3 (mask, mask, vt));
20584
20585 /* Convert it into the byte positions by doing
20586 mask = mask + {0,1,..,16/w, 0,1,..,16/w, ...} */
20587 for (i = 0; i < 16; ++i)
20588 vec[i] = GEN_INT (i % e);
20589 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
20590 vt = force_const_mem (V16QImode, vt);
20591 emit_insn (gen_addv16qi3 (mask, mask, vt));
20592 }
20593
20594 /* The actual shuffle operations all operate on V16QImode. */
20595 op0 = gen_lowpart (V16QImode, op0);
20596 op1 = gen_lowpart (V16QImode, op1);
20597 target = gen_lowpart (V16QImode, target);
20598
20599 if (TARGET_XOP)
20600 {
20601 emit_insn (gen_xop_pperm (target, op0, op1, mask));
20602 }
20603 else if (one_operand_shuffle)
20604 {
20605 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, mask));
20606 }
20607 else
20608 {
20609 rtx xops[6];
20610 bool ok;
20611
20612 /* Shuffle the two input vectors independently. */
20613 t1 = gen_reg_rtx (V16QImode);
20614 t2 = gen_reg_rtx (V16QImode);
20615 emit_insn (gen_ssse3_pshufbv16qi3 (t1, op0, mask));
20616 emit_insn (gen_ssse3_pshufbv16qi3 (t2, op1, mask));
20617
20618 merge_two:
20619 /* Then merge them together. The key is whether any given control
20620 element contained a bit set that indicates the second word. */
20621 mask = operands[3];
20622 vt = GEN_INT (w);
20623 if (maskmode == V2DImode && !TARGET_SSE4_1)
20624 {
20625 /* Without SSE4.1, we don't have V2DImode EQ. Perform one
20626 more shuffle to convert the V2DI input mask into a V4SI
20627 input mask. At which point the masking that expand_int_vcond
20628 will work as desired. */
20629 rtx t3 = gen_reg_rtx (V4SImode);
20630 emit_insn (gen_sse2_pshufd_1 (t3, gen_lowpart (V4SImode, mask),
20631 const0_rtx, const0_rtx,
20632 const2_rtx, const2_rtx));
20633 mask = t3;
20634 maskmode = V4SImode;
20635 e = w = 4;
20636 }
20637
20638 for (i = 0; i < w; i++)
20639 vec[i] = vt;
20640 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
20641 vt = force_reg (maskmode, vt);
20642 mask = expand_simple_binop (maskmode, AND, mask, vt,
20643 NULL_RTX, 0, OPTAB_DIRECT);
20644
20645 xops[0] = gen_lowpart (mode, operands[0]);
20646 xops[1] = gen_lowpart (mode, t2);
20647 xops[2] = gen_lowpart (mode, t1);
20648 xops[3] = gen_rtx_EQ (maskmode, mask, vt);
20649 xops[4] = mask;
20650 xops[5] = vt;
20651 ok = ix86_expand_int_vcond (xops);
20652 gcc_assert (ok);
20653 }
20654 }
20655
20656 /* Unpack OP[1] into the next wider integer vector type. UNSIGNED_P is
20657 true if we should do zero extension, else sign extension. HIGH_P is
20658 true if we want the N/2 high elements, else the low elements. */
20659
20660 void
20661 ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p)
20662 {
20663 enum machine_mode imode = GET_MODE (src);
20664 rtx tmp;
20665
20666 if (TARGET_SSE4_1)
20667 {
20668 rtx (*unpack)(rtx, rtx);
20669 rtx (*extract)(rtx, rtx) = NULL;
20670 enum machine_mode halfmode = BLKmode;
20671
20672 switch (imode)
20673 {
20674 case V32QImode:
20675 if (unsigned_p)
20676 unpack = gen_avx2_zero_extendv16qiv16hi2;
20677 else
20678 unpack = gen_avx2_sign_extendv16qiv16hi2;
20679 halfmode = V16QImode;
20680 extract
20681 = high_p ? gen_vec_extract_hi_v32qi : gen_vec_extract_lo_v32qi;
20682 break;
20683 case V16HImode:
20684 if (unsigned_p)
20685 unpack = gen_avx2_zero_extendv8hiv8si2;
20686 else
20687 unpack = gen_avx2_sign_extendv8hiv8si2;
20688 halfmode = V8HImode;
20689 extract
20690 = high_p ? gen_vec_extract_hi_v16hi : gen_vec_extract_lo_v16hi;
20691 break;
20692 case V8SImode:
20693 if (unsigned_p)
20694 unpack = gen_avx2_zero_extendv4siv4di2;
20695 else
20696 unpack = gen_avx2_sign_extendv4siv4di2;
20697 halfmode = V4SImode;
20698 extract
20699 = high_p ? gen_vec_extract_hi_v8si : gen_vec_extract_lo_v8si;
20700 break;
20701 case V16QImode:
20702 if (unsigned_p)
20703 unpack = gen_sse4_1_zero_extendv8qiv8hi2;
20704 else
20705 unpack = gen_sse4_1_sign_extendv8qiv8hi2;
20706 break;
20707 case V8HImode:
20708 if (unsigned_p)
20709 unpack = gen_sse4_1_zero_extendv4hiv4si2;
20710 else
20711 unpack = gen_sse4_1_sign_extendv4hiv4si2;
20712 break;
20713 case V4SImode:
20714 if (unsigned_p)
20715 unpack = gen_sse4_1_zero_extendv2siv2di2;
20716 else
20717 unpack = gen_sse4_1_sign_extendv2siv2di2;
20718 break;
20719 default:
20720 gcc_unreachable ();
20721 }
20722
20723 if (GET_MODE_SIZE (imode) == 32)
20724 {
20725 tmp = gen_reg_rtx (halfmode);
20726 emit_insn (extract (tmp, src));
20727 }
20728 else if (high_p)
20729 {
20730 /* Shift higher 8 bytes to lower 8 bytes. */
20731 tmp = gen_reg_rtx (imode);
20732 emit_insn (gen_sse2_lshrv1ti3 (gen_lowpart (V1TImode, tmp),
20733 gen_lowpart (V1TImode, src),
20734 GEN_INT (64)));
20735 }
20736 else
20737 tmp = src;
20738
20739 emit_insn (unpack (dest, tmp));
20740 }
20741 else
20742 {
20743 rtx (*unpack)(rtx, rtx, rtx);
20744
20745 switch (imode)
20746 {
20747 case V16QImode:
20748 if (high_p)
20749 unpack = gen_vec_interleave_highv16qi;
20750 else
20751 unpack = gen_vec_interleave_lowv16qi;
20752 break;
20753 case V8HImode:
20754 if (high_p)
20755 unpack = gen_vec_interleave_highv8hi;
20756 else
20757 unpack = gen_vec_interleave_lowv8hi;
20758 break;
20759 case V4SImode:
20760 if (high_p)
20761 unpack = gen_vec_interleave_highv4si;
20762 else
20763 unpack = gen_vec_interleave_lowv4si;
20764 break;
20765 default:
20766 gcc_unreachable ();
20767 }
20768
20769 if (unsigned_p)
20770 tmp = force_reg (imode, CONST0_RTX (imode));
20771 else
20772 tmp = ix86_expand_sse_cmp (gen_reg_rtx (imode), GT, CONST0_RTX (imode),
20773 src, pc_rtx, pc_rtx);
20774
20775 emit_insn (unpack (gen_lowpart (imode, dest), src, tmp));
20776 }
20777 }
20778
20779 /* Expand conditional increment or decrement using adb/sbb instructions.
20780 The default case using setcc followed by the conditional move can be
20781 done by generic code. */
20782 bool
20783 ix86_expand_int_addcc (rtx operands[])
20784 {
20785 enum rtx_code code = GET_CODE (operands[1]);
20786 rtx flags;
20787 rtx (*insn)(rtx, rtx, rtx, rtx, rtx);
20788 rtx compare_op;
20789 rtx val = const0_rtx;
20790 bool fpcmp = false;
20791 enum machine_mode mode;
20792 rtx op0 = XEXP (operands[1], 0);
20793 rtx op1 = XEXP (operands[1], 1);
20794
20795 if (operands[3] != const1_rtx
20796 && operands[3] != constm1_rtx)
20797 return false;
20798 if (!ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
20799 return false;
20800 code = GET_CODE (compare_op);
20801
20802 flags = XEXP (compare_op, 0);
20803
20804 if (GET_MODE (flags) == CCFPmode
20805 || GET_MODE (flags) == CCFPUmode)
20806 {
20807 fpcmp = true;
20808 code = ix86_fp_compare_code_to_integer (code);
20809 }
20810
20811 if (code != LTU)
20812 {
20813 val = constm1_rtx;
20814 if (fpcmp)
20815 PUT_CODE (compare_op,
20816 reverse_condition_maybe_unordered
20817 (GET_CODE (compare_op)));
20818 else
20819 PUT_CODE (compare_op, reverse_condition (GET_CODE (compare_op)));
20820 }
20821
20822 mode = GET_MODE (operands[0]);
20823
20824 /* Construct either adc or sbb insn. */
20825 if ((code == LTU) == (operands[3] == constm1_rtx))
20826 {
20827 switch (mode)
20828 {
20829 case QImode:
20830 insn = gen_subqi3_carry;
20831 break;
20832 case HImode:
20833 insn = gen_subhi3_carry;
20834 break;
20835 case SImode:
20836 insn = gen_subsi3_carry;
20837 break;
20838 case DImode:
20839 insn = gen_subdi3_carry;
20840 break;
20841 default:
20842 gcc_unreachable ();
20843 }
20844 }
20845 else
20846 {
20847 switch (mode)
20848 {
20849 case QImode:
20850 insn = gen_addqi3_carry;
20851 break;
20852 case HImode:
20853 insn = gen_addhi3_carry;
20854 break;
20855 case SImode:
20856 insn = gen_addsi3_carry;
20857 break;
20858 case DImode:
20859 insn = gen_adddi3_carry;
20860 break;
20861 default:
20862 gcc_unreachable ();
20863 }
20864 }
20865 emit_insn (insn (operands[0], operands[2], val, flags, compare_op));
20866
20867 return true;
20868 }
20869
20870
20871 /* Split operands 0 and 1 into half-mode parts. Similar to split_double_mode,
20872 but works for floating pointer parameters and nonoffsetable memories.
20873 For pushes, it returns just stack offsets; the values will be saved
20874 in the right order. Maximally three parts are generated. */
20875
20876 static int
20877 ix86_split_to_parts (rtx operand, rtx *parts, enum machine_mode mode)
20878 {
20879 int size;
20880
20881 if (!TARGET_64BIT)
20882 size = mode==XFmode ? 3 : GET_MODE_SIZE (mode) / 4;
20883 else
20884 size = (GET_MODE_SIZE (mode) + 4) / 8;
20885
20886 gcc_assert (!REG_P (operand) || !MMX_REGNO_P (REGNO (operand)));
20887 gcc_assert (size >= 2 && size <= 4);
20888
20889 /* Optimize constant pool reference to immediates. This is used by fp
20890 moves, that force all constants to memory to allow combining. */
20891 if (MEM_P (operand) && MEM_READONLY_P (operand))
20892 {
20893 rtx tmp = maybe_get_pool_constant (operand);
20894 if (tmp)
20895 operand = tmp;
20896 }
20897
20898 if (MEM_P (operand) && !offsettable_memref_p (operand))
20899 {
20900 /* The only non-offsetable memories we handle are pushes. */
20901 int ok = push_operand (operand, VOIDmode);
20902
20903 gcc_assert (ok);
20904
20905 operand = copy_rtx (operand);
20906 PUT_MODE (operand, word_mode);
20907 parts[0] = parts[1] = parts[2] = parts[3] = operand;
20908 return size;
20909 }
20910
20911 if (GET_CODE (operand) == CONST_VECTOR)
20912 {
20913 enum machine_mode imode = int_mode_for_mode (mode);
20914 /* Caution: if we looked through a constant pool memory above,
20915 the operand may actually have a different mode now. That's
20916 ok, since we want to pun this all the way back to an integer. */
20917 operand = simplify_subreg (imode, operand, GET_MODE (operand), 0);
20918 gcc_assert (operand != NULL);
20919 mode = imode;
20920 }
20921
20922 if (!TARGET_64BIT)
20923 {
20924 if (mode == DImode)
20925 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
20926 else
20927 {
20928 int i;
20929
20930 if (REG_P (operand))
20931 {
20932 gcc_assert (reload_completed);
20933 for (i = 0; i < size; i++)
20934 parts[i] = gen_rtx_REG (SImode, REGNO (operand) + i);
20935 }
20936 else if (offsettable_memref_p (operand))
20937 {
20938 operand = adjust_address (operand, SImode, 0);
20939 parts[0] = operand;
20940 for (i = 1; i < size; i++)
20941 parts[i] = adjust_address (operand, SImode, 4 * i);
20942 }
20943 else if (GET_CODE (operand) == CONST_DOUBLE)
20944 {
20945 REAL_VALUE_TYPE r;
20946 long l[4];
20947
20948 REAL_VALUE_FROM_CONST_DOUBLE (r, operand);
20949 switch (mode)
20950 {
20951 case TFmode:
20952 real_to_target (l, &r, mode);
20953 parts[3] = gen_int_mode (l[3], SImode);
20954 parts[2] = gen_int_mode (l[2], SImode);
20955 break;
20956 case XFmode:
20957 /* We can't use REAL_VALUE_TO_TARGET_LONG_DOUBLE since
20958 long double may not be 80-bit. */
20959 real_to_target (l, &r, mode);
20960 parts[2] = gen_int_mode (l[2], SImode);
20961 break;
20962 case DFmode:
20963 REAL_VALUE_TO_TARGET_DOUBLE (r, l);
20964 break;
20965 default:
20966 gcc_unreachable ();
20967 }
20968 parts[1] = gen_int_mode (l[1], SImode);
20969 parts[0] = gen_int_mode (l[0], SImode);
20970 }
20971 else
20972 gcc_unreachable ();
20973 }
20974 }
20975 else
20976 {
20977 if (mode == TImode)
20978 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
20979 if (mode == XFmode || mode == TFmode)
20980 {
20981 enum machine_mode upper_mode = mode==XFmode ? SImode : DImode;
20982 if (REG_P (operand))
20983 {
20984 gcc_assert (reload_completed);
20985 parts[0] = gen_rtx_REG (DImode, REGNO (operand) + 0);
20986 parts[1] = gen_rtx_REG (upper_mode, REGNO (operand) + 1);
20987 }
20988 else if (offsettable_memref_p (operand))
20989 {
20990 operand = adjust_address (operand, DImode, 0);
20991 parts[0] = operand;
20992 parts[1] = adjust_address (operand, upper_mode, 8);
20993 }
20994 else if (GET_CODE (operand) == CONST_DOUBLE)
20995 {
20996 REAL_VALUE_TYPE r;
20997 long l[4];
20998
20999 REAL_VALUE_FROM_CONST_DOUBLE (r, operand);
21000 real_to_target (l, &r, mode);
21001
21002 /* Do not use shift by 32 to avoid warning on 32bit systems. */
21003 if (HOST_BITS_PER_WIDE_INT >= 64)
21004 parts[0]
21005 = gen_int_mode
21006 ((l[0] & (((HOST_WIDE_INT) 2 << 31) - 1))
21007 + ((((HOST_WIDE_INT) l[1]) << 31) << 1),
21008 DImode);
21009 else
21010 parts[0] = immed_double_const (l[0], l[1], DImode);
21011
21012 if (upper_mode == SImode)
21013 parts[1] = gen_int_mode (l[2], SImode);
21014 else if (HOST_BITS_PER_WIDE_INT >= 64)
21015 parts[1]
21016 = gen_int_mode
21017 ((l[2] & (((HOST_WIDE_INT) 2 << 31) - 1))
21018 + ((((HOST_WIDE_INT) l[3]) << 31) << 1),
21019 DImode);
21020 else
21021 parts[1] = immed_double_const (l[2], l[3], DImode);
21022 }
21023 else
21024 gcc_unreachable ();
21025 }
21026 }
21027
21028 return size;
21029 }
21030
21031 /* Emit insns to perform a move or push of DI, DF, XF, and TF values.
21032 Return false when normal moves are needed; true when all required
21033 insns have been emitted. Operands 2-4 contain the input values
21034 int the correct order; operands 5-7 contain the output values. */
21035
21036 void
21037 ix86_split_long_move (rtx operands[])
21038 {
21039 rtx part[2][4];
21040 int nparts, i, j;
21041 int push = 0;
21042 int collisions = 0;
21043 enum machine_mode mode = GET_MODE (operands[0]);
21044 bool collisionparts[4];
21045
21046 /* The DFmode expanders may ask us to move double.
21047 For 64bit target this is single move. By hiding the fact
21048 here we simplify i386.md splitters. */
21049 if (TARGET_64BIT && GET_MODE_SIZE (GET_MODE (operands[0])) == 8)
21050 {
21051 /* Optimize constant pool reference to immediates. This is used by
21052 fp moves, that force all constants to memory to allow combining. */
21053
21054 if (MEM_P (operands[1])
21055 && GET_CODE (XEXP (operands[1], 0)) == SYMBOL_REF
21056 && CONSTANT_POOL_ADDRESS_P (XEXP (operands[1], 0)))
21057 operands[1] = get_pool_constant (XEXP (operands[1], 0));
21058 if (push_operand (operands[0], VOIDmode))
21059 {
21060 operands[0] = copy_rtx (operands[0]);
21061 PUT_MODE (operands[0], word_mode);
21062 }
21063 else
21064 operands[0] = gen_lowpart (DImode, operands[0]);
21065 operands[1] = gen_lowpart (DImode, operands[1]);
21066 emit_move_insn (operands[0], operands[1]);
21067 return;
21068 }
21069
21070 /* The only non-offsettable memory we handle is push. */
21071 if (push_operand (operands[0], VOIDmode))
21072 push = 1;
21073 else
21074 gcc_assert (!MEM_P (operands[0])
21075 || offsettable_memref_p (operands[0]));
21076
21077 nparts = ix86_split_to_parts (operands[1], part[1], GET_MODE (operands[0]));
21078 ix86_split_to_parts (operands[0], part[0], GET_MODE (operands[0]));
21079
21080 /* When emitting push, take care for source operands on the stack. */
21081 if (push && MEM_P (operands[1])
21082 && reg_overlap_mentioned_p (stack_pointer_rtx, operands[1]))
21083 {
21084 rtx src_base = XEXP (part[1][nparts - 1], 0);
21085
21086 /* Compensate for the stack decrement by 4. */
21087 if (!TARGET_64BIT && nparts == 3
21088 && mode == XFmode && TARGET_128BIT_LONG_DOUBLE)
21089 src_base = plus_constant (Pmode, src_base, 4);
21090
21091 /* src_base refers to the stack pointer and is
21092 automatically decreased by emitted push. */
21093 for (i = 0; i < nparts; i++)
21094 part[1][i] = change_address (part[1][i],
21095 GET_MODE (part[1][i]), src_base);
21096 }
21097
21098 /* We need to do copy in the right order in case an address register
21099 of the source overlaps the destination. */
21100 if (REG_P (part[0][0]) && MEM_P (part[1][0]))
21101 {
21102 rtx tmp;
21103
21104 for (i = 0; i < nparts; i++)
21105 {
21106 collisionparts[i]
21107 = reg_overlap_mentioned_p (part[0][i], XEXP (part[1][0], 0));
21108 if (collisionparts[i])
21109 collisions++;
21110 }
21111
21112 /* Collision in the middle part can be handled by reordering. */
21113 if (collisions == 1 && nparts == 3 && collisionparts [1])
21114 {
21115 tmp = part[0][1]; part[0][1] = part[0][2]; part[0][2] = tmp;
21116 tmp = part[1][1]; part[1][1] = part[1][2]; part[1][2] = tmp;
21117 }
21118 else if (collisions == 1
21119 && nparts == 4
21120 && (collisionparts [1] || collisionparts [2]))
21121 {
21122 if (collisionparts [1])
21123 {
21124 tmp = part[0][1]; part[0][1] = part[0][2]; part[0][2] = tmp;
21125 tmp = part[1][1]; part[1][1] = part[1][2]; part[1][2] = tmp;
21126 }
21127 else
21128 {
21129 tmp = part[0][2]; part[0][2] = part[0][3]; part[0][3] = tmp;
21130 tmp = part[1][2]; part[1][2] = part[1][3]; part[1][3] = tmp;
21131 }
21132 }
21133
21134 /* If there are more collisions, we can't handle it by reordering.
21135 Do an lea to the last part and use only one colliding move. */
21136 else if (collisions > 1)
21137 {
21138 rtx base;
21139
21140 collisions = 1;
21141
21142 base = part[0][nparts - 1];
21143
21144 /* Handle the case when the last part isn't valid for lea.
21145 Happens in 64-bit mode storing the 12-byte XFmode. */
21146 if (GET_MODE (base) != Pmode)
21147 base = gen_rtx_REG (Pmode, REGNO (base));
21148
21149 emit_insn (gen_rtx_SET (VOIDmode, base, XEXP (part[1][0], 0)));
21150 part[1][0] = replace_equiv_address (part[1][0], base);
21151 for (i = 1; i < nparts; i++)
21152 {
21153 tmp = plus_constant (Pmode, base, UNITS_PER_WORD * i);
21154 part[1][i] = replace_equiv_address (part[1][i], tmp);
21155 }
21156 }
21157 }
21158
21159 if (push)
21160 {
21161 if (!TARGET_64BIT)
21162 {
21163 if (nparts == 3)
21164 {
21165 if (TARGET_128BIT_LONG_DOUBLE && mode == XFmode)
21166 emit_insn (ix86_gen_add3 (stack_pointer_rtx,
21167 stack_pointer_rtx, GEN_INT (-4)));
21168 emit_move_insn (part[0][2], part[1][2]);
21169 }
21170 else if (nparts == 4)
21171 {
21172 emit_move_insn (part[0][3], part[1][3]);
21173 emit_move_insn (part[0][2], part[1][2]);
21174 }
21175 }
21176 else
21177 {
21178 /* In 64bit mode we don't have 32bit push available. In case this is
21179 register, it is OK - we will just use larger counterpart. We also
21180 retype memory - these comes from attempt to avoid REX prefix on
21181 moving of second half of TFmode value. */
21182 if (GET_MODE (part[1][1]) == SImode)
21183 {
21184 switch (GET_CODE (part[1][1]))
21185 {
21186 case MEM:
21187 part[1][1] = adjust_address (part[1][1], DImode, 0);
21188 break;
21189
21190 case REG:
21191 part[1][1] = gen_rtx_REG (DImode, REGNO (part[1][1]));
21192 break;
21193
21194 default:
21195 gcc_unreachable ();
21196 }
21197
21198 if (GET_MODE (part[1][0]) == SImode)
21199 part[1][0] = part[1][1];
21200 }
21201 }
21202 emit_move_insn (part[0][1], part[1][1]);
21203 emit_move_insn (part[0][0], part[1][0]);
21204 return;
21205 }
21206
21207 /* Choose correct order to not overwrite the source before it is copied. */
21208 if ((REG_P (part[0][0])
21209 && REG_P (part[1][1])
21210 && (REGNO (part[0][0]) == REGNO (part[1][1])
21211 || (nparts == 3
21212 && REGNO (part[0][0]) == REGNO (part[1][2]))
21213 || (nparts == 4
21214 && REGNO (part[0][0]) == REGNO (part[1][3]))))
21215 || (collisions > 0
21216 && reg_overlap_mentioned_p (part[0][0], XEXP (part[1][0], 0))))
21217 {
21218 for (i = 0, j = nparts - 1; i < nparts; i++, j--)
21219 {
21220 operands[2 + i] = part[0][j];
21221 operands[6 + i] = part[1][j];
21222 }
21223 }
21224 else
21225 {
21226 for (i = 0; i < nparts; i++)
21227 {
21228 operands[2 + i] = part[0][i];
21229 operands[6 + i] = part[1][i];
21230 }
21231 }
21232
21233 /* If optimizing for size, attempt to locally unCSE nonzero constants. */
21234 if (optimize_insn_for_size_p ())
21235 {
21236 for (j = 0; j < nparts - 1; j++)
21237 if (CONST_INT_P (operands[6 + j])
21238 && operands[6 + j] != const0_rtx
21239 && REG_P (operands[2 + j]))
21240 for (i = j; i < nparts - 1; i++)
21241 if (CONST_INT_P (operands[7 + i])
21242 && INTVAL (operands[7 + i]) == INTVAL (operands[6 + j]))
21243 operands[7 + i] = operands[2 + j];
21244 }
21245
21246 for (i = 0; i < nparts; i++)
21247 emit_move_insn (operands[2 + i], operands[6 + i]);
21248
21249 return;
21250 }
21251
21252 /* Helper function of ix86_split_ashl used to generate an SImode/DImode
21253 left shift by a constant, either using a single shift or
21254 a sequence of add instructions. */
21255
21256 static void
21257 ix86_expand_ashl_const (rtx operand, int count, enum machine_mode mode)
21258 {
21259 rtx (*insn)(rtx, rtx, rtx);
21260
21261 if (count == 1
21262 || (count * ix86_cost->add <= ix86_cost->shift_const
21263 && !optimize_insn_for_size_p ()))
21264 {
21265 insn = mode == DImode ? gen_addsi3 : gen_adddi3;
21266 while (count-- > 0)
21267 emit_insn (insn (operand, operand, operand));
21268 }
21269 else
21270 {
21271 insn = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
21272 emit_insn (insn (operand, operand, GEN_INT (count)));
21273 }
21274 }
21275
21276 void
21277 ix86_split_ashl (rtx *operands, rtx scratch, enum machine_mode mode)
21278 {
21279 rtx (*gen_ashl3)(rtx, rtx, rtx);
21280 rtx (*gen_shld)(rtx, rtx, rtx);
21281 int half_width = GET_MODE_BITSIZE (mode) >> 1;
21282
21283 rtx low[2], high[2];
21284 int count;
21285
21286 if (CONST_INT_P (operands[2]))
21287 {
21288 split_double_mode (mode, operands, 2, low, high);
21289 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
21290
21291 if (count >= half_width)
21292 {
21293 emit_move_insn (high[0], low[1]);
21294 emit_move_insn (low[0], const0_rtx);
21295
21296 if (count > half_width)
21297 ix86_expand_ashl_const (high[0], count - half_width, mode);
21298 }
21299 else
21300 {
21301 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
21302
21303 if (!rtx_equal_p (operands[0], operands[1]))
21304 emit_move_insn (operands[0], operands[1]);
21305
21306 emit_insn (gen_shld (high[0], low[0], GEN_INT (count)));
21307 ix86_expand_ashl_const (low[0], count, mode);
21308 }
21309 return;
21310 }
21311
21312 split_double_mode (mode, operands, 1, low, high);
21313
21314 gen_ashl3 = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
21315
21316 if (operands[1] == const1_rtx)
21317 {
21318 /* Assuming we've chosen a QImode capable registers, then 1 << N
21319 can be done with two 32/64-bit shifts, no branches, no cmoves. */
21320 if (ANY_QI_REG_P (low[0]) && ANY_QI_REG_P (high[0]))
21321 {
21322 rtx s, d, flags = gen_rtx_REG (CCZmode, FLAGS_REG);
21323
21324 ix86_expand_clear (low[0]);
21325 ix86_expand_clear (high[0]);
21326 emit_insn (gen_testqi_ccz_1 (operands[2], GEN_INT (half_width)));
21327
21328 d = gen_lowpart (QImode, low[0]);
21329 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
21330 s = gen_rtx_EQ (QImode, flags, const0_rtx);
21331 emit_insn (gen_rtx_SET (VOIDmode, d, s));
21332
21333 d = gen_lowpart (QImode, high[0]);
21334 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
21335 s = gen_rtx_NE (QImode, flags, const0_rtx);
21336 emit_insn (gen_rtx_SET (VOIDmode, d, s));
21337 }
21338
21339 /* Otherwise, we can get the same results by manually performing
21340 a bit extract operation on bit 5/6, and then performing the two
21341 shifts. The two methods of getting 0/1 into low/high are exactly
21342 the same size. Avoiding the shift in the bit extract case helps
21343 pentium4 a bit; no one else seems to care much either way. */
21344 else
21345 {
21346 enum machine_mode half_mode;
21347 rtx (*gen_lshr3)(rtx, rtx, rtx);
21348 rtx (*gen_and3)(rtx, rtx, rtx);
21349 rtx (*gen_xor3)(rtx, rtx, rtx);
21350 HOST_WIDE_INT bits;
21351 rtx x;
21352
21353 if (mode == DImode)
21354 {
21355 half_mode = SImode;
21356 gen_lshr3 = gen_lshrsi3;
21357 gen_and3 = gen_andsi3;
21358 gen_xor3 = gen_xorsi3;
21359 bits = 5;
21360 }
21361 else
21362 {
21363 half_mode = DImode;
21364 gen_lshr3 = gen_lshrdi3;
21365 gen_and3 = gen_anddi3;
21366 gen_xor3 = gen_xordi3;
21367 bits = 6;
21368 }
21369
21370 if (TARGET_PARTIAL_REG_STALL && !optimize_insn_for_size_p ())
21371 x = gen_rtx_ZERO_EXTEND (half_mode, operands[2]);
21372 else
21373 x = gen_lowpart (half_mode, operands[2]);
21374 emit_insn (gen_rtx_SET (VOIDmode, high[0], x));
21375
21376 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (bits)));
21377 emit_insn (gen_and3 (high[0], high[0], const1_rtx));
21378 emit_move_insn (low[0], high[0]);
21379 emit_insn (gen_xor3 (low[0], low[0], const1_rtx));
21380 }
21381
21382 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
21383 emit_insn (gen_ashl3 (high[0], high[0], operands[2]));
21384 return;
21385 }
21386
21387 if (operands[1] == constm1_rtx)
21388 {
21389 /* For -1 << N, we can avoid the shld instruction, because we
21390 know that we're shifting 0...31/63 ones into a -1. */
21391 emit_move_insn (low[0], constm1_rtx);
21392 if (optimize_insn_for_size_p ())
21393 emit_move_insn (high[0], low[0]);
21394 else
21395 emit_move_insn (high[0], constm1_rtx);
21396 }
21397 else
21398 {
21399 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
21400
21401 if (!rtx_equal_p (operands[0], operands[1]))
21402 emit_move_insn (operands[0], operands[1]);
21403
21404 split_double_mode (mode, operands, 1, low, high);
21405 emit_insn (gen_shld (high[0], low[0], operands[2]));
21406 }
21407
21408 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
21409
21410 if (TARGET_CMOVE && scratch)
21411 {
21412 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
21413 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
21414
21415 ix86_expand_clear (scratch);
21416 emit_insn (gen_x86_shift_adj_1 (high[0], low[0], operands[2], scratch));
21417 }
21418 else
21419 {
21420 rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
21421 = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
21422
21423 emit_insn (gen_x86_shift_adj_2 (high[0], low[0], operands[2]));
21424 }
21425 }
21426
21427 void
21428 ix86_split_ashr (rtx *operands, rtx scratch, enum machine_mode mode)
21429 {
21430 rtx (*gen_ashr3)(rtx, rtx, rtx)
21431 = mode == DImode ? gen_ashrsi3 : gen_ashrdi3;
21432 rtx (*gen_shrd)(rtx, rtx, rtx);
21433 int half_width = GET_MODE_BITSIZE (mode) >> 1;
21434
21435 rtx low[2], high[2];
21436 int count;
21437
21438 if (CONST_INT_P (operands[2]))
21439 {
21440 split_double_mode (mode, operands, 2, low, high);
21441 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
21442
21443 if (count == GET_MODE_BITSIZE (mode) - 1)
21444 {
21445 emit_move_insn (high[0], high[1]);
21446 emit_insn (gen_ashr3 (high[0], high[0],
21447 GEN_INT (half_width - 1)));
21448 emit_move_insn (low[0], high[0]);
21449
21450 }
21451 else if (count >= half_width)
21452 {
21453 emit_move_insn (low[0], high[1]);
21454 emit_move_insn (high[0], low[0]);
21455 emit_insn (gen_ashr3 (high[0], high[0],
21456 GEN_INT (half_width - 1)));
21457
21458 if (count > half_width)
21459 emit_insn (gen_ashr3 (low[0], low[0],
21460 GEN_INT (count - half_width)));
21461 }
21462 else
21463 {
21464 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
21465
21466 if (!rtx_equal_p (operands[0], operands[1]))
21467 emit_move_insn (operands[0], operands[1]);
21468
21469 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
21470 emit_insn (gen_ashr3 (high[0], high[0], GEN_INT (count)));
21471 }
21472 }
21473 else
21474 {
21475 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
21476
21477 if (!rtx_equal_p (operands[0], operands[1]))
21478 emit_move_insn (operands[0], operands[1]);
21479
21480 split_double_mode (mode, operands, 1, low, high);
21481
21482 emit_insn (gen_shrd (low[0], high[0], operands[2]));
21483 emit_insn (gen_ashr3 (high[0], high[0], operands[2]));
21484
21485 if (TARGET_CMOVE && scratch)
21486 {
21487 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
21488 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
21489
21490 emit_move_insn (scratch, high[0]);
21491 emit_insn (gen_ashr3 (scratch, scratch,
21492 GEN_INT (half_width - 1)));
21493 emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
21494 scratch));
21495 }
21496 else
21497 {
21498 rtx (*gen_x86_shift_adj_3)(rtx, rtx, rtx)
21499 = mode == DImode ? gen_x86_shiftsi_adj_3 : gen_x86_shiftdi_adj_3;
21500
21501 emit_insn (gen_x86_shift_adj_3 (low[0], high[0], operands[2]));
21502 }
21503 }
21504 }
21505
21506 void
21507 ix86_split_lshr (rtx *operands, rtx scratch, enum machine_mode mode)
21508 {
21509 rtx (*gen_lshr3)(rtx, rtx, rtx)
21510 = mode == DImode ? gen_lshrsi3 : gen_lshrdi3;
21511 rtx (*gen_shrd)(rtx, rtx, rtx);
21512 int half_width = GET_MODE_BITSIZE (mode) >> 1;
21513
21514 rtx low[2], high[2];
21515 int count;
21516
21517 if (CONST_INT_P (operands[2]))
21518 {
21519 split_double_mode (mode, operands, 2, low, high);
21520 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
21521
21522 if (count >= half_width)
21523 {
21524 emit_move_insn (low[0], high[1]);
21525 ix86_expand_clear (high[0]);
21526
21527 if (count > half_width)
21528 emit_insn (gen_lshr3 (low[0], low[0],
21529 GEN_INT (count - half_width)));
21530 }
21531 else
21532 {
21533 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
21534
21535 if (!rtx_equal_p (operands[0], operands[1]))
21536 emit_move_insn (operands[0], operands[1]);
21537
21538 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
21539 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (count)));
21540 }
21541 }
21542 else
21543 {
21544 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
21545
21546 if (!rtx_equal_p (operands[0], operands[1]))
21547 emit_move_insn (operands[0], operands[1]);
21548
21549 split_double_mode (mode, operands, 1, low, high);
21550
21551 emit_insn (gen_shrd (low[0], high[0], operands[2]));
21552 emit_insn (gen_lshr3 (high[0], high[0], operands[2]));
21553
21554 if (TARGET_CMOVE && scratch)
21555 {
21556 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
21557 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
21558
21559 ix86_expand_clear (scratch);
21560 emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
21561 scratch));
21562 }
21563 else
21564 {
21565 rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
21566 = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
21567
21568 emit_insn (gen_x86_shift_adj_2 (low[0], high[0], operands[2]));
21569 }
21570 }
21571 }
21572
21573 /* Predict just emitted jump instruction to be taken with probability PROB. */
21574 static void
21575 predict_jump (int prob)
21576 {
21577 rtx insn = get_last_insn ();
21578 gcc_assert (JUMP_P (insn));
21579 add_reg_note (insn, REG_BR_PROB, GEN_INT (prob));
21580 }
21581
21582 /* Helper function for the string operations below. Dest VARIABLE whether
21583 it is aligned to VALUE bytes. If true, jump to the label. */
21584 static rtx
21585 ix86_expand_aligntest (rtx variable, int value, bool epilogue)
21586 {
21587 rtx label = gen_label_rtx ();
21588 rtx tmpcount = gen_reg_rtx (GET_MODE (variable));
21589 if (GET_MODE (variable) == DImode)
21590 emit_insn (gen_anddi3 (tmpcount, variable, GEN_INT (value)));
21591 else
21592 emit_insn (gen_andsi3 (tmpcount, variable, GEN_INT (value)));
21593 emit_cmp_and_jump_insns (tmpcount, const0_rtx, EQ, 0, GET_MODE (variable),
21594 1, label);
21595 if (epilogue)
21596 predict_jump (REG_BR_PROB_BASE * 50 / 100);
21597 else
21598 predict_jump (REG_BR_PROB_BASE * 90 / 100);
21599 return label;
21600 }
21601
21602 /* Adjust COUNTER by the VALUE. */
21603 static void
21604 ix86_adjust_counter (rtx countreg, HOST_WIDE_INT value)
21605 {
21606 rtx (*gen_add)(rtx, rtx, rtx)
21607 = GET_MODE (countreg) == DImode ? gen_adddi3 : gen_addsi3;
21608
21609 emit_insn (gen_add (countreg, countreg, GEN_INT (-value)));
21610 }
21611
21612 /* Zero extend possibly SImode EXP to Pmode register. */
21613 rtx
21614 ix86_zero_extend_to_Pmode (rtx exp)
21615 {
21616 if (GET_MODE (exp) != Pmode)
21617 exp = convert_to_mode (Pmode, exp, 1);
21618 return force_reg (Pmode, exp);
21619 }
21620
21621 /* Divide COUNTREG by SCALE. */
21622 static rtx
21623 scale_counter (rtx countreg, int scale)
21624 {
21625 rtx sc;
21626
21627 if (scale == 1)
21628 return countreg;
21629 if (CONST_INT_P (countreg))
21630 return GEN_INT (INTVAL (countreg) / scale);
21631 gcc_assert (REG_P (countreg));
21632
21633 sc = expand_simple_binop (GET_MODE (countreg), LSHIFTRT, countreg,
21634 GEN_INT (exact_log2 (scale)),
21635 NULL, 1, OPTAB_DIRECT);
21636 return sc;
21637 }
21638
21639 /* Return mode for the memcpy/memset loop counter. Prefer SImode over
21640 DImode for constant loop counts. */
21641
21642 static enum machine_mode
21643 counter_mode (rtx count_exp)
21644 {
21645 if (GET_MODE (count_exp) != VOIDmode)
21646 return GET_MODE (count_exp);
21647 if (!CONST_INT_P (count_exp))
21648 return Pmode;
21649 if (TARGET_64BIT && (INTVAL (count_exp) & ~0xffffffff))
21650 return DImode;
21651 return SImode;
21652 }
21653
21654 /* When SRCPTR is non-NULL, output simple loop to move memory
21655 pointer to SRCPTR to DESTPTR via chunks of MODE unrolled UNROLL times,
21656 overall size is COUNT specified in bytes. When SRCPTR is NULL, output the
21657 equivalent loop to set memory by VALUE (supposed to be in MODE).
21658
21659 The size is rounded down to whole number of chunk size moved at once.
21660 SRCMEM and DESTMEM provide MEMrtx to feed proper aliasing info. */
21661
21662
21663 static void
21664 expand_set_or_movmem_via_loop (rtx destmem, rtx srcmem,
21665 rtx destptr, rtx srcptr, rtx value,
21666 rtx count, enum machine_mode mode, int unroll,
21667 int expected_size)
21668 {
21669 rtx out_label, top_label, iter, tmp;
21670 enum machine_mode iter_mode = counter_mode (count);
21671 rtx piece_size = GEN_INT (GET_MODE_SIZE (mode) * unroll);
21672 rtx piece_size_mask = GEN_INT (~((GET_MODE_SIZE (mode) * unroll) - 1));
21673 rtx size;
21674 rtx x_addr;
21675 rtx y_addr;
21676 int i;
21677
21678 top_label = gen_label_rtx ();
21679 out_label = gen_label_rtx ();
21680 iter = gen_reg_rtx (iter_mode);
21681
21682 size = expand_simple_binop (iter_mode, AND, count, piece_size_mask,
21683 NULL, 1, OPTAB_DIRECT);
21684 /* Those two should combine. */
21685 if (piece_size == const1_rtx)
21686 {
21687 emit_cmp_and_jump_insns (size, const0_rtx, EQ, NULL_RTX, iter_mode,
21688 true, out_label);
21689 predict_jump (REG_BR_PROB_BASE * 10 / 100);
21690 }
21691 emit_move_insn (iter, const0_rtx);
21692
21693 emit_label (top_label);
21694
21695 tmp = convert_modes (Pmode, iter_mode, iter, true);
21696 x_addr = gen_rtx_PLUS (Pmode, destptr, tmp);
21697 destmem = change_address (destmem, mode, x_addr);
21698
21699 if (srcmem)
21700 {
21701 y_addr = gen_rtx_PLUS (Pmode, srcptr, copy_rtx (tmp));
21702 srcmem = change_address (srcmem, mode, y_addr);
21703
21704 /* When unrolling for chips that reorder memory reads and writes,
21705 we can save registers by using single temporary.
21706 Also using 4 temporaries is overkill in 32bit mode. */
21707 if (!TARGET_64BIT && 0)
21708 {
21709 for (i = 0; i < unroll; i++)
21710 {
21711 if (i)
21712 {
21713 destmem =
21714 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
21715 srcmem =
21716 adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
21717 }
21718 emit_move_insn (destmem, srcmem);
21719 }
21720 }
21721 else
21722 {
21723 rtx tmpreg[4];
21724 gcc_assert (unroll <= 4);
21725 for (i = 0; i < unroll; i++)
21726 {
21727 tmpreg[i] = gen_reg_rtx (mode);
21728 if (i)
21729 {
21730 srcmem =
21731 adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
21732 }
21733 emit_move_insn (tmpreg[i], srcmem);
21734 }
21735 for (i = 0; i < unroll; i++)
21736 {
21737 if (i)
21738 {
21739 destmem =
21740 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
21741 }
21742 emit_move_insn (destmem, tmpreg[i]);
21743 }
21744 }
21745 }
21746 else
21747 for (i = 0; i < unroll; i++)
21748 {
21749 if (i)
21750 destmem =
21751 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
21752 emit_move_insn (destmem, value);
21753 }
21754
21755 tmp = expand_simple_binop (iter_mode, PLUS, iter, piece_size, iter,
21756 true, OPTAB_LIB_WIDEN);
21757 if (tmp != iter)
21758 emit_move_insn (iter, tmp);
21759
21760 emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
21761 true, top_label);
21762 if (expected_size != -1)
21763 {
21764 expected_size /= GET_MODE_SIZE (mode) * unroll;
21765 if (expected_size == 0)
21766 predict_jump (0);
21767 else if (expected_size > REG_BR_PROB_BASE)
21768 predict_jump (REG_BR_PROB_BASE - 1);
21769 else
21770 predict_jump (REG_BR_PROB_BASE - (REG_BR_PROB_BASE + expected_size / 2) / expected_size);
21771 }
21772 else
21773 predict_jump (REG_BR_PROB_BASE * 80 / 100);
21774 iter = ix86_zero_extend_to_Pmode (iter);
21775 tmp = expand_simple_binop (Pmode, PLUS, destptr, iter, destptr,
21776 true, OPTAB_LIB_WIDEN);
21777 if (tmp != destptr)
21778 emit_move_insn (destptr, tmp);
21779 if (srcptr)
21780 {
21781 tmp = expand_simple_binop (Pmode, PLUS, srcptr, iter, srcptr,
21782 true, OPTAB_LIB_WIDEN);
21783 if (tmp != srcptr)
21784 emit_move_insn (srcptr, tmp);
21785 }
21786 emit_label (out_label);
21787 }
21788
21789 /* Output "rep; mov" instruction.
21790 Arguments have same meaning as for previous function */
21791 static void
21792 expand_movmem_via_rep_mov (rtx destmem, rtx srcmem,
21793 rtx destptr, rtx srcptr,
21794 rtx count,
21795 enum machine_mode mode)
21796 {
21797 rtx destexp;
21798 rtx srcexp;
21799 rtx countreg;
21800 HOST_WIDE_INT rounded_count;
21801
21802 /* If the size is known, it is shorter to use rep movs. */
21803 if (mode == QImode && CONST_INT_P (count)
21804 && !(INTVAL (count) & 3))
21805 mode = SImode;
21806
21807 if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
21808 destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
21809 if (srcptr != XEXP (srcmem, 0) || GET_MODE (srcmem) != BLKmode)
21810 srcmem = adjust_automodify_address_nv (srcmem, BLKmode, srcptr, 0);
21811 countreg = ix86_zero_extend_to_Pmode (scale_counter (count, GET_MODE_SIZE (mode)));
21812 if (mode != QImode)
21813 {
21814 destexp = gen_rtx_ASHIFT (Pmode, countreg,
21815 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
21816 destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
21817 srcexp = gen_rtx_ASHIFT (Pmode, countreg,
21818 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
21819 srcexp = gen_rtx_PLUS (Pmode, srcexp, srcptr);
21820 }
21821 else
21822 {
21823 destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
21824 srcexp = gen_rtx_PLUS (Pmode, srcptr, countreg);
21825 }
21826 if (CONST_INT_P (count))
21827 {
21828 rounded_count = (INTVAL (count)
21829 & ~((HOST_WIDE_INT) GET_MODE_SIZE (mode) - 1));
21830 destmem = shallow_copy_rtx (destmem);
21831 srcmem = shallow_copy_rtx (srcmem);
21832 set_mem_size (destmem, rounded_count);
21833 set_mem_size (srcmem, rounded_count);
21834 }
21835 else
21836 {
21837 if (MEM_SIZE_KNOWN_P (destmem))
21838 clear_mem_size (destmem);
21839 if (MEM_SIZE_KNOWN_P (srcmem))
21840 clear_mem_size (srcmem);
21841 }
21842 emit_insn (gen_rep_mov (destptr, destmem, srcptr, srcmem, countreg,
21843 destexp, srcexp));
21844 }
21845
21846 /* Output "rep; stos" instruction.
21847 Arguments have same meaning as for previous function */
21848 static void
21849 expand_setmem_via_rep_stos (rtx destmem, rtx destptr, rtx value,
21850 rtx count, enum machine_mode mode,
21851 rtx orig_value)
21852 {
21853 rtx destexp;
21854 rtx countreg;
21855 HOST_WIDE_INT rounded_count;
21856
21857 if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
21858 destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
21859 value = force_reg (mode, gen_lowpart (mode, value));
21860 countreg = ix86_zero_extend_to_Pmode (scale_counter (count, GET_MODE_SIZE (mode)));
21861 if (mode != QImode)
21862 {
21863 destexp = gen_rtx_ASHIFT (Pmode, countreg,
21864 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
21865 destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
21866 }
21867 else
21868 destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
21869 if (orig_value == const0_rtx && CONST_INT_P (count))
21870 {
21871 rounded_count = (INTVAL (count)
21872 & ~((HOST_WIDE_INT) GET_MODE_SIZE (mode) - 1));
21873 destmem = shallow_copy_rtx (destmem);
21874 set_mem_size (destmem, rounded_count);
21875 }
21876 else if (MEM_SIZE_KNOWN_P (destmem))
21877 clear_mem_size (destmem);
21878 emit_insn (gen_rep_stos (destptr, countreg, destmem, value, destexp));
21879 }
21880
21881 static void
21882 emit_strmov (rtx destmem, rtx srcmem,
21883 rtx destptr, rtx srcptr, enum machine_mode mode, int offset)
21884 {
21885 rtx src = adjust_automodify_address_nv (srcmem, mode, srcptr, offset);
21886 rtx dest = adjust_automodify_address_nv (destmem, mode, destptr, offset);
21887 emit_insn (gen_strmov (destptr, dest, srcptr, src));
21888 }
21889
21890 /* Output code to copy at most count & (max_size - 1) bytes from SRC to DEST. */
21891 static void
21892 expand_movmem_epilogue (rtx destmem, rtx srcmem,
21893 rtx destptr, rtx srcptr, rtx count, int max_size)
21894 {
21895 rtx src, dest;
21896 if (CONST_INT_P (count))
21897 {
21898 HOST_WIDE_INT countval = INTVAL (count);
21899 int offset = 0;
21900
21901 if ((countval & 0x10) && max_size > 16)
21902 {
21903 if (TARGET_64BIT)
21904 {
21905 emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset);
21906 emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset + 8);
21907 }
21908 else
21909 gcc_unreachable ();
21910 offset += 16;
21911 }
21912 if ((countval & 0x08) && max_size > 8)
21913 {
21914 if (TARGET_64BIT)
21915 emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset);
21916 else
21917 {
21918 emit_strmov (destmem, srcmem, destptr, srcptr, SImode, offset);
21919 emit_strmov (destmem, srcmem, destptr, srcptr, SImode, offset + 4);
21920 }
21921 offset += 8;
21922 }
21923 if ((countval & 0x04) && max_size > 4)
21924 {
21925 emit_strmov (destmem, srcmem, destptr, srcptr, SImode, offset);
21926 offset += 4;
21927 }
21928 if ((countval & 0x02) && max_size > 2)
21929 {
21930 emit_strmov (destmem, srcmem, destptr, srcptr, HImode, offset);
21931 offset += 2;
21932 }
21933 if ((countval & 0x01) && max_size > 1)
21934 {
21935 emit_strmov (destmem, srcmem, destptr, srcptr, QImode, offset);
21936 offset += 1;
21937 }
21938 return;
21939 }
21940 if (max_size > 8)
21941 {
21942 count = expand_simple_binop (GET_MODE (count), AND, count, GEN_INT (max_size - 1),
21943 count, 1, OPTAB_DIRECT);
21944 expand_set_or_movmem_via_loop (destmem, srcmem, destptr, srcptr, NULL,
21945 count, QImode, 1, 4);
21946 return;
21947 }
21948
21949 /* When there are stringops, we can cheaply increase dest and src pointers.
21950 Otherwise we save code size by maintaining offset (zero is readily
21951 available from preceding rep operation) and using x86 addressing modes.
21952 */
21953 if (TARGET_SINGLE_STRINGOP)
21954 {
21955 if (max_size > 4)
21956 {
21957 rtx label = ix86_expand_aligntest (count, 4, true);
21958 src = change_address (srcmem, SImode, srcptr);
21959 dest = change_address (destmem, SImode, destptr);
21960 emit_insn (gen_strmov (destptr, dest, srcptr, src));
21961 emit_label (label);
21962 LABEL_NUSES (label) = 1;
21963 }
21964 if (max_size > 2)
21965 {
21966 rtx label = ix86_expand_aligntest (count, 2, true);
21967 src = change_address (srcmem, HImode, srcptr);
21968 dest = change_address (destmem, HImode, destptr);
21969 emit_insn (gen_strmov (destptr, dest, srcptr, src));
21970 emit_label (label);
21971 LABEL_NUSES (label) = 1;
21972 }
21973 if (max_size > 1)
21974 {
21975 rtx label = ix86_expand_aligntest (count, 1, true);
21976 src = change_address (srcmem, QImode, srcptr);
21977 dest = change_address (destmem, QImode, destptr);
21978 emit_insn (gen_strmov (destptr, dest, srcptr, src));
21979 emit_label (label);
21980 LABEL_NUSES (label) = 1;
21981 }
21982 }
21983 else
21984 {
21985 rtx offset = force_reg (Pmode, const0_rtx);
21986 rtx tmp;
21987
21988 if (max_size > 4)
21989 {
21990 rtx label = ix86_expand_aligntest (count, 4, true);
21991 src = change_address (srcmem, SImode, srcptr);
21992 dest = change_address (destmem, SImode, destptr);
21993 emit_move_insn (dest, src);
21994 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (4), NULL,
21995 true, OPTAB_LIB_WIDEN);
21996 if (tmp != offset)
21997 emit_move_insn (offset, tmp);
21998 emit_label (label);
21999 LABEL_NUSES (label) = 1;
22000 }
22001 if (max_size > 2)
22002 {
22003 rtx label = ix86_expand_aligntest (count, 2, true);
22004 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
22005 src = change_address (srcmem, HImode, tmp);
22006 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
22007 dest = change_address (destmem, HImode, tmp);
22008 emit_move_insn (dest, src);
22009 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (2), tmp,
22010 true, OPTAB_LIB_WIDEN);
22011 if (tmp != offset)
22012 emit_move_insn (offset, tmp);
22013 emit_label (label);
22014 LABEL_NUSES (label) = 1;
22015 }
22016 if (max_size > 1)
22017 {
22018 rtx label = ix86_expand_aligntest (count, 1, true);
22019 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
22020 src = change_address (srcmem, QImode, tmp);
22021 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
22022 dest = change_address (destmem, QImode, tmp);
22023 emit_move_insn (dest, src);
22024 emit_label (label);
22025 LABEL_NUSES (label) = 1;
22026 }
22027 }
22028 }
22029
22030 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
22031 static void
22032 expand_setmem_epilogue_via_loop (rtx destmem, rtx destptr, rtx value,
22033 rtx count, int max_size)
22034 {
22035 count =
22036 expand_simple_binop (counter_mode (count), AND, count,
22037 GEN_INT (max_size - 1), count, 1, OPTAB_DIRECT);
22038 expand_set_or_movmem_via_loop (destmem, NULL, destptr, NULL,
22039 gen_lowpart (QImode, value), count, QImode,
22040 1, max_size / 2);
22041 }
22042
22043 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
22044 static void
22045 expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx count, int max_size)
22046 {
22047 rtx dest;
22048
22049 if (CONST_INT_P (count))
22050 {
22051 HOST_WIDE_INT countval = INTVAL (count);
22052 int offset = 0;
22053
22054 if ((countval & 0x10) && max_size > 16)
22055 {
22056 if (TARGET_64BIT)
22057 {
22058 dest = adjust_automodify_address_nv (destmem, DImode, destptr, offset);
22059 emit_insn (gen_strset (destptr, dest, value));
22060 dest = adjust_automodify_address_nv (destmem, DImode, destptr, offset + 8);
22061 emit_insn (gen_strset (destptr, dest, value));
22062 }
22063 else
22064 gcc_unreachable ();
22065 offset += 16;
22066 }
22067 if ((countval & 0x08) && max_size > 8)
22068 {
22069 if (TARGET_64BIT)
22070 {
22071 dest = adjust_automodify_address_nv (destmem, DImode, destptr, offset);
22072 emit_insn (gen_strset (destptr, dest, value));
22073 }
22074 else
22075 {
22076 dest = adjust_automodify_address_nv (destmem, SImode, destptr, offset);
22077 emit_insn (gen_strset (destptr, dest, value));
22078 dest = adjust_automodify_address_nv (destmem, SImode, destptr, offset + 4);
22079 emit_insn (gen_strset (destptr, dest, value));
22080 }
22081 offset += 8;
22082 }
22083 if ((countval & 0x04) && max_size > 4)
22084 {
22085 dest = adjust_automodify_address_nv (destmem, SImode, destptr, offset);
22086 emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
22087 offset += 4;
22088 }
22089 if ((countval & 0x02) && max_size > 2)
22090 {
22091 dest = adjust_automodify_address_nv (destmem, HImode, destptr, offset);
22092 emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
22093 offset += 2;
22094 }
22095 if ((countval & 0x01) && max_size > 1)
22096 {
22097 dest = adjust_automodify_address_nv (destmem, QImode, destptr, offset);
22098 emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
22099 offset += 1;
22100 }
22101 return;
22102 }
22103 if (max_size > 32)
22104 {
22105 expand_setmem_epilogue_via_loop (destmem, destptr, value, count, max_size);
22106 return;
22107 }
22108 if (max_size > 16)
22109 {
22110 rtx label = ix86_expand_aligntest (count, 16, true);
22111 if (TARGET_64BIT)
22112 {
22113 dest = change_address (destmem, DImode, destptr);
22114 emit_insn (gen_strset (destptr, dest, value));
22115 emit_insn (gen_strset (destptr, dest, value));
22116 }
22117 else
22118 {
22119 dest = change_address (destmem, SImode, destptr);
22120 emit_insn (gen_strset (destptr, dest, value));
22121 emit_insn (gen_strset (destptr, dest, value));
22122 emit_insn (gen_strset (destptr, dest, value));
22123 emit_insn (gen_strset (destptr, dest, value));
22124 }
22125 emit_label (label);
22126 LABEL_NUSES (label) = 1;
22127 }
22128 if (max_size > 8)
22129 {
22130 rtx label = ix86_expand_aligntest (count, 8, true);
22131 if (TARGET_64BIT)
22132 {
22133 dest = change_address (destmem, DImode, destptr);
22134 emit_insn (gen_strset (destptr, dest, value));
22135 }
22136 else
22137 {
22138 dest = change_address (destmem, SImode, destptr);
22139 emit_insn (gen_strset (destptr, dest, value));
22140 emit_insn (gen_strset (destptr, dest, value));
22141 }
22142 emit_label (label);
22143 LABEL_NUSES (label) = 1;
22144 }
22145 if (max_size > 4)
22146 {
22147 rtx label = ix86_expand_aligntest (count, 4, true);
22148 dest = change_address (destmem, SImode, destptr);
22149 emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
22150 emit_label (label);
22151 LABEL_NUSES (label) = 1;
22152 }
22153 if (max_size > 2)
22154 {
22155 rtx label = ix86_expand_aligntest (count, 2, true);
22156 dest = change_address (destmem, HImode, destptr);
22157 emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
22158 emit_label (label);
22159 LABEL_NUSES (label) = 1;
22160 }
22161 if (max_size > 1)
22162 {
22163 rtx label = ix86_expand_aligntest (count, 1, true);
22164 dest = change_address (destmem, QImode, destptr);
22165 emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
22166 emit_label (label);
22167 LABEL_NUSES (label) = 1;
22168 }
22169 }
22170
22171 /* Copy enough from DEST to SRC to align DEST known to by aligned by ALIGN to
22172 DESIRED_ALIGNMENT. */
22173 static void
22174 expand_movmem_prologue (rtx destmem, rtx srcmem,
22175 rtx destptr, rtx srcptr, rtx count,
22176 int align, int desired_alignment)
22177 {
22178 if (align <= 1 && desired_alignment > 1)
22179 {
22180 rtx label = ix86_expand_aligntest (destptr, 1, false);
22181 srcmem = change_address (srcmem, QImode, srcptr);
22182 destmem = change_address (destmem, QImode, destptr);
22183 emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem));
22184 ix86_adjust_counter (count, 1);
22185 emit_label (label);
22186 LABEL_NUSES (label) = 1;
22187 }
22188 if (align <= 2 && desired_alignment > 2)
22189 {
22190 rtx label = ix86_expand_aligntest (destptr, 2, false);
22191 srcmem = change_address (srcmem, HImode, srcptr);
22192 destmem = change_address (destmem, HImode, destptr);
22193 emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem));
22194 ix86_adjust_counter (count, 2);
22195 emit_label (label);
22196 LABEL_NUSES (label) = 1;
22197 }
22198 if (align <= 4 && desired_alignment > 4)
22199 {
22200 rtx label = ix86_expand_aligntest (destptr, 4, false);
22201 srcmem = change_address (srcmem, SImode, srcptr);
22202 destmem = change_address (destmem, SImode, destptr);
22203 emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem));
22204 ix86_adjust_counter (count, 4);
22205 emit_label (label);
22206 LABEL_NUSES (label) = 1;
22207 }
22208 gcc_assert (desired_alignment <= 8);
22209 }
22210
22211 /* Copy enough from DST to SRC to align DST known to DESIRED_ALIGN.
22212 ALIGN_BYTES is how many bytes need to be copied. */
22213 static rtx
22214 expand_constant_movmem_prologue (rtx dst, rtx *srcp, rtx destreg, rtx srcreg,
22215 int desired_align, int align_bytes)
22216 {
22217 rtx src = *srcp;
22218 rtx orig_dst = dst;
22219 rtx orig_src = src;
22220 int off = 0;
22221 int src_align_bytes = get_mem_align_offset (src, desired_align * BITS_PER_UNIT);
22222 if (src_align_bytes >= 0)
22223 src_align_bytes = desired_align - src_align_bytes;
22224 if (align_bytes & 1)
22225 {
22226 dst = adjust_automodify_address_nv (dst, QImode, destreg, 0);
22227 src = adjust_automodify_address_nv (src, QImode, srcreg, 0);
22228 off = 1;
22229 emit_insn (gen_strmov (destreg, dst, srcreg, src));
22230 }
22231 if (align_bytes & 2)
22232 {
22233 dst = adjust_automodify_address_nv (dst, HImode, destreg, off);
22234 src = adjust_automodify_address_nv (src, HImode, srcreg, off);
22235 if (MEM_ALIGN (dst) < 2 * BITS_PER_UNIT)
22236 set_mem_align (dst, 2 * BITS_PER_UNIT);
22237 if (src_align_bytes >= 0
22238 && (src_align_bytes & 1) == (align_bytes & 1)
22239 && MEM_ALIGN (src) < 2 * BITS_PER_UNIT)
22240 set_mem_align (src, 2 * BITS_PER_UNIT);
22241 off = 2;
22242 emit_insn (gen_strmov (destreg, dst, srcreg, src));
22243 }
22244 if (align_bytes & 4)
22245 {
22246 dst = adjust_automodify_address_nv (dst, SImode, destreg, off);
22247 src = adjust_automodify_address_nv (src, SImode, srcreg, off);
22248 if (MEM_ALIGN (dst) < 4 * BITS_PER_UNIT)
22249 set_mem_align (dst, 4 * BITS_PER_UNIT);
22250 if (src_align_bytes >= 0)
22251 {
22252 unsigned int src_align = 0;
22253 if ((src_align_bytes & 3) == (align_bytes & 3))
22254 src_align = 4;
22255 else if ((src_align_bytes & 1) == (align_bytes & 1))
22256 src_align = 2;
22257 if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
22258 set_mem_align (src, src_align * BITS_PER_UNIT);
22259 }
22260 off = 4;
22261 emit_insn (gen_strmov (destreg, dst, srcreg, src));
22262 }
22263 dst = adjust_automodify_address_nv (dst, BLKmode, destreg, off);
22264 src = adjust_automodify_address_nv (src, BLKmode, srcreg, off);
22265 if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
22266 set_mem_align (dst, desired_align * BITS_PER_UNIT);
22267 if (src_align_bytes >= 0)
22268 {
22269 unsigned int src_align = 0;
22270 if ((src_align_bytes & 7) == (align_bytes & 7))
22271 src_align = 8;
22272 else if ((src_align_bytes & 3) == (align_bytes & 3))
22273 src_align = 4;
22274 else if ((src_align_bytes & 1) == (align_bytes & 1))
22275 src_align = 2;
22276 if (src_align > (unsigned int) desired_align)
22277 src_align = desired_align;
22278 if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
22279 set_mem_align (src, src_align * BITS_PER_UNIT);
22280 }
22281 if (MEM_SIZE_KNOWN_P (orig_dst))
22282 set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
22283 if (MEM_SIZE_KNOWN_P (orig_src))
22284 set_mem_size (src, MEM_SIZE (orig_src) - align_bytes);
22285 *srcp = src;
22286 return dst;
22287 }
22288
22289 /* Set enough from DEST to align DEST known to by aligned by ALIGN to
22290 DESIRED_ALIGNMENT. */
22291 static void
22292 expand_setmem_prologue (rtx destmem, rtx destptr, rtx value, rtx count,
22293 int align, int desired_alignment)
22294 {
22295 if (align <= 1 && desired_alignment > 1)
22296 {
22297 rtx label = ix86_expand_aligntest (destptr, 1, false);
22298 destmem = change_address (destmem, QImode, destptr);
22299 emit_insn (gen_strset (destptr, destmem, gen_lowpart (QImode, value)));
22300 ix86_adjust_counter (count, 1);
22301 emit_label (label);
22302 LABEL_NUSES (label) = 1;
22303 }
22304 if (align <= 2 && desired_alignment > 2)
22305 {
22306 rtx label = ix86_expand_aligntest (destptr, 2, false);
22307 destmem = change_address (destmem, HImode, destptr);
22308 emit_insn (gen_strset (destptr, destmem, gen_lowpart (HImode, value)));
22309 ix86_adjust_counter (count, 2);
22310 emit_label (label);
22311 LABEL_NUSES (label) = 1;
22312 }
22313 if (align <= 4 && desired_alignment > 4)
22314 {
22315 rtx label = ix86_expand_aligntest (destptr, 4, false);
22316 destmem = change_address (destmem, SImode, destptr);
22317 emit_insn (gen_strset (destptr, destmem, gen_lowpart (SImode, value)));
22318 ix86_adjust_counter (count, 4);
22319 emit_label (label);
22320 LABEL_NUSES (label) = 1;
22321 }
22322 gcc_assert (desired_alignment <= 8);
22323 }
22324
22325 /* Set enough from DST to align DST known to by aligned by ALIGN to
22326 DESIRED_ALIGN. ALIGN_BYTES is how many bytes need to be stored. */
22327 static rtx
22328 expand_constant_setmem_prologue (rtx dst, rtx destreg, rtx value,
22329 int desired_align, int align_bytes)
22330 {
22331 int off = 0;
22332 rtx orig_dst = dst;
22333 if (align_bytes & 1)
22334 {
22335 dst = adjust_automodify_address_nv (dst, QImode, destreg, 0);
22336 off = 1;
22337 emit_insn (gen_strset (destreg, dst,
22338 gen_lowpart (QImode, value)));
22339 }
22340 if (align_bytes & 2)
22341 {
22342 dst = adjust_automodify_address_nv (dst, HImode, destreg, off);
22343 if (MEM_ALIGN (dst) < 2 * BITS_PER_UNIT)
22344 set_mem_align (dst, 2 * BITS_PER_UNIT);
22345 off = 2;
22346 emit_insn (gen_strset (destreg, dst,
22347 gen_lowpart (HImode, value)));
22348 }
22349 if (align_bytes & 4)
22350 {
22351 dst = adjust_automodify_address_nv (dst, SImode, destreg, off);
22352 if (MEM_ALIGN (dst) < 4 * BITS_PER_UNIT)
22353 set_mem_align (dst, 4 * BITS_PER_UNIT);
22354 off = 4;
22355 emit_insn (gen_strset (destreg, dst,
22356 gen_lowpart (SImode, value)));
22357 }
22358 dst = adjust_automodify_address_nv (dst, BLKmode, destreg, off);
22359 if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
22360 set_mem_align (dst, desired_align * BITS_PER_UNIT);
22361 if (MEM_SIZE_KNOWN_P (orig_dst))
22362 set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
22363 return dst;
22364 }
22365
22366 /* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation. */
22367 static enum stringop_alg
22368 decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size, bool memset,
22369 int *dynamic_check)
22370 {
22371 const struct stringop_algs * algs;
22372 bool optimize_for_speed;
22373 /* Algorithms using the rep prefix want at least edi and ecx;
22374 additionally, memset wants eax and memcpy wants esi. Don't
22375 consider such algorithms if the user has appropriated those
22376 registers for their own purposes. */
22377 bool rep_prefix_usable = !(fixed_regs[CX_REG] || fixed_regs[DI_REG]
22378 || (memset
22379 ? fixed_regs[AX_REG] : fixed_regs[SI_REG]));
22380
22381 #define ALG_USABLE_P(alg) (rep_prefix_usable \
22382 || (alg != rep_prefix_1_byte \
22383 && alg != rep_prefix_4_byte \
22384 && alg != rep_prefix_8_byte))
22385 const struct processor_costs *cost;
22386
22387 /* Even if the string operation call is cold, we still might spend a lot
22388 of time processing large blocks. */
22389 if (optimize_function_for_size_p (cfun)
22390 || (optimize_insn_for_size_p ()
22391 && expected_size != -1 && expected_size < 256))
22392 optimize_for_speed = false;
22393 else
22394 optimize_for_speed = true;
22395
22396 cost = optimize_for_speed ? ix86_cost : &ix86_size_cost;
22397
22398 *dynamic_check = -1;
22399 if (memset)
22400 algs = &cost->memset[TARGET_64BIT != 0];
22401 else
22402 algs = &cost->memcpy[TARGET_64BIT != 0];
22403 if (ix86_stringop_alg != no_stringop && ALG_USABLE_P (ix86_stringop_alg))
22404 return ix86_stringop_alg;
22405 /* rep; movq or rep; movl is the smallest variant. */
22406 else if (!optimize_for_speed)
22407 {
22408 if (!count || (count & 3))
22409 return rep_prefix_usable ? rep_prefix_1_byte : loop_1_byte;
22410 else
22411 return rep_prefix_usable ? rep_prefix_4_byte : loop;
22412 }
22413 /* Very tiny blocks are best handled via the loop, REP is expensive to setup.
22414 */
22415 else if (expected_size != -1 && expected_size < 4)
22416 return loop_1_byte;
22417 else if (expected_size != -1)
22418 {
22419 unsigned int i;
22420 enum stringop_alg alg = libcall;
22421 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
22422 {
22423 /* We get here if the algorithms that were not libcall-based
22424 were rep-prefix based and we are unable to use rep prefixes
22425 based on global register usage. Break out of the loop and
22426 use the heuristic below. */
22427 if (algs->size[i].max == 0)
22428 break;
22429 if (algs->size[i].max >= expected_size || algs->size[i].max == -1)
22430 {
22431 enum stringop_alg candidate = algs->size[i].alg;
22432
22433 if (candidate != libcall && ALG_USABLE_P (candidate))
22434 alg = candidate;
22435 /* Honor TARGET_INLINE_ALL_STRINGOPS by picking
22436 last non-libcall inline algorithm. */
22437 if (TARGET_INLINE_ALL_STRINGOPS)
22438 {
22439 /* When the current size is best to be copied by a libcall,
22440 but we are still forced to inline, run the heuristic below
22441 that will pick code for medium sized blocks. */
22442 if (alg != libcall)
22443 return alg;
22444 break;
22445 }
22446 else if (ALG_USABLE_P (candidate))
22447 return candidate;
22448 }
22449 }
22450 gcc_assert (TARGET_INLINE_ALL_STRINGOPS || !rep_prefix_usable);
22451 }
22452 /* When asked to inline the call anyway, try to pick meaningful choice.
22453 We look for maximal size of block that is faster to copy by hand and
22454 take blocks of at most of that size guessing that average size will
22455 be roughly half of the block.
22456
22457 If this turns out to be bad, we might simply specify the preferred
22458 choice in ix86_costs. */
22459 if ((TARGET_INLINE_ALL_STRINGOPS || TARGET_INLINE_STRINGOPS_DYNAMICALLY)
22460 && (algs->unknown_size == libcall || !ALG_USABLE_P (algs->unknown_size)))
22461 {
22462 int max = -1;
22463 enum stringop_alg alg;
22464 int i;
22465 bool any_alg_usable_p = true;
22466
22467 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
22468 {
22469 enum stringop_alg candidate = algs->size[i].alg;
22470 any_alg_usable_p = any_alg_usable_p && ALG_USABLE_P (candidate);
22471
22472 if (candidate != libcall && candidate
22473 && ALG_USABLE_P (candidate))
22474 max = algs->size[i].max;
22475 }
22476 /* If there aren't any usable algorithms, then recursing on
22477 smaller sizes isn't going to find anything. Just return the
22478 simple byte-at-a-time copy loop. */
22479 if (!any_alg_usable_p)
22480 {
22481 /* Pick something reasonable. */
22482 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
22483 *dynamic_check = 128;
22484 return loop_1_byte;
22485 }
22486 if (max == -1)
22487 max = 4096;
22488 alg = decide_alg (count, max / 2, memset, dynamic_check);
22489 gcc_assert (*dynamic_check == -1);
22490 gcc_assert (alg != libcall);
22491 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
22492 *dynamic_check = max;
22493 return alg;
22494 }
22495 return ALG_USABLE_P (algs->unknown_size) ? algs->unknown_size : libcall;
22496 #undef ALG_USABLE_P
22497 }
22498
22499 /* Decide on alignment. We know that the operand is already aligned to ALIGN
22500 (ALIGN can be based on profile feedback and thus it is not 100% guaranteed). */
22501 static int
22502 decide_alignment (int align,
22503 enum stringop_alg alg,
22504 int expected_size)
22505 {
22506 int desired_align = 0;
22507 switch (alg)
22508 {
22509 case no_stringop:
22510 gcc_unreachable ();
22511 case loop:
22512 case unrolled_loop:
22513 desired_align = GET_MODE_SIZE (Pmode);
22514 break;
22515 case rep_prefix_8_byte:
22516 desired_align = 8;
22517 break;
22518 case rep_prefix_4_byte:
22519 /* PentiumPro has special logic triggering for 8 byte aligned blocks.
22520 copying whole cacheline at once. */
22521 if (TARGET_PENTIUMPRO)
22522 desired_align = 8;
22523 else
22524 desired_align = 4;
22525 break;
22526 case rep_prefix_1_byte:
22527 /* PentiumPro has special logic triggering for 8 byte aligned blocks.
22528 copying whole cacheline at once. */
22529 if (TARGET_PENTIUMPRO)
22530 desired_align = 8;
22531 else
22532 desired_align = 1;
22533 break;
22534 case loop_1_byte:
22535 desired_align = 1;
22536 break;
22537 case libcall:
22538 return 0;
22539 }
22540
22541 if (optimize_size)
22542 desired_align = 1;
22543 if (desired_align < align)
22544 desired_align = align;
22545 if (expected_size != -1 && expected_size < 4)
22546 desired_align = align;
22547 return desired_align;
22548 }
22549
22550 /* Return the smallest power of 2 greater than VAL. */
22551 static int
22552 smallest_pow2_greater_than (int val)
22553 {
22554 int ret = 1;
22555 while (ret <= val)
22556 ret <<= 1;
22557 return ret;
22558 }
22559
22560 /* Expand string move (memcpy) operation. Use i386 string operations
22561 when profitable. expand_setmem contains similar code. The code
22562 depends upon architecture, block size and alignment, but always has
22563 the same overall structure:
22564
22565 1) Prologue guard: Conditional that jumps up to epilogues for small
22566 blocks that can be handled by epilogue alone. This is faster
22567 but also needed for correctness, since prologue assume the block
22568 is larger than the desired alignment.
22569
22570 Optional dynamic check for size and libcall for large
22571 blocks is emitted here too, with -minline-stringops-dynamically.
22572
22573 2) Prologue: copy first few bytes in order to get destination
22574 aligned to DESIRED_ALIGN. It is emitted only when ALIGN is less
22575 than DESIRED_ALIGN and up to DESIRED_ALIGN - ALIGN bytes can be
22576 copied. We emit either a jump tree on power of two sized
22577 blocks, or a byte loop.
22578
22579 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
22580 with specified algorithm.
22581
22582 4) Epilogue: code copying tail of the block that is too small to be
22583 handled by main body (or up to size guarded by prologue guard). */
22584
22585 bool
22586 ix86_expand_movmem (rtx dst, rtx src, rtx count_exp, rtx align_exp,
22587 rtx expected_align_exp, rtx expected_size_exp)
22588 {
22589 rtx destreg;
22590 rtx srcreg;
22591 rtx label = NULL;
22592 rtx tmp;
22593 rtx jump_around_label = NULL;
22594 HOST_WIDE_INT align = 1;
22595 unsigned HOST_WIDE_INT count = 0;
22596 HOST_WIDE_INT expected_size = -1;
22597 int size_needed = 0, epilogue_size_needed;
22598 int desired_align = 0, align_bytes = 0;
22599 enum stringop_alg alg;
22600 int dynamic_check;
22601 bool need_zero_guard = false;
22602
22603 if (CONST_INT_P (align_exp))
22604 align = INTVAL (align_exp);
22605 /* i386 can do misaligned access on reasonably increased cost. */
22606 if (CONST_INT_P (expected_align_exp)
22607 && INTVAL (expected_align_exp) > align)
22608 align = INTVAL (expected_align_exp);
22609 /* ALIGN is the minimum of destination and source alignment, but we care here
22610 just about destination alignment. */
22611 else if (MEM_ALIGN (dst) > (unsigned HOST_WIDE_INT) align * BITS_PER_UNIT)
22612 align = MEM_ALIGN (dst) / BITS_PER_UNIT;
22613
22614 if (CONST_INT_P (count_exp))
22615 count = expected_size = INTVAL (count_exp);
22616 if (CONST_INT_P (expected_size_exp) && count == 0)
22617 expected_size = INTVAL (expected_size_exp);
22618
22619 /* Make sure we don't need to care about overflow later on. */
22620 if (count > ((unsigned HOST_WIDE_INT) 1 << 30))
22621 return false;
22622
22623 /* Step 0: Decide on preferred algorithm, desired alignment and
22624 size of chunks to be copied by main loop. */
22625
22626 alg = decide_alg (count, expected_size, false, &dynamic_check);
22627 desired_align = decide_alignment (align, alg, expected_size);
22628
22629 if (!TARGET_ALIGN_STRINGOPS)
22630 align = desired_align;
22631
22632 if (alg == libcall)
22633 return false;
22634 gcc_assert (alg != no_stringop);
22635 if (!count)
22636 count_exp = copy_to_mode_reg (GET_MODE (count_exp), count_exp);
22637 destreg = copy_addr_to_reg (XEXP (dst, 0));
22638 srcreg = copy_addr_to_reg (XEXP (src, 0));
22639 switch (alg)
22640 {
22641 case libcall:
22642 case no_stringop:
22643 gcc_unreachable ();
22644 case loop:
22645 need_zero_guard = true;
22646 size_needed = GET_MODE_SIZE (word_mode);
22647 break;
22648 case unrolled_loop:
22649 need_zero_guard = true;
22650 size_needed = GET_MODE_SIZE (word_mode) * (TARGET_64BIT ? 4 : 2);
22651 break;
22652 case rep_prefix_8_byte:
22653 size_needed = 8;
22654 break;
22655 case rep_prefix_4_byte:
22656 size_needed = 4;
22657 break;
22658 case rep_prefix_1_byte:
22659 size_needed = 1;
22660 break;
22661 case loop_1_byte:
22662 need_zero_guard = true;
22663 size_needed = 1;
22664 break;
22665 }
22666
22667 epilogue_size_needed = size_needed;
22668
22669 /* Step 1: Prologue guard. */
22670
22671 /* Alignment code needs count to be in register. */
22672 if (CONST_INT_P (count_exp) && desired_align > align)
22673 {
22674 if (INTVAL (count_exp) > desired_align
22675 && INTVAL (count_exp) > size_needed)
22676 {
22677 align_bytes
22678 = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
22679 if (align_bytes <= 0)
22680 align_bytes = 0;
22681 else
22682 align_bytes = desired_align - align_bytes;
22683 }
22684 if (align_bytes == 0)
22685 count_exp = force_reg (counter_mode (count_exp), count_exp);
22686 }
22687 gcc_assert (desired_align >= 1 && align >= 1);
22688
22689 /* Ensure that alignment prologue won't copy past end of block. */
22690 if (size_needed > 1 || (desired_align > 1 && desired_align > align))
22691 {
22692 epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
22693 /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes.
22694 Make sure it is power of 2. */
22695 epilogue_size_needed = smallest_pow2_greater_than (epilogue_size_needed);
22696
22697 if (count)
22698 {
22699 if (count < (unsigned HOST_WIDE_INT)epilogue_size_needed)
22700 {
22701 /* If main algorithm works on QImode, no epilogue is needed.
22702 For small sizes just don't align anything. */
22703 if (size_needed == 1)
22704 desired_align = align;
22705 else
22706 goto epilogue;
22707 }
22708 }
22709 else
22710 {
22711 label = gen_label_rtx ();
22712 emit_cmp_and_jump_insns (count_exp,
22713 GEN_INT (epilogue_size_needed),
22714 LTU, 0, counter_mode (count_exp), 1, label);
22715 if (expected_size == -1 || expected_size < epilogue_size_needed)
22716 predict_jump (REG_BR_PROB_BASE * 60 / 100);
22717 else
22718 predict_jump (REG_BR_PROB_BASE * 20 / 100);
22719 }
22720 }
22721
22722 /* Emit code to decide on runtime whether library call or inline should be
22723 used. */
22724 if (dynamic_check != -1)
22725 {
22726 if (CONST_INT_P (count_exp))
22727 {
22728 if (UINTVAL (count_exp) >= (unsigned HOST_WIDE_INT)dynamic_check)
22729 {
22730 emit_block_move_via_libcall (dst, src, count_exp, false);
22731 count_exp = const0_rtx;
22732 goto epilogue;
22733 }
22734 }
22735 else
22736 {
22737 rtx hot_label = gen_label_rtx ();
22738 jump_around_label = gen_label_rtx ();
22739 emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
22740 LEU, 0, GET_MODE (count_exp), 1, hot_label);
22741 predict_jump (REG_BR_PROB_BASE * 90 / 100);
22742 emit_block_move_via_libcall (dst, src, count_exp, false);
22743 emit_jump (jump_around_label);
22744 emit_label (hot_label);
22745 }
22746 }
22747
22748 /* Step 2: Alignment prologue. */
22749
22750 if (desired_align > align)
22751 {
22752 if (align_bytes == 0)
22753 {
22754 /* Except for the first move in epilogue, we no longer know
22755 constant offset in aliasing info. It don't seems to worth
22756 the pain to maintain it for the first move, so throw away
22757 the info early. */
22758 src = change_address (src, BLKmode, srcreg);
22759 dst = change_address (dst, BLKmode, destreg);
22760 expand_movmem_prologue (dst, src, destreg, srcreg, count_exp, align,
22761 desired_align);
22762 }
22763 else
22764 {
22765 /* If we know how many bytes need to be stored before dst is
22766 sufficiently aligned, maintain aliasing info accurately. */
22767 dst = expand_constant_movmem_prologue (dst, &src, destreg, srcreg,
22768 desired_align, align_bytes);
22769 count_exp = plus_constant (counter_mode (count_exp),
22770 count_exp, -align_bytes);
22771 count -= align_bytes;
22772 }
22773 if (need_zero_guard
22774 && (count < (unsigned HOST_WIDE_INT) size_needed
22775 || (align_bytes == 0
22776 && count < ((unsigned HOST_WIDE_INT) size_needed
22777 + desired_align - align))))
22778 {
22779 /* It is possible that we copied enough so the main loop will not
22780 execute. */
22781 gcc_assert (size_needed > 1);
22782 if (label == NULL_RTX)
22783 label = gen_label_rtx ();
22784 emit_cmp_and_jump_insns (count_exp,
22785 GEN_INT (size_needed),
22786 LTU, 0, counter_mode (count_exp), 1, label);
22787 if (expected_size == -1
22788 || expected_size < (desired_align - align) / 2 + size_needed)
22789 predict_jump (REG_BR_PROB_BASE * 20 / 100);
22790 else
22791 predict_jump (REG_BR_PROB_BASE * 60 / 100);
22792 }
22793 }
22794 if (label && size_needed == 1)
22795 {
22796 emit_label (label);
22797 LABEL_NUSES (label) = 1;
22798 label = NULL;
22799 epilogue_size_needed = 1;
22800 }
22801 else if (label == NULL_RTX)
22802 epilogue_size_needed = size_needed;
22803
22804 /* Step 3: Main loop. */
22805
22806 switch (alg)
22807 {
22808 case libcall:
22809 case no_stringop:
22810 gcc_unreachable ();
22811 case loop_1_byte:
22812 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, NULL,
22813 count_exp, QImode, 1, expected_size);
22814 break;
22815 case loop:
22816 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, NULL,
22817 count_exp, word_mode, 1, expected_size);
22818 break;
22819 case unrolled_loop:
22820 /* Unroll only by factor of 2 in 32bit mode, since we don't have enough
22821 registers for 4 temporaries anyway. */
22822 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, NULL,
22823 count_exp, word_mode, TARGET_64BIT ? 4 : 2,
22824 expected_size);
22825 break;
22826 case rep_prefix_8_byte:
22827 expand_movmem_via_rep_mov (dst, src, destreg, srcreg, count_exp,
22828 DImode);
22829 break;
22830 case rep_prefix_4_byte:
22831 expand_movmem_via_rep_mov (dst, src, destreg, srcreg, count_exp,
22832 SImode);
22833 break;
22834 case rep_prefix_1_byte:
22835 expand_movmem_via_rep_mov (dst, src, destreg, srcreg, count_exp,
22836 QImode);
22837 break;
22838 }
22839 /* Adjust properly the offset of src and dest memory for aliasing. */
22840 if (CONST_INT_P (count_exp))
22841 {
22842 src = adjust_automodify_address_nv (src, BLKmode, srcreg,
22843 (count / size_needed) * size_needed);
22844 dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
22845 (count / size_needed) * size_needed);
22846 }
22847 else
22848 {
22849 src = change_address (src, BLKmode, srcreg);
22850 dst = change_address (dst, BLKmode, destreg);
22851 }
22852
22853 /* Step 4: Epilogue to copy the remaining bytes. */
22854 epilogue:
22855 if (label)
22856 {
22857 /* When the main loop is done, COUNT_EXP might hold original count,
22858 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
22859 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
22860 bytes. Compensate if needed. */
22861
22862 if (size_needed < epilogue_size_needed)
22863 {
22864 tmp =
22865 expand_simple_binop (counter_mode (count_exp), AND, count_exp,
22866 GEN_INT (size_needed - 1), count_exp, 1,
22867 OPTAB_DIRECT);
22868 if (tmp != count_exp)
22869 emit_move_insn (count_exp, tmp);
22870 }
22871 emit_label (label);
22872 LABEL_NUSES (label) = 1;
22873 }
22874
22875 if (count_exp != const0_rtx && epilogue_size_needed > 1)
22876 expand_movmem_epilogue (dst, src, destreg, srcreg, count_exp,
22877 epilogue_size_needed);
22878 if (jump_around_label)
22879 emit_label (jump_around_label);
22880 return true;
22881 }
22882
22883 /* Helper function for memcpy. For QImode value 0xXY produce
22884 0xXYXYXYXY of wide specified by MODE. This is essentially
22885 a * 0x10101010, but we can do slightly better than
22886 synth_mult by unwinding the sequence by hand on CPUs with
22887 slow multiply. */
22888 static rtx
22889 promote_duplicated_reg (enum machine_mode mode, rtx val)
22890 {
22891 enum machine_mode valmode = GET_MODE (val);
22892 rtx tmp;
22893 int nops = mode == DImode ? 3 : 2;
22894
22895 gcc_assert (mode == SImode || mode == DImode);
22896 if (val == const0_rtx)
22897 return copy_to_mode_reg (mode, const0_rtx);
22898 if (CONST_INT_P (val))
22899 {
22900 HOST_WIDE_INT v = INTVAL (val) & 255;
22901
22902 v |= v << 8;
22903 v |= v << 16;
22904 if (mode == DImode)
22905 v |= (v << 16) << 16;
22906 return copy_to_mode_reg (mode, gen_int_mode (v, mode));
22907 }
22908
22909 if (valmode == VOIDmode)
22910 valmode = QImode;
22911 if (valmode != QImode)
22912 val = gen_lowpart (QImode, val);
22913 if (mode == QImode)
22914 return val;
22915 if (!TARGET_PARTIAL_REG_STALL)
22916 nops--;
22917 if (ix86_cost->mult_init[mode == DImode ? 3 : 2]
22918 + ix86_cost->mult_bit * (mode == DImode ? 8 : 4)
22919 <= (ix86_cost->shift_const + ix86_cost->add) * nops
22920 + (COSTS_N_INSNS (TARGET_PARTIAL_REG_STALL == 0)))
22921 {
22922 rtx reg = convert_modes (mode, QImode, val, true);
22923 tmp = promote_duplicated_reg (mode, const1_rtx);
22924 return expand_simple_binop (mode, MULT, reg, tmp, NULL, 1,
22925 OPTAB_DIRECT);
22926 }
22927 else
22928 {
22929 rtx reg = convert_modes (mode, QImode, val, true);
22930
22931 if (!TARGET_PARTIAL_REG_STALL)
22932 if (mode == SImode)
22933 emit_insn (gen_movsi_insv_1 (reg, reg));
22934 else
22935 emit_insn (gen_movdi_insv_1 (reg, reg));
22936 else
22937 {
22938 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (8),
22939 NULL, 1, OPTAB_DIRECT);
22940 reg =
22941 expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
22942 }
22943 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (16),
22944 NULL, 1, OPTAB_DIRECT);
22945 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
22946 if (mode == SImode)
22947 return reg;
22948 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (32),
22949 NULL, 1, OPTAB_DIRECT);
22950 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
22951 return reg;
22952 }
22953 }
22954
22955 /* Duplicate value VAL using promote_duplicated_reg into maximal size that will
22956 be needed by main loop copying SIZE_NEEDED chunks and prologue getting
22957 alignment from ALIGN to DESIRED_ALIGN. */
22958 static rtx
22959 promote_duplicated_reg_to_size (rtx val, int size_needed, int desired_align, int align)
22960 {
22961 rtx promoted_val;
22962
22963 if (TARGET_64BIT
22964 && (size_needed > 4 || (desired_align > align && desired_align > 4)))
22965 promoted_val = promote_duplicated_reg (DImode, val);
22966 else if (size_needed > 2 || (desired_align > align && desired_align > 2))
22967 promoted_val = promote_duplicated_reg (SImode, val);
22968 else if (size_needed > 1 || (desired_align > align && desired_align > 1))
22969 promoted_val = promote_duplicated_reg (HImode, val);
22970 else
22971 promoted_val = val;
22972
22973 return promoted_val;
22974 }
22975
22976 /* Expand string clear operation (bzero). Use i386 string operations when
22977 profitable. See expand_movmem comment for explanation of individual
22978 steps performed. */
22979 bool
22980 ix86_expand_setmem (rtx dst, rtx count_exp, rtx val_exp, rtx align_exp,
22981 rtx expected_align_exp, rtx expected_size_exp)
22982 {
22983 rtx destreg;
22984 rtx label = NULL;
22985 rtx tmp;
22986 rtx jump_around_label = NULL;
22987 HOST_WIDE_INT align = 1;
22988 unsigned HOST_WIDE_INT count = 0;
22989 HOST_WIDE_INT expected_size = -1;
22990 int size_needed = 0, epilogue_size_needed;
22991 int desired_align = 0, align_bytes = 0;
22992 enum stringop_alg alg;
22993 rtx promoted_val = NULL;
22994 bool force_loopy_epilogue = false;
22995 int dynamic_check;
22996 bool need_zero_guard = false;
22997
22998 if (CONST_INT_P (align_exp))
22999 align = INTVAL (align_exp);
23000 /* i386 can do misaligned access on reasonably increased cost. */
23001 if (CONST_INT_P (expected_align_exp)
23002 && INTVAL (expected_align_exp) > align)
23003 align = INTVAL (expected_align_exp);
23004 if (CONST_INT_P (count_exp))
23005 count = expected_size = INTVAL (count_exp);
23006 if (CONST_INT_P (expected_size_exp) && count == 0)
23007 expected_size = INTVAL (expected_size_exp);
23008
23009 /* Make sure we don't need to care about overflow later on. */
23010 if (count > ((unsigned HOST_WIDE_INT) 1 << 30))
23011 return false;
23012
23013 /* Step 0: Decide on preferred algorithm, desired alignment and
23014 size of chunks to be copied by main loop. */
23015
23016 alg = decide_alg (count, expected_size, true, &dynamic_check);
23017 desired_align = decide_alignment (align, alg, expected_size);
23018
23019 if (!TARGET_ALIGN_STRINGOPS)
23020 align = desired_align;
23021
23022 if (alg == libcall)
23023 return false;
23024 gcc_assert (alg != no_stringop);
23025 if (!count)
23026 count_exp = copy_to_mode_reg (counter_mode (count_exp), count_exp);
23027 destreg = copy_addr_to_reg (XEXP (dst, 0));
23028 switch (alg)
23029 {
23030 case libcall:
23031 case no_stringop:
23032 gcc_unreachable ();
23033 case loop:
23034 need_zero_guard = true;
23035 size_needed = GET_MODE_SIZE (word_mode);
23036 break;
23037 case unrolled_loop:
23038 need_zero_guard = true;
23039 size_needed = GET_MODE_SIZE (word_mode) * 4;
23040 break;
23041 case rep_prefix_8_byte:
23042 size_needed = 8;
23043 break;
23044 case rep_prefix_4_byte:
23045 size_needed = 4;
23046 break;
23047 case rep_prefix_1_byte:
23048 size_needed = 1;
23049 break;
23050 case loop_1_byte:
23051 need_zero_guard = true;
23052 size_needed = 1;
23053 break;
23054 }
23055 epilogue_size_needed = size_needed;
23056
23057 /* Step 1: Prologue guard. */
23058
23059 /* Alignment code needs count to be in register. */
23060 if (CONST_INT_P (count_exp) && desired_align > align)
23061 {
23062 if (INTVAL (count_exp) > desired_align
23063 && INTVAL (count_exp) > size_needed)
23064 {
23065 align_bytes
23066 = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
23067 if (align_bytes <= 0)
23068 align_bytes = 0;
23069 else
23070 align_bytes = desired_align - align_bytes;
23071 }
23072 if (align_bytes == 0)
23073 {
23074 enum machine_mode mode = SImode;
23075 if (TARGET_64BIT && (count & ~0xffffffff))
23076 mode = DImode;
23077 count_exp = force_reg (mode, count_exp);
23078 }
23079 }
23080 /* Do the cheap promotion to allow better CSE across the
23081 main loop and epilogue (ie one load of the big constant in the
23082 front of all code. */
23083 if (CONST_INT_P (val_exp))
23084 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
23085 desired_align, align);
23086 /* Ensure that alignment prologue won't copy past end of block. */
23087 if (size_needed > 1 || (desired_align > 1 && desired_align > align))
23088 {
23089 epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
23090 /* Epilogue always copies COUNT_EXP & (EPILOGUE_SIZE_NEEDED - 1) bytes.
23091 Make sure it is power of 2. */
23092 epilogue_size_needed = smallest_pow2_greater_than (epilogue_size_needed);
23093
23094 /* To improve performance of small blocks, we jump around the VAL
23095 promoting mode. This mean that if the promoted VAL is not constant,
23096 we might not use it in the epilogue and have to use byte
23097 loop variant. */
23098 if (epilogue_size_needed > 2 && !promoted_val)
23099 force_loopy_epilogue = true;
23100 if (count)
23101 {
23102 if (count < (unsigned HOST_WIDE_INT)epilogue_size_needed)
23103 {
23104 /* If main algorithm works on QImode, no epilogue is needed.
23105 For small sizes just don't align anything. */
23106 if (size_needed == 1)
23107 desired_align = align;
23108 else
23109 goto epilogue;
23110 }
23111 }
23112 else
23113 {
23114 label = gen_label_rtx ();
23115 emit_cmp_and_jump_insns (count_exp,
23116 GEN_INT (epilogue_size_needed),
23117 LTU, 0, counter_mode (count_exp), 1, label);
23118 if (expected_size == -1 || expected_size <= epilogue_size_needed)
23119 predict_jump (REG_BR_PROB_BASE * 60 / 100);
23120 else
23121 predict_jump (REG_BR_PROB_BASE * 20 / 100);
23122 }
23123 }
23124 if (dynamic_check != -1)
23125 {
23126 rtx hot_label = gen_label_rtx ();
23127 jump_around_label = gen_label_rtx ();
23128 emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
23129 LEU, 0, counter_mode (count_exp), 1, hot_label);
23130 predict_jump (REG_BR_PROB_BASE * 90 / 100);
23131 set_storage_via_libcall (dst, count_exp, val_exp, false);
23132 emit_jump (jump_around_label);
23133 emit_label (hot_label);
23134 }
23135
23136 /* Step 2: Alignment prologue. */
23137
23138 /* Do the expensive promotion once we branched off the small blocks. */
23139 if (!promoted_val)
23140 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
23141 desired_align, align);
23142 gcc_assert (desired_align >= 1 && align >= 1);
23143
23144 if (desired_align > align)
23145 {
23146 if (align_bytes == 0)
23147 {
23148 /* Except for the first move in epilogue, we no longer know
23149 constant offset in aliasing info. It don't seems to worth
23150 the pain to maintain it for the first move, so throw away
23151 the info early. */
23152 dst = change_address (dst, BLKmode, destreg);
23153 expand_setmem_prologue (dst, destreg, promoted_val, count_exp, align,
23154 desired_align);
23155 }
23156 else
23157 {
23158 /* If we know how many bytes need to be stored before dst is
23159 sufficiently aligned, maintain aliasing info accurately. */
23160 dst = expand_constant_setmem_prologue (dst, destreg, promoted_val,
23161 desired_align, align_bytes);
23162 count_exp = plus_constant (counter_mode (count_exp),
23163 count_exp, -align_bytes);
23164 count -= align_bytes;
23165 }
23166 if (need_zero_guard
23167 && (count < (unsigned HOST_WIDE_INT) size_needed
23168 || (align_bytes == 0
23169 && count < ((unsigned HOST_WIDE_INT) size_needed
23170 + desired_align - align))))
23171 {
23172 /* It is possible that we copied enough so the main loop will not
23173 execute. */
23174 gcc_assert (size_needed > 1);
23175 if (label == NULL_RTX)
23176 label = gen_label_rtx ();
23177 emit_cmp_and_jump_insns (count_exp,
23178 GEN_INT (size_needed),
23179 LTU, 0, counter_mode (count_exp), 1, label);
23180 if (expected_size == -1
23181 || expected_size < (desired_align - align) / 2 + size_needed)
23182 predict_jump (REG_BR_PROB_BASE * 20 / 100);
23183 else
23184 predict_jump (REG_BR_PROB_BASE * 60 / 100);
23185 }
23186 }
23187 if (label && size_needed == 1)
23188 {
23189 emit_label (label);
23190 LABEL_NUSES (label) = 1;
23191 label = NULL;
23192 promoted_val = val_exp;
23193 epilogue_size_needed = 1;
23194 }
23195 else if (label == NULL_RTX)
23196 epilogue_size_needed = size_needed;
23197
23198 /* Step 3: Main loop. */
23199
23200 switch (alg)
23201 {
23202 case libcall:
23203 case no_stringop:
23204 gcc_unreachable ();
23205 case loop_1_byte:
23206 expand_set_or_movmem_via_loop (dst, NULL, destreg, NULL, promoted_val,
23207 count_exp, QImode, 1, expected_size);
23208 break;
23209 case loop:
23210 expand_set_or_movmem_via_loop (dst, NULL, destreg, NULL, promoted_val,
23211 count_exp, word_mode, 1, expected_size);
23212 break;
23213 case unrolled_loop:
23214 expand_set_or_movmem_via_loop (dst, NULL, destreg, NULL, promoted_val,
23215 count_exp, word_mode, 4, expected_size);
23216 break;
23217 case rep_prefix_8_byte:
23218 expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp,
23219 DImode, val_exp);
23220 break;
23221 case rep_prefix_4_byte:
23222 expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp,
23223 SImode, val_exp);
23224 break;
23225 case rep_prefix_1_byte:
23226 expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp,
23227 QImode, val_exp);
23228 break;
23229 }
23230 /* Adjust properly the offset of src and dest memory for aliasing. */
23231 if (CONST_INT_P (count_exp))
23232 dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
23233 (count / size_needed) * size_needed);
23234 else
23235 dst = change_address (dst, BLKmode, destreg);
23236
23237 /* Step 4: Epilogue to copy the remaining bytes. */
23238
23239 if (label)
23240 {
23241 /* When the main loop is done, COUNT_EXP might hold original count,
23242 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
23243 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
23244 bytes. Compensate if needed. */
23245
23246 if (size_needed < epilogue_size_needed)
23247 {
23248 tmp =
23249 expand_simple_binop (counter_mode (count_exp), AND, count_exp,
23250 GEN_INT (size_needed - 1), count_exp, 1,
23251 OPTAB_DIRECT);
23252 if (tmp != count_exp)
23253 emit_move_insn (count_exp, tmp);
23254 }
23255 emit_label (label);
23256 LABEL_NUSES (label) = 1;
23257 }
23258 epilogue:
23259 if (count_exp != const0_rtx && epilogue_size_needed > 1)
23260 {
23261 if (force_loopy_epilogue)
23262 expand_setmem_epilogue_via_loop (dst, destreg, val_exp, count_exp,
23263 epilogue_size_needed);
23264 else
23265 expand_setmem_epilogue (dst, destreg, promoted_val, count_exp,
23266 epilogue_size_needed);
23267 }
23268 if (jump_around_label)
23269 emit_label (jump_around_label);
23270 return true;
23271 }
23272
23273 /* Expand the appropriate insns for doing strlen if not just doing
23274 repnz; scasb
23275
23276 out = result, initialized with the start address
23277 align_rtx = alignment of the address.
23278 scratch = scratch register, initialized with the startaddress when
23279 not aligned, otherwise undefined
23280
23281 This is just the body. It needs the initializations mentioned above and
23282 some address computing at the end. These things are done in i386.md. */
23283
23284 static void
23285 ix86_expand_strlensi_unroll_1 (rtx out, rtx src, rtx align_rtx)
23286 {
23287 int align;
23288 rtx tmp;
23289 rtx align_2_label = NULL_RTX;
23290 rtx align_3_label = NULL_RTX;
23291 rtx align_4_label = gen_label_rtx ();
23292 rtx end_0_label = gen_label_rtx ();
23293 rtx mem;
23294 rtx tmpreg = gen_reg_rtx (SImode);
23295 rtx scratch = gen_reg_rtx (SImode);
23296 rtx cmp;
23297
23298 align = 0;
23299 if (CONST_INT_P (align_rtx))
23300 align = INTVAL (align_rtx);
23301
23302 /* Loop to check 1..3 bytes for null to get an aligned pointer. */
23303
23304 /* Is there a known alignment and is it less than 4? */
23305 if (align < 4)
23306 {
23307 rtx scratch1 = gen_reg_rtx (Pmode);
23308 emit_move_insn (scratch1, out);
23309 /* Is there a known alignment and is it not 2? */
23310 if (align != 2)
23311 {
23312 align_3_label = gen_label_rtx (); /* Label when aligned to 3-byte */
23313 align_2_label = gen_label_rtx (); /* Label when aligned to 2-byte */
23314
23315 /* Leave just the 3 lower bits. */
23316 align_rtx = expand_binop (Pmode, and_optab, scratch1, GEN_INT (3),
23317 NULL_RTX, 0, OPTAB_WIDEN);
23318
23319 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
23320 Pmode, 1, align_4_label);
23321 emit_cmp_and_jump_insns (align_rtx, const2_rtx, EQ, NULL,
23322 Pmode, 1, align_2_label);
23323 emit_cmp_and_jump_insns (align_rtx, const2_rtx, GTU, NULL,
23324 Pmode, 1, align_3_label);
23325 }
23326 else
23327 {
23328 /* Since the alignment is 2, we have to check 2 or 0 bytes;
23329 check if is aligned to 4 - byte. */
23330
23331 align_rtx = expand_binop (Pmode, and_optab, scratch1, const2_rtx,
23332 NULL_RTX, 0, OPTAB_WIDEN);
23333
23334 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
23335 Pmode, 1, align_4_label);
23336 }
23337
23338 mem = change_address (src, QImode, out);
23339
23340 /* Now compare the bytes. */
23341
23342 /* Compare the first n unaligned byte on a byte per byte basis. */
23343 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL,
23344 QImode, 1, end_0_label);
23345
23346 /* Increment the address. */
23347 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
23348
23349 /* Not needed with an alignment of 2 */
23350 if (align != 2)
23351 {
23352 emit_label (align_2_label);
23353
23354 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
23355 end_0_label);
23356
23357 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
23358
23359 emit_label (align_3_label);
23360 }
23361
23362 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
23363 end_0_label);
23364
23365 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
23366 }
23367
23368 /* Generate loop to check 4 bytes at a time. It is not a good idea to
23369 align this loop. It gives only huge programs, but does not help to
23370 speed up. */
23371 emit_label (align_4_label);
23372
23373 mem = change_address (src, SImode, out);
23374 emit_move_insn (scratch, mem);
23375 emit_insn (ix86_gen_add3 (out, out, GEN_INT (4)));
23376
23377 /* This formula yields a nonzero result iff one of the bytes is zero.
23378 This saves three branches inside loop and many cycles. */
23379
23380 emit_insn (gen_addsi3 (tmpreg, scratch, GEN_INT (-0x01010101)));
23381 emit_insn (gen_one_cmplsi2 (scratch, scratch));
23382 emit_insn (gen_andsi3 (tmpreg, tmpreg, scratch));
23383 emit_insn (gen_andsi3 (tmpreg, tmpreg,
23384 gen_int_mode (0x80808080, SImode)));
23385 emit_cmp_and_jump_insns (tmpreg, const0_rtx, EQ, 0, SImode, 1,
23386 align_4_label);
23387
23388 if (TARGET_CMOVE)
23389 {
23390 rtx reg = gen_reg_rtx (SImode);
23391 rtx reg2 = gen_reg_rtx (Pmode);
23392 emit_move_insn (reg, tmpreg);
23393 emit_insn (gen_lshrsi3 (reg, reg, GEN_INT (16)));
23394
23395 /* If zero is not in the first two bytes, move two bytes forward. */
23396 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
23397 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
23398 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
23399 emit_insn (gen_rtx_SET (VOIDmode, tmpreg,
23400 gen_rtx_IF_THEN_ELSE (SImode, tmp,
23401 reg,
23402 tmpreg)));
23403 /* Emit lea manually to avoid clobbering of flags. */
23404 emit_insn (gen_rtx_SET (SImode, reg2,
23405 gen_rtx_PLUS (Pmode, out, const2_rtx)));
23406
23407 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
23408 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
23409 emit_insn (gen_rtx_SET (VOIDmode, out,
23410 gen_rtx_IF_THEN_ELSE (Pmode, tmp,
23411 reg2,
23412 out)));
23413 }
23414 else
23415 {
23416 rtx end_2_label = gen_label_rtx ();
23417 /* Is zero in the first two bytes? */
23418
23419 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
23420 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
23421 tmp = gen_rtx_NE (VOIDmode, tmp, const0_rtx);
23422 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
23423 gen_rtx_LABEL_REF (VOIDmode, end_2_label),
23424 pc_rtx);
23425 tmp = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
23426 JUMP_LABEL (tmp) = end_2_label;
23427
23428 /* Not in the first two. Move two bytes forward. */
23429 emit_insn (gen_lshrsi3 (tmpreg, tmpreg, GEN_INT (16)));
23430 emit_insn (ix86_gen_add3 (out, out, const2_rtx));
23431
23432 emit_label (end_2_label);
23433
23434 }
23435
23436 /* Avoid branch in fixing the byte. */
23437 tmpreg = gen_lowpart (QImode, tmpreg);
23438 emit_insn (gen_addqi3_cc (tmpreg, tmpreg, tmpreg));
23439 tmp = gen_rtx_REG (CCmode, FLAGS_REG);
23440 cmp = gen_rtx_LTU (VOIDmode, tmp, const0_rtx);
23441 emit_insn (ix86_gen_sub3_carry (out, out, GEN_INT (3), tmp, cmp));
23442
23443 emit_label (end_0_label);
23444 }
23445
23446 /* Expand strlen. */
23447
23448 bool
23449 ix86_expand_strlen (rtx out, rtx src, rtx eoschar, rtx align)
23450 {
23451 rtx addr, scratch1, scratch2, scratch3, scratch4;
23452
23453 /* The generic case of strlen expander is long. Avoid it's
23454 expanding unless TARGET_INLINE_ALL_STRINGOPS. */
23455
23456 if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
23457 && !TARGET_INLINE_ALL_STRINGOPS
23458 && !optimize_insn_for_size_p ()
23459 && (!CONST_INT_P (align) || INTVAL (align) < 4))
23460 return false;
23461
23462 addr = force_reg (Pmode, XEXP (src, 0));
23463 scratch1 = gen_reg_rtx (Pmode);
23464
23465 if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
23466 && !optimize_insn_for_size_p ())
23467 {
23468 /* Well it seems that some optimizer does not combine a call like
23469 foo(strlen(bar), strlen(bar));
23470 when the move and the subtraction is done here. It does calculate
23471 the length just once when these instructions are done inside of
23472 output_strlen_unroll(). But I think since &bar[strlen(bar)] is
23473 often used and I use one fewer register for the lifetime of
23474 output_strlen_unroll() this is better. */
23475
23476 emit_move_insn (out, addr);
23477
23478 ix86_expand_strlensi_unroll_1 (out, src, align);
23479
23480 /* strlensi_unroll_1 returns the address of the zero at the end of
23481 the string, like memchr(), so compute the length by subtracting
23482 the start address. */
23483 emit_insn (ix86_gen_sub3 (out, out, addr));
23484 }
23485 else
23486 {
23487 rtx unspec;
23488
23489 /* Can't use this if the user has appropriated eax, ecx, or edi. */
23490 if (fixed_regs[AX_REG] || fixed_regs[CX_REG] || fixed_regs[DI_REG])
23491 return false;
23492
23493 scratch2 = gen_reg_rtx (Pmode);
23494 scratch3 = gen_reg_rtx (Pmode);
23495 scratch4 = force_reg (Pmode, constm1_rtx);
23496
23497 emit_move_insn (scratch3, addr);
23498 eoschar = force_reg (QImode, eoschar);
23499
23500 src = replace_equiv_address_nv (src, scratch3);
23501
23502 /* If .md starts supporting :P, this can be done in .md. */
23503 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (4, src, eoschar, align,
23504 scratch4), UNSPEC_SCAS);
23505 emit_insn (gen_strlenqi_1 (scratch1, scratch3, unspec));
23506 emit_insn (ix86_gen_one_cmpl2 (scratch2, scratch1));
23507 emit_insn (ix86_gen_add3 (out, scratch2, constm1_rtx));
23508 }
23509 return true;
23510 }
23511
23512 /* For given symbol (function) construct code to compute address of it's PLT
23513 entry in large x86-64 PIC model. */
23514 static rtx
23515 construct_plt_address (rtx symbol)
23516 {
23517 rtx tmp, unspec;
23518
23519 gcc_assert (GET_CODE (symbol) == SYMBOL_REF);
23520 gcc_assert (ix86_cmodel == CM_LARGE_PIC);
23521 gcc_assert (Pmode == DImode);
23522
23523 tmp = gen_reg_rtx (Pmode);
23524 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, symbol), UNSPEC_PLTOFF);
23525
23526 emit_move_insn (tmp, gen_rtx_CONST (Pmode, unspec));
23527 emit_insn (ix86_gen_add3 (tmp, tmp, pic_offset_table_rtx));
23528 return tmp;
23529 }
23530
23531 rtx
23532 ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1,
23533 rtx callarg2,
23534 rtx pop, bool sibcall)
23535 {
23536 /* We need to represent that SI and DI registers are clobbered
23537 by SYSV calls. */
23538 static int clobbered_registers[] = {
23539 XMM6_REG, XMM7_REG, XMM8_REG,
23540 XMM9_REG, XMM10_REG, XMM11_REG,
23541 XMM12_REG, XMM13_REG, XMM14_REG,
23542 XMM15_REG, SI_REG, DI_REG
23543 };
23544 rtx vec[ARRAY_SIZE (clobbered_registers) + 3];
23545 rtx use = NULL, call;
23546 unsigned int vec_len;
23547
23548 if (pop == const0_rtx)
23549 pop = NULL;
23550 gcc_assert (!TARGET_64BIT || !pop);
23551
23552 if (TARGET_MACHO && !TARGET_64BIT)
23553 {
23554 #if TARGET_MACHO
23555 if (flag_pic && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
23556 fnaddr = machopic_indirect_call_target (fnaddr);
23557 #endif
23558 }
23559 else
23560 {
23561 /* Static functions and indirect calls don't need the pic register. */
23562 if (flag_pic && (!TARGET_64BIT || ix86_cmodel == CM_LARGE_PIC)
23563 && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
23564 && ! SYMBOL_REF_LOCAL_P (XEXP (fnaddr, 0)))
23565 use_reg (&use, pic_offset_table_rtx);
23566 }
23567
23568 if (TARGET_64BIT && INTVAL (callarg2) >= 0)
23569 {
23570 rtx al = gen_rtx_REG (QImode, AX_REG);
23571 emit_move_insn (al, callarg2);
23572 use_reg (&use, al);
23573 }
23574
23575 if (ix86_cmodel == CM_LARGE_PIC
23576 && MEM_P (fnaddr)
23577 && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
23578 && !local_symbolic_operand (XEXP (fnaddr, 0), VOIDmode))
23579 fnaddr = gen_rtx_MEM (QImode, construct_plt_address (XEXP (fnaddr, 0)));
23580 else if (sibcall
23581 ? !sibcall_insn_operand (XEXP (fnaddr, 0), word_mode)
23582 : !call_insn_operand (XEXP (fnaddr, 0), word_mode))
23583 {
23584 fnaddr = XEXP (fnaddr, 0);
23585 if (GET_MODE (fnaddr) != word_mode)
23586 fnaddr = convert_to_mode (word_mode, fnaddr, 1);
23587 fnaddr = gen_rtx_MEM (QImode, copy_to_mode_reg (word_mode, fnaddr));
23588 }
23589
23590 vec_len = 0;
23591 call = gen_rtx_CALL (VOIDmode, fnaddr, callarg1);
23592 if (retval)
23593 call = gen_rtx_SET (VOIDmode, retval, call);
23594 vec[vec_len++] = call;
23595
23596 if (pop)
23597 {
23598 pop = gen_rtx_PLUS (Pmode, stack_pointer_rtx, pop);
23599 pop = gen_rtx_SET (VOIDmode, stack_pointer_rtx, pop);
23600 vec[vec_len++] = pop;
23601 }
23602
23603 if (TARGET_64BIT_MS_ABI
23604 && (!callarg2 || INTVAL (callarg2) != -2))
23605 {
23606 unsigned i;
23607
23608 vec[vec_len++] = gen_rtx_UNSPEC (VOIDmode, gen_rtvec (1, const0_rtx),
23609 UNSPEC_MS_TO_SYSV_CALL);
23610
23611 for (i = 0; i < ARRAY_SIZE (clobbered_registers); i++)
23612 vec[vec_len++]
23613 = gen_rtx_CLOBBER (SSE_REGNO_P (clobbered_registers[i])
23614 ? TImode : DImode,
23615 gen_rtx_REG (SSE_REGNO_P (clobbered_registers[i])
23616 ? TImode : DImode,
23617 clobbered_registers[i]));
23618 }
23619
23620 /* Add UNSPEC_CALL_NEEDS_VZEROUPPER decoration. */
23621 if (TARGET_VZEROUPPER)
23622 {
23623 int avx256;
23624 if (cfun->machine->callee_pass_avx256_p)
23625 {
23626 if (cfun->machine->callee_return_avx256_p)
23627 avx256 = callee_return_pass_avx256;
23628 else
23629 avx256 = callee_pass_avx256;
23630 }
23631 else if (cfun->machine->callee_return_avx256_p)
23632 avx256 = callee_return_avx256;
23633 else
23634 avx256 = call_no_avx256;
23635
23636 if (reload_completed)
23637 emit_insn (gen_avx_vzeroupper (GEN_INT (avx256)));
23638 else
23639 vec[vec_len++] = gen_rtx_UNSPEC (VOIDmode,
23640 gen_rtvec (1, GEN_INT (avx256)),
23641 UNSPEC_CALL_NEEDS_VZEROUPPER);
23642 }
23643
23644 if (vec_len > 1)
23645 call = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (vec_len, vec));
23646 call = emit_call_insn (call);
23647 if (use)
23648 CALL_INSN_FUNCTION_USAGE (call) = use;
23649
23650 return call;
23651 }
23652
23653 void
23654 ix86_split_call_vzeroupper (rtx insn, rtx vzeroupper)
23655 {
23656 rtx pat = PATTERN (insn);
23657 rtvec vec = XVEC (pat, 0);
23658 int len = GET_NUM_ELEM (vec) - 1;
23659
23660 /* Strip off the last entry of the parallel. */
23661 gcc_assert (GET_CODE (RTVEC_ELT (vec, len)) == UNSPEC);
23662 gcc_assert (XINT (RTVEC_ELT (vec, len), 1) == UNSPEC_CALL_NEEDS_VZEROUPPER);
23663 if (len == 1)
23664 pat = RTVEC_ELT (vec, 0);
23665 else
23666 pat = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (len, &RTVEC_ELT (vec, 0)));
23667
23668 emit_insn (gen_avx_vzeroupper (vzeroupper));
23669 emit_call_insn (pat);
23670 }
23671
23672 /* Output the assembly for a call instruction. */
23673
23674 const char *
23675 ix86_output_call_insn (rtx insn, rtx call_op)
23676 {
23677 bool direct_p = constant_call_address_operand (call_op, VOIDmode);
23678 bool seh_nop_p = false;
23679 const char *xasm;
23680
23681 if (SIBLING_CALL_P (insn))
23682 {
23683 if (direct_p)
23684 xasm = "jmp\t%P0";
23685 /* SEH epilogue detection requires the indirect branch case
23686 to include REX.W. */
23687 else if (TARGET_SEH)
23688 xasm = "rex.W jmp %A0";
23689 else
23690 xasm = "jmp\t%A0";
23691
23692 output_asm_insn (xasm, &call_op);
23693 return "";
23694 }
23695
23696 /* SEH unwinding can require an extra nop to be emitted in several
23697 circumstances. Determine if we have one of those. */
23698 if (TARGET_SEH)
23699 {
23700 rtx i;
23701
23702 for (i = NEXT_INSN (insn); i ; i = NEXT_INSN (i))
23703 {
23704 /* If we get to another real insn, we don't need the nop. */
23705 if (INSN_P (i))
23706 break;
23707
23708 /* If we get to the epilogue note, prevent a catch region from
23709 being adjacent to the standard epilogue sequence. If non-
23710 call-exceptions, we'll have done this during epilogue emission. */
23711 if (NOTE_P (i) && NOTE_KIND (i) == NOTE_INSN_EPILOGUE_BEG
23712 && !flag_non_call_exceptions
23713 && !can_throw_internal (insn))
23714 {
23715 seh_nop_p = true;
23716 break;
23717 }
23718 }
23719
23720 /* If we didn't find a real insn following the call, prevent the
23721 unwinder from looking into the next function. */
23722 if (i == NULL)
23723 seh_nop_p = true;
23724 }
23725
23726 if (direct_p)
23727 xasm = "call\t%P0";
23728 else
23729 xasm = "call\t%A0";
23730
23731 output_asm_insn (xasm, &call_op);
23732
23733 if (seh_nop_p)
23734 return "nop";
23735
23736 return "";
23737 }
23738 \f
23739 /* Clear stack slot assignments remembered from previous functions.
23740 This is called from INIT_EXPANDERS once before RTL is emitted for each
23741 function. */
23742
23743 static struct machine_function *
23744 ix86_init_machine_status (void)
23745 {
23746 struct machine_function *f;
23747
23748 f = ggc_alloc_cleared_machine_function ();
23749 f->use_fast_prologue_epilogue_nregs = -1;
23750 f->tls_descriptor_call_expanded_p = 0;
23751 f->call_abi = ix86_abi;
23752
23753 return f;
23754 }
23755
23756 /* Return a MEM corresponding to a stack slot with mode MODE.
23757 Allocate a new slot if necessary.
23758
23759 The RTL for a function can have several slots available: N is
23760 which slot to use. */
23761
23762 rtx
23763 assign_386_stack_local (enum machine_mode mode, enum ix86_stack_slot n)
23764 {
23765 struct stack_local_entry *s;
23766
23767 gcc_assert (n < MAX_386_STACK_LOCALS);
23768
23769 /* Virtual slot is valid only before vregs are instantiated. */
23770 gcc_assert ((n == SLOT_VIRTUAL) == !virtuals_instantiated);
23771
23772 for (s = ix86_stack_locals; s; s = s->next)
23773 if (s->mode == mode && s->n == n)
23774 return validize_mem (copy_rtx (s->rtl));
23775
23776 s = ggc_alloc_stack_local_entry ();
23777 s->n = n;
23778 s->mode = mode;
23779 s->rtl = assign_stack_local (mode, GET_MODE_SIZE (mode), 0);
23780
23781 s->next = ix86_stack_locals;
23782 ix86_stack_locals = s;
23783 return validize_mem (s->rtl);
23784 }
23785 \f
23786 /* Calculate the length of the memory address in the instruction encoding.
23787 Includes addr32 prefix, does not include the one-byte modrm, opcode,
23788 or other prefixes. We never generate addr32 prefix for LEA insn. */
23789
23790 int
23791 memory_address_length (rtx addr, bool lea)
23792 {
23793 struct ix86_address parts;
23794 rtx base, index, disp;
23795 int len;
23796 int ok;
23797
23798 if (GET_CODE (addr) == PRE_DEC
23799 || GET_CODE (addr) == POST_INC
23800 || GET_CODE (addr) == PRE_MODIFY
23801 || GET_CODE (addr) == POST_MODIFY)
23802 return 0;
23803
23804 ok = ix86_decompose_address (addr, &parts);
23805 gcc_assert (ok);
23806
23807 len = (parts.seg == SEG_DEFAULT) ? 0 : 1;
23808
23809 /* If this is not LEA instruction, add the length of addr32 prefix. */
23810 if (TARGET_64BIT && !lea
23811 && (SImode_address_operand (addr, VOIDmode)
23812 || (parts.base && GET_MODE (parts.base) == SImode)
23813 || (parts.index && GET_MODE (parts.index) == SImode)))
23814 len++;
23815
23816 base = parts.base;
23817 index = parts.index;
23818 disp = parts.disp;
23819
23820 if (base && GET_CODE (base) == SUBREG)
23821 base = SUBREG_REG (base);
23822 if (index && GET_CODE (index) == SUBREG)
23823 index = SUBREG_REG (index);
23824
23825 gcc_assert (base == NULL_RTX || REG_P (base));
23826 gcc_assert (index == NULL_RTX || REG_P (index));
23827
23828 /* Rule of thumb:
23829 - esp as the base always wants an index,
23830 - ebp as the base always wants a displacement,
23831 - r12 as the base always wants an index,
23832 - r13 as the base always wants a displacement. */
23833
23834 /* Register Indirect. */
23835 if (base && !index && !disp)
23836 {
23837 /* esp (for its index) and ebp (for its displacement) need
23838 the two-byte modrm form. Similarly for r12 and r13 in 64-bit
23839 code. */
23840 if (base == arg_pointer_rtx
23841 || base == frame_pointer_rtx
23842 || REGNO (base) == SP_REG
23843 || REGNO (base) == BP_REG
23844 || REGNO (base) == R12_REG
23845 || REGNO (base) == R13_REG)
23846 len++;
23847 }
23848
23849 /* Direct Addressing. In 64-bit mode mod 00 r/m 5
23850 is not disp32, but disp32(%rip), so for disp32
23851 SIB byte is needed, unless print_operand_address
23852 optimizes it into disp32(%rip) or (%rip) is implied
23853 by UNSPEC. */
23854 else if (disp && !base && !index)
23855 {
23856 len += 4;
23857 if (TARGET_64BIT)
23858 {
23859 rtx symbol = disp;
23860
23861 if (GET_CODE (disp) == CONST)
23862 symbol = XEXP (disp, 0);
23863 if (GET_CODE (symbol) == PLUS
23864 && CONST_INT_P (XEXP (symbol, 1)))
23865 symbol = XEXP (symbol, 0);
23866
23867 if (GET_CODE (symbol) != LABEL_REF
23868 && (GET_CODE (symbol) != SYMBOL_REF
23869 || SYMBOL_REF_TLS_MODEL (symbol) != 0)
23870 && (GET_CODE (symbol) != UNSPEC
23871 || (XINT (symbol, 1) != UNSPEC_GOTPCREL
23872 && XINT (symbol, 1) != UNSPEC_PCREL
23873 && XINT (symbol, 1) != UNSPEC_GOTNTPOFF)))
23874 len++;
23875 }
23876 }
23877 else
23878 {
23879 /* Find the length of the displacement constant. */
23880 if (disp)
23881 {
23882 if (base && satisfies_constraint_K (disp))
23883 len += 1;
23884 else
23885 len += 4;
23886 }
23887 /* ebp always wants a displacement. Similarly r13. */
23888 else if (base && (REGNO (base) == BP_REG || REGNO (base) == R13_REG))
23889 len++;
23890
23891 /* An index requires the two-byte modrm form.... */
23892 if (index
23893 /* ...like esp (or r12), which always wants an index. */
23894 || base == arg_pointer_rtx
23895 || base == frame_pointer_rtx
23896 || (base && (REGNO (base) == SP_REG || REGNO (base) == R12_REG)))
23897 len++;
23898 }
23899
23900 return len;
23901 }
23902
23903 /* Compute default value for "length_immediate" attribute. When SHORTFORM
23904 is set, expect that insn have 8bit immediate alternative. */
23905 int
23906 ix86_attr_length_immediate_default (rtx insn, bool shortform)
23907 {
23908 int len = 0;
23909 int i;
23910 extract_insn_cached (insn);
23911 for (i = recog_data.n_operands - 1; i >= 0; --i)
23912 if (CONSTANT_P (recog_data.operand[i]))
23913 {
23914 enum attr_mode mode = get_attr_mode (insn);
23915
23916 gcc_assert (!len);
23917 if (shortform && CONST_INT_P (recog_data.operand[i]))
23918 {
23919 HOST_WIDE_INT ival = INTVAL (recog_data.operand[i]);
23920 switch (mode)
23921 {
23922 case MODE_QI:
23923 len = 1;
23924 continue;
23925 case MODE_HI:
23926 ival = trunc_int_for_mode (ival, HImode);
23927 break;
23928 case MODE_SI:
23929 ival = trunc_int_for_mode (ival, SImode);
23930 break;
23931 default:
23932 break;
23933 }
23934 if (IN_RANGE (ival, -128, 127))
23935 {
23936 len = 1;
23937 continue;
23938 }
23939 }
23940 switch (mode)
23941 {
23942 case MODE_QI:
23943 len = 1;
23944 break;
23945 case MODE_HI:
23946 len = 2;
23947 break;
23948 case MODE_SI:
23949 len = 4;
23950 break;
23951 /* Immediates for DImode instructions are encoded
23952 as 32bit sign extended values. */
23953 case MODE_DI:
23954 len = 4;
23955 break;
23956 default:
23957 fatal_insn ("unknown insn mode", insn);
23958 }
23959 }
23960 return len;
23961 }
23962
23963 /* Compute default value for "length_address" attribute. */
23964 int
23965 ix86_attr_length_address_default (rtx insn)
23966 {
23967 int i;
23968
23969 if (get_attr_type (insn) == TYPE_LEA)
23970 {
23971 rtx set = PATTERN (insn), addr;
23972
23973 if (GET_CODE (set) == PARALLEL)
23974 set = XVECEXP (set, 0, 0);
23975
23976 gcc_assert (GET_CODE (set) == SET);
23977
23978 addr = SET_SRC (set);
23979
23980 return memory_address_length (addr, true);
23981 }
23982
23983 extract_insn_cached (insn);
23984 for (i = recog_data.n_operands - 1; i >= 0; --i)
23985 if (MEM_P (recog_data.operand[i]))
23986 {
23987 constrain_operands_cached (reload_completed);
23988 if (which_alternative != -1)
23989 {
23990 const char *constraints = recog_data.constraints[i];
23991 int alt = which_alternative;
23992
23993 while (*constraints == '=' || *constraints == '+')
23994 constraints++;
23995 while (alt-- > 0)
23996 while (*constraints++ != ',')
23997 ;
23998 /* Skip ignored operands. */
23999 if (*constraints == 'X')
24000 continue;
24001 }
24002 return memory_address_length (XEXP (recog_data.operand[i], 0), false);
24003 }
24004 return 0;
24005 }
24006
24007 /* Compute default value for "length_vex" attribute. It includes
24008 2 or 3 byte VEX prefix and 1 opcode byte. */
24009
24010 int
24011 ix86_attr_length_vex_default (rtx insn, bool has_0f_opcode, bool has_vex_w)
24012 {
24013 int i;
24014
24015 /* Only 0f opcode can use 2 byte VEX prefix and VEX W bit uses 3
24016 byte VEX prefix. */
24017 if (!has_0f_opcode || has_vex_w)
24018 return 3 + 1;
24019
24020 /* We can always use 2 byte VEX prefix in 32bit. */
24021 if (!TARGET_64BIT)
24022 return 2 + 1;
24023
24024 extract_insn_cached (insn);
24025
24026 for (i = recog_data.n_operands - 1; i >= 0; --i)
24027 if (REG_P (recog_data.operand[i]))
24028 {
24029 /* REX.W bit uses 3 byte VEX prefix. */
24030 if (GET_MODE (recog_data.operand[i]) == DImode
24031 && GENERAL_REG_P (recog_data.operand[i]))
24032 return 3 + 1;
24033 }
24034 else
24035 {
24036 /* REX.X or REX.B bits use 3 byte VEX prefix. */
24037 if (MEM_P (recog_data.operand[i])
24038 && x86_extended_reg_mentioned_p (recog_data.operand[i]))
24039 return 3 + 1;
24040 }
24041
24042 return 2 + 1;
24043 }
24044 \f
24045 /* Return the maximum number of instructions a cpu can issue. */
24046
24047 static int
24048 ix86_issue_rate (void)
24049 {
24050 switch (ix86_tune)
24051 {
24052 case PROCESSOR_PENTIUM:
24053 case PROCESSOR_ATOM:
24054 case PROCESSOR_K6:
24055 case PROCESSOR_BTVER2:
24056 return 2;
24057
24058 case PROCESSOR_PENTIUMPRO:
24059 case PROCESSOR_PENTIUM4:
24060 case PROCESSOR_CORE2_32:
24061 case PROCESSOR_CORE2_64:
24062 case PROCESSOR_COREI7_32:
24063 case PROCESSOR_COREI7_64:
24064 case PROCESSOR_ATHLON:
24065 case PROCESSOR_K8:
24066 case PROCESSOR_AMDFAM10:
24067 case PROCESSOR_NOCONA:
24068 case PROCESSOR_GENERIC32:
24069 case PROCESSOR_GENERIC64:
24070 case PROCESSOR_BDVER1:
24071 case PROCESSOR_BDVER2:
24072 case PROCESSOR_BTVER1:
24073 return 3;
24074
24075 default:
24076 return 1;
24077 }
24078 }
24079
24080 /* A subroutine of ix86_adjust_cost -- return TRUE iff INSN reads flags set
24081 by DEP_INSN and nothing set by DEP_INSN. */
24082
24083 static bool
24084 ix86_flags_dependent (rtx insn, rtx dep_insn, enum attr_type insn_type)
24085 {
24086 rtx set, set2;
24087
24088 /* Simplify the test for uninteresting insns. */
24089 if (insn_type != TYPE_SETCC
24090 && insn_type != TYPE_ICMOV
24091 && insn_type != TYPE_FCMOV
24092 && insn_type != TYPE_IBR)
24093 return false;
24094
24095 if ((set = single_set (dep_insn)) != 0)
24096 {
24097 set = SET_DEST (set);
24098 set2 = NULL_RTX;
24099 }
24100 else if (GET_CODE (PATTERN (dep_insn)) == PARALLEL
24101 && XVECLEN (PATTERN (dep_insn), 0) == 2
24102 && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 0)) == SET
24103 && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 1)) == SET)
24104 {
24105 set = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
24106 set2 = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
24107 }
24108 else
24109 return false;
24110
24111 if (!REG_P (set) || REGNO (set) != FLAGS_REG)
24112 return false;
24113
24114 /* This test is true if the dependent insn reads the flags but
24115 not any other potentially set register. */
24116 if (!reg_overlap_mentioned_p (set, PATTERN (insn)))
24117 return false;
24118
24119 if (set2 && reg_overlap_mentioned_p (set2, PATTERN (insn)))
24120 return false;
24121
24122 return true;
24123 }
24124
24125 /* Return true iff USE_INSN has a memory address with operands set by
24126 SET_INSN. */
24127
24128 bool
24129 ix86_agi_dependent (rtx set_insn, rtx use_insn)
24130 {
24131 int i;
24132 extract_insn_cached (use_insn);
24133 for (i = recog_data.n_operands - 1; i >= 0; --i)
24134 if (MEM_P (recog_data.operand[i]))
24135 {
24136 rtx addr = XEXP (recog_data.operand[i], 0);
24137 return modified_in_p (addr, set_insn) != 0;
24138 }
24139 return false;
24140 }
24141
24142 static int
24143 ix86_adjust_cost (rtx insn, rtx link, rtx dep_insn, int cost)
24144 {
24145 enum attr_type insn_type, dep_insn_type;
24146 enum attr_memory memory;
24147 rtx set, set2;
24148 int dep_insn_code_number;
24149
24150 /* Anti and output dependencies have zero cost on all CPUs. */
24151 if (REG_NOTE_KIND (link) != 0)
24152 return 0;
24153
24154 dep_insn_code_number = recog_memoized (dep_insn);
24155
24156 /* If we can't recognize the insns, we can't really do anything. */
24157 if (dep_insn_code_number < 0 || recog_memoized (insn) < 0)
24158 return cost;
24159
24160 insn_type = get_attr_type (insn);
24161 dep_insn_type = get_attr_type (dep_insn);
24162
24163 switch (ix86_tune)
24164 {
24165 case PROCESSOR_PENTIUM:
24166 /* Address Generation Interlock adds a cycle of latency. */
24167 if (insn_type == TYPE_LEA)
24168 {
24169 rtx addr = PATTERN (insn);
24170
24171 if (GET_CODE (addr) == PARALLEL)
24172 addr = XVECEXP (addr, 0, 0);
24173
24174 gcc_assert (GET_CODE (addr) == SET);
24175
24176 addr = SET_SRC (addr);
24177 if (modified_in_p (addr, dep_insn))
24178 cost += 1;
24179 }
24180 else if (ix86_agi_dependent (dep_insn, insn))
24181 cost += 1;
24182
24183 /* ??? Compares pair with jump/setcc. */
24184 if (ix86_flags_dependent (insn, dep_insn, insn_type))
24185 cost = 0;
24186
24187 /* Floating point stores require value to be ready one cycle earlier. */
24188 if (insn_type == TYPE_FMOV
24189 && get_attr_memory (insn) == MEMORY_STORE
24190 && !ix86_agi_dependent (dep_insn, insn))
24191 cost += 1;
24192 break;
24193
24194 case PROCESSOR_PENTIUMPRO:
24195 memory = get_attr_memory (insn);
24196
24197 /* INT->FP conversion is expensive. */
24198 if (get_attr_fp_int_src (dep_insn))
24199 cost += 5;
24200
24201 /* There is one cycle extra latency between an FP op and a store. */
24202 if (insn_type == TYPE_FMOV
24203 && (set = single_set (dep_insn)) != NULL_RTX
24204 && (set2 = single_set (insn)) != NULL_RTX
24205 && rtx_equal_p (SET_DEST (set), SET_SRC (set2))
24206 && MEM_P (SET_DEST (set2)))
24207 cost += 1;
24208
24209 /* Show ability of reorder buffer to hide latency of load by executing
24210 in parallel with previous instruction in case
24211 previous instruction is not needed to compute the address. */
24212 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
24213 && !ix86_agi_dependent (dep_insn, insn))
24214 {
24215 /* Claim moves to take one cycle, as core can issue one load
24216 at time and the next load can start cycle later. */
24217 if (dep_insn_type == TYPE_IMOV
24218 || dep_insn_type == TYPE_FMOV)
24219 cost = 1;
24220 else if (cost > 1)
24221 cost--;
24222 }
24223 break;
24224
24225 case PROCESSOR_K6:
24226 memory = get_attr_memory (insn);
24227
24228 /* The esp dependency is resolved before the instruction is really
24229 finished. */
24230 if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
24231 && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
24232 return 1;
24233
24234 /* INT->FP conversion is expensive. */
24235 if (get_attr_fp_int_src (dep_insn))
24236 cost += 5;
24237
24238 /* Show ability of reorder buffer to hide latency of load by executing
24239 in parallel with previous instruction in case
24240 previous instruction is not needed to compute the address. */
24241 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
24242 && !ix86_agi_dependent (dep_insn, insn))
24243 {
24244 /* Claim moves to take one cycle, as core can issue one load
24245 at time and the next load can start cycle later. */
24246 if (dep_insn_type == TYPE_IMOV
24247 || dep_insn_type == TYPE_FMOV)
24248 cost = 1;
24249 else if (cost > 2)
24250 cost -= 2;
24251 else
24252 cost = 1;
24253 }
24254 break;
24255
24256 case PROCESSOR_ATHLON:
24257 case PROCESSOR_K8:
24258 case PROCESSOR_AMDFAM10:
24259 case PROCESSOR_BDVER1:
24260 case PROCESSOR_BDVER2:
24261 case PROCESSOR_BTVER1:
24262 case PROCESSOR_BTVER2:
24263 case PROCESSOR_ATOM:
24264 case PROCESSOR_GENERIC32:
24265 case PROCESSOR_GENERIC64:
24266 memory = get_attr_memory (insn);
24267
24268 /* Show ability of reorder buffer to hide latency of load by executing
24269 in parallel with previous instruction in case
24270 previous instruction is not needed to compute the address. */
24271 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
24272 && !ix86_agi_dependent (dep_insn, insn))
24273 {
24274 enum attr_unit unit = get_attr_unit (insn);
24275 int loadcost = 3;
24276
24277 /* Because of the difference between the length of integer and
24278 floating unit pipeline preparation stages, the memory operands
24279 for floating point are cheaper.
24280
24281 ??? For Athlon it the difference is most probably 2. */
24282 if (unit == UNIT_INTEGER || unit == UNIT_UNKNOWN)
24283 loadcost = 3;
24284 else
24285 loadcost = TARGET_ATHLON ? 2 : 0;
24286
24287 if (cost >= loadcost)
24288 cost -= loadcost;
24289 else
24290 cost = 0;
24291 }
24292
24293 default:
24294 break;
24295 }
24296
24297 return cost;
24298 }
24299
24300 /* How many alternative schedules to try. This should be as wide as the
24301 scheduling freedom in the DFA, but no wider. Making this value too
24302 large results extra work for the scheduler. */
24303
24304 static int
24305 ia32_multipass_dfa_lookahead (void)
24306 {
24307 switch (ix86_tune)
24308 {
24309 case PROCESSOR_PENTIUM:
24310 return 2;
24311
24312 case PROCESSOR_PENTIUMPRO:
24313 case PROCESSOR_K6:
24314 return 1;
24315
24316 case PROCESSOR_CORE2_32:
24317 case PROCESSOR_CORE2_64:
24318 case PROCESSOR_COREI7_32:
24319 case PROCESSOR_COREI7_64:
24320 case PROCESSOR_ATOM:
24321 /* Generally, we want haifa-sched:max_issue() to look ahead as far
24322 as many instructions can be executed on a cycle, i.e.,
24323 issue_rate. I wonder why tuning for many CPUs does not do this. */
24324 if (reload_completed)
24325 return ix86_issue_rate ();
24326 /* Don't use lookahead for pre-reload schedule to save compile time. */
24327 return 0;
24328
24329 default:
24330 return 0;
24331 }
24332 }
24333
24334 /* Try to reorder ready list to take advantage of Atom pipelined IMUL
24335 execution. It is applied if
24336 (1) IMUL instruction is on the top of list;
24337 (2) There exists the only producer of independent IMUL instruction in
24338 ready list;
24339 (3) Put found producer on the top of ready list.
24340 Returns issue rate. */
24341
24342 static int
24343 ix86_sched_reorder(FILE *dump, int sched_verbose, rtx *ready, int *pn_ready,
24344 int clock_var ATTRIBUTE_UNUSED)
24345 {
24346 static int issue_rate = -1;
24347 int n_ready = *pn_ready;
24348 rtx insn, insn1, insn2;
24349 int i;
24350 sd_iterator_def sd_it;
24351 dep_t dep;
24352 int index = -1;
24353
24354 /* Set up issue rate. */
24355 issue_rate = ix86_issue_rate();
24356
24357 /* Do reodering for Atom only. */
24358 if (ix86_tune != PROCESSOR_ATOM)
24359 return issue_rate;
24360 /* Do not perform ready list reodering for pre-reload schedule pass. */
24361 if (!reload_completed)
24362 return issue_rate;
24363 /* Nothing to do if ready list contains only 1 instruction. */
24364 if (n_ready <= 1)
24365 return issue_rate;
24366
24367 /* Check that IMUL instruction is on the top of ready list. */
24368 insn = ready[n_ready - 1];
24369 if (!NONDEBUG_INSN_P (insn))
24370 return issue_rate;
24371 insn = PATTERN (insn);
24372 if (GET_CODE (insn) == PARALLEL)
24373 insn = XVECEXP (insn, 0, 0);
24374 if (GET_CODE (insn) != SET)
24375 return issue_rate;
24376 if (!(GET_CODE (SET_SRC (insn)) == MULT
24377 && GET_MODE (SET_SRC (insn)) == SImode))
24378 return issue_rate;
24379
24380 /* Search for producer of independent IMUL instruction. */
24381 for (i = n_ready - 2; i>= 0; i--)
24382 {
24383 insn = ready[i];
24384 if (!NONDEBUG_INSN_P (insn))
24385 continue;
24386 /* Skip IMUL instruction. */
24387 insn2 = PATTERN (insn);
24388 if (GET_CODE (insn2) == PARALLEL)
24389 insn2 = XVECEXP (insn2, 0, 0);
24390 if (GET_CODE (insn2) == SET
24391 && GET_CODE (SET_SRC (insn2)) == MULT
24392 && GET_MODE (SET_SRC (insn2)) == SImode)
24393 continue;
24394
24395 FOR_EACH_DEP (insn, SD_LIST_FORW, sd_it, dep)
24396 {
24397 rtx con;
24398 con = DEP_CON (dep);
24399 if (!NONDEBUG_INSN_P (con))
24400 continue;
24401 insn1 = PATTERN (con);
24402 if (GET_CODE (insn1) == PARALLEL)
24403 insn1 = XVECEXP (insn1, 0, 0);
24404
24405 if (GET_CODE (insn1) == SET
24406 && GET_CODE (SET_SRC (insn1)) == MULT
24407 && GET_MODE (SET_SRC (insn1)) == SImode)
24408 {
24409 sd_iterator_def sd_it1;
24410 dep_t dep1;
24411 /* Check if there is no other dependee for IMUL. */
24412 index = i;
24413 FOR_EACH_DEP (con, SD_LIST_BACK, sd_it1, dep1)
24414 {
24415 rtx pro;
24416 pro = DEP_PRO (dep1);
24417 if (!NONDEBUG_INSN_P (pro))
24418 continue;
24419 if (pro != insn)
24420 index = -1;
24421 }
24422 if (index >= 0)
24423 break;
24424 }
24425 }
24426 if (index >= 0)
24427 break;
24428 }
24429 if (index < 0)
24430 return issue_rate; /* Didn't find IMUL producer. */
24431
24432 if (sched_verbose > 1)
24433 fprintf(dump, ";;\tatom sched_reorder: swap %d and %d insns\n",
24434 INSN_UID (ready[index]), INSN_UID (ready[n_ready - 1]));
24435
24436 /* Put IMUL producer (ready[index]) at the top of ready list. */
24437 insn1= ready[index];
24438 for (i = index; i < n_ready - 1; i++)
24439 ready[i] = ready[i + 1];
24440 ready[n_ready - 1] = insn1;
24441
24442 return issue_rate;
24443 }
24444
24445 static bool
24446 ix86_class_likely_spilled_p (reg_class_t);
24447
24448 /* Returns true if lhs of insn is HW function argument register and set up
24449 is_spilled to true if it is likely spilled HW register. */
24450 static bool
24451 insn_is_function_arg (rtx insn, bool* is_spilled)
24452 {
24453 rtx dst;
24454
24455 if (!NONDEBUG_INSN_P (insn))
24456 return false;
24457 /* Call instructions are not movable, ignore it. */
24458 if (CALL_P (insn))
24459 return false;
24460 insn = PATTERN (insn);
24461 if (GET_CODE (insn) == PARALLEL)
24462 insn = XVECEXP (insn, 0, 0);
24463 if (GET_CODE (insn) != SET)
24464 return false;
24465 dst = SET_DEST (insn);
24466 if (REG_P (dst) && HARD_REGISTER_P (dst)
24467 && ix86_function_arg_regno_p (REGNO (dst)))
24468 {
24469 /* Is it likely spilled HW register? */
24470 if (!TEST_HARD_REG_BIT (fixed_reg_set, REGNO (dst))
24471 && ix86_class_likely_spilled_p (REGNO_REG_CLASS (REGNO (dst))))
24472 *is_spilled = true;
24473 return true;
24474 }
24475 return false;
24476 }
24477
24478 /* Add output dependencies for chain of function adjacent arguments if only
24479 there is a move to likely spilled HW register. Return first argument
24480 if at least one dependence was added or NULL otherwise. */
24481 static rtx
24482 add_parameter_dependencies (rtx call, rtx head)
24483 {
24484 rtx insn;
24485 rtx last = call;
24486 rtx first_arg = NULL;
24487 bool is_spilled = false;
24488
24489 /* Find nearest to call argument passing instruction. */
24490 while (true)
24491 {
24492 last = PREV_INSN (last);
24493 if (last == head)
24494 return NULL;
24495 if (!NONDEBUG_INSN_P (last))
24496 continue;
24497 if (insn_is_function_arg (last, &is_spilled))
24498 break;
24499 return NULL;
24500 }
24501
24502 first_arg = last;
24503 while (true)
24504 {
24505 insn = PREV_INSN (last);
24506 if (!INSN_P (insn))
24507 break;
24508 if (insn == head)
24509 break;
24510 if (!NONDEBUG_INSN_P (insn))
24511 {
24512 last = insn;
24513 continue;
24514 }
24515 if (insn_is_function_arg (insn, &is_spilled))
24516 {
24517 /* Add output depdendence between two function arguments if chain
24518 of output arguments contains likely spilled HW registers. */
24519 if (is_spilled)
24520 add_dependence (last, insn, REG_DEP_OUTPUT);
24521 first_arg = last = insn;
24522 }
24523 else
24524 break;
24525 }
24526 if (!is_spilled)
24527 return NULL;
24528 return first_arg;
24529 }
24530
24531 /* Add output or anti dependency from insn to first_arg to restrict its code
24532 motion. */
24533 static void
24534 avoid_func_arg_motion (rtx first_arg, rtx insn)
24535 {
24536 rtx set;
24537 rtx tmp;
24538
24539 set = single_set (insn);
24540 if (!set)
24541 return;
24542 tmp = SET_DEST (set);
24543 if (REG_P (tmp))
24544 {
24545 /* Add output dependency to the first function argument. */
24546 add_dependence (first_arg, insn, REG_DEP_OUTPUT);
24547 return;
24548 }
24549 /* Add anti dependency. */
24550 add_dependence (first_arg, insn, REG_DEP_ANTI);
24551 }
24552
24553 /* Avoid cross block motion of function argument through adding dependency
24554 from the first non-jump instruction in bb. */
24555 static void
24556 add_dependee_for_func_arg (rtx arg, basic_block bb)
24557 {
24558 rtx insn = BB_END (bb);
24559
24560 while (insn)
24561 {
24562 if (NONDEBUG_INSN_P (insn) && NONJUMP_INSN_P (insn))
24563 {
24564 rtx set = single_set (insn);
24565 if (set)
24566 {
24567 avoid_func_arg_motion (arg, insn);
24568 return;
24569 }
24570 }
24571 if (insn == BB_HEAD (bb))
24572 return;
24573 insn = PREV_INSN (insn);
24574 }
24575 }
24576
24577 /* Hook for pre-reload schedule - avoid motion of function arguments
24578 passed in likely spilled HW registers. */
24579 static void
24580 ix86_dependencies_evaluation_hook (rtx head, rtx tail)
24581 {
24582 rtx insn;
24583 rtx first_arg = NULL;
24584 if (reload_completed)
24585 return;
24586 for (insn = tail; insn != head; insn = PREV_INSN (insn))
24587 if (INSN_P (insn) && CALL_P (insn))
24588 {
24589 first_arg = add_parameter_dependencies (insn, head);
24590 if (first_arg)
24591 {
24592 /* Add dependee for first argument to predecessors if only
24593 region contains more than one block. */
24594 basic_block bb = BLOCK_FOR_INSN (insn);
24595 int rgn = CONTAINING_RGN (bb->index);
24596 int nr_blks = RGN_NR_BLOCKS (rgn);
24597 /* Skip trivial regions and region head blocks that can have
24598 predecessors outside of region. */
24599 if (nr_blks > 1 && BLOCK_TO_BB (bb->index) != 0)
24600 {
24601 edge e;
24602 edge_iterator ei;
24603 /* Assume that region is SCC, i.e. all immediate predecessors
24604 of non-head block are in the same region. */
24605 FOR_EACH_EDGE (e, ei, bb->preds)
24606 {
24607 /* Avoid creating of loop-carried dependencies through
24608 using topological odering in region. */
24609 if (BLOCK_TO_BB (bb->index) > BLOCK_TO_BB (e->src->index))
24610 add_dependee_for_func_arg (first_arg, e->src);
24611 }
24612 }
24613 insn = first_arg;
24614 }
24615 }
24616 else if (first_arg)
24617 avoid_func_arg_motion (first_arg, insn);
24618 }
24619
24620 /* Hook for pre-reload schedule - set priority of moves from likely spilled
24621 HW registers to maximum, to schedule them at soon as possible. These are
24622 moves from function argument registers at the top of the function entry
24623 and moves from function return value registers after call. */
24624 static int
24625 ix86_adjust_priority (rtx insn, int priority)
24626 {
24627 rtx set;
24628
24629 if (reload_completed)
24630 return priority;
24631
24632 if (!NONDEBUG_INSN_P (insn))
24633 return priority;
24634
24635 set = single_set (insn);
24636 if (set)
24637 {
24638 rtx tmp = SET_SRC (set);
24639 if (REG_P (tmp)
24640 && HARD_REGISTER_P (tmp)
24641 && !TEST_HARD_REG_BIT (fixed_reg_set, REGNO (tmp))
24642 && ix86_class_likely_spilled_p (REGNO_REG_CLASS (REGNO (tmp))))
24643 return current_sched_info->sched_max_insns_priority;
24644 }
24645
24646 return priority;
24647 }
24648
24649 /* Model decoder of Core 2/i7.
24650 Below hooks for multipass scheduling (see haifa-sched.c:max_issue)
24651 track the instruction fetch block boundaries and make sure that long
24652 (9+ bytes) instructions are assigned to D0. */
24653
24654 /* Maximum length of an insn that can be handled by
24655 a secondary decoder unit. '8' for Core 2/i7. */
24656 static int core2i7_secondary_decoder_max_insn_size;
24657
24658 /* Ifetch block size, i.e., number of bytes decoder reads per cycle.
24659 '16' for Core 2/i7. */
24660 static int core2i7_ifetch_block_size;
24661
24662 /* Maximum number of instructions decoder can handle per cycle.
24663 '6' for Core 2/i7. */
24664 static int core2i7_ifetch_block_max_insns;
24665
24666 typedef struct ix86_first_cycle_multipass_data_ *
24667 ix86_first_cycle_multipass_data_t;
24668 typedef const struct ix86_first_cycle_multipass_data_ *
24669 const_ix86_first_cycle_multipass_data_t;
24670
24671 /* A variable to store target state across calls to max_issue within
24672 one cycle. */
24673 static struct ix86_first_cycle_multipass_data_ _ix86_first_cycle_multipass_data,
24674 *ix86_first_cycle_multipass_data = &_ix86_first_cycle_multipass_data;
24675
24676 /* Initialize DATA. */
24677 static void
24678 core2i7_first_cycle_multipass_init (void *_data)
24679 {
24680 ix86_first_cycle_multipass_data_t data
24681 = (ix86_first_cycle_multipass_data_t) _data;
24682
24683 data->ifetch_block_len = 0;
24684 data->ifetch_block_n_insns = 0;
24685 data->ready_try_change = NULL;
24686 data->ready_try_change_size = 0;
24687 }
24688
24689 /* Advancing the cycle; reset ifetch block counts. */
24690 static void
24691 core2i7_dfa_post_advance_cycle (void)
24692 {
24693 ix86_first_cycle_multipass_data_t data = ix86_first_cycle_multipass_data;
24694
24695 gcc_assert (data->ifetch_block_n_insns <= core2i7_ifetch_block_max_insns);
24696
24697 data->ifetch_block_len = 0;
24698 data->ifetch_block_n_insns = 0;
24699 }
24700
24701 static int min_insn_size (rtx);
24702
24703 /* Filter out insns from ready_try that the core will not be able to issue
24704 on current cycle due to decoder. */
24705 static void
24706 core2i7_first_cycle_multipass_filter_ready_try
24707 (const_ix86_first_cycle_multipass_data_t data,
24708 char *ready_try, int n_ready, bool first_cycle_insn_p)
24709 {
24710 while (n_ready--)
24711 {
24712 rtx insn;
24713 int insn_size;
24714
24715 if (ready_try[n_ready])
24716 continue;
24717
24718 insn = get_ready_element (n_ready);
24719 insn_size = min_insn_size (insn);
24720
24721 if (/* If this is a too long an insn for a secondary decoder ... */
24722 (!first_cycle_insn_p
24723 && insn_size > core2i7_secondary_decoder_max_insn_size)
24724 /* ... or it would not fit into the ifetch block ... */
24725 || data->ifetch_block_len + insn_size > core2i7_ifetch_block_size
24726 /* ... or the decoder is full already ... */
24727 || data->ifetch_block_n_insns + 1 > core2i7_ifetch_block_max_insns)
24728 /* ... mask the insn out. */
24729 {
24730 ready_try[n_ready] = 1;
24731
24732 if (data->ready_try_change)
24733 SET_BIT (data->ready_try_change, n_ready);
24734 }
24735 }
24736 }
24737
24738 /* Prepare for a new round of multipass lookahead scheduling. */
24739 static void
24740 core2i7_first_cycle_multipass_begin (void *_data, char *ready_try, int n_ready,
24741 bool first_cycle_insn_p)
24742 {
24743 ix86_first_cycle_multipass_data_t data
24744 = (ix86_first_cycle_multipass_data_t) _data;
24745 const_ix86_first_cycle_multipass_data_t prev_data
24746 = ix86_first_cycle_multipass_data;
24747
24748 /* Restore the state from the end of the previous round. */
24749 data->ifetch_block_len = prev_data->ifetch_block_len;
24750 data->ifetch_block_n_insns = prev_data->ifetch_block_n_insns;
24751
24752 /* Filter instructions that cannot be issued on current cycle due to
24753 decoder restrictions. */
24754 core2i7_first_cycle_multipass_filter_ready_try (data, ready_try, n_ready,
24755 first_cycle_insn_p);
24756 }
24757
24758 /* INSN is being issued in current solution. Account for its impact on
24759 the decoder model. */
24760 static void
24761 core2i7_first_cycle_multipass_issue (void *_data, char *ready_try, int n_ready,
24762 rtx insn, const void *_prev_data)
24763 {
24764 ix86_first_cycle_multipass_data_t data
24765 = (ix86_first_cycle_multipass_data_t) _data;
24766 const_ix86_first_cycle_multipass_data_t prev_data
24767 = (const_ix86_first_cycle_multipass_data_t) _prev_data;
24768
24769 int insn_size = min_insn_size (insn);
24770
24771 data->ifetch_block_len = prev_data->ifetch_block_len + insn_size;
24772 data->ifetch_block_n_insns = prev_data->ifetch_block_n_insns + 1;
24773 gcc_assert (data->ifetch_block_len <= core2i7_ifetch_block_size
24774 && data->ifetch_block_n_insns <= core2i7_ifetch_block_max_insns);
24775
24776 /* Allocate or resize the bitmap for storing INSN's effect on ready_try. */
24777 if (!data->ready_try_change)
24778 {
24779 data->ready_try_change = sbitmap_alloc (n_ready);
24780 data->ready_try_change_size = n_ready;
24781 }
24782 else if (data->ready_try_change_size < n_ready)
24783 {
24784 data->ready_try_change = sbitmap_resize (data->ready_try_change,
24785 n_ready, 0);
24786 data->ready_try_change_size = n_ready;
24787 }
24788 sbitmap_zero (data->ready_try_change);
24789
24790 /* Filter out insns from ready_try that the core will not be able to issue
24791 on current cycle due to decoder. */
24792 core2i7_first_cycle_multipass_filter_ready_try (data, ready_try, n_ready,
24793 false);
24794 }
24795
24796 /* Revert the effect on ready_try. */
24797 static void
24798 core2i7_first_cycle_multipass_backtrack (const void *_data,
24799 char *ready_try,
24800 int n_ready ATTRIBUTE_UNUSED)
24801 {
24802 const_ix86_first_cycle_multipass_data_t data
24803 = (const_ix86_first_cycle_multipass_data_t) _data;
24804 unsigned int i = 0;
24805 sbitmap_iterator sbi;
24806
24807 gcc_assert (sbitmap_last_set_bit (data->ready_try_change) < n_ready);
24808 EXECUTE_IF_SET_IN_SBITMAP (data->ready_try_change, 0, i, sbi)
24809 {
24810 ready_try[i] = 0;
24811 }
24812 }
24813
24814 /* Save the result of multipass lookahead scheduling for the next round. */
24815 static void
24816 core2i7_first_cycle_multipass_end (const void *_data)
24817 {
24818 const_ix86_first_cycle_multipass_data_t data
24819 = (const_ix86_first_cycle_multipass_data_t) _data;
24820 ix86_first_cycle_multipass_data_t next_data
24821 = ix86_first_cycle_multipass_data;
24822
24823 if (data != NULL)
24824 {
24825 next_data->ifetch_block_len = data->ifetch_block_len;
24826 next_data->ifetch_block_n_insns = data->ifetch_block_n_insns;
24827 }
24828 }
24829
24830 /* Deallocate target data. */
24831 static void
24832 core2i7_first_cycle_multipass_fini (void *_data)
24833 {
24834 ix86_first_cycle_multipass_data_t data
24835 = (ix86_first_cycle_multipass_data_t) _data;
24836
24837 if (data->ready_try_change)
24838 {
24839 sbitmap_free (data->ready_try_change);
24840 data->ready_try_change = NULL;
24841 data->ready_try_change_size = 0;
24842 }
24843 }
24844
24845 /* Prepare for scheduling pass. */
24846 static void
24847 ix86_sched_init_global (FILE *dump ATTRIBUTE_UNUSED,
24848 int verbose ATTRIBUTE_UNUSED,
24849 int max_uid ATTRIBUTE_UNUSED)
24850 {
24851 /* Install scheduling hooks for current CPU. Some of these hooks are used
24852 in time-critical parts of the scheduler, so we only set them up when
24853 they are actually used. */
24854 switch (ix86_tune)
24855 {
24856 case PROCESSOR_CORE2_32:
24857 case PROCESSOR_CORE2_64:
24858 case PROCESSOR_COREI7_32:
24859 case PROCESSOR_COREI7_64:
24860 /* Do not perform multipass scheduling for pre-reload schedule
24861 to save compile time. */
24862 if (reload_completed)
24863 {
24864 targetm.sched.dfa_post_advance_cycle
24865 = core2i7_dfa_post_advance_cycle;
24866 targetm.sched.first_cycle_multipass_init
24867 = core2i7_first_cycle_multipass_init;
24868 targetm.sched.first_cycle_multipass_begin
24869 = core2i7_first_cycle_multipass_begin;
24870 targetm.sched.first_cycle_multipass_issue
24871 = core2i7_first_cycle_multipass_issue;
24872 targetm.sched.first_cycle_multipass_backtrack
24873 = core2i7_first_cycle_multipass_backtrack;
24874 targetm.sched.first_cycle_multipass_end
24875 = core2i7_first_cycle_multipass_end;
24876 targetm.sched.first_cycle_multipass_fini
24877 = core2i7_first_cycle_multipass_fini;
24878
24879 /* Set decoder parameters. */
24880 core2i7_secondary_decoder_max_insn_size = 8;
24881 core2i7_ifetch_block_size = 16;
24882 core2i7_ifetch_block_max_insns = 6;
24883 break;
24884 }
24885 /* ... Fall through ... */
24886 default:
24887 targetm.sched.dfa_post_advance_cycle = NULL;
24888 targetm.sched.first_cycle_multipass_init = NULL;
24889 targetm.sched.first_cycle_multipass_begin = NULL;
24890 targetm.sched.first_cycle_multipass_issue = NULL;
24891 targetm.sched.first_cycle_multipass_backtrack = NULL;
24892 targetm.sched.first_cycle_multipass_end = NULL;
24893 targetm.sched.first_cycle_multipass_fini = NULL;
24894 break;
24895 }
24896 }
24897
24898 \f
24899 /* Compute the alignment given to a constant that is being placed in memory.
24900 EXP is the constant and ALIGN is the alignment that the object would
24901 ordinarily have.
24902 The value of this function is used instead of that alignment to align
24903 the object. */
24904
24905 int
24906 ix86_constant_alignment (tree exp, int align)
24907 {
24908 if (TREE_CODE (exp) == REAL_CST || TREE_CODE (exp) == VECTOR_CST
24909 || TREE_CODE (exp) == INTEGER_CST)
24910 {
24911 if (TYPE_MODE (TREE_TYPE (exp)) == DFmode && align < 64)
24912 return 64;
24913 else if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (exp))) && align < 128)
24914 return 128;
24915 }
24916 else if (!optimize_size && TREE_CODE (exp) == STRING_CST
24917 && TREE_STRING_LENGTH (exp) >= 31 && align < BITS_PER_WORD)
24918 return BITS_PER_WORD;
24919
24920 return align;
24921 }
24922
24923 /* Compute the alignment for a static variable.
24924 TYPE is the data type, and ALIGN is the alignment that
24925 the object would ordinarily have. The value of this function is used
24926 instead of that alignment to align the object. */
24927
24928 int
24929 ix86_data_alignment (tree type, int align)
24930 {
24931 int max_align = optimize_size ? BITS_PER_WORD : MIN (256, MAX_OFILE_ALIGNMENT);
24932
24933 if (AGGREGATE_TYPE_P (type)
24934 && TYPE_SIZE (type)
24935 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
24936 && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= (unsigned) max_align
24937 || TREE_INT_CST_HIGH (TYPE_SIZE (type)))
24938 && align < max_align)
24939 align = max_align;
24940
24941 /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
24942 to 16byte boundary. */
24943 if (TARGET_64BIT)
24944 {
24945 if (AGGREGATE_TYPE_P (type)
24946 && TYPE_SIZE (type)
24947 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
24948 && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= 128
24949 || TREE_INT_CST_HIGH (TYPE_SIZE (type))) && align < 128)
24950 return 128;
24951 }
24952
24953 if (TREE_CODE (type) == ARRAY_TYPE)
24954 {
24955 if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
24956 return 64;
24957 if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
24958 return 128;
24959 }
24960 else if (TREE_CODE (type) == COMPLEX_TYPE)
24961 {
24962
24963 if (TYPE_MODE (type) == DCmode && align < 64)
24964 return 64;
24965 if ((TYPE_MODE (type) == XCmode
24966 || TYPE_MODE (type) == TCmode) && align < 128)
24967 return 128;
24968 }
24969 else if ((TREE_CODE (type) == RECORD_TYPE
24970 || TREE_CODE (type) == UNION_TYPE
24971 || TREE_CODE (type) == QUAL_UNION_TYPE)
24972 && TYPE_FIELDS (type))
24973 {
24974 if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
24975 return 64;
24976 if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
24977 return 128;
24978 }
24979 else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
24980 || TREE_CODE (type) == INTEGER_TYPE)
24981 {
24982 if (TYPE_MODE (type) == DFmode && align < 64)
24983 return 64;
24984 if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
24985 return 128;
24986 }
24987
24988 return align;
24989 }
24990
24991 /* Compute the alignment for a local variable or a stack slot. EXP is
24992 the data type or decl itself, MODE is the widest mode available and
24993 ALIGN is the alignment that the object would ordinarily have. The
24994 value of this macro is used instead of that alignment to align the
24995 object. */
24996
24997 unsigned int
24998 ix86_local_alignment (tree exp, enum machine_mode mode,
24999 unsigned int align)
25000 {
25001 tree type, decl;
25002
25003 if (exp && DECL_P (exp))
25004 {
25005 type = TREE_TYPE (exp);
25006 decl = exp;
25007 }
25008 else
25009 {
25010 type = exp;
25011 decl = NULL;
25012 }
25013
25014 /* Don't do dynamic stack realignment for long long objects with
25015 -mpreferred-stack-boundary=2. */
25016 if (!TARGET_64BIT
25017 && align == 64
25018 && ix86_preferred_stack_boundary < 64
25019 && (mode == DImode || (type && TYPE_MODE (type) == DImode))
25020 && (!type || !TYPE_USER_ALIGN (type))
25021 && (!decl || !DECL_USER_ALIGN (decl)))
25022 align = 32;
25023
25024 /* If TYPE is NULL, we are allocating a stack slot for caller-save
25025 register in MODE. We will return the largest alignment of XF
25026 and DF. */
25027 if (!type)
25028 {
25029 if (mode == XFmode && align < GET_MODE_ALIGNMENT (DFmode))
25030 align = GET_MODE_ALIGNMENT (DFmode);
25031 return align;
25032 }
25033
25034 /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
25035 to 16byte boundary. Exact wording is:
25036
25037 An array uses the same alignment as its elements, except that a local or
25038 global array variable of length at least 16 bytes or
25039 a C99 variable-length array variable always has alignment of at least 16 bytes.
25040
25041 This was added to allow use of aligned SSE instructions at arrays. This
25042 rule is meant for static storage (where compiler can not do the analysis
25043 by itself). We follow it for automatic variables only when convenient.
25044 We fully control everything in the function compiled and functions from
25045 other unit can not rely on the alignment.
25046
25047 Exclude va_list type. It is the common case of local array where
25048 we can not benefit from the alignment. */
25049 if (TARGET_64BIT && optimize_function_for_speed_p (cfun)
25050 && TARGET_SSE)
25051 {
25052 if (AGGREGATE_TYPE_P (type)
25053 && (va_list_type_node == NULL_TREE
25054 || (TYPE_MAIN_VARIANT (type)
25055 != TYPE_MAIN_VARIANT (va_list_type_node)))
25056 && TYPE_SIZE (type)
25057 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
25058 && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= 16
25059 || TREE_INT_CST_HIGH (TYPE_SIZE (type))) && align < 128)
25060 return 128;
25061 }
25062 if (TREE_CODE (type) == ARRAY_TYPE)
25063 {
25064 if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
25065 return 64;
25066 if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
25067 return 128;
25068 }
25069 else if (TREE_CODE (type) == COMPLEX_TYPE)
25070 {
25071 if (TYPE_MODE (type) == DCmode && align < 64)
25072 return 64;
25073 if ((TYPE_MODE (type) == XCmode
25074 || TYPE_MODE (type) == TCmode) && align < 128)
25075 return 128;
25076 }
25077 else if ((TREE_CODE (type) == RECORD_TYPE
25078 || TREE_CODE (type) == UNION_TYPE
25079 || TREE_CODE (type) == QUAL_UNION_TYPE)
25080 && TYPE_FIELDS (type))
25081 {
25082 if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
25083 return 64;
25084 if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
25085 return 128;
25086 }
25087 else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
25088 || TREE_CODE (type) == INTEGER_TYPE)
25089 {
25090
25091 if (TYPE_MODE (type) == DFmode && align < 64)
25092 return 64;
25093 if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
25094 return 128;
25095 }
25096 return align;
25097 }
25098
25099 /* Compute the minimum required alignment for dynamic stack realignment
25100 purposes for a local variable, parameter or a stack slot. EXP is
25101 the data type or decl itself, MODE is its mode and ALIGN is the
25102 alignment that the object would ordinarily have. */
25103
25104 unsigned int
25105 ix86_minimum_alignment (tree exp, enum machine_mode mode,
25106 unsigned int align)
25107 {
25108 tree type, decl;
25109
25110 if (exp && DECL_P (exp))
25111 {
25112 type = TREE_TYPE (exp);
25113 decl = exp;
25114 }
25115 else
25116 {
25117 type = exp;
25118 decl = NULL;
25119 }
25120
25121 if (TARGET_64BIT || align != 64 || ix86_preferred_stack_boundary >= 64)
25122 return align;
25123
25124 /* Don't do dynamic stack realignment for long long objects with
25125 -mpreferred-stack-boundary=2. */
25126 if ((mode == DImode || (type && TYPE_MODE (type) == DImode))
25127 && (!type || !TYPE_USER_ALIGN (type))
25128 && (!decl || !DECL_USER_ALIGN (decl)))
25129 return 32;
25130
25131 return align;
25132 }
25133 \f
25134 /* Find a location for the static chain incoming to a nested function.
25135 This is a register, unless all free registers are used by arguments. */
25136
25137 static rtx
25138 ix86_static_chain (const_tree fndecl, bool incoming_p)
25139 {
25140 unsigned regno;
25141
25142 if (!DECL_STATIC_CHAIN (fndecl))
25143 return NULL;
25144
25145 if (TARGET_64BIT)
25146 {
25147 /* We always use R10 in 64-bit mode. */
25148 regno = R10_REG;
25149 }
25150 else
25151 {
25152 tree fntype;
25153 unsigned int ccvt;
25154
25155 /* By default in 32-bit mode we use ECX to pass the static chain. */
25156 regno = CX_REG;
25157
25158 fntype = TREE_TYPE (fndecl);
25159 ccvt = ix86_get_callcvt (fntype);
25160 if ((ccvt & (IX86_CALLCVT_FASTCALL | IX86_CALLCVT_THISCALL)) != 0)
25161 {
25162 /* Fastcall functions use ecx/edx for arguments, which leaves
25163 us with EAX for the static chain.
25164 Thiscall functions use ecx for arguments, which also
25165 leaves us with EAX for the static chain. */
25166 regno = AX_REG;
25167 }
25168 else if (ix86_function_regparm (fntype, fndecl) == 3)
25169 {
25170 /* For regparm 3, we have no free call-clobbered registers in
25171 which to store the static chain. In order to implement this,
25172 we have the trampoline push the static chain to the stack.
25173 However, we can't push a value below the return address when
25174 we call the nested function directly, so we have to use an
25175 alternate entry point. For this we use ESI, and have the
25176 alternate entry point push ESI, so that things appear the
25177 same once we're executing the nested function. */
25178 if (incoming_p)
25179 {
25180 if (fndecl == current_function_decl)
25181 ix86_static_chain_on_stack = true;
25182 return gen_frame_mem (SImode,
25183 plus_constant (Pmode,
25184 arg_pointer_rtx, -8));
25185 }
25186 regno = SI_REG;
25187 }
25188 }
25189
25190 return gen_rtx_REG (Pmode, regno);
25191 }
25192
25193 /* Emit RTL insns to initialize the variable parts of a trampoline.
25194 FNDECL is the decl of the target address; M_TRAMP is a MEM for
25195 the trampoline, and CHAIN_VALUE is an RTX for the static chain
25196 to be passed to the target function. */
25197
25198 static void
25199 ix86_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
25200 {
25201 rtx mem, fnaddr;
25202 int opcode;
25203 int offset = 0;
25204
25205 fnaddr = XEXP (DECL_RTL (fndecl), 0);
25206
25207 if (TARGET_64BIT)
25208 {
25209 int size;
25210
25211 /* Load the function address to r11. Try to load address using
25212 the shorter movl instead of movabs. We may want to support
25213 movq for kernel mode, but kernel does not use trampolines at
25214 the moment. FNADDR is a 32bit address and may not be in
25215 DImode when ptr_mode == SImode. Always use movl in this
25216 case. */
25217 if (ptr_mode == SImode
25218 || x86_64_zext_immediate_operand (fnaddr, VOIDmode))
25219 {
25220 fnaddr = copy_addr_to_reg (fnaddr);
25221
25222 mem = adjust_address (m_tramp, HImode, offset);
25223 emit_move_insn (mem, gen_int_mode (0xbb41, HImode));
25224
25225 mem = adjust_address (m_tramp, SImode, offset + 2);
25226 emit_move_insn (mem, gen_lowpart (SImode, fnaddr));
25227 offset += 6;
25228 }
25229 else
25230 {
25231 mem = adjust_address (m_tramp, HImode, offset);
25232 emit_move_insn (mem, gen_int_mode (0xbb49, HImode));
25233
25234 mem = adjust_address (m_tramp, DImode, offset + 2);
25235 emit_move_insn (mem, fnaddr);
25236 offset += 10;
25237 }
25238
25239 /* Load static chain using movabs to r10. Use the shorter movl
25240 instead of movabs when ptr_mode == SImode. */
25241 if (ptr_mode == SImode)
25242 {
25243 opcode = 0xba41;
25244 size = 6;
25245 }
25246 else
25247 {
25248 opcode = 0xba49;
25249 size = 10;
25250 }
25251
25252 mem = adjust_address (m_tramp, HImode, offset);
25253 emit_move_insn (mem, gen_int_mode (opcode, HImode));
25254
25255 mem = adjust_address (m_tramp, ptr_mode, offset + 2);
25256 emit_move_insn (mem, chain_value);
25257 offset += size;
25258
25259 /* Jump to r11; the last (unused) byte is a nop, only there to
25260 pad the write out to a single 32-bit store. */
25261 mem = adjust_address (m_tramp, SImode, offset);
25262 emit_move_insn (mem, gen_int_mode (0x90e3ff49, SImode));
25263 offset += 4;
25264 }
25265 else
25266 {
25267 rtx disp, chain;
25268
25269 /* Depending on the static chain location, either load a register
25270 with a constant, or push the constant to the stack. All of the
25271 instructions are the same size. */
25272 chain = ix86_static_chain (fndecl, true);
25273 if (REG_P (chain))
25274 {
25275 switch (REGNO (chain))
25276 {
25277 case AX_REG:
25278 opcode = 0xb8; break;
25279 case CX_REG:
25280 opcode = 0xb9; break;
25281 default:
25282 gcc_unreachable ();
25283 }
25284 }
25285 else
25286 opcode = 0x68;
25287
25288 mem = adjust_address (m_tramp, QImode, offset);
25289 emit_move_insn (mem, gen_int_mode (opcode, QImode));
25290
25291 mem = adjust_address (m_tramp, SImode, offset + 1);
25292 emit_move_insn (mem, chain_value);
25293 offset += 5;
25294
25295 mem = adjust_address (m_tramp, QImode, offset);
25296 emit_move_insn (mem, gen_int_mode (0xe9, QImode));
25297
25298 mem = adjust_address (m_tramp, SImode, offset + 1);
25299
25300 /* Compute offset from the end of the jmp to the target function.
25301 In the case in which the trampoline stores the static chain on
25302 the stack, we need to skip the first insn which pushes the
25303 (call-saved) register static chain; this push is 1 byte. */
25304 offset += 5;
25305 disp = expand_binop (SImode, sub_optab, fnaddr,
25306 plus_constant (Pmode, XEXP (m_tramp, 0),
25307 offset - (MEM_P (chain) ? 1 : 0)),
25308 NULL_RTX, 1, OPTAB_DIRECT);
25309 emit_move_insn (mem, disp);
25310 }
25311
25312 gcc_assert (offset <= TRAMPOLINE_SIZE);
25313
25314 #ifdef HAVE_ENABLE_EXECUTE_STACK
25315 #ifdef CHECK_EXECUTE_STACK_ENABLED
25316 if (CHECK_EXECUTE_STACK_ENABLED)
25317 #endif
25318 emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__enable_execute_stack"),
25319 LCT_NORMAL, VOIDmode, 1, XEXP (m_tramp, 0), Pmode);
25320 #endif
25321 }
25322 \f
25323 /* The following file contains several enumerations and data structures
25324 built from the definitions in i386-builtin-types.def. */
25325
25326 #include "i386-builtin-types.inc"
25327
25328 /* Table for the ix86 builtin non-function types. */
25329 static GTY(()) tree ix86_builtin_type_tab[(int) IX86_BT_LAST_CPTR + 1];
25330
25331 /* Retrieve an element from the above table, building some of
25332 the types lazily. */
25333
25334 static tree
25335 ix86_get_builtin_type (enum ix86_builtin_type tcode)
25336 {
25337 unsigned int index;
25338 tree type, itype;
25339
25340 gcc_assert ((unsigned)tcode < ARRAY_SIZE(ix86_builtin_type_tab));
25341
25342 type = ix86_builtin_type_tab[(int) tcode];
25343 if (type != NULL)
25344 return type;
25345
25346 gcc_assert (tcode > IX86_BT_LAST_PRIM);
25347 if (tcode <= IX86_BT_LAST_VECT)
25348 {
25349 enum machine_mode mode;
25350
25351 index = tcode - IX86_BT_LAST_PRIM - 1;
25352 itype = ix86_get_builtin_type (ix86_builtin_type_vect_base[index]);
25353 mode = ix86_builtin_type_vect_mode[index];
25354
25355 type = build_vector_type_for_mode (itype, mode);
25356 }
25357 else
25358 {
25359 int quals;
25360
25361 index = tcode - IX86_BT_LAST_VECT - 1;
25362 if (tcode <= IX86_BT_LAST_PTR)
25363 quals = TYPE_UNQUALIFIED;
25364 else
25365 quals = TYPE_QUAL_CONST;
25366
25367 itype = ix86_get_builtin_type (ix86_builtin_type_ptr_base[index]);
25368 if (quals != TYPE_UNQUALIFIED)
25369 itype = build_qualified_type (itype, quals);
25370
25371 type = build_pointer_type (itype);
25372 }
25373
25374 ix86_builtin_type_tab[(int) tcode] = type;
25375 return type;
25376 }
25377
25378 /* Table for the ix86 builtin function types. */
25379 static GTY(()) tree ix86_builtin_func_type_tab[(int) IX86_BT_LAST_ALIAS + 1];
25380
25381 /* Retrieve an element from the above table, building some of
25382 the types lazily. */
25383
25384 static tree
25385 ix86_get_builtin_func_type (enum ix86_builtin_func_type tcode)
25386 {
25387 tree type;
25388
25389 gcc_assert ((unsigned)tcode < ARRAY_SIZE (ix86_builtin_func_type_tab));
25390
25391 type = ix86_builtin_func_type_tab[(int) tcode];
25392 if (type != NULL)
25393 return type;
25394
25395 if (tcode <= IX86_BT_LAST_FUNC)
25396 {
25397 unsigned start = ix86_builtin_func_start[(int) tcode];
25398 unsigned after = ix86_builtin_func_start[(int) tcode + 1];
25399 tree rtype, atype, args = void_list_node;
25400 unsigned i;
25401
25402 rtype = ix86_get_builtin_type (ix86_builtin_func_args[start]);
25403 for (i = after - 1; i > start; --i)
25404 {
25405 atype = ix86_get_builtin_type (ix86_builtin_func_args[i]);
25406 args = tree_cons (NULL, atype, args);
25407 }
25408
25409 type = build_function_type (rtype, args);
25410 }
25411 else
25412 {
25413 unsigned index = tcode - IX86_BT_LAST_FUNC - 1;
25414 enum ix86_builtin_func_type icode;
25415
25416 icode = ix86_builtin_func_alias_base[index];
25417 type = ix86_get_builtin_func_type (icode);
25418 }
25419
25420 ix86_builtin_func_type_tab[(int) tcode] = type;
25421 return type;
25422 }
25423
25424
25425 /* Codes for all the SSE/MMX builtins. */
25426 enum ix86_builtins
25427 {
25428 IX86_BUILTIN_ADDPS,
25429 IX86_BUILTIN_ADDSS,
25430 IX86_BUILTIN_DIVPS,
25431 IX86_BUILTIN_DIVSS,
25432 IX86_BUILTIN_MULPS,
25433 IX86_BUILTIN_MULSS,
25434 IX86_BUILTIN_SUBPS,
25435 IX86_BUILTIN_SUBSS,
25436
25437 IX86_BUILTIN_CMPEQPS,
25438 IX86_BUILTIN_CMPLTPS,
25439 IX86_BUILTIN_CMPLEPS,
25440 IX86_BUILTIN_CMPGTPS,
25441 IX86_BUILTIN_CMPGEPS,
25442 IX86_BUILTIN_CMPNEQPS,
25443 IX86_BUILTIN_CMPNLTPS,
25444 IX86_BUILTIN_CMPNLEPS,
25445 IX86_BUILTIN_CMPNGTPS,
25446 IX86_BUILTIN_CMPNGEPS,
25447 IX86_BUILTIN_CMPORDPS,
25448 IX86_BUILTIN_CMPUNORDPS,
25449 IX86_BUILTIN_CMPEQSS,
25450 IX86_BUILTIN_CMPLTSS,
25451 IX86_BUILTIN_CMPLESS,
25452 IX86_BUILTIN_CMPNEQSS,
25453 IX86_BUILTIN_CMPNLTSS,
25454 IX86_BUILTIN_CMPNLESS,
25455 IX86_BUILTIN_CMPNGTSS,
25456 IX86_BUILTIN_CMPNGESS,
25457 IX86_BUILTIN_CMPORDSS,
25458 IX86_BUILTIN_CMPUNORDSS,
25459
25460 IX86_BUILTIN_COMIEQSS,
25461 IX86_BUILTIN_COMILTSS,
25462 IX86_BUILTIN_COMILESS,
25463 IX86_BUILTIN_COMIGTSS,
25464 IX86_BUILTIN_COMIGESS,
25465 IX86_BUILTIN_COMINEQSS,
25466 IX86_BUILTIN_UCOMIEQSS,
25467 IX86_BUILTIN_UCOMILTSS,
25468 IX86_BUILTIN_UCOMILESS,
25469 IX86_BUILTIN_UCOMIGTSS,
25470 IX86_BUILTIN_UCOMIGESS,
25471 IX86_BUILTIN_UCOMINEQSS,
25472
25473 IX86_BUILTIN_CVTPI2PS,
25474 IX86_BUILTIN_CVTPS2PI,
25475 IX86_BUILTIN_CVTSI2SS,
25476 IX86_BUILTIN_CVTSI642SS,
25477 IX86_BUILTIN_CVTSS2SI,
25478 IX86_BUILTIN_CVTSS2SI64,
25479 IX86_BUILTIN_CVTTPS2PI,
25480 IX86_BUILTIN_CVTTSS2SI,
25481 IX86_BUILTIN_CVTTSS2SI64,
25482
25483 IX86_BUILTIN_MAXPS,
25484 IX86_BUILTIN_MAXSS,
25485 IX86_BUILTIN_MINPS,
25486 IX86_BUILTIN_MINSS,
25487
25488 IX86_BUILTIN_LOADUPS,
25489 IX86_BUILTIN_STOREUPS,
25490 IX86_BUILTIN_MOVSS,
25491
25492 IX86_BUILTIN_MOVHLPS,
25493 IX86_BUILTIN_MOVLHPS,
25494 IX86_BUILTIN_LOADHPS,
25495 IX86_BUILTIN_LOADLPS,
25496 IX86_BUILTIN_STOREHPS,
25497 IX86_BUILTIN_STORELPS,
25498
25499 IX86_BUILTIN_MASKMOVQ,
25500 IX86_BUILTIN_MOVMSKPS,
25501 IX86_BUILTIN_PMOVMSKB,
25502
25503 IX86_BUILTIN_MOVNTPS,
25504 IX86_BUILTIN_MOVNTQ,
25505
25506 IX86_BUILTIN_LOADDQU,
25507 IX86_BUILTIN_STOREDQU,
25508
25509 IX86_BUILTIN_PACKSSWB,
25510 IX86_BUILTIN_PACKSSDW,
25511 IX86_BUILTIN_PACKUSWB,
25512
25513 IX86_BUILTIN_PADDB,
25514 IX86_BUILTIN_PADDW,
25515 IX86_BUILTIN_PADDD,
25516 IX86_BUILTIN_PADDQ,
25517 IX86_BUILTIN_PADDSB,
25518 IX86_BUILTIN_PADDSW,
25519 IX86_BUILTIN_PADDUSB,
25520 IX86_BUILTIN_PADDUSW,
25521 IX86_BUILTIN_PSUBB,
25522 IX86_BUILTIN_PSUBW,
25523 IX86_BUILTIN_PSUBD,
25524 IX86_BUILTIN_PSUBQ,
25525 IX86_BUILTIN_PSUBSB,
25526 IX86_BUILTIN_PSUBSW,
25527 IX86_BUILTIN_PSUBUSB,
25528 IX86_BUILTIN_PSUBUSW,
25529
25530 IX86_BUILTIN_PAND,
25531 IX86_BUILTIN_PANDN,
25532 IX86_BUILTIN_POR,
25533 IX86_BUILTIN_PXOR,
25534
25535 IX86_BUILTIN_PAVGB,
25536 IX86_BUILTIN_PAVGW,
25537
25538 IX86_BUILTIN_PCMPEQB,
25539 IX86_BUILTIN_PCMPEQW,
25540 IX86_BUILTIN_PCMPEQD,
25541 IX86_BUILTIN_PCMPGTB,
25542 IX86_BUILTIN_PCMPGTW,
25543 IX86_BUILTIN_PCMPGTD,
25544
25545 IX86_BUILTIN_PMADDWD,
25546
25547 IX86_BUILTIN_PMAXSW,
25548 IX86_BUILTIN_PMAXUB,
25549 IX86_BUILTIN_PMINSW,
25550 IX86_BUILTIN_PMINUB,
25551
25552 IX86_BUILTIN_PMULHUW,
25553 IX86_BUILTIN_PMULHW,
25554 IX86_BUILTIN_PMULLW,
25555
25556 IX86_BUILTIN_PSADBW,
25557 IX86_BUILTIN_PSHUFW,
25558
25559 IX86_BUILTIN_PSLLW,
25560 IX86_BUILTIN_PSLLD,
25561 IX86_BUILTIN_PSLLQ,
25562 IX86_BUILTIN_PSRAW,
25563 IX86_BUILTIN_PSRAD,
25564 IX86_BUILTIN_PSRLW,
25565 IX86_BUILTIN_PSRLD,
25566 IX86_BUILTIN_PSRLQ,
25567 IX86_BUILTIN_PSLLWI,
25568 IX86_BUILTIN_PSLLDI,
25569 IX86_BUILTIN_PSLLQI,
25570 IX86_BUILTIN_PSRAWI,
25571 IX86_BUILTIN_PSRADI,
25572 IX86_BUILTIN_PSRLWI,
25573 IX86_BUILTIN_PSRLDI,
25574 IX86_BUILTIN_PSRLQI,
25575
25576 IX86_BUILTIN_PUNPCKHBW,
25577 IX86_BUILTIN_PUNPCKHWD,
25578 IX86_BUILTIN_PUNPCKHDQ,
25579 IX86_BUILTIN_PUNPCKLBW,
25580 IX86_BUILTIN_PUNPCKLWD,
25581 IX86_BUILTIN_PUNPCKLDQ,
25582
25583 IX86_BUILTIN_SHUFPS,
25584
25585 IX86_BUILTIN_RCPPS,
25586 IX86_BUILTIN_RCPSS,
25587 IX86_BUILTIN_RSQRTPS,
25588 IX86_BUILTIN_RSQRTPS_NR,
25589 IX86_BUILTIN_RSQRTSS,
25590 IX86_BUILTIN_RSQRTF,
25591 IX86_BUILTIN_SQRTPS,
25592 IX86_BUILTIN_SQRTPS_NR,
25593 IX86_BUILTIN_SQRTSS,
25594
25595 IX86_BUILTIN_UNPCKHPS,
25596 IX86_BUILTIN_UNPCKLPS,
25597
25598 IX86_BUILTIN_ANDPS,
25599 IX86_BUILTIN_ANDNPS,
25600 IX86_BUILTIN_ORPS,
25601 IX86_BUILTIN_XORPS,
25602
25603 IX86_BUILTIN_EMMS,
25604 IX86_BUILTIN_LDMXCSR,
25605 IX86_BUILTIN_STMXCSR,
25606 IX86_BUILTIN_SFENCE,
25607
25608 IX86_BUILTIN_FXSAVE,
25609 IX86_BUILTIN_FXRSTOR,
25610 IX86_BUILTIN_FXSAVE64,
25611 IX86_BUILTIN_FXRSTOR64,
25612
25613 IX86_BUILTIN_XSAVE,
25614 IX86_BUILTIN_XRSTOR,
25615 IX86_BUILTIN_XSAVE64,
25616 IX86_BUILTIN_XRSTOR64,
25617
25618 IX86_BUILTIN_XSAVEOPT,
25619 IX86_BUILTIN_XSAVEOPT64,
25620
25621 /* 3DNow! Original */
25622 IX86_BUILTIN_FEMMS,
25623 IX86_BUILTIN_PAVGUSB,
25624 IX86_BUILTIN_PF2ID,
25625 IX86_BUILTIN_PFACC,
25626 IX86_BUILTIN_PFADD,
25627 IX86_BUILTIN_PFCMPEQ,
25628 IX86_BUILTIN_PFCMPGE,
25629 IX86_BUILTIN_PFCMPGT,
25630 IX86_BUILTIN_PFMAX,
25631 IX86_BUILTIN_PFMIN,
25632 IX86_BUILTIN_PFMUL,
25633 IX86_BUILTIN_PFRCP,
25634 IX86_BUILTIN_PFRCPIT1,
25635 IX86_BUILTIN_PFRCPIT2,
25636 IX86_BUILTIN_PFRSQIT1,
25637 IX86_BUILTIN_PFRSQRT,
25638 IX86_BUILTIN_PFSUB,
25639 IX86_BUILTIN_PFSUBR,
25640 IX86_BUILTIN_PI2FD,
25641 IX86_BUILTIN_PMULHRW,
25642
25643 /* 3DNow! Athlon Extensions */
25644 IX86_BUILTIN_PF2IW,
25645 IX86_BUILTIN_PFNACC,
25646 IX86_BUILTIN_PFPNACC,
25647 IX86_BUILTIN_PI2FW,
25648 IX86_BUILTIN_PSWAPDSI,
25649 IX86_BUILTIN_PSWAPDSF,
25650
25651 /* SSE2 */
25652 IX86_BUILTIN_ADDPD,
25653 IX86_BUILTIN_ADDSD,
25654 IX86_BUILTIN_DIVPD,
25655 IX86_BUILTIN_DIVSD,
25656 IX86_BUILTIN_MULPD,
25657 IX86_BUILTIN_MULSD,
25658 IX86_BUILTIN_SUBPD,
25659 IX86_BUILTIN_SUBSD,
25660
25661 IX86_BUILTIN_CMPEQPD,
25662 IX86_BUILTIN_CMPLTPD,
25663 IX86_BUILTIN_CMPLEPD,
25664 IX86_BUILTIN_CMPGTPD,
25665 IX86_BUILTIN_CMPGEPD,
25666 IX86_BUILTIN_CMPNEQPD,
25667 IX86_BUILTIN_CMPNLTPD,
25668 IX86_BUILTIN_CMPNLEPD,
25669 IX86_BUILTIN_CMPNGTPD,
25670 IX86_BUILTIN_CMPNGEPD,
25671 IX86_BUILTIN_CMPORDPD,
25672 IX86_BUILTIN_CMPUNORDPD,
25673 IX86_BUILTIN_CMPEQSD,
25674 IX86_BUILTIN_CMPLTSD,
25675 IX86_BUILTIN_CMPLESD,
25676 IX86_BUILTIN_CMPNEQSD,
25677 IX86_BUILTIN_CMPNLTSD,
25678 IX86_BUILTIN_CMPNLESD,
25679 IX86_BUILTIN_CMPORDSD,
25680 IX86_BUILTIN_CMPUNORDSD,
25681
25682 IX86_BUILTIN_COMIEQSD,
25683 IX86_BUILTIN_COMILTSD,
25684 IX86_BUILTIN_COMILESD,
25685 IX86_BUILTIN_COMIGTSD,
25686 IX86_BUILTIN_COMIGESD,
25687 IX86_BUILTIN_COMINEQSD,
25688 IX86_BUILTIN_UCOMIEQSD,
25689 IX86_BUILTIN_UCOMILTSD,
25690 IX86_BUILTIN_UCOMILESD,
25691 IX86_BUILTIN_UCOMIGTSD,
25692 IX86_BUILTIN_UCOMIGESD,
25693 IX86_BUILTIN_UCOMINEQSD,
25694
25695 IX86_BUILTIN_MAXPD,
25696 IX86_BUILTIN_MAXSD,
25697 IX86_BUILTIN_MINPD,
25698 IX86_BUILTIN_MINSD,
25699
25700 IX86_BUILTIN_ANDPD,
25701 IX86_BUILTIN_ANDNPD,
25702 IX86_BUILTIN_ORPD,
25703 IX86_BUILTIN_XORPD,
25704
25705 IX86_BUILTIN_SQRTPD,
25706 IX86_BUILTIN_SQRTSD,
25707
25708 IX86_BUILTIN_UNPCKHPD,
25709 IX86_BUILTIN_UNPCKLPD,
25710
25711 IX86_BUILTIN_SHUFPD,
25712
25713 IX86_BUILTIN_LOADUPD,
25714 IX86_BUILTIN_STOREUPD,
25715 IX86_BUILTIN_MOVSD,
25716
25717 IX86_BUILTIN_LOADHPD,
25718 IX86_BUILTIN_LOADLPD,
25719
25720 IX86_BUILTIN_CVTDQ2PD,
25721 IX86_BUILTIN_CVTDQ2PS,
25722
25723 IX86_BUILTIN_CVTPD2DQ,
25724 IX86_BUILTIN_CVTPD2PI,
25725 IX86_BUILTIN_CVTPD2PS,
25726 IX86_BUILTIN_CVTTPD2DQ,
25727 IX86_BUILTIN_CVTTPD2PI,
25728
25729 IX86_BUILTIN_CVTPI2PD,
25730 IX86_BUILTIN_CVTSI2SD,
25731 IX86_BUILTIN_CVTSI642SD,
25732
25733 IX86_BUILTIN_CVTSD2SI,
25734 IX86_BUILTIN_CVTSD2SI64,
25735 IX86_BUILTIN_CVTSD2SS,
25736 IX86_BUILTIN_CVTSS2SD,
25737 IX86_BUILTIN_CVTTSD2SI,
25738 IX86_BUILTIN_CVTTSD2SI64,
25739
25740 IX86_BUILTIN_CVTPS2DQ,
25741 IX86_BUILTIN_CVTPS2PD,
25742 IX86_BUILTIN_CVTTPS2DQ,
25743
25744 IX86_BUILTIN_MOVNTI,
25745 IX86_BUILTIN_MOVNTI64,
25746 IX86_BUILTIN_MOVNTPD,
25747 IX86_BUILTIN_MOVNTDQ,
25748
25749 IX86_BUILTIN_MOVQ128,
25750
25751 /* SSE2 MMX */
25752 IX86_BUILTIN_MASKMOVDQU,
25753 IX86_BUILTIN_MOVMSKPD,
25754 IX86_BUILTIN_PMOVMSKB128,
25755
25756 IX86_BUILTIN_PACKSSWB128,
25757 IX86_BUILTIN_PACKSSDW128,
25758 IX86_BUILTIN_PACKUSWB128,
25759
25760 IX86_BUILTIN_PADDB128,
25761 IX86_BUILTIN_PADDW128,
25762 IX86_BUILTIN_PADDD128,
25763 IX86_BUILTIN_PADDQ128,
25764 IX86_BUILTIN_PADDSB128,
25765 IX86_BUILTIN_PADDSW128,
25766 IX86_BUILTIN_PADDUSB128,
25767 IX86_BUILTIN_PADDUSW128,
25768 IX86_BUILTIN_PSUBB128,
25769 IX86_BUILTIN_PSUBW128,
25770 IX86_BUILTIN_PSUBD128,
25771 IX86_BUILTIN_PSUBQ128,
25772 IX86_BUILTIN_PSUBSB128,
25773 IX86_BUILTIN_PSUBSW128,
25774 IX86_BUILTIN_PSUBUSB128,
25775 IX86_BUILTIN_PSUBUSW128,
25776
25777 IX86_BUILTIN_PAND128,
25778 IX86_BUILTIN_PANDN128,
25779 IX86_BUILTIN_POR128,
25780 IX86_BUILTIN_PXOR128,
25781
25782 IX86_BUILTIN_PAVGB128,
25783 IX86_BUILTIN_PAVGW128,
25784
25785 IX86_BUILTIN_PCMPEQB128,
25786 IX86_BUILTIN_PCMPEQW128,
25787 IX86_BUILTIN_PCMPEQD128,
25788 IX86_BUILTIN_PCMPGTB128,
25789 IX86_BUILTIN_PCMPGTW128,
25790 IX86_BUILTIN_PCMPGTD128,
25791
25792 IX86_BUILTIN_PMADDWD128,
25793
25794 IX86_BUILTIN_PMAXSW128,
25795 IX86_BUILTIN_PMAXUB128,
25796 IX86_BUILTIN_PMINSW128,
25797 IX86_BUILTIN_PMINUB128,
25798
25799 IX86_BUILTIN_PMULUDQ,
25800 IX86_BUILTIN_PMULUDQ128,
25801 IX86_BUILTIN_PMULHUW128,
25802 IX86_BUILTIN_PMULHW128,
25803 IX86_BUILTIN_PMULLW128,
25804
25805 IX86_BUILTIN_PSADBW128,
25806 IX86_BUILTIN_PSHUFHW,
25807 IX86_BUILTIN_PSHUFLW,
25808 IX86_BUILTIN_PSHUFD,
25809
25810 IX86_BUILTIN_PSLLDQI128,
25811 IX86_BUILTIN_PSLLWI128,
25812 IX86_BUILTIN_PSLLDI128,
25813 IX86_BUILTIN_PSLLQI128,
25814 IX86_BUILTIN_PSRAWI128,
25815 IX86_BUILTIN_PSRADI128,
25816 IX86_BUILTIN_PSRLDQI128,
25817 IX86_BUILTIN_PSRLWI128,
25818 IX86_BUILTIN_PSRLDI128,
25819 IX86_BUILTIN_PSRLQI128,
25820
25821 IX86_BUILTIN_PSLLDQ128,
25822 IX86_BUILTIN_PSLLW128,
25823 IX86_BUILTIN_PSLLD128,
25824 IX86_BUILTIN_PSLLQ128,
25825 IX86_BUILTIN_PSRAW128,
25826 IX86_BUILTIN_PSRAD128,
25827 IX86_BUILTIN_PSRLW128,
25828 IX86_BUILTIN_PSRLD128,
25829 IX86_BUILTIN_PSRLQ128,
25830
25831 IX86_BUILTIN_PUNPCKHBW128,
25832 IX86_BUILTIN_PUNPCKHWD128,
25833 IX86_BUILTIN_PUNPCKHDQ128,
25834 IX86_BUILTIN_PUNPCKHQDQ128,
25835 IX86_BUILTIN_PUNPCKLBW128,
25836 IX86_BUILTIN_PUNPCKLWD128,
25837 IX86_BUILTIN_PUNPCKLDQ128,
25838 IX86_BUILTIN_PUNPCKLQDQ128,
25839
25840 IX86_BUILTIN_CLFLUSH,
25841 IX86_BUILTIN_MFENCE,
25842 IX86_BUILTIN_LFENCE,
25843 IX86_BUILTIN_PAUSE,
25844
25845 IX86_BUILTIN_BSRSI,
25846 IX86_BUILTIN_BSRDI,
25847 IX86_BUILTIN_RDPMC,
25848 IX86_BUILTIN_RDTSC,
25849 IX86_BUILTIN_RDTSCP,
25850 IX86_BUILTIN_ROLQI,
25851 IX86_BUILTIN_ROLHI,
25852 IX86_BUILTIN_RORQI,
25853 IX86_BUILTIN_RORHI,
25854
25855 /* SSE3. */
25856 IX86_BUILTIN_ADDSUBPS,
25857 IX86_BUILTIN_HADDPS,
25858 IX86_BUILTIN_HSUBPS,
25859 IX86_BUILTIN_MOVSHDUP,
25860 IX86_BUILTIN_MOVSLDUP,
25861 IX86_BUILTIN_ADDSUBPD,
25862 IX86_BUILTIN_HADDPD,
25863 IX86_BUILTIN_HSUBPD,
25864 IX86_BUILTIN_LDDQU,
25865
25866 IX86_BUILTIN_MONITOR,
25867 IX86_BUILTIN_MWAIT,
25868
25869 /* SSSE3. */
25870 IX86_BUILTIN_PHADDW,
25871 IX86_BUILTIN_PHADDD,
25872 IX86_BUILTIN_PHADDSW,
25873 IX86_BUILTIN_PHSUBW,
25874 IX86_BUILTIN_PHSUBD,
25875 IX86_BUILTIN_PHSUBSW,
25876 IX86_BUILTIN_PMADDUBSW,
25877 IX86_BUILTIN_PMULHRSW,
25878 IX86_BUILTIN_PSHUFB,
25879 IX86_BUILTIN_PSIGNB,
25880 IX86_BUILTIN_PSIGNW,
25881 IX86_BUILTIN_PSIGND,
25882 IX86_BUILTIN_PALIGNR,
25883 IX86_BUILTIN_PABSB,
25884 IX86_BUILTIN_PABSW,
25885 IX86_BUILTIN_PABSD,
25886
25887 IX86_BUILTIN_PHADDW128,
25888 IX86_BUILTIN_PHADDD128,
25889 IX86_BUILTIN_PHADDSW128,
25890 IX86_BUILTIN_PHSUBW128,
25891 IX86_BUILTIN_PHSUBD128,
25892 IX86_BUILTIN_PHSUBSW128,
25893 IX86_BUILTIN_PMADDUBSW128,
25894 IX86_BUILTIN_PMULHRSW128,
25895 IX86_BUILTIN_PSHUFB128,
25896 IX86_BUILTIN_PSIGNB128,
25897 IX86_BUILTIN_PSIGNW128,
25898 IX86_BUILTIN_PSIGND128,
25899 IX86_BUILTIN_PALIGNR128,
25900 IX86_BUILTIN_PABSB128,
25901 IX86_BUILTIN_PABSW128,
25902 IX86_BUILTIN_PABSD128,
25903
25904 /* AMDFAM10 - SSE4A New Instructions. */
25905 IX86_BUILTIN_MOVNTSD,
25906 IX86_BUILTIN_MOVNTSS,
25907 IX86_BUILTIN_EXTRQI,
25908 IX86_BUILTIN_EXTRQ,
25909 IX86_BUILTIN_INSERTQI,
25910 IX86_BUILTIN_INSERTQ,
25911
25912 /* SSE4.1. */
25913 IX86_BUILTIN_BLENDPD,
25914 IX86_BUILTIN_BLENDPS,
25915 IX86_BUILTIN_BLENDVPD,
25916 IX86_BUILTIN_BLENDVPS,
25917 IX86_BUILTIN_PBLENDVB128,
25918 IX86_BUILTIN_PBLENDW128,
25919
25920 IX86_BUILTIN_DPPD,
25921 IX86_BUILTIN_DPPS,
25922
25923 IX86_BUILTIN_INSERTPS128,
25924
25925 IX86_BUILTIN_MOVNTDQA,
25926 IX86_BUILTIN_MPSADBW128,
25927 IX86_BUILTIN_PACKUSDW128,
25928 IX86_BUILTIN_PCMPEQQ,
25929 IX86_BUILTIN_PHMINPOSUW128,
25930
25931 IX86_BUILTIN_PMAXSB128,
25932 IX86_BUILTIN_PMAXSD128,
25933 IX86_BUILTIN_PMAXUD128,
25934 IX86_BUILTIN_PMAXUW128,
25935
25936 IX86_BUILTIN_PMINSB128,
25937 IX86_BUILTIN_PMINSD128,
25938 IX86_BUILTIN_PMINUD128,
25939 IX86_BUILTIN_PMINUW128,
25940
25941 IX86_BUILTIN_PMOVSXBW128,
25942 IX86_BUILTIN_PMOVSXBD128,
25943 IX86_BUILTIN_PMOVSXBQ128,
25944 IX86_BUILTIN_PMOVSXWD128,
25945 IX86_BUILTIN_PMOVSXWQ128,
25946 IX86_BUILTIN_PMOVSXDQ128,
25947
25948 IX86_BUILTIN_PMOVZXBW128,
25949 IX86_BUILTIN_PMOVZXBD128,
25950 IX86_BUILTIN_PMOVZXBQ128,
25951 IX86_BUILTIN_PMOVZXWD128,
25952 IX86_BUILTIN_PMOVZXWQ128,
25953 IX86_BUILTIN_PMOVZXDQ128,
25954
25955 IX86_BUILTIN_PMULDQ128,
25956 IX86_BUILTIN_PMULLD128,
25957
25958 IX86_BUILTIN_ROUNDSD,
25959 IX86_BUILTIN_ROUNDSS,
25960
25961 IX86_BUILTIN_ROUNDPD,
25962 IX86_BUILTIN_ROUNDPS,
25963
25964 IX86_BUILTIN_FLOORPD,
25965 IX86_BUILTIN_CEILPD,
25966 IX86_BUILTIN_TRUNCPD,
25967 IX86_BUILTIN_RINTPD,
25968 IX86_BUILTIN_ROUNDPD_AZ,
25969
25970 IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX,
25971 IX86_BUILTIN_CEILPD_VEC_PACK_SFIX,
25972 IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX,
25973
25974 IX86_BUILTIN_FLOORPS,
25975 IX86_BUILTIN_CEILPS,
25976 IX86_BUILTIN_TRUNCPS,
25977 IX86_BUILTIN_RINTPS,
25978 IX86_BUILTIN_ROUNDPS_AZ,
25979
25980 IX86_BUILTIN_FLOORPS_SFIX,
25981 IX86_BUILTIN_CEILPS_SFIX,
25982 IX86_BUILTIN_ROUNDPS_AZ_SFIX,
25983
25984 IX86_BUILTIN_PTESTZ,
25985 IX86_BUILTIN_PTESTC,
25986 IX86_BUILTIN_PTESTNZC,
25987
25988 IX86_BUILTIN_VEC_INIT_V2SI,
25989 IX86_BUILTIN_VEC_INIT_V4HI,
25990 IX86_BUILTIN_VEC_INIT_V8QI,
25991 IX86_BUILTIN_VEC_EXT_V2DF,
25992 IX86_BUILTIN_VEC_EXT_V2DI,
25993 IX86_BUILTIN_VEC_EXT_V4SF,
25994 IX86_BUILTIN_VEC_EXT_V4SI,
25995 IX86_BUILTIN_VEC_EXT_V8HI,
25996 IX86_BUILTIN_VEC_EXT_V2SI,
25997 IX86_BUILTIN_VEC_EXT_V4HI,
25998 IX86_BUILTIN_VEC_EXT_V16QI,
25999 IX86_BUILTIN_VEC_SET_V2DI,
26000 IX86_BUILTIN_VEC_SET_V4SF,
26001 IX86_BUILTIN_VEC_SET_V4SI,
26002 IX86_BUILTIN_VEC_SET_V8HI,
26003 IX86_BUILTIN_VEC_SET_V4HI,
26004 IX86_BUILTIN_VEC_SET_V16QI,
26005
26006 IX86_BUILTIN_VEC_PACK_SFIX,
26007 IX86_BUILTIN_VEC_PACK_SFIX256,
26008
26009 /* SSE4.2. */
26010 IX86_BUILTIN_CRC32QI,
26011 IX86_BUILTIN_CRC32HI,
26012 IX86_BUILTIN_CRC32SI,
26013 IX86_BUILTIN_CRC32DI,
26014
26015 IX86_BUILTIN_PCMPESTRI128,
26016 IX86_BUILTIN_PCMPESTRM128,
26017 IX86_BUILTIN_PCMPESTRA128,
26018 IX86_BUILTIN_PCMPESTRC128,
26019 IX86_BUILTIN_PCMPESTRO128,
26020 IX86_BUILTIN_PCMPESTRS128,
26021 IX86_BUILTIN_PCMPESTRZ128,
26022 IX86_BUILTIN_PCMPISTRI128,
26023 IX86_BUILTIN_PCMPISTRM128,
26024 IX86_BUILTIN_PCMPISTRA128,
26025 IX86_BUILTIN_PCMPISTRC128,
26026 IX86_BUILTIN_PCMPISTRO128,
26027 IX86_BUILTIN_PCMPISTRS128,
26028 IX86_BUILTIN_PCMPISTRZ128,
26029
26030 IX86_BUILTIN_PCMPGTQ,
26031
26032 /* AES instructions */
26033 IX86_BUILTIN_AESENC128,
26034 IX86_BUILTIN_AESENCLAST128,
26035 IX86_BUILTIN_AESDEC128,
26036 IX86_BUILTIN_AESDECLAST128,
26037 IX86_BUILTIN_AESIMC128,
26038 IX86_BUILTIN_AESKEYGENASSIST128,
26039
26040 /* PCLMUL instruction */
26041 IX86_BUILTIN_PCLMULQDQ128,
26042
26043 /* AVX */
26044 IX86_BUILTIN_ADDPD256,
26045 IX86_BUILTIN_ADDPS256,
26046 IX86_BUILTIN_ADDSUBPD256,
26047 IX86_BUILTIN_ADDSUBPS256,
26048 IX86_BUILTIN_ANDPD256,
26049 IX86_BUILTIN_ANDPS256,
26050 IX86_BUILTIN_ANDNPD256,
26051 IX86_BUILTIN_ANDNPS256,
26052 IX86_BUILTIN_BLENDPD256,
26053 IX86_BUILTIN_BLENDPS256,
26054 IX86_BUILTIN_BLENDVPD256,
26055 IX86_BUILTIN_BLENDVPS256,
26056 IX86_BUILTIN_DIVPD256,
26057 IX86_BUILTIN_DIVPS256,
26058 IX86_BUILTIN_DPPS256,
26059 IX86_BUILTIN_HADDPD256,
26060 IX86_BUILTIN_HADDPS256,
26061 IX86_BUILTIN_HSUBPD256,
26062 IX86_BUILTIN_HSUBPS256,
26063 IX86_BUILTIN_MAXPD256,
26064 IX86_BUILTIN_MAXPS256,
26065 IX86_BUILTIN_MINPD256,
26066 IX86_BUILTIN_MINPS256,
26067 IX86_BUILTIN_MULPD256,
26068 IX86_BUILTIN_MULPS256,
26069 IX86_BUILTIN_ORPD256,
26070 IX86_BUILTIN_ORPS256,
26071 IX86_BUILTIN_SHUFPD256,
26072 IX86_BUILTIN_SHUFPS256,
26073 IX86_BUILTIN_SUBPD256,
26074 IX86_BUILTIN_SUBPS256,
26075 IX86_BUILTIN_XORPD256,
26076 IX86_BUILTIN_XORPS256,
26077 IX86_BUILTIN_CMPSD,
26078 IX86_BUILTIN_CMPSS,
26079 IX86_BUILTIN_CMPPD,
26080 IX86_BUILTIN_CMPPS,
26081 IX86_BUILTIN_CMPPD256,
26082 IX86_BUILTIN_CMPPS256,
26083 IX86_BUILTIN_CVTDQ2PD256,
26084 IX86_BUILTIN_CVTDQ2PS256,
26085 IX86_BUILTIN_CVTPD2PS256,
26086 IX86_BUILTIN_CVTPS2DQ256,
26087 IX86_BUILTIN_CVTPS2PD256,
26088 IX86_BUILTIN_CVTTPD2DQ256,
26089 IX86_BUILTIN_CVTPD2DQ256,
26090 IX86_BUILTIN_CVTTPS2DQ256,
26091 IX86_BUILTIN_EXTRACTF128PD256,
26092 IX86_BUILTIN_EXTRACTF128PS256,
26093 IX86_BUILTIN_EXTRACTF128SI256,
26094 IX86_BUILTIN_VZEROALL,
26095 IX86_BUILTIN_VZEROUPPER,
26096 IX86_BUILTIN_VPERMILVARPD,
26097 IX86_BUILTIN_VPERMILVARPS,
26098 IX86_BUILTIN_VPERMILVARPD256,
26099 IX86_BUILTIN_VPERMILVARPS256,
26100 IX86_BUILTIN_VPERMILPD,
26101 IX86_BUILTIN_VPERMILPS,
26102 IX86_BUILTIN_VPERMILPD256,
26103 IX86_BUILTIN_VPERMILPS256,
26104 IX86_BUILTIN_VPERMIL2PD,
26105 IX86_BUILTIN_VPERMIL2PS,
26106 IX86_BUILTIN_VPERMIL2PD256,
26107 IX86_BUILTIN_VPERMIL2PS256,
26108 IX86_BUILTIN_VPERM2F128PD256,
26109 IX86_BUILTIN_VPERM2F128PS256,
26110 IX86_BUILTIN_VPERM2F128SI256,
26111 IX86_BUILTIN_VBROADCASTSS,
26112 IX86_BUILTIN_VBROADCASTSD256,
26113 IX86_BUILTIN_VBROADCASTSS256,
26114 IX86_BUILTIN_VBROADCASTPD256,
26115 IX86_BUILTIN_VBROADCASTPS256,
26116 IX86_BUILTIN_VINSERTF128PD256,
26117 IX86_BUILTIN_VINSERTF128PS256,
26118 IX86_BUILTIN_VINSERTF128SI256,
26119 IX86_BUILTIN_LOADUPD256,
26120 IX86_BUILTIN_LOADUPS256,
26121 IX86_BUILTIN_STOREUPD256,
26122 IX86_BUILTIN_STOREUPS256,
26123 IX86_BUILTIN_LDDQU256,
26124 IX86_BUILTIN_MOVNTDQ256,
26125 IX86_BUILTIN_MOVNTPD256,
26126 IX86_BUILTIN_MOVNTPS256,
26127 IX86_BUILTIN_LOADDQU256,
26128 IX86_BUILTIN_STOREDQU256,
26129 IX86_BUILTIN_MASKLOADPD,
26130 IX86_BUILTIN_MASKLOADPS,
26131 IX86_BUILTIN_MASKSTOREPD,
26132 IX86_BUILTIN_MASKSTOREPS,
26133 IX86_BUILTIN_MASKLOADPD256,
26134 IX86_BUILTIN_MASKLOADPS256,
26135 IX86_BUILTIN_MASKSTOREPD256,
26136 IX86_BUILTIN_MASKSTOREPS256,
26137 IX86_BUILTIN_MOVSHDUP256,
26138 IX86_BUILTIN_MOVSLDUP256,
26139 IX86_BUILTIN_MOVDDUP256,
26140
26141 IX86_BUILTIN_SQRTPD256,
26142 IX86_BUILTIN_SQRTPS256,
26143 IX86_BUILTIN_SQRTPS_NR256,
26144 IX86_BUILTIN_RSQRTPS256,
26145 IX86_BUILTIN_RSQRTPS_NR256,
26146
26147 IX86_BUILTIN_RCPPS256,
26148
26149 IX86_BUILTIN_ROUNDPD256,
26150 IX86_BUILTIN_ROUNDPS256,
26151
26152 IX86_BUILTIN_FLOORPD256,
26153 IX86_BUILTIN_CEILPD256,
26154 IX86_BUILTIN_TRUNCPD256,
26155 IX86_BUILTIN_RINTPD256,
26156 IX86_BUILTIN_ROUNDPD_AZ256,
26157
26158 IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256,
26159 IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256,
26160 IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256,
26161
26162 IX86_BUILTIN_FLOORPS256,
26163 IX86_BUILTIN_CEILPS256,
26164 IX86_BUILTIN_TRUNCPS256,
26165 IX86_BUILTIN_RINTPS256,
26166 IX86_BUILTIN_ROUNDPS_AZ256,
26167
26168 IX86_BUILTIN_FLOORPS_SFIX256,
26169 IX86_BUILTIN_CEILPS_SFIX256,
26170 IX86_BUILTIN_ROUNDPS_AZ_SFIX256,
26171
26172 IX86_BUILTIN_UNPCKHPD256,
26173 IX86_BUILTIN_UNPCKLPD256,
26174 IX86_BUILTIN_UNPCKHPS256,
26175 IX86_BUILTIN_UNPCKLPS256,
26176
26177 IX86_BUILTIN_SI256_SI,
26178 IX86_BUILTIN_PS256_PS,
26179 IX86_BUILTIN_PD256_PD,
26180 IX86_BUILTIN_SI_SI256,
26181 IX86_BUILTIN_PS_PS256,
26182 IX86_BUILTIN_PD_PD256,
26183
26184 IX86_BUILTIN_VTESTZPD,
26185 IX86_BUILTIN_VTESTCPD,
26186 IX86_BUILTIN_VTESTNZCPD,
26187 IX86_BUILTIN_VTESTZPS,
26188 IX86_BUILTIN_VTESTCPS,
26189 IX86_BUILTIN_VTESTNZCPS,
26190 IX86_BUILTIN_VTESTZPD256,
26191 IX86_BUILTIN_VTESTCPD256,
26192 IX86_BUILTIN_VTESTNZCPD256,
26193 IX86_BUILTIN_VTESTZPS256,
26194 IX86_BUILTIN_VTESTCPS256,
26195 IX86_BUILTIN_VTESTNZCPS256,
26196 IX86_BUILTIN_PTESTZ256,
26197 IX86_BUILTIN_PTESTC256,
26198 IX86_BUILTIN_PTESTNZC256,
26199
26200 IX86_BUILTIN_MOVMSKPD256,
26201 IX86_BUILTIN_MOVMSKPS256,
26202
26203 /* AVX2 */
26204 IX86_BUILTIN_MPSADBW256,
26205 IX86_BUILTIN_PABSB256,
26206 IX86_BUILTIN_PABSW256,
26207 IX86_BUILTIN_PABSD256,
26208 IX86_BUILTIN_PACKSSDW256,
26209 IX86_BUILTIN_PACKSSWB256,
26210 IX86_BUILTIN_PACKUSDW256,
26211 IX86_BUILTIN_PACKUSWB256,
26212 IX86_BUILTIN_PADDB256,
26213 IX86_BUILTIN_PADDW256,
26214 IX86_BUILTIN_PADDD256,
26215 IX86_BUILTIN_PADDQ256,
26216 IX86_BUILTIN_PADDSB256,
26217 IX86_BUILTIN_PADDSW256,
26218 IX86_BUILTIN_PADDUSB256,
26219 IX86_BUILTIN_PADDUSW256,
26220 IX86_BUILTIN_PALIGNR256,
26221 IX86_BUILTIN_AND256I,
26222 IX86_BUILTIN_ANDNOT256I,
26223 IX86_BUILTIN_PAVGB256,
26224 IX86_BUILTIN_PAVGW256,
26225 IX86_BUILTIN_PBLENDVB256,
26226 IX86_BUILTIN_PBLENDVW256,
26227 IX86_BUILTIN_PCMPEQB256,
26228 IX86_BUILTIN_PCMPEQW256,
26229 IX86_BUILTIN_PCMPEQD256,
26230 IX86_BUILTIN_PCMPEQQ256,
26231 IX86_BUILTIN_PCMPGTB256,
26232 IX86_BUILTIN_PCMPGTW256,
26233 IX86_BUILTIN_PCMPGTD256,
26234 IX86_BUILTIN_PCMPGTQ256,
26235 IX86_BUILTIN_PHADDW256,
26236 IX86_BUILTIN_PHADDD256,
26237 IX86_BUILTIN_PHADDSW256,
26238 IX86_BUILTIN_PHSUBW256,
26239 IX86_BUILTIN_PHSUBD256,
26240 IX86_BUILTIN_PHSUBSW256,
26241 IX86_BUILTIN_PMADDUBSW256,
26242 IX86_BUILTIN_PMADDWD256,
26243 IX86_BUILTIN_PMAXSB256,
26244 IX86_BUILTIN_PMAXSW256,
26245 IX86_BUILTIN_PMAXSD256,
26246 IX86_BUILTIN_PMAXUB256,
26247 IX86_BUILTIN_PMAXUW256,
26248 IX86_BUILTIN_PMAXUD256,
26249 IX86_BUILTIN_PMINSB256,
26250 IX86_BUILTIN_PMINSW256,
26251 IX86_BUILTIN_PMINSD256,
26252 IX86_BUILTIN_PMINUB256,
26253 IX86_BUILTIN_PMINUW256,
26254 IX86_BUILTIN_PMINUD256,
26255 IX86_BUILTIN_PMOVMSKB256,
26256 IX86_BUILTIN_PMOVSXBW256,
26257 IX86_BUILTIN_PMOVSXBD256,
26258 IX86_BUILTIN_PMOVSXBQ256,
26259 IX86_BUILTIN_PMOVSXWD256,
26260 IX86_BUILTIN_PMOVSXWQ256,
26261 IX86_BUILTIN_PMOVSXDQ256,
26262 IX86_BUILTIN_PMOVZXBW256,
26263 IX86_BUILTIN_PMOVZXBD256,
26264 IX86_BUILTIN_PMOVZXBQ256,
26265 IX86_BUILTIN_PMOVZXWD256,
26266 IX86_BUILTIN_PMOVZXWQ256,
26267 IX86_BUILTIN_PMOVZXDQ256,
26268 IX86_BUILTIN_PMULDQ256,
26269 IX86_BUILTIN_PMULHRSW256,
26270 IX86_BUILTIN_PMULHUW256,
26271 IX86_BUILTIN_PMULHW256,
26272 IX86_BUILTIN_PMULLW256,
26273 IX86_BUILTIN_PMULLD256,
26274 IX86_BUILTIN_PMULUDQ256,
26275 IX86_BUILTIN_POR256,
26276 IX86_BUILTIN_PSADBW256,
26277 IX86_BUILTIN_PSHUFB256,
26278 IX86_BUILTIN_PSHUFD256,
26279 IX86_BUILTIN_PSHUFHW256,
26280 IX86_BUILTIN_PSHUFLW256,
26281 IX86_BUILTIN_PSIGNB256,
26282 IX86_BUILTIN_PSIGNW256,
26283 IX86_BUILTIN_PSIGND256,
26284 IX86_BUILTIN_PSLLDQI256,
26285 IX86_BUILTIN_PSLLWI256,
26286 IX86_BUILTIN_PSLLW256,
26287 IX86_BUILTIN_PSLLDI256,
26288 IX86_BUILTIN_PSLLD256,
26289 IX86_BUILTIN_PSLLQI256,
26290 IX86_BUILTIN_PSLLQ256,
26291 IX86_BUILTIN_PSRAWI256,
26292 IX86_BUILTIN_PSRAW256,
26293 IX86_BUILTIN_PSRADI256,
26294 IX86_BUILTIN_PSRAD256,
26295 IX86_BUILTIN_PSRLDQI256,
26296 IX86_BUILTIN_PSRLWI256,
26297 IX86_BUILTIN_PSRLW256,
26298 IX86_BUILTIN_PSRLDI256,
26299 IX86_BUILTIN_PSRLD256,
26300 IX86_BUILTIN_PSRLQI256,
26301 IX86_BUILTIN_PSRLQ256,
26302 IX86_BUILTIN_PSUBB256,
26303 IX86_BUILTIN_PSUBW256,
26304 IX86_BUILTIN_PSUBD256,
26305 IX86_BUILTIN_PSUBQ256,
26306 IX86_BUILTIN_PSUBSB256,
26307 IX86_BUILTIN_PSUBSW256,
26308 IX86_BUILTIN_PSUBUSB256,
26309 IX86_BUILTIN_PSUBUSW256,
26310 IX86_BUILTIN_PUNPCKHBW256,
26311 IX86_BUILTIN_PUNPCKHWD256,
26312 IX86_BUILTIN_PUNPCKHDQ256,
26313 IX86_BUILTIN_PUNPCKHQDQ256,
26314 IX86_BUILTIN_PUNPCKLBW256,
26315 IX86_BUILTIN_PUNPCKLWD256,
26316 IX86_BUILTIN_PUNPCKLDQ256,
26317 IX86_BUILTIN_PUNPCKLQDQ256,
26318 IX86_BUILTIN_PXOR256,
26319 IX86_BUILTIN_MOVNTDQA256,
26320 IX86_BUILTIN_VBROADCASTSS_PS,
26321 IX86_BUILTIN_VBROADCASTSS_PS256,
26322 IX86_BUILTIN_VBROADCASTSD_PD256,
26323 IX86_BUILTIN_VBROADCASTSI256,
26324 IX86_BUILTIN_PBLENDD256,
26325 IX86_BUILTIN_PBLENDD128,
26326 IX86_BUILTIN_PBROADCASTB256,
26327 IX86_BUILTIN_PBROADCASTW256,
26328 IX86_BUILTIN_PBROADCASTD256,
26329 IX86_BUILTIN_PBROADCASTQ256,
26330 IX86_BUILTIN_PBROADCASTB128,
26331 IX86_BUILTIN_PBROADCASTW128,
26332 IX86_BUILTIN_PBROADCASTD128,
26333 IX86_BUILTIN_PBROADCASTQ128,
26334 IX86_BUILTIN_VPERMVARSI256,
26335 IX86_BUILTIN_VPERMDF256,
26336 IX86_BUILTIN_VPERMVARSF256,
26337 IX86_BUILTIN_VPERMDI256,
26338 IX86_BUILTIN_VPERMTI256,
26339 IX86_BUILTIN_VEXTRACT128I256,
26340 IX86_BUILTIN_VINSERT128I256,
26341 IX86_BUILTIN_MASKLOADD,
26342 IX86_BUILTIN_MASKLOADQ,
26343 IX86_BUILTIN_MASKLOADD256,
26344 IX86_BUILTIN_MASKLOADQ256,
26345 IX86_BUILTIN_MASKSTORED,
26346 IX86_BUILTIN_MASKSTOREQ,
26347 IX86_BUILTIN_MASKSTORED256,
26348 IX86_BUILTIN_MASKSTOREQ256,
26349 IX86_BUILTIN_PSLLVV4DI,
26350 IX86_BUILTIN_PSLLVV2DI,
26351 IX86_BUILTIN_PSLLVV8SI,
26352 IX86_BUILTIN_PSLLVV4SI,
26353 IX86_BUILTIN_PSRAVV8SI,
26354 IX86_BUILTIN_PSRAVV4SI,
26355 IX86_BUILTIN_PSRLVV4DI,
26356 IX86_BUILTIN_PSRLVV2DI,
26357 IX86_BUILTIN_PSRLVV8SI,
26358 IX86_BUILTIN_PSRLVV4SI,
26359
26360 IX86_BUILTIN_GATHERSIV2DF,
26361 IX86_BUILTIN_GATHERSIV4DF,
26362 IX86_BUILTIN_GATHERDIV2DF,
26363 IX86_BUILTIN_GATHERDIV4DF,
26364 IX86_BUILTIN_GATHERSIV4SF,
26365 IX86_BUILTIN_GATHERSIV8SF,
26366 IX86_BUILTIN_GATHERDIV4SF,
26367 IX86_BUILTIN_GATHERDIV8SF,
26368 IX86_BUILTIN_GATHERSIV2DI,
26369 IX86_BUILTIN_GATHERSIV4DI,
26370 IX86_BUILTIN_GATHERDIV2DI,
26371 IX86_BUILTIN_GATHERDIV4DI,
26372 IX86_BUILTIN_GATHERSIV4SI,
26373 IX86_BUILTIN_GATHERSIV8SI,
26374 IX86_BUILTIN_GATHERDIV4SI,
26375 IX86_BUILTIN_GATHERDIV8SI,
26376
26377 /* Alternate 4 element gather for the vectorizer where
26378 all operands are 32-byte wide. */
26379 IX86_BUILTIN_GATHERALTSIV4DF,
26380 IX86_BUILTIN_GATHERALTDIV8SF,
26381 IX86_BUILTIN_GATHERALTSIV4DI,
26382 IX86_BUILTIN_GATHERALTDIV8SI,
26383
26384 /* TFmode support builtins. */
26385 IX86_BUILTIN_INFQ,
26386 IX86_BUILTIN_HUGE_VALQ,
26387 IX86_BUILTIN_FABSQ,
26388 IX86_BUILTIN_COPYSIGNQ,
26389
26390 /* Vectorizer support builtins. */
26391 IX86_BUILTIN_CPYSGNPS,
26392 IX86_BUILTIN_CPYSGNPD,
26393 IX86_BUILTIN_CPYSGNPS256,
26394 IX86_BUILTIN_CPYSGNPD256,
26395
26396 /* FMA4 instructions. */
26397 IX86_BUILTIN_VFMADDSS,
26398 IX86_BUILTIN_VFMADDSD,
26399 IX86_BUILTIN_VFMADDPS,
26400 IX86_BUILTIN_VFMADDPD,
26401 IX86_BUILTIN_VFMADDPS256,
26402 IX86_BUILTIN_VFMADDPD256,
26403 IX86_BUILTIN_VFMADDSUBPS,
26404 IX86_BUILTIN_VFMADDSUBPD,
26405 IX86_BUILTIN_VFMADDSUBPS256,
26406 IX86_BUILTIN_VFMADDSUBPD256,
26407
26408 /* FMA3 instructions. */
26409 IX86_BUILTIN_VFMADDSS3,
26410 IX86_BUILTIN_VFMADDSD3,
26411
26412 /* XOP instructions. */
26413 IX86_BUILTIN_VPCMOV,
26414 IX86_BUILTIN_VPCMOV_V2DI,
26415 IX86_BUILTIN_VPCMOV_V4SI,
26416 IX86_BUILTIN_VPCMOV_V8HI,
26417 IX86_BUILTIN_VPCMOV_V16QI,
26418 IX86_BUILTIN_VPCMOV_V4SF,
26419 IX86_BUILTIN_VPCMOV_V2DF,
26420 IX86_BUILTIN_VPCMOV256,
26421 IX86_BUILTIN_VPCMOV_V4DI256,
26422 IX86_BUILTIN_VPCMOV_V8SI256,
26423 IX86_BUILTIN_VPCMOV_V16HI256,
26424 IX86_BUILTIN_VPCMOV_V32QI256,
26425 IX86_BUILTIN_VPCMOV_V8SF256,
26426 IX86_BUILTIN_VPCMOV_V4DF256,
26427
26428 IX86_BUILTIN_VPPERM,
26429
26430 IX86_BUILTIN_VPMACSSWW,
26431 IX86_BUILTIN_VPMACSWW,
26432 IX86_BUILTIN_VPMACSSWD,
26433 IX86_BUILTIN_VPMACSWD,
26434 IX86_BUILTIN_VPMACSSDD,
26435 IX86_BUILTIN_VPMACSDD,
26436 IX86_BUILTIN_VPMACSSDQL,
26437 IX86_BUILTIN_VPMACSSDQH,
26438 IX86_BUILTIN_VPMACSDQL,
26439 IX86_BUILTIN_VPMACSDQH,
26440 IX86_BUILTIN_VPMADCSSWD,
26441 IX86_BUILTIN_VPMADCSWD,
26442
26443 IX86_BUILTIN_VPHADDBW,
26444 IX86_BUILTIN_VPHADDBD,
26445 IX86_BUILTIN_VPHADDBQ,
26446 IX86_BUILTIN_VPHADDWD,
26447 IX86_BUILTIN_VPHADDWQ,
26448 IX86_BUILTIN_VPHADDDQ,
26449 IX86_BUILTIN_VPHADDUBW,
26450 IX86_BUILTIN_VPHADDUBD,
26451 IX86_BUILTIN_VPHADDUBQ,
26452 IX86_BUILTIN_VPHADDUWD,
26453 IX86_BUILTIN_VPHADDUWQ,
26454 IX86_BUILTIN_VPHADDUDQ,
26455 IX86_BUILTIN_VPHSUBBW,
26456 IX86_BUILTIN_VPHSUBWD,
26457 IX86_BUILTIN_VPHSUBDQ,
26458
26459 IX86_BUILTIN_VPROTB,
26460 IX86_BUILTIN_VPROTW,
26461 IX86_BUILTIN_VPROTD,
26462 IX86_BUILTIN_VPROTQ,
26463 IX86_BUILTIN_VPROTB_IMM,
26464 IX86_BUILTIN_VPROTW_IMM,
26465 IX86_BUILTIN_VPROTD_IMM,
26466 IX86_BUILTIN_VPROTQ_IMM,
26467
26468 IX86_BUILTIN_VPSHLB,
26469 IX86_BUILTIN_VPSHLW,
26470 IX86_BUILTIN_VPSHLD,
26471 IX86_BUILTIN_VPSHLQ,
26472 IX86_BUILTIN_VPSHAB,
26473 IX86_BUILTIN_VPSHAW,
26474 IX86_BUILTIN_VPSHAD,
26475 IX86_BUILTIN_VPSHAQ,
26476
26477 IX86_BUILTIN_VFRCZSS,
26478 IX86_BUILTIN_VFRCZSD,
26479 IX86_BUILTIN_VFRCZPS,
26480 IX86_BUILTIN_VFRCZPD,
26481 IX86_BUILTIN_VFRCZPS256,
26482 IX86_BUILTIN_VFRCZPD256,
26483
26484 IX86_BUILTIN_VPCOMEQUB,
26485 IX86_BUILTIN_VPCOMNEUB,
26486 IX86_BUILTIN_VPCOMLTUB,
26487 IX86_BUILTIN_VPCOMLEUB,
26488 IX86_BUILTIN_VPCOMGTUB,
26489 IX86_BUILTIN_VPCOMGEUB,
26490 IX86_BUILTIN_VPCOMFALSEUB,
26491 IX86_BUILTIN_VPCOMTRUEUB,
26492
26493 IX86_BUILTIN_VPCOMEQUW,
26494 IX86_BUILTIN_VPCOMNEUW,
26495 IX86_BUILTIN_VPCOMLTUW,
26496 IX86_BUILTIN_VPCOMLEUW,
26497 IX86_BUILTIN_VPCOMGTUW,
26498 IX86_BUILTIN_VPCOMGEUW,
26499 IX86_BUILTIN_VPCOMFALSEUW,
26500 IX86_BUILTIN_VPCOMTRUEUW,
26501
26502 IX86_BUILTIN_VPCOMEQUD,
26503 IX86_BUILTIN_VPCOMNEUD,
26504 IX86_BUILTIN_VPCOMLTUD,
26505 IX86_BUILTIN_VPCOMLEUD,
26506 IX86_BUILTIN_VPCOMGTUD,
26507 IX86_BUILTIN_VPCOMGEUD,
26508 IX86_BUILTIN_VPCOMFALSEUD,
26509 IX86_BUILTIN_VPCOMTRUEUD,
26510
26511 IX86_BUILTIN_VPCOMEQUQ,
26512 IX86_BUILTIN_VPCOMNEUQ,
26513 IX86_BUILTIN_VPCOMLTUQ,
26514 IX86_BUILTIN_VPCOMLEUQ,
26515 IX86_BUILTIN_VPCOMGTUQ,
26516 IX86_BUILTIN_VPCOMGEUQ,
26517 IX86_BUILTIN_VPCOMFALSEUQ,
26518 IX86_BUILTIN_VPCOMTRUEUQ,
26519
26520 IX86_BUILTIN_VPCOMEQB,
26521 IX86_BUILTIN_VPCOMNEB,
26522 IX86_BUILTIN_VPCOMLTB,
26523 IX86_BUILTIN_VPCOMLEB,
26524 IX86_BUILTIN_VPCOMGTB,
26525 IX86_BUILTIN_VPCOMGEB,
26526 IX86_BUILTIN_VPCOMFALSEB,
26527 IX86_BUILTIN_VPCOMTRUEB,
26528
26529 IX86_BUILTIN_VPCOMEQW,
26530 IX86_BUILTIN_VPCOMNEW,
26531 IX86_BUILTIN_VPCOMLTW,
26532 IX86_BUILTIN_VPCOMLEW,
26533 IX86_BUILTIN_VPCOMGTW,
26534 IX86_BUILTIN_VPCOMGEW,
26535 IX86_BUILTIN_VPCOMFALSEW,
26536 IX86_BUILTIN_VPCOMTRUEW,
26537
26538 IX86_BUILTIN_VPCOMEQD,
26539 IX86_BUILTIN_VPCOMNED,
26540 IX86_BUILTIN_VPCOMLTD,
26541 IX86_BUILTIN_VPCOMLED,
26542 IX86_BUILTIN_VPCOMGTD,
26543 IX86_BUILTIN_VPCOMGED,
26544 IX86_BUILTIN_VPCOMFALSED,
26545 IX86_BUILTIN_VPCOMTRUED,
26546
26547 IX86_BUILTIN_VPCOMEQQ,
26548 IX86_BUILTIN_VPCOMNEQ,
26549 IX86_BUILTIN_VPCOMLTQ,
26550 IX86_BUILTIN_VPCOMLEQ,
26551 IX86_BUILTIN_VPCOMGTQ,
26552 IX86_BUILTIN_VPCOMGEQ,
26553 IX86_BUILTIN_VPCOMFALSEQ,
26554 IX86_BUILTIN_VPCOMTRUEQ,
26555
26556 /* LWP instructions. */
26557 IX86_BUILTIN_LLWPCB,
26558 IX86_BUILTIN_SLWPCB,
26559 IX86_BUILTIN_LWPVAL32,
26560 IX86_BUILTIN_LWPVAL64,
26561 IX86_BUILTIN_LWPINS32,
26562 IX86_BUILTIN_LWPINS64,
26563
26564 IX86_BUILTIN_CLZS,
26565
26566 /* RTM */
26567 IX86_BUILTIN_XBEGIN,
26568 IX86_BUILTIN_XEND,
26569 IX86_BUILTIN_XABORT,
26570 IX86_BUILTIN_XTEST,
26571
26572 /* BMI instructions. */
26573 IX86_BUILTIN_BEXTR32,
26574 IX86_BUILTIN_BEXTR64,
26575 IX86_BUILTIN_CTZS,
26576
26577 /* TBM instructions. */
26578 IX86_BUILTIN_BEXTRI32,
26579 IX86_BUILTIN_BEXTRI64,
26580
26581 /* BMI2 instructions. */
26582 IX86_BUILTIN_BZHI32,
26583 IX86_BUILTIN_BZHI64,
26584 IX86_BUILTIN_PDEP32,
26585 IX86_BUILTIN_PDEP64,
26586 IX86_BUILTIN_PEXT32,
26587 IX86_BUILTIN_PEXT64,
26588
26589 /* ADX instructions. */
26590 IX86_BUILTIN_ADDCARRYX32,
26591 IX86_BUILTIN_ADDCARRYX64,
26592
26593 /* FSGSBASE instructions. */
26594 IX86_BUILTIN_RDFSBASE32,
26595 IX86_BUILTIN_RDFSBASE64,
26596 IX86_BUILTIN_RDGSBASE32,
26597 IX86_BUILTIN_RDGSBASE64,
26598 IX86_BUILTIN_WRFSBASE32,
26599 IX86_BUILTIN_WRFSBASE64,
26600 IX86_BUILTIN_WRGSBASE32,
26601 IX86_BUILTIN_WRGSBASE64,
26602
26603 /* RDRND instructions. */
26604 IX86_BUILTIN_RDRAND16_STEP,
26605 IX86_BUILTIN_RDRAND32_STEP,
26606 IX86_BUILTIN_RDRAND64_STEP,
26607
26608 /* RDSEED instructions. */
26609 IX86_BUILTIN_RDSEED16_STEP,
26610 IX86_BUILTIN_RDSEED32_STEP,
26611 IX86_BUILTIN_RDSEED64_STEP,
26612
26613 /* F16C instructions. */
26614 IX86_BUILTIN_CVTPH2PS,
26615 IX86_BUILTIN_CVTPH2PS256,
26616 IX86_BUILTIN_CVTPS2PH,
26617 IX86_BUILTIN_CVTPS2PH256,
26618
26619 /* CFString built-in for darwin */
26620 IX86_BUILTIN_CFSTRING,
26621
26622 /* Builtins to get CPU type and supported features. */
26623 IX86_BUILTIN_CPU_INIT,
26624 IX86_BUILTIN_CPU_IS,
26625 IX86_BUILTIN_CPU_SUPPORTS,
26626
26627 IX86_BUILTIN_MAX
26628 };
26629
26630 /* Table for the ix86 builtin decls. */
26631 static GTY(()) tree ix86_builtins[(int) IX86_BUILTIN_MAX];
26632
26633 /* Table of all of the builtin functions that are possible with different ISA's
26634 but are waiting to be built until a function is declared to use that
26635 ISA. */
26636 struct builtin_isa {
26637 const char *name; /* function name */
26638 enum ix86_builtin_func_type tcode; /* type to use in the declaration */
26639 HOST_WIDE_INT isa; /* isa_flags this builtin is defined for */
26640 bool const_p; /* true if the declaration is constant */
26641 bool set_and_not_built_p;
26642 };
26643
26644 static struct builtin_isa ix86_builtins_isa[(int) IX86_BUILTIN_MAX];
26645
26646
26647 /* Add an ix86 target builtin function with CODE, NAME and TYPE. Save the MASK
26648 of which isa_flags to use in the ix86_builtins_isa array. Stores the
26649 function decl in the ix86_builtins array. Returns the function decl or
26650 NULL_TREE, if the builtin was not added.
26651
26652 If the front end has a special hook for builtin functions, delay adding
26653 builtin functions that aren't in the current ISA until the ISA is changed
26654 with function specific optimization. Doing so, can save about 300K for the
26655 default compiler. When the builtin is expanded, check at that time whether
26656 it is valid.
26657
26658 If the front end doesn't have a special hook, record all builtins, even if
26659 it isn't an instruction set in the current ISA in case the user uses
26660 function specific options for a different ISA, so that we don't get scope
26661 errors if a builtin is added in the middle of a function scope. */
26662
26663 static inline tree
26664 def_builtin (HOST_WIDE_INT mask, const char *name,
26665 enum ix86_builtin_func_type tcode,
26666 enum ix86_builtins code)
26667 {
26668 tree decl = NULL_TREE;
26669
26670 if (!(mask & OPTION_MASK_ISA_64BIT) || TARGET_64BIT)
26671 {
26672 ix86_builtins_isa[(int) code].isa = mask;
26673
26674 mask &= ~OPTION_MASK_ISA_64BIT;
26675 if (mask == 0
26676 || (mask & ix86_isa_flags) != 0
26677 || (lang_hooks.builtin_function
26678 == lang_hooks.builtin_function_ext_scope))
26679
26680 {
26681 tree type = ix86_get_builtin_func_type (tcode);
26682 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
26683 NULL, NULL_TREE);
26684 ix86_builtins[(int) code] = decl;
26685 ix86_builtins_isa[(int) code].set_and_not_built_p = false;
26686 }
26687 else
26688 {
26689 ix86_builtins[(int) code] = NULL_TREE;
26690 ix86_builtins_isa[(int) code].tcode = tcode;
26691 ix86_builtins_isa[(int) code].name = name;
26692 ix86_builtins_isa[(int) code].const_p = false;
26693 ix86_builtins_isa[(int) code].set_and_not_built_p = true;
26694 }
26695 }
26696
26697 return decl;
26698 }
26699
26700 /* Like def_builtin, but also marks the function decl "const". */
26701
26702 static inline tree
26703 def_builtin_const (HOST_WIDE_INT mask, const char *name,
26704 enum ix86_builtin_func_type tcode, enum ix86_builtins code)
26705 {
26706 tree decl = def_builtin (mask, name, tcode, code);
26707 if (decl)
26708 TREE_READONLY (decl) = 1;
26709 else
26710 ix86_builtins_isa[(int) code].const_p = true;
26711
26712 return decl;
26713 }
26714
26715 /* Add any new builtin functions for a given ISA that may not have been
26716 declared. This saves a bit of space compared to adding all of the
26717 declarations to the tree, even if we didn't use them. */
26718
26719 static void
26720 ix86_add_new_builtins (HOST_WIDE_INT isa)
26721 {
26722 int i;
26723
26724 for (i = 0; i < (int)IX86_BUILTIN_MAX; i++)
26725 {
26726 if ((ix86_builtins_isa[i].isa & isa) != 0
26727 && ix86_builtins_isa[i].set_and_not_built_p)
26728 {
26729 tree decl, type;
26730
26731 /* Don't define the builtin again. */
26732 ix86_builtins_isa[i].set_and_not_built_p = false;
26733
26734 type = ix86_get_builtin_func_type (ix86_builtins_isa[i].tcode);
26735 decl = add_builtin_function_ext_scope (ix86_builtins_isa[i].name,
26736 type, i, BUILT_IN_MD, NULL,
26737 NULL_TREE);
26738
26739 ix86_builtins[i] = decl;
26740 if (ix86_builtins_isa[i].const_p)
26741 TREE_READONLY (decl) = 1;
26742 }
26743 }
26744 }
26745
26746 /* Bits for builtin_description.flag. */
26747
26748 /* Set when we don't support the comparison natively, and should
26749 swap_comparison in order to support it. */
26750 #define BUILTIN_DESC_SWAP_OPERANDS 1
26751
26752 struct builtin_description
26753 {
26754 const HOST_WIDE_INT mask;
26755 const enum insn_code icode;
26756 const char *const name;
26757 const enum ix86_builtins code;
26758 const enum rtx_code comparison;
26759 const int flag;
26760 };
26761
26762 static const struct builtin_description bdesc_comi[] =
26763 {
26764 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comieq", IX86_BUILTIN_COMIEQSS, UNEQ, 0 },
26765 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comilt", IX86_BUILTIN_COMILTSS, UNLT, 0 },
26766 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comile", IX86_BUILTIN_COMILESS, UNLE, 0 },
26767 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comigt", IX86_BUILTIN_COMIGTSS, GT, 0 },
26768 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comige", IX86_BUILTIN_COMIGESS, GE, 0 },
26769 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comineq", IX86_BUILTIN_COMINEQSS, LTGT, 0 },
26770 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomieq", IX86_BUILTIN_UCOMIEQSS, UNEQ, 0 },
26771 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomilt", IX86_BUILTIN_UCOMILTSS, UNLT, 0 },
26772 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomile", IX86_BUILTIN_UCOMILESS, UNLE, 0 },
26773 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomigt", IX86_BUILTIN_UCOMIGTSS, GT, 0 },
26774 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomige", IX86_BUILTIN_UCOMIGESS, GE, 0 },
26775 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomineq", IX86_BUILTIN_UCOMINEQSS, LTGT, 0 },
26776 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdeq", IX86_BUILTIN_COMIEQSD, UNEQ, 0 },
26777 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdlt", IX86_BUILTIN_COMILTSD, UNLT, 0 },
26778 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdle", IX86_BUILTIN_COMILESD, UNLE, 0 },
26779 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdgt", IX86_BUILTIN_COMIGTSD, GT, 0 },
26780 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdge", IX86_BUILTIN_COMIGESD, GE, 0 },
26781 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdneq", IX86_BUILTIN_COMINEQSD, LTGT, 0 },
26782 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdeq", IX86_BUILTIN_UCOMIEQSD, UNEQ, 0 },
26783 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdlt", IX86_BUILTIN_UCOMILTSD, UNLT, 0 },
26784 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdle", IX86_BUILTIN_UCOMILESD, UNLE, 0 },
26785 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdgt", IX86_BUILTIN_UCOMIGTSD, GT, 0 },
26786 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdge", IX86_BUILTIN_UCOMIGESD, GE, 0 },
26787 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdneq", IX86_BUILTIN_UCOMINEQSD, LTGT, 0 },
26788 };
26789
26790 static const struct builtin_description bdesc_pcmpestr[] =
26791 {
26792 /* SSE4.2 */
26793 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestri128", IX86_BUILTIN_PCMPESTRI128, UNKNOWN, 0 },
26794 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestrm128", IX86_BUILTIN_PCMPESTRM128, UNKNOWN, 0 },
26795 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestria128", IX86_BUILTIN_PCMPESTRA128, UNKNOWN, (int) CCAmode },
26796 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestric128", IX86_BUILTIN_PCMPESTRC128, UNKNOWN, (int) CCCmode },
26797 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestrio128", IX86_BUILTIN_PCMPESTRO128, UNKNOWN, (int) CCOmode },
26798 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestris128", IX86_BUILTIN_PCMPESTRS128, UNKNOWN, (int) CCSmode },
26799 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestriz128", IX86_BUILTIN_PCMPESTRZ128, UNKNOWN, (int) CCZmode },
26800 };
26801
26802 static const struct builtin_description bdesc_pcmpistr[] =
26803 {
26804 /* SSE4.2 */
26805 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistri128", IX86_BUILTIN_PCMPISTRI128, UNKNOWN, 0 },
26806 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistrm128", IX86_BUILTIN_PCMPISTRM128, UNKNOWN, 0 },
26807 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistria128", IX86_BUILTIN_PCMPISTRA128, UNKNOWN, (int) CCAmode },
26808 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistric128", IX86_BUILTIN_PCMPISTRC128, UNKNOWN, (int) CCCmode },
26809 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistrio128", IX86_BUILTIN_PCMPISTRO128, UNKNOWN, (int) CCOmode },
26810 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistris128", IX86_BUILTIN_PCMPISTRS128, UNKNOWN, (int) CCSmode },
26811 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistriz128", IX86_BUILTIN_PCMPISTRZ128, UNKNOWN, (int) CCZmode },
26812 };
26813
26814 /* Special builtins with variable number of arguments. */
26815 static const struct builtin_description bdesc_special_args[] =
26816 {
26817 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_rdtsc", IX86_BUILTIN_RDTSC, UNKNOWN, (int) UINT64_FTYPE_VOID },
26818 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_rdtscp", IX86_BUILTIN_RDTSCP, UNKNOWN, (int) UINT64_FTYPE_PUNSIGNED },
26819 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_pause, "__builtin_ia32_pause", IX86_BUILTIN_PAUSE, UNKNOWN, (int) VOID_FTYPE_VOID },
26820
26821 /* MMX */
26822 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_emms, "__builtin_ia32_emms", IX86_BUILTIN_EMMS, UNKNOWN, (int) VOID_FTYPE_VOID },
26823
26824 /* 3DNow! */
26825 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_femms, "__builtin_ia32_femms", IX86_BUILTIN_FEMMS, UNKNOWN, (int) VOID_FTYPE_VOID },
26826
26827 /* FXSR, XSAVE and XSAVEOPT */
26828 { OPTION_MASK_ISA_FXSR, CODE_FOR_nothing, "__builtin_ia32_fxsave", IX86_BUILTIN_FXSAVE, UNKNOWN, (int) VOID_FTYPE_PVOID },
26829 { OPTION_MASK_ISA_FXSR, CODE_FOR_nothing, "__builtin_ia32_fxrstor", IX86_BUILTIN_FXRSTOR, UNKNOWN, (int) VOID_FTYPE_PVOID },
26830 { OPTION_MASK_ISA_XSAVE, CODE_FOR_nothing, "__builtin_ia32_xsave", IX86_BUILTIN_XSAVE, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
26831 { OPTION_MASK_ISA_XSAVE, CODE_FOR_nothing, "__builtin_ia32_xrstor", IX86_BUILTIN_XRSTOR, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
26832 { OPTION_MASK_ISA_XSAVEOPT, CODE_FOR_nothing, "__builtin_ia32_xsaveopt", IX86_BUILTIN_XSAVEOPT, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
26833
26834 { OPTION_MASK_ISA_FXSR | OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_fxsave64", IX86_BUILTIN_FXSAVE64, UNKNOWN, (int) VOID_FTYPE_PVOID },
26835 { OPTION_MASK_ISA_FXSR | OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_fxrstor64", IX86_BUILTIN_FXRSTOR64, UNKNOWN, (int) VOID_FTYPE_PVOID },
26836 { OPTION_MASK_ISA_XSAVE | OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_xsave64", IX86_BUILTIN_XSAVE64, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
26837 { OPTION_MASK_ISA_XSAVE | OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_xrstor64", IX86_BUILTIN_XRSTOR64, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
26838 { OPTION_MASK_ISA_XSAVEOPT | OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_xsaveopt64", IX86_BUILTIN_XSAVEOPT64, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
26839
26840 /* SSE */
26841 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_storeups, "__builtin_ia32_storeups", IX86_BUILTIN_STOREUPS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF },
26842 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movntv4sf, "__builtin_ia32_movntps", IX86_BUILTIN_MOVNTPS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF },
26843 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_loadups, "__builtin_ia32_loadups", IX86_BUILTIN_LOADUPS, UNKNOWN, (int) V4SF_FTYPE_PCFLOAT },
26844
26845 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_loadhps_exp, "__builtin_ia32_loadhps", IX86_BUILTIN_LOADHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_PCV2SF },
26846 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_loadlps_exp, "__builtin_ia32_loadlps", IX86_BUILTIN_LOADLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_PCV2SF },
26847 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_storehps, "__builtin_ia32_storehps", IX86_BUILTIN_STOREHPS, UNKNOWN, (int) VOID_FTYPE_PV2SF_V4SF },
26848 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_storelps, "__builtin_ia32_storelps", IX86_BUILTIN_STORELPS, UNKNOWN, (int) VOID_FTYPE_PV2SF_V4SF },
26849
26850 /* SSE or 3DNow!A */
26851 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_sse_sfence, "__builtin_ia32_sfence", IX86_BUILTIN_SFENCE, UNKNOWN, (int) VOID_FTYPE_VOID },
26852 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_sse_movntq, "__builtin_ia32_movntq", IX86_BUILTIN_MOVNTQ, UNKNOWN, (int) VOID_FTYPE_PULONGLONG_ULONGLONG },
26853
26854 /* SSE2 */
26855 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_lfence, "__builtin_ia32_lfence", IX86_BUILTIN_LFENCE, UNKNOWN, (int) VOID_FTYPE_VOID },
26856 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_mfence, 0, IX86_BUILTIN_MFENCE, UNKNOWN, (int) VOID_FTYPE_VOID },
26857 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_storeupd, "__builtin_ia32_storeupd", IX86_BUILTIN_STOREUPD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF },
26858 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_storedqu, "__builtin_ia32_storedqu", IX86_BUILTIN_STOREDQU, UNKNOWN, (int) VOID_FTYPE_PCHAR_V16QI },
26859 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntv2df, "__builtin_ia32_movntpd", IX86_BUILTIN_MOVNTPD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF },
26860 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntv2di, "__builtin_ia32_movntdq", IX86_BUILTIN_MOVNTDQ, UNKNOWN, (int) VOID_FTYPE_PV2DI_V2DI },
26861 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntisi, "__builtin_ia32_movnti", IX86_BUILTIN_MOVNTI, UNKNOWN, (int) VOID_FTYPE_PINT_INT },
26862 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_movntidi, "__builtin_ia32_movnti64", IX86_BUILTIN_MOVNTI64, UNKNOWN, (int) VOID_FTYPE_PLONGLONG_LONGLONG },
26863 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loadupd, "__builtin_ia32_loadupd", IX86_BUILTIN_LOADUPD, UNKNOWN, (int) V2DF_FTYPE_PCDOUBLE },
26864 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loaddqu, "__builtin_ia32_loaddqu", IX86_BUILTIN_LOADDQU, UNKNOWN, (int) V16QI_FTYPE_PCCHAR },
26865
26866 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loadhpd_exp, "__builtin_ia32_loadhpd", IX86_BUILTIN_LOADHPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_PCDOUBLE },
26867 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loadlpd_exp, "__builtin_ia32_loadlpd", IX86_BUILTIN_LOADLPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_PCDOUBLE },
26868
26869 /* SSE3 */
26870 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_lddqu, "__builtin_ia32_lddqu", IX86_BUILTIN_LDDQU, UNKNOWN, (int) V16QI_FTYPE_PCCHAR },
26871
26872 /* SSE4.1 */
26873 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_movntdqa, "__builtin_ia32_movntdqa", IX86_BUILTIN_MOVNTDQA, UNKNOWN, (int) V2DI_FTYPE_PV2DI },
26874
26875 /* SSE4A */
26876 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_vmmovntv2df, "__builtin_ia32_movntsd", IX86_BUILTIN_MOVNTSD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF },
26877 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_vmmovntv4sf, "__builtin_ia32_movntss", IX86_BUILTIN_MOVNTSS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF },
26878
26879 /* AVX */
26880 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vzeroall, "__builtin_ia32_vzeroall", IX86_BUILTIN_VZEROALL, UNKNOWN, (int) VOID_FTYPE_VOID },
26881 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vzeroupper, "__builtin_ia32_vzeroupper", IX86_BUILTIN_VZEROUPPER, UNKNOWN, (int) VOID_FTYPE_VOID },
26882
26883 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv4sf, "__builtin_ia32_vbroadcastss", IX86_BUILTIN_VBROADCASTSS, UNKNOWN, (int) V4SF_FTYPE_PCFLOAT },
26884 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv4df, "__builtin_ia32_vbroadcastsd256", IX86_BUILTIN_VBROADCASTSD256, UNKNOWN, (int) V4DF_FTYPE_PCDOUBLE },
26885 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv8sf, "__builtin_ia32_vbroadcastss256", IX86_BUILTIN_VBROADCASTSS256, UNKNOWN, (int) V8SF_FTYPE_PCFLOAT },
26886 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastf128_v4df, "__builtin_ia32_vbroadcastf128_pd256", IX86_BUILTIN_VBROADCASTPD256, UNKNOWN, (int) V4DF_FTYPE_PCV2DF },
26887 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastf128_v8sf, "__builtin_ia32_vbroadcastf128_ps256", IX86_BUILTIN_VBROADCASTPS256, UNKNOWN, (int) V8SF_FTYPE_PCV4SF },
26888
26889 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_loadupd256, "__builtin_ia32_loadupd256", IX86_BUILTIN_LOADUPD256, UNKNOWN, (int) V4DF_FTYPE_PCDOUBLE },
26890 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_loadups256, "__builtin_ia32_loadups256", IX86_BUILTIN_LOADUPS256, UNKNOWN, (int) V8SF_FTYPE_PCFLOAT },
26891 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_storeupd256, "__builtin_ia32_storeupd256", IX86_BUILTIN_STOREUPD256, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V4DF },
26892 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_storeups256, "__builtin_ia32_storeups256", IX86_BUILTIN_STOREUPS256, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V8SF },
26893 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_loaddqu256, "__builtin_ia32_loaddqu256", IX86_BUILTIN_LOADDQU256, UNKNOWN, (int) V32QI_FTYPE_PCCHAR },
26894 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_storedqu256, "__builtin_ia32_storedqu256", IX86_BUILTIN_STOREDQU256, UNKNOWN, (int) VOID_FTYPE_PCHAR_V32QI },
26895 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_lddqu256, "__builtin_ia32_lddqu256", IX86_BUILTIN_LDDQU256, UNKNOWN, (int) V32QI_FTYPE_PCCHAR },
26896
26897 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv4di, "__builtin_ia32_movntdq256", IX86_BUILTIN_MOVNTDQ256, UNKNOWN, (int) VOID_FTYPE_PV4DI_V4DI },
26898 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv4df, "__builtin_ia32_movntpd256", IX86_BUILTIN_MOVNTPD256, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V4DF },
26899 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv8sf, "__builtin_ia32_movntps256", IX86_BUILTIN_MOVNTPS256, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V8SF },
26900
26901 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadpd, "__builtin_ia32_maskloadpd", IX86_BUILTIN_MASKLOADPD, UNKNOWN, (int) V2DF_FTYPE_PCV2DF_V2DI },
26902 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadps, "__builtin_ia32_maskloadps", IX86_BUILTIN_MASKLOADPS, UNKNOWN, (int) V4SF_FTYPE_PCV4SF_V4SI },
26903 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadpd256, "__builtin_ia32_maskloadpd256", IX86_BUILTIN_MASKLOADPD256, UNKNOWN, (int) V4DF_FTYPE_PCV4DF_V4DI },
26904 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadps256, "__builtin_ia32_maskloadps256", IX86_BUILTIN_MASKLOADPS256, UNKNOWN, (int) V8SF_FTYPE_PCV8SF_V8SI },
26905 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstorepd, "__builtin_ia32_maskstorepd", IX86_BUILTIN_MASKSTOREPD, UNKNOWN, (int) VOID_FTYPE_PV2DF_V2DI_V2DF },
26906 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstoreps, "__builtin_ia32_maskstoreps", IX86_BUILTIN_MASKSTOREPS, UNKNOWN, (int) VOID_FTYPE_PV4SF_V4SI_V4SF },
26907 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstorepd256, "__builtin_ia32_maskstorepd256", IX86_BUILTIN_MASKSTOREPD256, UNKNOWN, (int) VOID_FTYPE_PV4DF_V4DI_V4DF },
26908 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstoreps256, "__builtin_ia32_maskstoreps256", IX86_BUILTIN_MASKSTOREPS256, UNKNOWN, (int) VOID_FTYPE_PV8SF_V8SI_V8SF },
26909
26910 /* AVX2 */
26911 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_movntdqa, "__builtin_ia32_movntdqa256", IX86_BUILTIN_MOVNTDQA256, UNKNOWN, (int) V4DI_FTYPE_PV4DI },
26912 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadd, "__builtin_ia32_maskloadd", IX86_BUILTIN_MASKLOADD, UNKNOWN, (int) V4SI_FTYPE_PCV4SI_V4SI },
26913 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadq, "__builtin_ia32_maskloadq", IX86_BUILTIN_MASKLOADQ, UNKNOWN, (int) V2DI_FTYPE_PCV2DI_V2DI },
26914 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadd256, "__builtin_ia32_maskloadd256", IX86_BUILTIN_MASKLOADD256, UNKNOWN, (int) V8SI_FTYPE_PCV8SI_V8SI },
26915 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadq256, "__builtin_ia32_maskloadq256", IX86_BUILTIN_MASKLOADQ256, UNKNOWN, (int) V4DI_FTYPE_PCV4DI_V4DI },
26916 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstored, "__builtin_ia32_maskstored", IX86_BUILTIN_MASKSTORED, UNKNOWN, (int) VOID_FTYPE_PV4SI_V4SI_V4SI },
26917 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstoreq, "__builtin_ia32_maskstoreq", IX86_BUILTIN_MASKSTOREQ, UNKNOWN, (int) VOID_FTYPE_PV2DI_V2DI_V2DI },
26918 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstored256, "__builtin_ia32_maskstored256", IX86_BUILTIN_MASKSTORED256, UNKNOWN, (int) VOID_FTYPE_PV8SI_V8SI_V8SI },
26919 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstoreq256, "__builtin_ia32_maskstoreq256", IX86_BUILTIN_MASKSTOREQ256, UNKNOWN, (int) VOID_FTYPE_PV4DI_V4DI_V4DI },
26920
26921 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_llwpcb, "__builtin_ia32_llwpcb", IX86_BUILTIN_LLWPCB, UNKNOWN, (int) VOID_FTYPE_PVOID },
26922 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_slwpcb, "__builtin_ia32_slwpcb", IX86_BUILTIN_SLWPCB, UNKNOWN, (int) PVOID_FTYPE_VOID },
26923 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpvalsi3, "__builtin_ia32_lwpval32", IX86_BUILTIN_LWPVAL32, UNKNOWN, (int) VOID_FTYPE_UINT_UINT_UINT },
26924 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpvaldi3, "__builtin_ia32_lwpval64", IX86_BUILTIN_LWPVAL64, UNKNOWN, (int) VOID_FTYPE_UINT64_UINT_UINT },
26925 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpinssi3, "__builtin_ia32_lwpins32", IX86_BUILTIN_LWPINS32, UNKNOWN, (int) UCHAR_FTYPE_UINT_UINT_UINT },
26926 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpinsdi3, "__builtin_ia32_lwpins64", IX86_BUILTIN_LWPINS64, UNKNOWN, (int) UCHAR_FTYPE_UINT64_UINT_UINT },
26927
26928 /* FSGSBASE */
26929 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdfsbasesi, "__builtin_ia32_rdfsbase32", IX86_BUILTIN_RDFSBASE32, UNKNOWN, (int) UNSIGNED_FTYPE_VOID },
26930 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdfsbasedi, "__builtin_ia32_rdfsbase64", IX86_BUILTIN_RDFSBASE64, UNKNOWN, (int) UINT64_FTYPE_VOID },
26931 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdgsbasesi, "__builtin_ia32_rdgsbase32", IX86_BUILTIN_RDGSBASE32, UNKNOWN, (int) UNSIGNED_FTYPE_VOID },
26932 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdgsbasedi, "__builtin_ia32_rdgsbase64", IX86_BUILTIN_RDGSBASE64, UNKNOWN, (int) UINT64_FTYPE_VOID },
26933 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrfsbasesi, "__builtin_ia32_wrfsbase32", IX86_BUILTIN_WRFSBASE32, UNKNOWN, (int) VOID_FTYPE_UNSIGNED },
26934 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrfsbasedi, "__builtin_ia32_wrfsbase64", IX86_BUILTIN_WRFSBASE64, UNKNOWN, (int) VOID_FTYPE_UINT64 },
26935 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrgsbasesi, "__builtin_ia32_wrgsbase32", IX86_BUILTIN_WRGSBASE32, UNKNOWN, (int) VOID_FTYPE_UNSIGNED },
26936 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrgsbasedi, "__builtin_ia32_wrgsbase64", IX86_BUILTIN_WRGSBASE64, UNKNOWN, (int) VOID_FTYPE_UINT64 },
26937
26938 /* RTM */
26939 { OPTION_MASK_ISA_RTM, CODE_FOR_xbegin, "__builtin_ia32_xbegin", IX86_BUILTIN_XBEGIN, UNKNOWN, (int) UNSIGNED_FTYPE_VOID },
26940 { OPTION_MASK_ISA_RTM, CODE_FOR_xend, "__builtin_ia32_xend", IX86_BUILTIN_XEND, UNKNOWN, (int) VOID_FTYPE_VOID },
26941 { OPTION_MASK_ISA_RTM, CODE_FOR_xtest, "__builtin_ia32_xtest", IX86_BUILTIN_XTEST, UNKNOWN, (int) INT_FTYPE_VOID },
26942 };
26943
26944 /* Builtins with variable number of arguments. */
26945 static const struct builtin_description bdesc_args[] =
26946 {
26947 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_bsr, "__builtin_ia32_bsrsi", IX86_BUILTIN_BSRSI, UNKNOWN, (int) INT_FTYPE_INT },
26948 { OPTION_MASK_ISA_64BIT, CODE_FOR_bsr_rex64, "__builtin_ia32_bsrdi", IX86_BUILTIN_BSRDI, UNKNOWN, (int) INT64_FTYPE_INT64 },
26949 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_rdpmc", IX86_BUILTIN_RDPMC, UNKNOWN, (int) UINT64_FTYPE_INT },
26950 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotlqi3, "__builtin_ia32_rolqi", IX86_BUILTIN_ROLQI, UNKNOWN, (int) UINT8_FTYPE_UINT8_INT },
26951 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotlhi3, "__builtin_ia32_rolhi", IX86_BUILTIN_ROLHI, UNKNOWN, (int) UINT16_FTYPE_UINT16_INT },
26952 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotrqi3, "__builtin_ia32_rorqi", IX86_BUILTIN_RORQI, UNKNOWN, (int) UINT8_FTYPE_UINT8_INT },
26953 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotrhi3, "__builtin_ia32_rorhi", IX86_BUILTIN_RORHI, UNKNOWN, (int) UINT16_FTYPE_UINT16_INT },
26954
26955 /* MMX */
26956 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv8qi3, "__builtin_ia32_paddb", IX86_BUILTIN_PADDB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26957 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv4hi3, "__builtin_ia32_paddw", IX86_BUILTIN_PADDW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26958 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv2si3, "__builtin_ia32_paddd", IX86_BUILTIN_PADDD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26959 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv8qi3, "__builtin_ia32_psubb", IX86_BUILTIN_PSUBB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26960 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv4hi3, "__builtin_ia32_psubw", IX86_BUILTIN_PSUBW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26961 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv2si3, "__builtin_ia32_psubd", IX86_BUILTIN_PSUBD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26962
26963 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ssaddv8qi3, "__builtin_ia32_paddsb", IX86_BUILTIN_PADDSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26964 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ssaddv4hi3, "__builtin_ia32_paddsw", IX86_BUILTIN_PADDSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26965 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_sssubv8qi3, "__builtin_ia32_psubsb", IX86_BUILTIN_PSUBSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26966 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_sssubv4hi3, "__builtin_ia32_psubsw", IX86_BUILTIN_PSUBSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26967 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_usaddv8qi3, "__builtin_ia32_paddusb", IX86_BUILTIN_PADDUSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26968 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_usaddv4hi3, "__builtin_ia32_paddusw", IX86_BUILTIN_PADDUSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26969 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ussubv8qi3, "__builtin_ia32_psubusb", IX86_BUILTIN_PSUBUSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26970 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ussubv4hi3, "__builtin_ia32_psubusw", IX86_BUILTIN_PSUBUSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26971
26972 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_mulv4hi3, "__builtin_ia32_pmullw", IX86_BUILTIN_PMULLW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26973 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_smulv4hi3_highpart, "__builtin_ia32_pmulhw", IX86_BUILTIN_PMULHW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26974
26975 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_andv2si3, "__builtin_ia32_pand", IX86_BUILTIN_PAND, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26976 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_andnotv2si3, "__builtin_ia32_pandn", IX86_BUILTIN_PANDN, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26977 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_iorv2si3, "__builtin_ia32_por", IX86_BUILTIN_POR, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26978 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_xorv2si3, "__builtin_ia32_pxor", IX86_BUILTIN_PXOR, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26979
26980 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_eqv8qi3, "__builtin_ia32_pcmpeqb", IX86_BUILTIN_PCMPEQB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26981 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_eqv4hi3, "__builtin_ia32_pcmpeqw", IX86_BUILTIN_PCMPEQW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26982 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_eqv2si3, "__builtin_ia32_pcmpeqd", IX86_BUILTIN_PCMPEQD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26983 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_gtv8qi3, "__builtin_ia32_pcmpgtb", IX86_BUILTIN_PCMPGTB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26984 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_gtv4hi3, "__builtin_ia32_pcmpgtw", IX86_BUILTIN_PCMPGTW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26985 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_gtv2si3, "__builtin_ia32_pcmpgtd", IX86_BUILTIN_PCMPGTD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26986
26987 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckhbw, "__builtin_ia32_punpckhbw", IX86_BUILTIN_PUNPCKHBW, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26988 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckhwd, "__builtin_ia32_punpckhwd", IX86_BUILTIN_PUNPCKHWD, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26989 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckhdq, "__builtin_ia32_punpckhdq", IX86_BUILTIN_PUNPCKHDQ, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26990 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpcklbw, "__builtin_ia32_punpcklbw", IX86_BUILTIN_PUNPCKLBW, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26991 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpcklwd, "__builtin_ia32_punpcklwd", IX86_BUILTIN_PUNPCKLWD, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI},
26992 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckldq, "__builtin_ia32_punpckldq", IX86_BUILTIN_PUNPCKLDQ, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI},
26993
26994 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_packsswb, "__builtin_ia32_packsswb", IX86_BUILTIN_PACKSSWB, UNKNOWN, (int) V8QI_FTYPE_V4HI_V4HI },
26995 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_packssdw, "__builtin_ia32_packssdw", IX86_BUILTIN_PACKSSDW, UNKNOWN, (int) V4HI_FTYPE_V2SI_V2SI },
26996 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_packuswb, "__builtin_ia32_packuswb", IX86_BUILTIN_PACKUSWB, UNKNOWN, (int) V8QI_FTYPE_V4HI_V4HI },
26997
26998 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_pmaddwd, "__builtin_ia32_pmaddwd", IX86_BUILTIN_PMADDWD, UNKNOWN, (int) V2SI_FTYPE_V4HI_V4HI },
26999
27000 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv4hi3, "__builtin_ia32_psllwi", IX86_BUILTIN_PSLLWI, UNKNOWN, (int) V4HI_FTYPE_V4HI_SI_COUNT },
27001 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv2si3, "__builtin_ia32_pslldi", IX86_BUILTIN_PSLLDI, UNKNOWN, (int) V2SI_FTYPE_V2SI_SI_COUNT },
27002 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv1di3, "__builtin_ia32_psllqi", IX86_BUILTIN_PSLLQI, UNKNOWN, (int) V1DI_FTYPE_V1DI_SI_COUNT },
27003 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv4hi3, "__builtin_ia32_psllw", IX86_BUILTIN_PSLLW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI_COUNT },
27004 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv2si3, "__builtin_ia32_pslld", IX86_BUILTIN_PSLLD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI_COUNT },
27005 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv1di3, "__builtin_ia32_psllq", IX86_BUILTIN_PSLLQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI_COUNT },
27006
27007 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv4hi3, "__builtin_ia32_psrlwi", IX86_BUILTIN_PSRLWI, UNKNOWN, (int) V4HI_FTYPE_V4HI_SI_COUNT },
27008 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv2si3, "__builtin_ia32_psrldi", IX86_BUILTIN_PSRLDI, UNKNOWN, (int) V2SI_FTYPE_V2SI_SI_COUNT },
27009 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv1di3, "__builtin_ia32_psrlqi", IX86_BUILTIN_PSRLQI, UNKNOWN, (int) V1DI_FTYPE_V1DI_SI_COUNT },
27010 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv4hi3, "__builtin_ia32_psrlw", IX86_BUILTIN_PSRLW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI_COUNT },
27011 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv2si3, "__builtin_ia32_psrld", IX86_BUILTIN_PSRLD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI_COUNT },
27012 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv1di3, "__builtin_ia32_psrlq", IX86_BUILTIN_PSRLQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI_COUNT },
27013
27014 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv4hi3, "__builtin_ia32_psrawi", IX86_BUILTIN_PSRAWI, UNKNOWN, (int) V4HI_FTYPE_V4HI_SI_COUNT },
27015 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv2si3, "__builtin_ia32_psradi", IX86_BUILTIN_PSRADI, UNKNOWN, (int) V2SI_FTYPE_V2SI_SI_COUNT },
27016 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv4hi3, "__builtin_ia32_psraw", IX86_BUILTIN_PSRAW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI_COUNT },
27017 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv2si3, "__builtin_ia32_psrad", IX86_BUILTIN_PSRAD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI_COUNT },
27018
27019 /* 3DNow! */
27020 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_pf2id, "__builtin_ia32_pf2id", IX86_BUILTIN_PF2ID, UNKNOWN, (int) V2SI_FTYPE_V2SF },
27021 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_floatv2si2, "__builtin_ia32_pi2fd", IX86_BUILTIN_PI2FD, UNKNOWN, (int) V2SF_FTYPE_V2SI },
27022 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rcpv2sf2, "__builtin_ia32_pfrcp", IX86_BUILTIN_PFRCP, UNKNOWN, (int) V2SF_FTYPE_V2SF },
27023 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rsqrtv2sf2, "__builtin_ia32_pfrsqrt", IX86_BUILTIN_PFRSQRT, UNKNOWN, (int) V2SF_FTYPE_V2SF },
27024
27025 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_uavgv8qi3, "__builtin_ia32_pavgusb", IX86_BUILTIN_PAVGUSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27026 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_haddv2sf3, "__builtin_ia32_pfacc", IX86_BUILTIN_PFACC, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27027 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_addv2sf3, "__builtin_ia32_pfadd", IX86_BUILTIN_PFADD, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27028 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_eqv2sf3, "__builtin_ia32_pfcmpeq", IX86_BUILTIN_PFCMPEQ, UNKNOWN, (int) V2SI_FTYPE_V2SF_V2SF },
27029 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_gev2sf3, "__builtin_ia32_pfcmpge", IX86_BUILTIN_PFCMPGE, UNKNOWN, (int) V2SI_FTYPE_V2SF_V2SF },
27030 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_gtv2sf3, "__builtin_ia32_pfcmpgt", IX86_BUILTIN_PFCMPGT, UNKNOWN, (int) V2SI_FTYPE_V2SF_V2SF },
27031 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_smaxv2sf3, "__builtin_ia32_pfmax", IX86_BUILTIN_PFMAX, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27032 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_sminv2sf3, "__builtin_ia32_pfmin", IX86_BUILTIN_PFMIN, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27033 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_mulv2sf3, "__builtin_ia32_pfmul", IX86_BUILTIN_PFMUL, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27034 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rcpit1v2sf3, "__builtin_ia32_pfrcpit1", IX86_BUILTIN_PFRCPIT1, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27035 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rcpit2v2sf3, "__builtin_ia32_pfrcpit2", IX86_BUILTIN_PFRCPIT2, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27036 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rsqit1v2sf3, "__builtin_ia32_pfrsqit1", IX86_BUILTIN_PFRSQIT1, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27037 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_subv2sf3, "__builtin_ia32_pfsub", IX86_BUILTIN_PFSUB, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27038 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_subrv2sf3, "__builtin_ia32_pfsubr", IX86_BUILTIN_PFSUBR, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27039 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_pmulhrwv4hi3, "__builtin_ia32_pmulhrw", IX86_BUILTIN_PMULHRW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27040
27041 /* 3DNow!A */
27042 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pf2iw, "__builtin_ia32_pf2iw", IX86_BUILTIN_PF2IW, UNKNOWN, (int) V2SI_FTYPE_V2SF },
27043 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pi2fw, "__builtin_ia32_pi2fw", IX86_BUILTIN_PI2FW, UNKNOWN, (int) V2SF_FTYPE_V2SI },
27044 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pswapdv2si2, "__builtin_ia32_pswapdsi", IX86_BUILTIN_PSWAPDSI, UNKNOWN, (int) V2SI_FTYPE_V2SI },
27045 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pswapdv2sf2, "__builtin_ia32_pswapdsf", IX86_BUILTIN_PSWAPDSF, UNKNOWN, (int) V2SF_FTYPE_V2SF },
27046 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_hsubv2sf3, "__builtin_ia32_pfnacc", IX86_BUILTIN_PFNACC, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27047 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_addsubv2sf3, "__builtin_ia32_pfpnacc", IX86_BUILTIN_PFPNACC, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27048
27049 /* SSE */
27050 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movmskps, "__builtin_ia32_movmskps", IX86_BUILTIN_MOVMSKPS, UNKNOWN, (int) INT_FTYPE_V4SF },
27051 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_sqrtv4sf2, "__builtin_ia32_sqrtps", IX86_BUILTIN_SQRTPS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
27052 { OPTION_MASK_ISA_SSE, CODE_FOR_sqrtv4sf2, "__builtin_ia32_sqrtps_nr", IX86_BUILTIN_SQRTPS_NR, UNKNOWN, (int) V4SF_FTYPE_V4SF },
27053 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_rsqrtv4sf2, "__builtin_ia32_rsqrtps", IX86_BUILTIN_RSQRTPS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
27054 { OPTION_MASK_ISA_SSE, CODE_FOR_rsqrtv4sf2, "__builtin_ia32_rsqrtps_nr", IX86_BUILTIN_RSQRTPS_NR, UNKNOWN, (int) V4SF_FTYPE_V4SF },
27055 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_rcpv4sf2, "__builtin_ia32_rcpps", IX86_BUILTIN_RCPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
27056 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtps2pi, "__builtin_ia32_cvtps2pi", IX86_BUILTIN_CVTPS2PI, UNKNOWN, (int) V2SI_FTYPE_V4SF },
27057 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtss2si, "__builtin_ia32_cvtss2si", IX86_BUILTIN_CVTSS2SI, UNKNOWN, (int) INT_FTYPE_V4SF },
27058 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvtss2siq, "__builtin_ia32_cvtss2si64", IX86_BUILTIN_CVTSS2SI64, UNKNOWN, (int) INT64_FTYPE_V4SF },
27059 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvttps2pi, "__builtin_ia32_cvttps2pi", IX86_BUILTIN_CVTTPS2PI, UNKNOWN, (int) V2SI_FTYPE_V4SF },
27060 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvttss2si, "__builtin_ia32_cvttss2si", IX86_BUILTIN_CVTTSS2SI, UNKNOWN, (int) INT_FTYPE_V4SF },
27061 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvttss2siq, "__builtin_ia32_cvttss2si64", IX86_BUILTIN_CVTTSS2SI64, UNKNOWN, (int) INT64_FTYPE_V4SF },
27062
27063 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_shufps, "__builtin_ia32_shufps", IX86_BUILTIN_SHUFPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
27064
27065 { OPTION_MASK_ISA_SSE, CODE_FOR_addv4sf3, "__builtin_ia32_addps", IX86_BUILTIN_ADDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27066 { OPTION_MASK_ISA_SSE, CODE_FOR_subv4sf3, "__builtin_ia32_subps", IX86_BUILTIN_SUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27067 { OPTION_MASK_ISA_SSE, CODE_FOR_mulv4sf3, "__builtin_ia32_mulps", IX86_BUILTIN_MULPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27068 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_divv4sf3, "__builtin_ia32_divps", IX86_BUILTIN_DIVPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27069 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmaddv4sf3, "__builtin_ia32_addss", IX86_BUILTIN_ADDSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27070 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsubv4sf3, "__builtin_ia32_subss", IX86_BUILTIN_SUBSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27071 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmulv4sf3, "__builtin_ia32_mulss", IX86_BUILTIN_MULSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27072 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmdivv4sf3, "__builtin_ia32_divss", IX86_BUILTIN_DIVSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27073
27074 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpeqps", IX86_BUILTIN_CMPEQPS, EQ, (int) V4SF_FTYPE_V4SF_V4SF },
27075 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpltps", IX86_BUILTIN_CMPLTPS, LT, (int) V4SF_FTYPE_V4SF_V4SF },
27076 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpleps", IX86_BUILTIN_CMPLEPS, LE, (int) V4SF_FTYPE_V4SF_V4SF },
27077 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpgtps", IX86_BUILTIN_CMPGTPS, LT, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
27078 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpgeps", IX86_BUILTIN_CMPGEPS, LE, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
27079 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpunordps", IX86_BUILTIN_CMPUNORDPS, UNORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
27080 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpneqps", IX86_BUILTIN_CMPNEQPS, NE, (int) V4SF_FTYPE_V4SF_V4SF },
27081 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpnltps", IX86_BUILTIN_CMPNLTPS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF },
27082 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpnleps", IX86_BUILTIN_CMPNLEPS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF },
27083 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpngtps", IX86_BUILTIN_CMPNGTPS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
27084 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpngeps", IX86_BUILTIN_CMPNGEPS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF_SWAP},
27085 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpordps", IX86_BUILTIN_CMPORDPS, ORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
27086 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpeqss", IX86_BUILTIN_CMPEQSS, EQ, (int) V4SF_FTYPE_V4SF_V4SF },
27087 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpltss", IX86_BUILTIN_CMPLTSS, LT, (int) V4SF_FTYPE_V4SF_V4SF },
27088 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpless", IX86_BUILTIN_CMPLESS, LE, (int) V4SF_FTYPE_V4SF_V4SF },
27089 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpunordss", IX86_BUILTIN_CMPUNORDSS, UNORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
27090 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpneqss", IX86_BUILTIN_CMPNEQSS, NE, (int) V4SF_FTYPE_V4SF_V4SF },
27091 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpnltss", IX86_BUILTIN_CMPNLTSS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF },
27092 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpnless", IX86_BUILTIN_CMPNLESS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF },
27093 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpngtss", IX86_BUILTIN_CMPNGTSS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
27094 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpngess", IX86_BUILTIN_CMPNGESS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
27095 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpordss", IX86_BUILTIN_CMPORDSS, ORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
27096
27097 { OPTION_MASK_ISA_SSE, CODE_FOR_sminv4sf3, "__builtin_ia32_minps", IX86_BUILTIN_MINPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27098 { OPTION_MASK_ISA_SSE, CODE_FOR_smaxv4sf3, "__builtin_ia32_maxps", IX86_BUILTIN_MAXPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27099 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsminv4sf3, "__builtin_ia32_minss", IX86_BUILTIN_MINSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27100 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsmaxv4sf3, "__builtin_ia32_maxss", IX86_BUILTIN_MAXSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27101
27102 { OPTION_MASK_ISA_SSE, CODE_FOR_andv4sf3, "__builtin_ia32_andps", IX86_BUILTIN_ANDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27103 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_andnotv4sf3, "__builtin_ia32_andnps", IX86_BUILTIN_ANDNPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27104 { OPTION_MASK_ISA_SSE, CODE_FOR_iorv4sf3, "__builtin_ia32_orps", IX86_BUILTIN_ORPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27105 { OPTION_MASK_ISA_SSE, CODE_FOR_xorv4sf3, "__builtin_ia32_xorps", IX86_BUILTIN_XORPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27106
27107 { OPTION_MASK_ISA_SSE, CODE_FOR_copysignv4sf3, "__builtin_ia32_copysignps", IX86_BUILTIN_CPYSGNPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27108
27109 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movss, "__builtin_ia32_movss", IX86_BUILTIN_MOVSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27110 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movhlps_exp, "__builtin_ia32_movhlps", IX86_BUILTIN_MOVHLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27111 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movlhps_exp, "__builtin_ia32_movlhps", IX86_BUILTIN_MOVLHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27112 { OPTION_MASK_ISA_SSE, CODE_FOR_vec_interleave_highv4sf, "__builtin_ia32_unpckhps", IX86_BUILTIN_UNPCKHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27113 { OPTION_MASK_ISA_SSE, CODE_FOR_vec_interleave_lowv4sf, "__builtin_ia32_unpcklps", IX86_BUILTIN_UNPCKLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27114
27115 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtpi2ps, "__builtin_ia32_cvtpi2ps", IX86_BUILTIN_CVTPI2PS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V2SI },
27116 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtsi2ss, "__builtin_ia32_cvtsi2ss", IX86_BUILTIN_CVTSI2SS, UNKNOWN, (int) V4SF_FTYPE_V4SF_SI },
27117 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvtsi2ssq, "__builtin_ia32_cvtsi642ss", IX86_BUILTIN_CVTSI642SS, UNKNOWN, V4SF_FTYPE_V4SF_DI },
27118
27119 { OPTION_MASK_ISA_SSE, CODE_FOR_rsqrtsf2, "__builtin_ia32_rsqrtf", IX86_BUILTIN_RSQRTF, UNKNOWN, (int) FLOAT_FTYPE_FLOAT },
27120
27121 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsqrtv4sf2, "__builtin_ia32_sqrtss", IX86_BUILTIN_SQRTSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_VEC_MERGE },
27122 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmrsqrtv4sf2, "__builtin_ia32_rsqrtss", IX86_BUILTIN_RSQRTSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_VEC_MERGE },
27123 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmrcpv4sf2, "__builtin_ia32_rcpss", IX86_BUILTIN_RCPSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_VEC_MERGE },
27124
27125 { OPTION_MASK_ISA_SSE, CODE_FOR_abstf2, 0, IX86_BUILTIN_FABSQ, UNKNOWN, (int) FLOAT128_FTYPE_FLOAT128 },
27126 { OPTION_MASK_ISA_SSE, CODE_FOR_copysigntf3, 0, IX86_BUILTIN_COPYSIGNQ, UNKNOWN, (int) FLOAT128_FTYPE_FLOAT128_FLOAT128 },
27127
27128 /* SSE MMX or 3Dnow!A */
27129 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_uavgv8qi3, "__builtin_ia32_pavgb", IX86_BUILTIN_PAVGB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27130 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_uavgv4hi3, "__builtin_ia32_pavgw", IX86_BUILTIN_PAVGW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27131 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_umulv4hi3_highpart, "__builtin_ia32_pmulhuw", IX86_BUILTIN_PMULHUW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27132
27133 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_umaxv8qi3, "__builtin_ia32_pmaxub", IX86_BUILTIN_PMAXUB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27134 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_smaxv4hi3, "__builtin_ia32_pmaxsw", IX86_BUILTIN_PMAXSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27135 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_uminv8qi3, "__builtin_ia32_pminub", IX86_BUILTIN_PMINUB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27136 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_sminv4hi3, "__builtin_ia32_pminsw", IX86_BUILTIN_PMINSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27137
27138 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_psadbw, "__builtin_ia32_psadbw", IX86_BUILTIN_PSADBW, UNKNOWN, (int) V1DI_FTYPE_V8QI_V8QI },
27139 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pmovmskb, "__builtin_ia32_pmovmskb", IX86_BUILTIN_PMOVMSKB, UNKNOWN, (int) INT_FTYPE_V8QI },
27140
27141 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pshufw, "__builtin_ia32_pshufw", IX86_BUILTIN_PSHUFW, UNKNOWN, (int) V4HI_FTYPE_V4HI_INT },
27142
27143 /* SSE2 */
27144 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_shufpd, "__builtin_ia32_shufpd", IX86_BUILTIN_SHUFPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
27145
27146 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movmskpd, "__builtin_ia32_movmskpd", IX86_BUILTIN_MOVMSKPD, UNKNOWN, (int) INT_FTYPE_V2DF },
27147 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pmovmskb, "__builtin_ia32_pmovmskb128", IX86_BUILTIN_PMOVMSKB128, UNKNOWN, (int) INT_FTYPE_V16QI },
27148 { OPTION_MASK_ISA_SSE2, CODE_FOR_sqrtv2df2, "__builtin_ia32_sqrtpd", IX86_BUILTIN_SQRTPD, UNKNOWN, (int) V2DF_FTYPE_V2DF },
27149 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtdq2pd, "__builtin_ia32_cvtdq2pd", IX86_BUILTIN_CVTDQ2PD, UNKNOWN, (int) V2DF_FTYPE_V4SI },
27150 { OPTION_MASK_ISA_SSE2, CODE_FOR_floatv4siv4sf2, "__builtin_ia32_cvtdq2ps", IX86_BUILTIN_CVTDQ2PS, UNKNOWN, (int) V4SF_FTYPE_V4SI },
27151
27152 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpd2dq, "__builtin_ia32_cvtpd2dq", IX86_BUILTIN_CVTPD2DQ, UNKNOWN, (int) V4SI_FTYPE_V2DF },
27153 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpd2pi, "__builtin_ia32_cvtpd2pi", IX86_BUILTIN_CVTPD2PI, UNKNOWN, (int) V2SI_FTYPE_V2DF },
27154 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpd2ps, "__builtin_ia32_cvtpd2ps", IX86_BUILTIN_CVTPD2PS, UNKNOWN, (int) V4SF_FTYPE_V2DF },
27155 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvttpd2dq, "__builtin_ia32_cvttpd2dq", IX86_BUILTIN_CVTTPD2DQ, UNKNOWN, (int) V4SI_FTYPE_V2DF },
27156 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvttpd2pi, "__builtin_ia32_cvttpd2pi", IX86_BUILTIN_CVTTPD2PI, UNKNOWN, (int) V2SI_FTYPE_V2DF },
27157
27158 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpi2pd, "__builtin_ia32_cvtpi2pd", IX86_BUILTIN_CVTPI2PD, UNKNOWN, (int) V2DF_FTYPE_V2SI },
27159
27160 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtsd2si, "__builtin_ia32_cvtsd2si", IX86_BUILTIN_CVTSD2SI, UNKNOWN, (int) INT_FTYPE_V2DF },
27161 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvttsd2si, "__builtin_ia32_cvttsd2si", IX86_BUILTIN_CVTTSD2SI, UNKNOWN, (int) INT_FTYPE_V2DF },
27162 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvtsd2siq, "__builtin_ia32_cvtsd2si64", IX86_BUILTIN_CVTSD2SI64, UNKNOWN, (int) INT64_FTYPE_V2DF },
27163 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvttsd2siq, "__builtin_ia32_cvttsd2si64", IX86_BUILTIN_CVTTSD2SI64, UNKNOWN, (int) INT64_FTYPE_V2DF },
27164
27165 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtps2dq, "__builtin_ia32_cvtps2dq", IX86_BUILTIN_CVTPS2DQ, UNKNOWN, (int) V4SI_FTYPE_V4SF },
27166 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtps2pd, "__builtin_ia32_cvtps2pd", IX86_BUILTIN_CVTPS2PD, UNKNOWN, (int) V2DF_FTYPE_V4SF },
27167 { OPTION_MASK_ISA_SSE2, CODE_FOR_fix_truncv4sfv4si2, "__builtin_ia32_cvttps2dq", IX86_BUILTIN_CVTTPS2DQ, UNKNOWN, (int) V4SI_FTYPE_V4SF },
27168
27169 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv2df3, "__builtin_ia32_addpd", IX86_BUILTIN_ADDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27170 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv2df3, "__builtin_ia32_subpd", IX86_BUILTIN_SUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27171 { OPTION_MASK_ISA_SSE2, CODE_FOR_mulv2df3, "__builtin_ia32_mulpd", IX86_BUILTIN_MULPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27172 { OPTION_MASK_ISA_SSE2, CODE_FOR_divv2df3, "__builtin_ia32_divpd", IX86_BUILTIN_DIVPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27173 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmaddv2df3, "__builtin_ia32_addsd", IX86_BUILTIN_ADDSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27174 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsubv2df3, "__builtin_ia32_subsd", IX86_BUILTIN_SUBSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27175 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmulv2df3, "__builtin_ia32_mulsd", IX86_BUILTIN_MULSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27176 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmdivv2df3, "__builtin_ia32_divsd", IX86_BUILTIN_DIVSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27177
27178 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpeqpd", IX86_BUILTIN_CMPEQPD, EQ, (int) V2DF_FTYPE_V2DF_V2DF },
27179 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpltpd", IX86_BUILTIN_CMPLTPD, LT, (int) V2DF_FTYPE_V2DF_V2DF },
27180 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmplepd", IX86_BUILTIN_CMPLEPD, LE, (int) V2DF_FTYPE_V2DF_V2DF },
27181 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpgtpd", IX86_BUILTIN_CMPGTPD, LT, (int) V2DF_FTYPE_V2DF_V2DF_SWAP },
27182 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpgepd", IX86_BUILTIN_CMPGEPD, LE, (int) V2DF_FTYPE_V2DF_V2DF_SWAP},
27183 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpunordpd", IX86_BUILTIN_CMPUNORDPD, UNORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
27184 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpneqpd", IX86_BUILTIN_CMPNEQPD, NE, (int) V2DF_FTYPE_V2DF_V2DF },
27185 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpnltpd", IX86_BUILTIN_CMPNLTPD, UNGE, (int) V2DF_FTYPE_V2DF_V2DF },
27186 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpnlepd", IX86_BUILTIN_CMPNLEPD, UNGT, (int) V2DF_FTYPE_V2DF_V2DF },
27187 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpngtpd", IX86_BUILTIN_CMPNGTPD, UNGE, (int) V2DF_FTYPE_V2DF_V2DF_SWAP },
27188 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpngepd", IX86_BUILTIN_CMPNGEPD, UNGT, (int) V2DF_FTYPE_V2DF_V2DF_SWAP },
27189 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpordpd", IX86_BUILTIN_CMPORDPD, ORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
27190 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpeqsd", IX86_BUILTIN_CMPEQSD, EQ, (int) V2DF_FTYPE_V2DF_V2DF },
27191 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpltsd", IX86_BUILTIN_CMPLTSD, LT, (int) V2DF_FTYPE_V2DF_V2DF },
27192 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmplesd", IX86_BUILTIN_CMPLESD, LE, (int) V2DF_FTYPE_V2DF_V2DF },
27193 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpunordsd", IX86_BUILTIN_CMPUNORDSD, UNORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
27194 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpneqsd", IX86_BUILTIN_CMPNEQSD, NE, (int) V2DF_FTYPE_V2DF_V2DF },
27195 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpnltsd", IX86_BUILTIN_CMPNLTSD, UNGE, (int) V2DF_FTYPE_V2DF_V2DF },
27196 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpnlesd", IX86_BUILTIN_CMPNLESD, UNGT, (int) V2DF_FTYPE_V2DF_V2DF },
27197 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpordsd", IX86_BUILTIN_CMPORDSD, ORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
27198
27199 { OPTION_MASK_ISA_SSE2, CODE_FOR_sminv2df3, "__builtin_ia32_minpd", IX86_BUILTIN_MINPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27200 { OPTION_MASK_ISA_SSE2, CODE_FOR_smaxv2df3, "__builtin_ia32_maxpd", IX86_BUILTIN_MAXPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27201 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsminv2df3, "__builtin_ia32_minsd", IX86_BUILTIN_MINSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27202 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsmaxv2df3, "__builtin_ia32_maxsd", IX86_BUILTIN_MAXSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27203
27204 { OPTION_MASK_ISA_SSE2, CODE_FOR_andv2df3, "__builtin_ia32_andpd", IX86_BUILTIN_ANDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27205 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_andnotv2df3, "__builtin_ia32_andnpd", IX86_BUILTIN_ANDNPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27206 { OPTION_MASK_ISA_SSE2, CODE_FOR_iorv2df3, "__builtin_ia32_orpd", IX86_BUILTIN_ORPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27207 { OPTION_MASK_ISA_SSE2, CODE_FOR_xorv2df3, "__builtin_ia32_xorpd", IX86_BUILTIN_XORPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27208
27209 { OPTION_MASK_ISA_SSE2, CODE_FOR_copysignv2df3, "__builtin_ia32_copysignpd", IX86_BUILTIN_CPYSGNPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27210
27211 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movsd, "__builtin_ia32_movsd", IX86_BUILTIN_MOVSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27212 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv2df, "__builtin_ia32_unpckhpd", IX86_BUILTIN_UNPCKHPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27213 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv2df, "__builtin_ia32_unpcklpd", IX86_BUILTIN_UNPCKLPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27214
27215 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_pack_sfix_v2df, "__builtin_ia32_vec_pack_sfix", IX86_BUILTIN_VEC_PACK_SFIX, UNKNOWN, (int) V4SI_FTYPE_V2DF_V2DF },
27216
27217 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv16qi3, "__builtin_ia32_paddb128", IX86_BUILTIN_PADDB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27218 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv8hi3, "__builtin_ia32_paddw128", IX86_BUILTIN_PADDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27219 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv4si3, "__builtin_ia32_paddd128", IX86_BUILTIN_PADDD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27220 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv2di3, "__builtin_ia32_paddq128", IX86_BUILTIN_PADDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27221 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv16qi3, "__builtin_ia32_psubb128", IX86_BUILTIN_PSUBB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27222 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv8hi3, "__builtin_ia32_psubw128", IX86_BUILTIN_PSUBW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27223 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv4si3, "__builtin_ia32_psubd128", IX86_BUILTIN_PSUBD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27224 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv2di3, "__builtin_ia32_psubq128", IX86_BUILTIN_PSUBQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27225
27226 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ssaddv16qi3, "__builtin_ia32_paddsb128", IX86_BUILTIN_PADDSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27227 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ssaddv8hi3, "__builtin_ia32_paddsw128", IX86_BUILTIN_PADDSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27228 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_sssubv16qi3, "__builtin_ia32_psubsb128", IX86_BUILTIN_PSUBSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27229 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_sssubv8hi3, "__builtin_ia32_psubsw128", IX86_BUILTIN_PSUBSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27230 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_usaddv16qi3, "__builtin_ia32_paddusb128", IX86_BUILTIN_PADDUSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27231 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_usaddv8hi3, "__builtin_ia32_paddusw128", IX86_BUILTIN_PADDUSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27232 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ussubv16qi3, "__builtin_ia32_psubusb128", IX86_BUILTIN_PSUBUSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27233 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ussubv8hi3, "__builtin_ia32_psubusw128", IX86_BUILTIN_PSUBUSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27234
27235 { OPTION_MASK_ISA_SSE2, CODE_FOR_mulv8hi3, "__builtin_ia32_pmullw128", IX86_BUILTIN_PMULLW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27236 { OPTION_MASK_ISA_SSE2, CODE_FOR_smulv8hi3_highpart, "__builtin_ia32_pmulhw128", IX86_BUILTIN_PMULHW128, UNKNOWN,(int) V8HI_FTYPE_V8HI_V8HI },
27237
27238 { OPTION_MASK_ISA_SSE2, CODE_FOR_andv2di3, "__builtin_ia32_pand128", IX86_BUILTIN_PAND128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27239 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_andnotv2di3, "__builtin_ia32_pandn128", IX86_BUILTIN_PANDN128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27240 { OPTION_MASK_ISA_SSE2, CODE_FOR_iorv2di3, "__builtin_ia32_por128", IX86_BUILTIN_POR128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27241 { OPTION_MASK_ISA_SSE2, CODE_FOR_xorv2di3, "__builtin_ia32_pxor128", IX86_BUILTIN_PXOR128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27242
27243 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_uavgv16qi3, "__builtin_ia32_pavgb128", IX86_BUILTIN_PAVGB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27244 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_uavgv8hi3, "__builtin_ia32_pavgw128", IX86_BUILTIN_PAVGW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27245
27246 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_eqv16qi3, "__builtin_ia32_pcmpeqb128", IX86_BUILTIN_PCMPEQB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27247 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_eqv8hi3, "__builtin_ia32_pcmpeqw128", IX86_BUILTIN_PCMPEQW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27248 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_eqv4si3, "__builtin_ia32_pcmpeqd128", IX86_BUILTIN_PCMPEQD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27249 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_gtv16qi3, "__builtin_ia32_pcmpgtb128", IX86_BUILTIN_PCMPGTB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27250 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_gtv8hi3, "__builtin_ia32_pcmpgtw128", IX86_BUILTIN_PCMPGTW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27251 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_gtv4si3, "__builtin_ia32_pcmpgtd128", IX86_BUILTIN_PCMPGTD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27252
27253 { OPTION_MASK_ISA_SSE2, CODE_FOR_umaxv16qi3, "__builtin_ia32_pmaxub128", IX86_BUILTIN_PMAXUB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27254 { OPTION_MASK_ISA_SSE2, CODE_FOR_smaxv8hi3, "__builtin_ia32_pmaxsw128", IX86_BUILTIN_PMAXSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27255 { OPTION_MASK_ISA_SSE2, CODE_FOR_uminv16qi3, "__builtin_ia32_pminub128", IX86_BUILTIN_PMINUB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27256 { OPTION_MASK_ISA_SSE2, CODE_FOR_sminv8hi3, "__builtin_ia32_pminsw128", IX86_BUILTIN_PMINSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27257
27258 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv16qi, "__builtin_ia32_punpckhbw128", IX86_BUILTIN_PUNPCKHBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27259 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv8hi, "__builtin_ia32_punpckhwd128", IX86_BUILTIN_PUNPCKHWD128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27260 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv4si, "__builtin_ia32_punpckhdq128", IX86_BUILTIN_PUNPCKHDQ128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27261 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv2di, "__builtin_ia32_punpckhqdq128", IX86_BUILTIN_PUNPCKHQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27262 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv16qi, "__builtin_ia32_punpcklbw128", IX86_BUILTIN_PUNPCKLBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27263 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv8hi, "__builtin_ia32_punpcklwd128", IX86_BUILTIN_PUNPCKLWD128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27264 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv4si, "__builtin_ia32_punpckldq128", IX86_BUILTIN_PUNPCKLDQ128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27265 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv2di, "__builtin_ia32_punpcklqdq128", IX86_BUILTIN_PUNPCKLQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27266
27267 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_packsswb, "__builtin_ia32_packsswb128", IX86_BUILTIN_PACKSSWB128, UNKNOWN, (int) V16QI_FTYPE_V8HI_V8HI },
27268 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_packssdw, "__builtin_ia32_packssdw128", IX86_BUILTIN_PACKSSDW128, UNKNOWN, (int) V8HI_FTYPE_V4SI_V4SI },
27269 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_packuswb, "__builtin_ia32_packuswb128", IX86_BUILTIN_PACKUSWB128, UNKNOWN, (int) V16QI_FTYPE_V8HI_V8HI },
27270
27271 { OPTION_MASK_ISA_SSE2, CODE_FOR_umulv8hi3_highpart, "__builtin_ia32_pmulhuw128", IX86_BUILTIN_PMULHUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27272 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_psadbw, "__builtin_ia32_psadbw128", IX86_BUILTIN_PSADBW128, UNKNOWN, (int) V2DI_FTYPE_V16QI_V16QI },
27273
27274 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_umulv1siv1di3, "__builtin_ia32_pmuludq", IX86_BUILTIN_PMULUDQ, UNKNOWN, (int) V1DI_FTYPE_V2SI_V2SI },
27275 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_widen_umult_even_v4si, "__builtin_ia32_pmuludq128", IX86_BUILTIN_PMULUDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI_V4SI },
27276
27277 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pmaddwd, "__builtin_ia32_pmaddwd128", IX86_BUILTIN_PMADDWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI_V8HI },
27278
27279 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtsi2sd, "__builtin_ia32_cvtsi2sd", IX86_BUILTIN_CVTSI2SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_SI },
27280 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvtsi2sdq, "__builtin_ia32_cvtsi642sd", IX86_BUILTIN_CVTSI642SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_DI },
27281 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtsd2ss, "__builtin_ia32_cvtsd2ss", IX86_BUILTIN_CVTSD2SS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V2DF },
27282 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtss2sd, "__builtin_ia32_cvtss2sd", IX86_BUILTIN_CVTSS2SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V4SF },
27283
27284 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ashlv1ti3, "__builtin_ia32_pslldqi128", IX86_BUILTIN_PSLLDQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT_CONVERT },
27285 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv8hi3, "__builtin_ia32_psllwi128", IX86_BUILTIN_PSLLWI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_SI_COUNT },
27286 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv4si3, "__builtin_ia32_pslldi128", IX86_BUILTIN_PSLLDI128, UNKNOWN, (int) V4SI_FTYPE_V4SI_SI_COUNT },
27287 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv2di3, "__builtin_ia32_psllqi128", IX86_BUILTIN_PSLLQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_SI_COUNT },
27288 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv8hi3, "__builtin_ia32_psllw128", IX86_BUILTIN_PSLLW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_COUNT },
27289 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv4si3, "__builtin_ia32_pslld128", IX86_BUILTIN_PSLLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_COUNT },
27290 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv2di3, "__builtin_ia32_psllq128", IX86_BUILTIN_PSLLQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_COUNT },
27291
27292 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_lshrv1ti3, "__builtin_ia32_psrldqi128", IX86_BUILTIN_PSRLDQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT_CONVERT },
27293 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv8hi3, "__builtin_ia32_psrlwi128", IX86_BUILTIN_PSRLWI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_SI_COUNT },
27294 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv4si3, "__builtin_ia32_psrldi128", IX86_BUILTIN_PSRLDI128, UNKNOWN, (int) V4SI_FTYPE_V4SI_SI_COUNT },
27295 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv2di3, "__builtin_ia32_psrlqi128", IX86_BUILTIN_PSRLQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_SI_COUNT },
27296 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv8hi3, "__builtin_ia32_psrlw128", IX86_BUILTIN_PSRLW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_COUNT },
27297 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv4si3, "__builtin_ia32_psrld128", IX86_BUILTIN_PSRLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_COUNT },
27298 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv2di3, "__builtin_ia32_psrlq128", IX86_BUILTIN_PSRLQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_COUNT },
27299
27300 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv8hi3, "__builtin_ia32_psrawi128", IX86_BUILTIN_PSRAWI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_SI_COUNT },
27301 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv4si3, "__builtin_ia32_psradi128", IX86_BUILTIN_PSRADI128, UNKNOWN, (int) V4SI_FTYPE_V4SI_SI_COUNT },
27302 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv8hi3, "__builtin_ia32_psraw128", IX86_BUILTIN_PSRAW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_COUNT },
27303 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv4si3, "__builtin_ia32_psrad128", IX86_BUILTIN_PSRAD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_COUNT },
27304
27305 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pshufd, "__builtin_ia32_pshufd", IX86_BUILTIN_PSHUFD, UNKNOWN, (int) V4SI_FTYPE_V4SI_INT },
27306 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pshuflw, "__builtin_ia32_pshuflw", IX86_BUILTIN_PSHUFLW, UNKNOWN, (int) V8HI_FTYPE_V8HI_INT },
27307 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pshufhw, "__builtin_ia32_pshufhw", IX86_BUILTIN_PSHUFHW, UNKNOWN, (int) V8HI_FTYPE_V8HI_INT },
27308
27309 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsqrtv2df2, "__builtin_ia32_sqrtsd", IX86_BUILTIN_SQRTSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_VEC_MERGE },
27310
27311 { OPTION_MASK_ISA_SSE, CODE_FOR_sse2_movq128, "__builtin_ia32_movq128", IX86_BUILTIN_MOVQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI },
27312
27313 /* SSE2 MMX */
27314 { OPTION_MASK_ISA_SSE2, CODE_FOR_mmx_addv1di3, "__builtin_ia32_paddq", IX86_BUILTIN_PADDQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI },
27315 { OPTION_MASK_ISA_SSE2, CODE_FOR_mmx_subv1di3, "__builtin_ia32_psubq", IX86_BUILTIN_PSUBQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI },
27316
27317 /* SSE3 */
27318 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_movshdup, "__builtin_ia32_movshdup", IX86_BUILTIN_MOVSHDUP, UNKNOWN, (int) V4SF_FTYPE_V4SF},
27319 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_movsldup, "__builtin_ia32_movsldup", IX86_BUILTIN_MOVSLDUP, UNKNOWN, (int) V4SF_FTYPE_V4SF },
27320
27321 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_addsubv4sf3, "__builtin_ia32_addsubps", IX86_BUILTIN_ADDSUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27322 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_addsubv2df3, "__builtin_ia32_addsubpd", IX86_BUILTIN_ADDSUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27323 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_haddv4sf3, "__builtin_ia32_haddps", IX86_BUILTIN_HADDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27324 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_haddv2df3, "__builtin_ia32_haddpd", IX86_BUILTIN_HADDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27325 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_hsubv4sf3, "__builtin_ia32_hsubps", IX86_BUILTIN_HSUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27326 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_hsubv2df3, "__builtin_ia32_hsubpd", IX86_BUILTIN_HSUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27327
27328 /* SSSE3 */
27329 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv16qi2, "__builtin_ia32_pabsb128", IX86_BUILTIN_PABSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI },
27330 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv8qi2, "__builtin_ia32_pabsb", IX86_BUILTIN_PABSB, UNKNOWN, (int) V8QI_FTYPE_V8QI },
27331 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv8hi2, "__builtin_ia32_pabsw128", IX86_BUILTIN_PABSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI },
27332 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv4hi2, "__builtin_ia32_pabsw", IX86_BUILTIN_PABSW, UNKNOWN, (int) V4HI_FTYPE_V4HI },
27333 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv4si2, "__builtin_ia32_pabsd128", IX86_BUILTIN_PABSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI },
27334 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv2si2, "__builtin_ia32_pabsd", IX86_BUILTIN_PABSD, UNKNOWN, (int) V2SI_FTYPE_V2SI },
27335
27336 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddwv8hi3, "__builtin_ia32_phaddw128", IX86_BUILTIN_PHADDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27337 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddwv4hi3, "__builtin_ia32_phaddw", IX86_BUILTIN_PHADDW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27338 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phadddv4si3, "__builtin_ia32_phaddd128", IX86_BUILTIN_PHADDD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27339 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phadddv2si3, "__builtin_ia32_phaddd", IX86_BUILTIN_PHADDD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27340 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddswv8hi3, "__builtin_ia32_phaddsw128", IX86_BUILTIN_PHADDSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27341 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddswv4hi3, "__builtin_ia32_phaddsw", IX86_BUILTIN_PHADDSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27342 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubwv8hi3, "__builtin_ia32_phsubw128", IX86_BUILTIN_PHSUBW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27343 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubwv4hi3, "__builtin_ia32_phsubw", IX86_BUILTIN_PHSUBW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27344 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubdv4si3, "__builtin_ia32_phsubd128", IX86_BUILTIN_PHSUBD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27345 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubdv2si3, "__builtin_ia32_phsubd", IX86_BUILTIN_PHSUBD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27346 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubswv8hi3, "__builtin_ia32_phsubsw128", IX86_BUILTIN_PHSUBSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27347 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubswv4hi3, "__builtin_ia32_phsubsw", IX86_BUILTIN_PHSUBSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27348 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmaddubsw128, "__builtin_ia32_pmaddubsw128", IX86_BUILTIN_PMADDUBSW128, UNKNOWN, (int) V8HI_FTYPE_V16QI_V16QI },
27349 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmaddubsw, "__builtin_ia32_pmaddubsw", IX86_BUILTIN_PMADDUBSW, UNKNOWN, (int) V4HI_FTYPE_V8QI_V8QI },
27350 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmulhrswv8hi3, "__builtin_ia32_pmulhrsw128", IX86_BUILTIN_PMULHRSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27351 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmulhrswv4hi3, "__builtin_ia32_pmulhrsw", IX86_BUILTIN_PMULHRSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27352 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pshufbv16qi3, "__builtin_ia32_pshufb128", IX86_BUILTIN_PSHUFB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27353 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pshufbv8qi3, "__builtin_ia32_pshufb", IX86_BUILTIN_PSHUFB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27354 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv16qi3, "__builtin_ia32_psignb128", IX86_BUILTIN_PSIGNB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27355 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv8qi3, "__builtin_ia32_psignb", IX86_BUILTIN_PSIGNB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27356 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv8hi3, "__builtin_ia32_psignw128", IX86_BUILTIN_PSIGNW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27357 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv4hi3, "__builtin_ia32_psignw", IX86_BUILTIN_PSIGNW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27358 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv4si3, "__builtin_ia32_psignd128", IX86_BUILTIN_PSIGND128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27359 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv2si3, "__builtin_ia32_psignd", IX86_BUILTIN_PSIGND, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27360
27361 /* SSSE3. */
27362 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_palignrti, "__builtin_ia32_palignr128", IX86_BUILTIN_PALIGNR128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_INT_CONVERT },
27363 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_palignrdi, "__builtin_ia32_palignr", IX86_BUILTIN_PALIGNR, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI_INT_CONVERT },
27364
27365 /* SSE4.1 */
27366 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendpd, "__builtin_ia32_blendpd", IX86_BUILTIN_BLENDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
27367 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendps, "__builtin_ia32_blendps", IX86_BUILTIN_BLENDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
27368 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendvpd, "__builtin_ia32_blendvpd", IX86_BUILTIN_BLENDVPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_V2DF },
27369 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendvps, "__builtin_ia32_blendvps", IX86_BUILTIN_BLENDVPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_V4SF },
27370 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_dppd, "__builtin_ia32_dppd", IX86_BUILTIN_DPPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
27371 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_dpps, "__builtin_ia32_dpps", IX86_BUILTIN_DPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
27372 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_insertps, "__builtin_ia32_insertps128", IX86_BUILTIN_INSERTPS128, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
27373 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_mpsadbw, "__builtin_ia32_mpsadbw128", IX86_BUILTIN_MPSADBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI_INT },
27374 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_pblendvb, "__builtin_ia32_pblendvb128", IX86_BUILTIN_PBLENDVB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI_V16QI },
27375 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_pblendw, "__builtin_ia32_pblendw128", IX86_BUILTIN_PBLENDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_INT },
27376
27377 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv8qiv8hi2, "__builtin_ia32_pmovsxbw128", IX86_BUILTIN_PMOVSXBW128, UNKNOWN, (int) V8HI_FTYPE_V16QI },
27378 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv4qiv4si2, "__builtin_ia32_pmovsxbd128", IX86_BUILTIN_PMOVSXBD128, UNKNOWN, (int) V4SI_FTYPE_V16QI },
27379 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv2qiv2di2, "__builtin_ia32_pmovsxbq128", IX86_BUILTIN_PMOVSXBQ128, UNKNOWN, (int) V2DI_FTYPE_V16QI },
27380 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv4hiv4si2, "__builtin_ia32_pmovsxwd128", IX86_BUILTIN_PMOVSXWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI },
27381 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv2hiv2di2, "__builtin_ia32_pmovsxwq128", IX86_BUILTIN_PMOVSXWQ128, UNKNOWN, (int) V2DI_FTYPE_V8HI },
27382 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv2siv2di2, "__builtin_ia32_pmovsxdq128", IX86_BUILTIN_PMOVSXDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI },
27383 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv8qiv8hi2, "__builtin_ia32_pmovzxbw128", IX86_BUILTIN_PMOVZXBW128, UNKNOWN, (int) V8HI_FTYPE_V16QI },
27384 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv4qiv4si2, "__builtin_ia32_pmovzxbd128", IX86_BUILTIN_PMOVZXBD128, UNKNOWN, (int) V4SI_FTYPE_V16QI },
27385 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv2qiv2di2, "__builtin_ia32_pmovzxbq128", IX86_BUILTIN_PMOVZXBQ128, UNKNOWN, (int) V2DI_FTYPE_V16QI },
27386 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv4hiv4si2, "__builtin_ia32_pmovzxwd128", IX86_BUILTIN_PMOVZXWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI },
27387 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv2hiv2di2, "__builtin_ia32_pmovzxwq128", IX86_BUILTIN_PMOVZXWQ128, UNKNOWN, (int) V2DI_FTYPE_V8HI },
27388 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv2siv2di2, "__builtin_ia32_pmovzxdq128", IX86_BUILTIN_PMOVZXDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI },
27389 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_phminposuw, "__builtin_ia32_phminposuw128", IX86_BUILTIN_PHMINPOSUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI },
27390
27391 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_packusdw, "__builtin_ia32_packusdw128", IX86_BUILTIN_PACKUSDW128, UNKNOWN, (int) V8HI_FTYPE_V4SI_V4SI },
27392 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_eqv2di3, "__builtin_ia32_pcmpeqq", IX86_BUILTIN_PCMPEQQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27393 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_smaxv16qi3, "__builtin_ia32_pmaxsb128", IX86_BUILTIN_PMAXSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27394 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_smaxv4si3, "__builtin_ia32_pmaxsd128", IX86_BUILTIN_PMAXSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27395 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_umaxv4si3, "__builtin_ia32_pmaxud128", IX86_BUILTIN_PMAXUD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27396 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_umaxv8hi3, "__builtin_ia32_pmaxuw128", IX86_BUILTIN_PMAXUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27397 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sminv16qi3, "__builtin_ia32_pminsb128", IX86_BUILTIN_PMINSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27398 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sminv4si3, "__builtin_ia32_pminsd128", IX86_BUILTIN_PMINSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27399 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_uminv4si3, "__builtin_ia32_pminud128", IX86_BUILTIN_PMINUD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27400 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_uminv8hi3, "__builtin_ia32_pminuw128", IX86_BUILTIN_PMINUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27401 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_mulv2siv2di3, "__builtin_ia32_pmuldq128", IX86_BUILTIN_PMULDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI_V4SI },
27402 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_mulv4si3, "__builtin_ia32_pmulld128", IX86_BUILTIN_PMULLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27403
27404 /* SSE4.1 */
27405 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_roundpd", IX86_BUILTIN_ROUNDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_INT },
27406 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_roundps", IX86_BUILTIN_ROUNDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_INT },
27407 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundsd, "__builtin_ia32_roundsd", IX86_BUILTIN_ROUNDSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
27408 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundss, "__builtin_ia32_roundss", IX86_BUILTIN_ROUNDSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
27409
27410 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_floorpd", IX86_BUILTIN_FLOORPD, (enum rtx_code) ROUND_FLOOR, (int) V2DF_FTYPE_V2DF_ROUND },
27411 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_ceilpd", IX86_BUILTIN_CEILPD, (enum rtx_code) ROUND_CEIL, (int) V2DF_FTYPE_V2DF_ROUND },
27412 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_truncpd", IX86_BUILTIN_TRUNCPD, (enum rtx_code) ROUND_TRUNC, (int) V2DF_FTYPE_V2DF_ROUND },
27413 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_rintpd", IX86_BUILTIN_RINTPD, (enum rtx_code) ROUND_MXCSR, (int) V2DF_FTYPE_V2DF_ROUND },
27414
27415 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd_vec_pack_sfix, "__builtin_ia32_floorpd_vec_pack_sfix", IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX, (enum rtx_code) ROUND_FLOOR, (int) V4SI_FTYPE_V2DF_V2DF_ROUND },
27416 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd_vec_pack_sfix, "__builtin_ia32_ceilpd_vec_pack_sfix", IX86_BUILTIN_CEILPD_VEC_PACK_SFIX, (enum rtx_code) ROUND_CEIL, (int) V4SI_FTYPE_V2DF_V2DF_ROUND },
27417
27418 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv2df2, "__builtin_ia32_roundpd_az", IX86_BUILTIN_ROUNDPD_AZ, UNKNOWN, (int) V2DF_FTYPE_V2DF },
27419 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv2df2_vec_pack_sfix, "__builtin_ia32_roundpd_az_vec_pack_sfix", IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX, UNKNOWN, (int) V4SI_FTYPE_V2DF_V2DF },
27420
27421 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_floorps", IX86_BUILTIN_FLOORPS, (enum rtx_code) ROUND_FLOOR, (int) V4SF_FTYPE_V4SF_ROUND },
27422 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_ceilps", IX86_BUILTIN_CEILPS, (enum rtx_code) ROUND_CEIL, (int) V4SF_FTYPE_V4SF_ROUND },
27423 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_truncps", IX86_BUILTIN_TRUNCPS, (enum rtx_code) ROUND_TRUNC, (int) V4SF_FTYPE_V4SF_ROUND },
27424 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_rintps", IX86_BUILTIN_RINTPS, (enum rtx_code) ROUND_MXCSR, (int) V4SF_FTYPE_V4SF_ROUND },
27425
27426 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps_sfix, "__builtin_ia32_floorps_sfix", IX86_BUILTIN_FLOORPS_SFIX, (enum rtx_code) ROUND_FLOOR, (int) V4SI_FTYPE_V4SF_ROUND },
27427 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps_sfix, "__builtin_ia32_ceilps_sfix", IX86_BUILTIN_CEILPS_SFIX, (enum rtx_code) ROUND_CEIL, (int) V4SI_FTYPE_V4SF_ROUND },
27428
27429 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv4sf2, "__builtin_ia32_roundps_az", IX86_BUILTIN_ROUNDPS_AZ, UNKNOWN, (int) V4SF_FTYPE_V4SF },
27430 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv4sf2_sfix, "__builtin_ia32_roundps_az_sfix", IX86_BUILTIN_ROUNDPS_AZ_SFIX, UNKNOWN, (int) V4SI_FTYPE_V4SF },
27431
27432 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestz128", IX86_BUILTIN_PTESTZ, EQ, (int) INT_FTYPE_V2DI_V2DI_PTEST },
27433 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestc128", IX86_BUILTIN_PTESTC, LTU, (int) INT_FTYPE_V2DI_V2DI_PTEST },
27434 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestnzc128", IX86_BUILTIN_PTESTNZC, GTU, (int) INT_FTYPE_V2DI_V2DI_PTEST },
27435
27436 /* SSE4.2 */
27437 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_gtv2di3, "__builtin_ia32_pcmpgtq", IX86_BUILTIN_PCMPGTQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27438 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32, CODE_FOR_sse4_2_crc32qi, "__builtin_ia32_crc32qi", IX86_BUILTIN_CRC32QI, UNKNOWN, (int) UINT_FTYPE_UINT_UCHAR },
27439 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32, CODE_FOR_sse4_2_crc32hi, "__builtin_ia32_crc32hi", IX86_BUILTIN_CRC32HI, UNKNOWN, (int) UINT_FTYPE_UINT_USHORT },
27440 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32, CODE_FOR_sse4_2_crc32si, "__builtin_ia32_crc32si", IX86_BUILTIN_CRC32SI, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
27441 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse4_2_crc32di, "__builtin_ia32_crc32di", IX86_BUILTIN_CRC32DI, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
27442
27443 /* SSE4A */
27444 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_extrqi, "__builtin_ia32_extrqi", IX86_BUILTIN_EXTRQI, UNKNOWN, (int) V2DI_FTYPE_V2DI_UINT_UINT },
27445 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_extrq, "__builtin_ia32_extrq", IX86_BUILTIN_EXTRQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V16QI },
27446 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_insertqi, "__builtin_ia32_insertqi", IX86_BUILTIN_INSERTQI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_UINT_UINT },
27447 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_insertq, "__builtin_ia32_insertq", IX86_BUILTIN_INSERTQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27448
27449 /* AES */
27450 { OPTION_MASK_ISA_SSE2, CODE_FOR_aeskeygenassist, 0, IX86_BUILTIN_AESKEYGENASSIST128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT },
27451 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesimc, 0, IX86_BUILTIN_AESIMC128, UNKNOWN, (int) V2DI_FTYPE_V2DI },
27452
27453 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesenc, 0, IX86_BUILTIN_AESENC128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27454 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesenclast, 0, IX86_BUILTIN_AESENCLAST128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27455 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesdec, 0, IX86_BUILTIN_AESDEC128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27456 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesdeclast, 0, IX86_BUILTIN_AESDECLAST128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27457
27458 /* PCLMUL */
27459 { OPTION_MASK_ISA_SSE2, CODE_FOR_pclmulqdq, 0, IX86_BUILTIN_PCLMULQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_INT },
27460
27461 /* AVX */
27462 { OPTION_MASK_ISA_AVX, CODE_FOR_addv4df3, "__builtin_ia32_addpd256", IX86_BUILTIN_ADDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27463 { OPTION_MASK_ISA_AVX, CODE_FOR_addv8sf3, "__builtin_ia32_addps256", IX86_BUILTIN_ADDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27464 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_addsubv4df3, "__builtin_ia32_addsubpd256", IX86_BUILTIN_ADDSUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27465 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_addsubv8sf3, "__builtin_ia32_addsubps256", IX86_BUILTIN_ADDSUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27466 { OPTION_MASK_ISA_AVX, CODE_FOR_andv4df3, "__builtin_ia32_andpd256", IX86_BUILTIN_ANDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27467 { OPTION_MASK_ISA_AVX, CODE_FOR_andv8sf3, "__builtin_ia32_andps256", IX86_BUILTIN_ANDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27468 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_andnotv4df3, "__builtin_ia32_andnpd256", IX86_BUILTIN_ANDNPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27469 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_andnotv8sf3, "__builtin_ia32_andnps256", IX86_BUILTIN_ANDNPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27470 { OPTION_MASK_ISA_AVX, CODE_FOR_divv4df3, "__builtin_ia32_divpd256", IX86_BUILTIN_DIVPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27471 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_divv8sf3, "__builtin_ia32_divps256", IX86_BUILTIN_DIVPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27472 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_haddv4df3, "__builtin_ia32_haddpd256", IX86_BUILTIN_HADDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27473 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_hsubv8sf3, "__builtin_ia32_hsubps256", IX86_BUILTIN_HSUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27474 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_hsubv4df3, "__builtin_ia32_hsubpd256", IX86_BUILTIN_HSUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27475 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_haddv8sf3, "__builtin_ia32_haddps256", IX86_BUILTIN_HADDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27476 { OPTION_MASK_ISA_AVX, CODE_FOR_smaxv4df3, "__builtin_ia32_maxpd256", IX86_BUILTIN_MAXPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27477 { OPTION_MASK_ISA_AVX, CODE_FOR_smaxv8sf3, "__builtin_ia32_maxps256", IX86_BUILTIN_MAXPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27478 { OPTION_MASK_ISA_AVX, CODE_FOR_sminv4df3, "__builtin_ia32_minpd256", IX86_BUILTIN_MINPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27479 { OPTION_MASK_ISA_AVX, CODE_FOR_sminv8sf3, "__builtin_ia32_minps256", IX86_BUILTIN_MINPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27480 { OPTION_MASK_ISA_AVX, CODE_FOR_mulv4df3, "__builtin_ia32_mulpd256", IX86_BUILTIN_MULPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27481 { OPTION_MASK_ISA_AVX, CODE_FOR_mulv8sf3, "__builtin_ia32_mulps256", IX86_BUILTIN_MULPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27482 { OPTION_MASK_ISA_AVX, CODE_FOR_iorv4df3, "__builtin_ia32_orpd256", IX86_BUILTIN_ORPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27483 { OPTION_MASK_ISA_AVX, CODE_FOR_iorv8sf3, "__builtin_ia32_orps256", IX86_BUILTIN_ORPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27484 { OPTION_MASK_ISA_AVX, CODE_FOR_subv4df3, "__builtin_ia32_subpd256", IX86_BUILTIN_SUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27485 { OPTION_MASK_ISA_AVX, CODE_FOR_subv8sf3, "__builtin_ia32_subps256", IX86_BUILTIN_SUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27486 { OPTION_MASK_ISA_AVX, CODE_FOR_xorv4df3, "__builtin_ia32_xorpd256", IX86_BUILTIN_XORPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27487 { OPTION_MASK_ISA_AVX, CODE_FOR_xorv8sf3, "__builtin_ia32_xorps256", IX86_BUILTIN_XORPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27488
27489 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv2df3, "__builtin_ia32_vpermilvarpd", IX86_BUILTIN_VPERMILVARPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DI },
27490 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv4sf3, "__builtin_ia32_vpermilvarps", IX86_BUILTIN_VPERMILVARPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SI },
27491 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv4df3, "__builtin_ia32_vpermilvarpd256", IX86_BUILTIN_VPERMILVARPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DI },
27492 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv8sf3, "__builtin_ia32_vpermilvarps256", IX86_BUILTIN_VPERMILVARPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SI },
27493
27494 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendpd256, "__builtin_ia32_blendpd256", IX86_BUILTIN_BLENDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
27495 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendps256, "__builtin_ia32_blendps256", IX86_BUILTIN_BLENDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
27496 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendvpd256, "__builtin_ia32_blendvpd256", IX86_BUILTIN_BLENDVPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_V4DF },
27497 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendvps256, "__builtin_ia32_blendvps256", IX86_BUILTIN_BLENDVPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_V8SF },
27498 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_dpps256, "__builtin_ia32_dpps256", IX86_BUILTIN_DPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
27499 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_shufpd256, "__builtin_ia32_shufpd256", IX86_BUILTIN_SHUFPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
27500 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_shufps256, "__builtin_ia32_shufps256", IX86_BUILTIN_SHUFPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
27501 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vmcmpv2df3, "__builtin_ia32_cmpsd", IX86_BUILTIN_CMPSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
27502 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vmcmpv4sf3, "__builtin_ia32_cmpss", IX86_BUILTIN_CMPSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
27503 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv2df3, "__builtin_ia32_cmppd", IX86_BUILTIN_CMPPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
27504 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv4sf3, "__builtin_ia32_cmpps", IX86_BUILTIN_CMPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
27505 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv4df3, "__builtin_ia32_cmppd256", IX86_BUILTIN_CMPPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
27506 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv8sf3, "__builtin_ia32_cmpps256", IX86_BUILTIN_CMPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
27507 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vextractf128v4df, "__builtin_ia32_vextractf128_pd256", IX86_BUILTIN_EXTRACTF128PD256, UNKNOWN, (int) V2DF_FTYPE_V4DF_INT },
27508 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vextractf128v8sf, "__builtin_ia32_vextractf128_ps256", IX86_BUILTIN_EXTRACTF128PS256, UNKNOWN, (int) V4SF_FTYPE_V8SF_INT },
27509 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vextractf128v8si, "__builtin_ia32_vextractf128_si256", IX86_BUILTIN_EXTRACTF128SI256, UNKNOWN, (int) V4SI_FTYPE_V8SI_INT },
27510 { OPTION_MASK_ISA_AVX, CODE_FOR_floatv4siv4df2, "__builtin_ia32_cvtdq2pd256", IX86_BUILTIN_CVTDQ2PD256, UNKNOWN, (int) V4DF_FTYPE_V4SI },
27511 { OPTION_MASK_ISA_AVX, CODE_FOR_floatv8siv8sf2, "__builtin_ia32_cvtdq2ps256", IX86_BUILTIN_CVTDQ2PS256, UNKNOWN, (int) V8SF_FTYPE_V8SI },
27512 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtpd2ps256, "__builtin_ia32_cvtpd2ps256", IX86_BUILTIN_CVTPD2PS256, UNKNOWN, (int) V4SF_FTYPE_V4DF },
27513 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtps2dq256, "__builtin_ia32_cvtps2dq256", IX86_BUILTIN_CVTPS2DQ256, UNKNOWN, (int) V8SI_FTYPE_V8SF },
27514 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtps2pd256, "__builtin_ia32_cvtps2pd256", IX86_BUILTIN_CVTPS2PD256, UNKNOWN, (int) V4DF_FTYPE_V4SF },
27515 { OPTION_MASK_ISA_AVX, CODE_FOR_fix_truncv4dfv4si2, "__builtin_ia32_cvttpd2dq256", IX86_BUILTIN_CVTTPD2DQ256, UNKNOWN, (int) V4SI_FTYPE_V4DF },
27516 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtpd2dq256, "__builtin_ia32_cvtpd2dq256", IX86_BUILTIN_CVTPD2DQ256, UNKNOWN, (int) V4SI_FTYPE_V4DF },
27517 { OPTION_MASK_ISA_AVX, CODE_FOR_fix_truncv8sfv8si2, "__builtin_ia32_cvttps2dq256", IX86_BUILTIN_CVTTPS2DQ256, UNKNOWN, (int) V8SI_FTYPE_V8SF },
27518 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v4df3, "__builtin_ia32_vperm2f128_pd256", IX86_BUILTIN_VPERM2F128PD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
27519 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v8sf3, "__builtin_ia32_vperm2f128_ps256", IX86_BUILTIN_VPERM2F128PS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
27520 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v8si3, "__builtin_ia32_vperm2f128_si256", IX86_BUILTIN_VPERM2F128SI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_INT },
27521 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv2df, "__builtin_ia32_vpermilpd", IX86_BUILTIN_VPERMILPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_INT },
27522 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv4sf, "__builtin_ia32_vpermilps", IX86_BUILTIN_VPERMILPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_INT },
27523 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv4df, "__builtin_ia32_vpermilpd256", IX86_BUILTIN_VPERMILPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT },
27524 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv8sf, "__builtin_ia32_vpermilps256", IX86_BUILTIN_VPERMILPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_INT },
27525 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vinsertf128v4df, "__builtin_ia32_vinsertf128_pd256", IX86_BUILTIN_VINSERTF128PD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V2DF_INT },
27526 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vinsertf128v8sf, "__builtin_ia32_vinsertf128_ps256", IX86_BUILTIN_VINSERTF128PS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V4SF_INT },
27527 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vinsertf128v8si, "__builtin_ia32_vinsertf128_si256", IX86_BUILTIN_VINSERTF128SI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_INT },
27528
27529 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movshdup256, "__builtin_ia32_movshdup256", IX86_BUILTIN_MOVSHDUP256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
27530 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movsldup256, "__builtin_ia32_movsldup256", IX86_BUILTIN_MOVSLDUP256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
27531 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movddup256, "__builtin_ia32_movddup256", IX86_BUILTIN_MOVDDUP256, UNKNOWN, (int) V4DF_FTYPE_V4DF },
27532
27533 { OPTION_MASK_ISA_AVX, CODE_FOR_sqrtv4df2, "__builtin_ia32_sqrtpd256", IX86_BUILTIN_SQRTPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF },
27534 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_sqrtv8sf2, "__builtin_ia32_sqrtps256", IX86_BUILTIN_SQRTPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
27535 { OPTION_MASK_ISA_AVX, CODE_FOR_sqrtv8sf2, "__builtin_ia32_sqrtps_nr256", IX86_BUILTIN_SQRTPS_NR256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
27536 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_rsqrtv8sf2, "__builtin_ia32_rsqrtps256", IX86_BUILTIN_RSQRTPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
27537 { OPTION_MASK_ISA_AVX, CODE_FOR_rsqrtv8sf2, "__builtin_ia32_rsqrtps_nr256", IX86_BUILTIN_RSQRTPS_NR256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
27538
27539 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_rcpv8sf2, "__builtin_ia32_rcpps256", IX86_BUILTIN_RCPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
27540
27541 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_roundpd256", IX86_BUILTIN_ROUNDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT },
27542 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_roundps256", IX86_BUILTIN_ROUNDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_INT },
27543
27544 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_floorpd256", IX86_BUILTIN_FLOORPD256, (enum rtx_code) ROUND_FLOOR, (int) V4DF_FTYPE_V4DF_ROUND },
27545 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_ceilpd256", IX86_BUILTIN_CEILPD256, (enum rtx_code) ROUND_CEIL, (int) V4DF_FTYPE_V4DF_ROUND },
27546 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_truncpd256", IX86_BUILTIN_TRUNCPD256, (enum rtx_code) ROUND_TRUNC, (int) V4DF_FTYPE_V4DF_ROUND },
27547 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_rintpd256", IX86_BUILTIN_RINTPD256, (enum rtx_code) ROUND_MXCSR, (int) V4DF_FTYPE_V4DF_ROUND },
27548
27549 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv4df2, "__builtin_ia32_roundpd_az256", IX86_BUILTIN_ROUNDPD_AZ256, UNKNOWN, (int) V4DF_FTYPE_V4DF },
27550 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv4df2_vec_pack_sfix, "__builtin_ia32_roundpd_az_vec_pack_sfix256", IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256, UNKNOWN, (int) V8SI_FTYPE_V4DF_V4DF },
27551
27552 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd_vec_pack_sfix256, "__builtin_ia32_floorpd_vec_pack_sfix256", IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256, (enum rtx_code) ROUND_FLOOR, (int) V8SI_FTYPE_V4DF_V4DF_ROUND },
27553 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd_vec_pack_sfix256, "__builtin_ia32_ceilpd_vec_pack_sfix256", IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256, (enum rtx_code) ROUND_CEIL, (int) V8SI_FTYPE_V4DF_V4DF_ROUND },
27554
27555 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_floorps256", IX86_BUILTIN_FLOORPS256, (enum rtx_code) ROUND_FLOOR, (int) V8SF_FTYPE_V8SF_ROUND },
27556 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_ceilps256", IX86_BUILTIN_CEILPS256, (enum rtx_code) ROUND_CEIL, (int) V8SF_FTYPE_V8SF_ROUND },
27557 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_truncps256", IX86_BUILTIN_TRUNCPS256, (enum rtx_code) ROUND_TRUNC, (int) V8SF_FTYPE_V8SF_ROUND },
27558 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_rintps256", IX86_BUILTIN_RINTPS256, (enum rtx_code) ROUND_MXCSR, (int) V8SF_FTYPE_V8SF_ROUND },
27559
27560 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps_sfix256, "__builtin_ia32_floorps_sfix256", IX86_BUILTIN_FLOORPS_SFIX256, (enum rtx_code) ROUND_FLOOR, (int) V8SI_FTYPE_V8SF_ROUND },
27561 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps_sfix256, "__builtin_ia32_ceilps_sfix256", IX86_BUILTIN_CEILPS_SFIX256, (enum rtx_code) ROUND_CEIL, (int) V8SI_FTYPE_V8SF_ROUND },
27562
27563 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv8sf2, "__builtin_ia32_roundps_az256", IX86_BUILTIN_ROUNDPS_AZ256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
27564 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv8sf2_sfix, "__builtin_ia32_roundps_az_sfix256", IX86_BUILTIN_ROUNDPS_AZ_SFIX256, UNKNOWN, (int) V8SI_FTYPE_V8SF },
27565
27566 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpckhpd256, "__builtin_ia32_unpckhpd256", IX86_BUILTIN_UNPCKHPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27567 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpcklpd256, "__builtin_ia32_unpcklpd256", IX86_BUILTIN_UNPCKLPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27568 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpckhps256, "__builtin_ia32_unpckhps256", IX86_BUILTIN_UNPCKHPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27569 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpcklps256, "__builtin_ia32_unpcklps256", IX86_BUILTIN_UNPCKLPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27570
27571 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_si256_si, "__builtin_ia32_si256_si", IX86_BUILTIN_SI256_SI, UNKNOWN, (int) V8SI_FTYPE_V4SI },
27572 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ps256_ps, "__builtin_ia32_ps256_ps", IX86_BUILTIN_PS256_PS, UNKNOWN, (int) V8SF_FTYPE_V4SF },
27573 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_pd256_pd, "__builtin_ia32_pd256_pd", IX86_BUILTIN_PD256_PD, UNKNOWN, (int) V4DF_FTYPE_V2DF },
27574 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_extract_lo_v8si, "__builtin_ia32_si_si256", IX86_BUILTIN_SI_SI256, UNKNOWN, (int) V4SI_FTYPE_V8SI },
27575 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_extract_lo_v8sf, "__builtin_ia32_ps_ps256", IX86_BUILTIN_PS_PS256, UNKNOWN, (int) V4SF_FTYPE_V8SF },
27576 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_extract_lo_v4df, "__builtin_ia32_pd_pd256", IX86_BUILTIN_PD_PD256, UNKNOWN, (int) V2DF_FTYPE_V4DF },
27577
27578 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd, "__builtin_ia32_vtestzpd", IX86_BUILTIN_VTESTZPD, EQ, (int) INT_FTYPE_V2DF_V2DF_PTEST },
27579 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd, "__builtin_ia32_vtestcpd", IX86_BUILTIN_VTESTCPD, LTU, (int) INT_FTYPE_V2DF_V2DF_PTEST },
27580 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd, "__builtin_ia32_vtestnzcpd", IX86_BUILTIN_VTESTNZCPD, GTU, (int) INT_FTYPE_V2DF_V2DF_PTEST },
27581 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps, "__builtin_ia32_vtestzps", IX86_BUILTIN_VTESTZPS, EQ, (int) INT_FTYPE_V4SF_V4SF_PTEST },
27582 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps, "__builtin_ia32_vtestcps", IX86_BUILTIN_VTESTCPS, LTU, (int) INT_FTYPE_V4SF_V4SF_PTEST },
27583 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps, "__builtin_ia32_vtestnzcps", IX86_BUILTIN_VTESTNZCPS, GTU, (int) INT_FTYPE_V4SF_V4SF_PTEST },
27584 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd256, "__builtin_ia32_vtestzpd256", IX86_BUILTIN_VTESTZPD256, EQ, (int) INT_FTYPE_V4DF_V4DF_PTEST },
27585 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd256, "__builtin_ia32_vtestcpd256", IX86_BUILTIN_VTESTCPD256, LTU, (int) INT_FTYPE_V4DF_V4DF_PTEST },
27586 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd256, "__builtin_ia32_vtestnzcpd256", IX86_BUILTIN_VTESTNZCPD256, GTU, (int) INT_FTYPE_V4DF_V4DF_PTEST },
27587 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestzps256", IX86_BUILTIN_VTESTZPS256, EQ, (int) INT_FTYPE_V8SF_V8SF_PTEST },
27588 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestcps256", IX86_BUILTIN_VTESTCPS256, LTU, (int) INT_FTYPE_V8SF_V8SF_PTEST },
27589 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestnzcps256", IX86_BUILTIN_VTESTNZCPS256, GTU, (int) INT_FTYPE_V8SF_V8SF_PTEST },
27590 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, "__builtin_ia32_ptestz256", IX86_BUILTIN_PTESTZ256, EQ, (int) INT_FTYPE_V4DI_V4DI_PTEST },
27591 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, "__builtin_ia32_ptestc256", IX86_BUILTIN_PTESTC256, LTU, (int) INT_FTYPE_V4DI_V4DI_PTEST },
27592 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, "__builtin_ia32_ptestnzc256", IX86_BUILTIN_PTESTNZC256, GTU, (int) INT_FTYPE_V4DI_V4DI_PTEST },
27593
27594 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movmskpd256, "__builtin_ia32_movmskpd256", IX86_BUILTIN_MOVMSKPD256, UNKNOWN, (int) INT_FTYPE_V4DF },
27595 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movmskps256, "__builtin_ia32_movmskps256", IX86_BUILTIN_MOVMSKPS256, UNKNOWN, (int) INT_FTYPE_V8SF },
27596
27597 { OPTION_MASK_ISA_AVX, CODE_FOR_copysignv8sf3, "__builtin_ia32_copysignps256", IX86_BUILTIN_CPYSGNPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27598 { OPTION_MASK_ISA_AVX, CODE_FOR_copysignv4df3, "__builtin_ia32_copysignpd256", IX86_BUILTIN_CPYSGNPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27599
27600 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_pack_sfix_v4df, "__builtin_ia32_vec_pack_sfix256 ", IX86_BUILTIN_VEC_PACK_SFIX256, UNKNOWN, (int) V8SI_FTYPE_V4DF_V4DF },
27601
27602 /* AVX2 */
27603 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_mpsadbw, "__builtin_ia32_mpsadbw256", IX86_BUILTIN_MPSADBW256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI_INT },
27604 { OPTION_MASK_ISA_AVX2, CODE_FOR_absv32qi2, "__builtin_ia32_pabsb256", IX86_BUILTIN_PABSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI },
27605 { OPTION_MASK_ISA_AVX2, CODE_FOR_absv16hi2, "__builtin_ia32_pabsw256", IX86_BUILTIN_PABSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI },
27606 { OPTION_MASK_ISA_AVX2, CODE_FOR_absv8si2, "__builtin_ia32_pabsd256", IX86_BUILTIN_PABSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI },
27607 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packssdw, "__builtin_ia32_packssdw256", IX86_BUILTIN_PACKSSDW256, UNKNOWN, (int) V16HI_FTYPE_V8SI_V8SI },
27608 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packsswb, "__builtin_ia32_packsswb256", IX86_BUILTIN_PACKSSWB256, UNKNOWN, (int) V32QI_FTYPE_V16HI_V16HI },
27609 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packusdw, "__builtin_ia32_packusdw256", IX86_BUILTIN_PACKUSDW256, UNKNOWN, (int) V16HI_FTYPE_V8SI_V8SI },
27610 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packuswb, "__builtin_ia32_packuswb256", IX86_BUILTIN_PACKUSWB256, UNKNOWN, (int) V32QI_FTYPE_V16HI_V16HI },
27611 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv32qi3, "__builtin_ia32_paddb256", IX86_BUILTIN_PADDB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27612 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv16hi3, "__builtin_ia32_paddw256", IX86_BUILTIN_PADDW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27613 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv8si3, "__builtin_ia32_paddd256", IX86_BUILTIN_PADDD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27614 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv4di3, "__builtin_ia32_paddq256", IX86_BUILTIN_PADDQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27615 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ssaddv32qi3, "__builtin_ia32_paddsb256", IX86_BUILTIN_PADDSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27616 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ssaddv16hi3, "__builtin_ia32_paddsw256", IX86_BUILTIN_PADDSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27617 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_usaddv32qi3, "__builtin_ia32_paddusb256", IX86_BUILTIN_PADDUSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27618 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_usaddv16hi3, "__builtin_ia32_paddusw256", IX86_BUILTIN_PADDUSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27619 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_palignrv2ti, "__builtin_ia32_palignr256", IX86_BUILTIN_PALIGNR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI_INT_CONVERT },
27620 { OPTION_MASK_ISA_AVX2, CODE_FOR_andv4di3, "__builtin_ia32_andsi256", IX86_BUILTIN_AND256I, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27621 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_andnotv4di3, "__builtin_ia32_andnotsi256", IX86_BUILTIN_ANDNOT256I, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27622 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_uavgv32qi3, "__builtin_ia32_pavgb256", IX86_BUILTIN_PAVGB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27623 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_uavgv16hi3, "__builtin_ia32_pavgw256", IX86_BUILTIN_PAVGW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27624 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblendvb, "__builtin_ia32_pblendvb256", IX86_BUILTIN_PBLENDVB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI_V32QI },
27625 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblendw, "__builtin_ia32_pblendw256", IX86_BUILTIN_PBLENDVW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI_INT },
27626 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv32qi3, "__builtin_ia32_pcmpeqb256", IX86_BUILTIN_PCMPEQB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27627 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv16hi3, "__builtin_ia32_pcmpeqw256", IX86_BUILTIN_PCMPEQW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27628 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv8si3, "__builtin_ia32_pcmpeqd256", IX86_BUILTIN_PCMPEQD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27629 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv4di3, "__builtin_ia32_pcmpeqq256", IX86_BUILTIN_PCMPEQQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27630 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv32qi3, "__builtin_ia32_pcmpgtb256", IX86_BUILTIN_PCMPGTB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27631 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv16hi3, "__builtin_ia32_pcmpgtw256", IX86_BUILTIN_PCMPGTW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27632 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv8si3, "__builtin_ia32_pcmpgtd256", IX86_BUILTIN_PCMPGTD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27633 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv4di3, "__builtin_ia32_pcmpgtq256", IX86_BUILTIN_PCMPGTQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27634 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phaddwv16hi3, "__builtin_ia32_phaddw256", IX86_BUILTIN_PHADDW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27635 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phadddv8si3, "__builtin_ia32_phaddd256", IX86_BUILTIN_PHADDD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27636 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phaddswv16hi3, "__builtin_ia32_phaddsw256", IX86_BUILTIN_PHADDSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27637 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phsubwv16hi3, "__builtin_ia32_phsubw256", IX86_BUILTIN_PHSUBW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27638 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phsubdv8si3, "__builtin_ia32_phsubd256", IX86_BUILTIN_PHSUBD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27639 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phsubswv16hi3, "__builtin_ia32_phsubsw256", IX86_BUILTIN_PHSUBSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27640 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmaddubsw256, "__builtin_ia32_pmaddubsw256", IX86_BUILTIN_PMADDUBSW256, UNKNOWN, (int) V16HI_FTYPE_V32QI_V32QI },
27641 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmaddwd, "__builtin_ia32_pmaddwd256", IX86_BUILTIN_PMADDWD256, UNKNOWN, (int) V8SI_FTYPE_V16HI_V16HI },
27642 { OPTION_MASK_ISA_AVX2, CODE_FOR_smaxv32qi3, "__builtin_ia32_pmaxsb256", IX86_BUILTIN_PMAXSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27643 { OPTION_MASK_ISA_AVX2, CODE_FOR_smaxv16hi3, "__builtin_ia32_pmaxsw256", IX86_BUILTIN_PMAXSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27644 { OPTION_MASK_ISA_AVX2, CODE_FOR_smaxv8si3 , "__builtin_ia32_pmaxsd256", IX86_BUILTIN_PMAXSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27645 { OPTION_MASK_ISA_AVX2, CODE_FOR_umaxv32qi3, "__builtin_ia32_pmaxub256", IX86_BUILTIN_PMAXUB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27646 { OPTION_MASK_ISA_AVX2, CODE_FOR_umaxv16hi3, "__builtin_ia32_pmaxuw256", IX86_BUILTIN_PMAXUW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27647 { OPTION_MASK_ISA_AVX2, CODE_FOR_umaxv8si3 , "__builtin_ia32_pmaxud256", IX86_BUILTIN_PMAXUD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27648 { OPTION_MASK_ISA_AVX2, CODE_FOR_sminv32qi3, "__builtin_ia32_pminsb256", IX86_BUILTIN_PMINSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27649 { OPTION_MASK_ISA_AVX2, CODE_FOR_sminv16hi3, "__builtin_ia32_pminsw256", IX86_BUILTIN_PMINSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27650 { OPTION_MASK_ISA_AVX2, CODE_FOR_sminv8si3 , "__builtin_ia32_pminsd256", IX86_BUILTIN_PMINSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27651 { OPTION_MASK_ISA_AVX2, CODE_FOR_uminv32qi3, "__builtin_ia32_pminub256", IX86_BUILTIN_PMINUB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27652 { OPTION_MASK_ISA_AVX2, CODE_FOR_uminv16hi3, "__builtin_ia32_pminuw256", IX86_BUILTIN_PMINUW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27653 { OPTION_MASK_ISA_AVX2, CODE_FOR_uminv8si3 , "__builtin_ia32_pminud256", IX86_BUILTIN_PMINUD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27654 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmovmskb, "__builtin_ia32_pmovmskb256", IX86_BUILTIN_PMOVMSKB256, UNKNOWN, (int) INT_FTYPE_V32QI },
27655 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv16qiv16hi2, "__builtin_ia32_pmovsxbw256", IX86_BUILTIN_PMOVSXBW256, UNKNOWN, (int) V16HI_FTYPE_V16QI },
27656 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv8qiv8si2 , "__builtin_ia32_pmovsxbd256", IX86_BUILTIN_PMOVSXBD256, UNKNOWN, (int) V8SI_FTYPE_V16QI },
27657 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv4qiv4di2 , "__builtin_ia32_pmovsxbq256", IX86_BUILTIN_PMOVSXBQ256, UNKNOWN, (int) V4DI_FTYPE_V16QI },
27658 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv8hiv8si2 , "__builtin_ia32_pmovsxwd256", IX86_BUILTIN_PMOVSXWD256, UNKNOWN, (int) V8SI_FTYPE_V8HI },
27659 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv4hiv4di2 , "__builtin_ia32_pmovsxwq256", IX86_BUILTIN_PMOVSXWQ256, UNKNOWN, (int) V4DI_FTYPE_V8HI },
27660 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv4siv4di2 , "__builtin_ia32_pmovsxdq256", IX86_BUILTIN_PMOVSXDQ256, UNKNOWN, (int) V4DI_FTYPE_V4SI },
27661 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv16qiv16hi2, "__builtin_ia32_pmovzxbw256", IX86_BUILTIN_PMOVZXBW256, UNKNOWN, (int) V16HI_FTYPE_V16QI },
27662 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv8qiv8si2 , "__builtin_ia32_pmovzxbd256", IX86_BUILTIN_PMOVZXBD256, UNKNOWN, (int) V8SI_FTYPE_V16QI },
27663 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4qiv4di2 , "__builtin_ia32_pmovzxbq256", IX86_BUILTIN_PMOVZXBQ256, UNKNOWN, (int) V4DI_FTYPE_V16QI },
27664 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv8hiv8si2 , "__builtin_ia32_pmovzxwd256", IX86_BUILTIN_PMOVZXWD256, UNKNOWN, (int) V8SI_FTYPE_V8HI },
27665 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4hiv4di2 , "__builtin_ia32_pmovzxwq256", IX86_BUILTIN_PMOVZXWQ256, UNKNOWN, (int) V4DI_FTYPE_V8HI },
27666 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4siv4di2 , "__builtin_ia32_pmovzxdq256", IX86_BUILTIN_PMOVZXDQ256, UNKNOWN, (int) V4DI_FTYPE_V4SI },
27667 { OPTION_MASK_ISA_AVX2, CODE_FOR_vec_widen_smult_even_v8si, "__builtin_ia32_pmuldq256", IX86_BUILTIN_PMULDQ256, UNKNOWN, (int) V4DI_FTYPE_V8SI_V8SI },
27668 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_umulhrswv16hi3 , "__builtin_ia32_pmulhrsw256", IX86_BUILTIN_PMULHRSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27669 { OPTION_MASK_ISA_AVX2, CODE_FOR_umulv16hi3_highpart, "__builtin_ia32_pmulhuw256" , IX86_BUILTIN_PMULHUW256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27670 { OPTION_MASK_ISA_AVX2, CODE_FOR_smulv16hi3_highpart, "__builtin_ia32_pmulhw256" , IX86_BUILTIN_PMULHW256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27671 { OPTION_MASK_ISA_AVX2, CODE_FOR_mulv16hi3, "__builtin_ia32_pmullw256" , IX86_BUILTIN_PMULLW256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27672 { OPTION_MASK_ISA_AVX2, CODE_FOR_mulv8si3, "__builtin_ia32_pmulld256" , IX86_BUILTIN_PMULLD256 , UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27673 { OPTION_MASK_ISA_AVX2, CODE_FOR_vec_widen_umult_even_v8si, "__builtin_ia32_pmuludq256", IX86_BUILTIN_PMULUDQ256, UNKNOWN, (int) V4DI_FTYPE_V8SI_V8SI },
27674 { OPTION_MASK_ISA_AVX2, CODE_FOR_iorv4di3, "__builtin_ia32_por256", IX86_BUILTIN_POR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27675 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psadbw, "__builtin_ia32_psadbw256", IX86_BUILTIN_PSADBW256, UNKNOWN, (int) V16HI_FTYPE_V32QI_V32QI },
27676 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshufbv32qi3, "__builtin_ia32_pshufb256", IX86_BUILTIN_PSHUFB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27677 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshufdv3, "__builtin_ia32_pshufd256", IX86_BUILTIN_PSHUFD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_INT },
27678 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshufhwv3, "__builtin_ia32_pshufhw256", IX86_BUILTIN_PSHUFHW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_INT },
27679 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshuflwv3, "__builtin_ia32_pshuflw256", IX86_BUILTIN_PSHUFLW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_INT },
27680 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psignv32qi3, "__builtin_ia32_psignb256", IX86_BUILTIN_PSIGNB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27681 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psignv16hi3, "__builtin_ia32_psignw256", IX86_BUILTIN_PSIGNW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27682 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psignv8si3 , "__builtin_ia32_psignd256", IX86_BUILTIN_PSIGND256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27683 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlv2ti3, "__builtin_ia32_pslldqi256", IX86_BUILTIN_PSLLDQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_CONVERT },
27684 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv16hi3, "__builtin_ia32_psllwi256", IX86_BUILTIN_PSLLWI256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_SI_COUNT },
27685 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv16hi3, "__builtin_ia32_psllw256", IX86_BUILTIN_PSLLW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V8HI_COUNT },
27686 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv8si3, "__builtin_ia32_pslldi256", IX86_BUILTIN_PSLLDI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_SI_COUNT },
27687 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv8si3, "__builtin_ia32_pslld256", IX86_BUILTIN_PSLLD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_COUNT },
27688 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv4di3, "__builtin_ia32_psllqi256", IX86_BUILTIN_PSLLQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_COUNT },
27689 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv4di3, "__builtin_ia32_psllq256", IX86_BUILTIN_PSLLQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V2DI_COUNT },
27690 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv16hi3, "__builtin_ia32_psrawi256", IX86_BUILTIN_PSRAWI256, UNKNOWN, (int) V16HI_FTYPE_V16HI_SI_COUNT },
27691 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv16hi3, "__builtin_ia32_psraw256", IX86_BUILTIN_PSRAW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V8HI_COUNT },
27692 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv8si3, "__builtin_ia32_psradi256", IX86_BUILTIN_PSRADI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_SI_COUNT },
27693 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv8si3, "__builtin_ia32_psrad256", IX86_BUILTIN_PSRAD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_COUNT },
27694 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrv2ti3, "__builtin_ia32_psrldqi256", IX86_BUILTIN_PSRLDQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_CONVERT },
27695 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv16hi3, "__builtin_ia32_psrlwi256", IX86_BUILTIN_PSRLWI256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_SI_COUNT },
27696 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv16hi3, "__builtin_ia32_psrlw256", IX86_BUILTIN_PSRLW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V8HI_COUNT },
27697 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv8si3, "__builtin_ia32_psrldi256", IX86_BUILTIN_PSRLDI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_SI_COUNT },
27698 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv8si3, "__builtin_ia32_psrld256", IX86_BUILTIN_PSRLD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_COUNT },
27699 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv4di3, "__builtin_ia32_psrlqi256", IX86_BUILTIN_PSRLQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_COUNT },
27700 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv4di3, "__builtin_ia32_psrlq256", IX86_BUILTIN_PSRLQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V2DI_COUNT },
27701 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv32qi3, "__builtin_ia32_psubb256", IX86_BUILTIN_PSUBB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27702 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv16hi3, "__builtin_ia32_psubw256", IX86_BUILTIN_PSUBW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27703 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv8si3, "__builtin_ia32_psubd256", IX86_BUILTIN_PSUBD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27704 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv4di3, "__builtin_ia32_psubq256", IX86_BUILTIN_PSUBQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27705 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sssubv32qi3, "__builtin_ia32_psubsb256", IX86_BUILTIN_PSUBSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27706 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sssubv16hi3, "__builtin_ia32_psubsw256", IX86_BUILTIN_PSUBSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27707 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ussubv32qi3, "__builtin_ia32_psubusb256", IX86_BUILTIN_PSUBUSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27708 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ussubv16hi3, "__builtin_ia32_psubusw256", IX86_BUILTIN_PSUBUSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27709 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv32qi, "__builtin_ia32_punpckhbw256", IX86_BUILTIN_PUNPCKHBW256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27710 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv16hi, "__builtin_ia32_punpckhwd256", IX86_BUILTIN_PUNPCKHWD256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27711 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv8si, "__builtin_ia32_punpckhdq256", IX86_BUILTIN_PUNPCKHDQ256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27712 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv4di, "__builtin_ia32_punpckhqdq256", IX86_BUILTIN_PUNPCKHQDQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27713 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv32qi, "__builtin_ia32_punpcklbw256", IX86_BUILTIN_PUNPCKLBW256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27714 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv16hi, "__builtin_ia32_punpcklwd256", IX86_BUILTIN_PUNPCKLWD256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27715 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv8si, "__builtin_ia32_punpckldq256", IX86_BUILTIN_PUNPCKLDQ256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27716 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv4di, "__builtin_ia32_punpcklqdq256", IX86_BUILTIN_PUNPCKLQDQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27717 { OPTION_MASK_ISA_AVX2, CODE_FOR_xorv4di3, "__builtin_ia32_pxor256", IX86_BUILTIN_PXOR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27718 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv4sf, "__builtin_ia32_vbroadcastss_ps", IX86_BUILTIN_VBROADCASTSS_PS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
27719 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv8sf, "__builtin_ia32_vbroadcastss_ps256", IX86_BUILTIN_VBROADCASTSS_PS256, UNKNOWN, (int) V8SF_FTYPE_V4SF },
27720 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv4df, "__builtin_ia32_vbroadcastsd_pd256", IX86_BUILTIN_VBROADCASTSD_PD256, UNKNOWN, (int) V4DF_FTYPE_V2DF },
27721 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vbroadcasti128_v4di, "__builtin_ia32_vbroadcastsi256", IX86_BUILTIN_VBROADCASTSI256, UNKNOWN, (int) V4DI_FTYPE_V2DI },
27722 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblenddv4si, "__builtin_ia32_pblendd128", IX86_BUILTIN_PBLENDD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_INT },
27723 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblenddv8si, "__builtin_ia32_pblendd256", IX86_BUILTIN_PBLENDD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_INT },
27724 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv32qi, "__builtin_ia32_pbroadcastb256", IX86_BUILTIN_PBROADCASTB256, UNKNOWN, (int) V32QI_FTYPE_V16QI },
27725 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv16hi, "__builtin_ia32_pbroadcastw256", IX86_BUILTIN_PBROADCASTW256, UNKNOWN, (int) V16HI_FTYPE_V8HI },
27726 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv8si, "__builtin_ia32_pbroadcastd256", IX86_BUILTIN_PBROADCASTD256, UNKNOWN, (int) V8SI_FTYPE_V4SI },
27727 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv4di, "__builtin_ia32_pbroadcastq256", IX86_BUILTIN_PBROADCASTQ256, UNKNOWN, (int) V4DI_FTYPE_V2DI },
27728 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv16qi, "__builtin_ia32_pbroadcastb128", IX86_BUILTIN_PBROADCASTB128, UNKNOWN, (int) V16QI_FTYPE_V16QI },
27729 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv8hi, "__builtin_ia32_pbroadcastw128", IX86_BUILTIN_PBROADCASTW128, UNKNOWN, (int) V8HI_FTYPE_V8HI },
27730 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv4si, "__builtin_ia32_pbroadcastd128", IX86_BUILTIN_PBROADCASTD128, UNKNOWN, (int) V4SI_FTYPE_V4SI },
27731 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv2di, "__builtin_ia32_pbroadcastq128", IX86_BUILTIN_PBROADCASTQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI },
27732 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permvarv8si, "__builtin_ia32_permvarsi256", IX86_BUILTIN_VPERMVARSI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27733 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permvarv8sf, "__builtin_ia32_permvarsf256", IX86_BUILTIN_VPERMVARSF256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SI },
27734 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv4df, "__builtin_ia32_permdf256", IX86_BUILTIN_VPERMDF256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT },
27735 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv4di, "__builtin_ia32_permdi256", IX86_BUILTIN_VPERMDI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT },
27736 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv2ti, "__builtin_ia32_permti256", IX86_BUILTIN_VPERMTI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI_INT },
27737 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_extracti128, "__builtin_ia32_extract128i256", IX86_BUILTIN_VEXTRACT128I256, UNKNOWN, (int) V2DI_FTYPE_V4DI_INT },
27738 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_inserti128, "__builtin_ia32_insert128i256", IX86_BUILTIN_VINSERT128I256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V2DI_INT },
27739 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv4di, "__builtin_ia32_psllv4di", IX86_BUILTIN_PSLLVV4DI, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27740 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv2di, "__builtin_ia32_psllv2di", IX86_BUILTIN_PSLLVV2DI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27741 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv8si, "__builtin_ia32_psllv8si", IX86_BUILTIN_PSLLVV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27742 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv4si, "__builtin_ia32_psllv4si", IX86_BUILTIN_PSLLVV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27743 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashrvv8si, "__builtin_ia32_psrav8si", IX86_BUILTIN_PSRAVV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27744 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashrvv4si, "__builtin_ia32_psrav4si", IX86_BUILTIN_PSRAVV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27745 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv4di, "__builtin_ia32_psrlv4di", IX86_BUILTIN_PSRLVV4DI, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27746 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv2di, "__builtin_ia32_psrlv2di", IX86_BUILTIN_PSRLVV2DI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27747 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv8si, "__builtin_ia32_psrlv8si", IX86_BUILTIN_PSRLVV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27748 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv4si, "__builtin_ia32_psrlv4si", IX86_BUILTIN_PSRLVV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27749
27750 { OPTION_MASK_ISA_LZCNT, CODE_FOR_clzhi2_lzcnt, "__builtin_clzs", IX86_BUILTIN_CLZS, UNKNOWN, (int) UINT16_FTYPE_UINT16 },
27751
27752 /* BMI */
27753 { OPTION_MASK_ISA_BMI, CODE_FOR_bmi_bextr_si, "__builtin_ia32_bextr_u32", IX86_BUILTIN_BEXTR32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
27754 { OPTION_MASK_ISA_BMI, CODE_FOR_bmi_bextr_di, "__builtin_ia32_bextr_u64", IX86_BUILTIN_BEXTR64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
27755 { OPTION_MASK_ISA_BMI, CODE_FOR_ctzhi2, "__builtin_ctzs", IX86_BUILTIN_CTZS, UNKNOWN, (int) UINT16_FTYPE_UINT16 },
27756
27757 /* TBM */
27758 { OPTION_MASK_ISA_TBM, CODE_FOR_tbm_bextri_si, "__builtin_ia32_bextri_u32", IX86_BUILTIN_BEXTRI32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
27759 { OPTION_MASK_ISA_TBM, CODE_FOR_tbm_bextri_di, "__builtin_ia32_bextri_u64", IX86_BUILTIN_BEXTRI64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
27760
27761 /* F16C */
27762 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtph2ps, "__builtin_ia32_vcvtph2ps", IX86_BUILTIN_CVTPH2PS, UNKNOWN, (int) V4SF_FTYPE_V8HI },
27763 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtph2ps256, "__builtin_ia32_vcvtph2ps256", IX86_BUILTIN_CVTPH2PS256, UNKNOWN, (int) V8SF_FTYPE_V8HI },
27764 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtps2ph, "__builtin_ia32_vcvtps2ph", IX86_BUILTIN_CVTPS2PH, UNKNOWN, (int) V8HI_FTYPE_V4SF_INT },
27765 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtps2ph256, "__builtin_ia32_vcvtps2ph256", IX86_BUILTIN_CVTPS2PH256, UNKNOWN, (int) V8HI_FTYPE_V8SF_INT },
27766
27767 /* BMI2 */
27768 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_bzhi_si3, "__builtin_ia32_bzhi_si", IX86_BUILTIN_BZHI32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
27769 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_bzhi_di3, "__builtin_ia32_bzhi_di", IX86_BUILTIN_BZHI64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
27770 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pdep_si3, "__builtin_ia32_pdep_si", IX86_BUILTIN_PDEP32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
27771 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pdep_di3, "__builtin_ia32_pdep_di", IX86_BUILTIN_PDEP64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
27772 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pext_si3, "__builtin_ia32_pext_si", IX86_BUILTIN_PEXT32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
27773 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pext_di3, "__builtin_ia32_pext_di", IX86_BUILTIN_PEXT64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
27774 };
27775
27776 /* FMA4 and XOP. */
27777 #define MULTI_ARG_4_DF2_DI_I V2DF_FTYPE_V2DF_V2DF_V2DI_INT
27778 #define MULTI_ARG_4_DF2_DI_I1 V4DF_FTYPE_V4DF_V4DF_V4DI_INT
27779 #define MULTI_ARG_4_SF2_SI_I V4SF_FTYPE_V4SF_V4SF_V4SI_INT
27780 #define MULTI_ARG_4_SF2_SI_I1 V8SF_FTYPE_V8SF_V8SF_V8SI_INT
27781 #define MULTI_ARG_3_SF V4SF_FTYPE_V4SF_V4SF_V4SF
27782 #define MULTI_ARG_3_DF V2DF_FTYPE_V2DF_V2DF_V2DF
27783 #define MULTI_ARG_3_SF2 V8SF_FTYPE_V8SF_V8SF_V8SF
27784 #define MULTI_ARG_3_DF2 V4DF_FTYPE_V4DF_V4DF_V4DF
27785 #define MULTI_ARG_3_DI V2DI_FTYPE_V2DI_V2DI_V2DI
27786 #define MULTI_ARG_3_SI V4SI_FTYPE_V4SI_V4SI_V4SI
27787 #define MULTI_ARG_3_SI_DI V4SI_FTYPE_V4SI_V4SI_V2DI
27788 #define MULTI_ARG_3_HI V8HI_FTYPE_V8HI_V8HI_V8HI
27789 #define MULTI_ARG_3_HI_SI V8HI_FTYPE_V8HI_V8HI_V4SI
27790 #define MULTI_ARG_3_QI V16QI_FTYPE_V16QI_V16QI_V16QI
27791 #define MULTI_ARG_3_DI2 V4DI_FTYPE_V4DI_V4DI_V4DI
27792 #define MULTI_ARG_3_SI2 V8SI_FTYPE_V8SI_V8SI_V8SI
27793 #define MULTI_ARG_3_HI2 V16HI_FTYPE_V16HI_V16HI_V16HI
27794 #define MULTI_ARG_3_QI2 V32QI_FTYPE_V32QI_V32QI_V32QI
27795 #define MULTI_ARG_2_SF V4SF_FTYPE_V4SF_V4SF
27796 #define MULTI_ARG_2_DF V2DF_FTYPE_V2DF_V2DF
27797 #define MULTI_ARG_2_DI V2DI_FTYPE_V2DI_V2DI
27798 #define MULTI_ARG_2_SI V4SI_FTYPE_V4SI_V4SI
27799 #define MULTI_ARG_2_HI V8HI_FTYPE_V8HI_V8HI
27800 #define MULTI_ARG_2_QI V16QI_FTYPE_V16QI_V16QI
27801 #define MULTI_ARG_2_DI_IMM V2DI_FTYPE_V2DI_SI
27802 #define MULTI_ARG_2_SI_IMM V4SI_FTYPE_V4SI_SI
27803 #define MULTI_ARG_2_HI_IMM V8HI_FTYPE_V8HI_SI
27804 #define MULTI_ARG_2_QI_IMM V16QI_FTYPE_V16QI_SI
27805 #define MULTI_ARG_2_DI_CMP V2DI_FTYPE_V2DI_V2DI_CMP
27806 #define MULTI_ARG_2_SI_CMP V4SI_FTYPE_V4SI_V4SI_CMP
27807 #define MULTI_ARG_2_HI_CMP V8HI_FTYPE_V8HI_V8HI_CMP
27808 #define MULTI_ARG_2_QI_CMP V16QI_FTYPE_V16QI_V16QI_CMP
27809 #define MULTI_ARG_2_SF_TF V4SF_FTYPE_V4SF_V4SF_TF
27810 #define MULTI_ARG_2_DF_TF V2DF_FTYPE_V2DF_V2DF_TF
27811 #define MULTI_ARG_2_DI_TF V2DI_FTYPE_V2DI_V2DI_TF
27812 #define MULTI_ARG_2_SI_TF V4SI_FTYPE_V4SI_V4SI_TF
27813 #define MULTI_ARG_2_HI_TF V8HI_FTYPE_V8HI_V8HI_TF
27814 #define MULTI_ARG_2_QI_TF V16QI_FTYPE_V16QI_V16QI_TF
27815 #define MULTI_ARG_1_SF V4SF_FTYPE_V4SF
27816 #define MULTI_ARG_1_DF V2DF_FTYPE_V2DF
27817 #define MULTI_ARG_1_SF2 V8SF_FTYPE_V8SF
27818 #define MULTI_ARG_1_DF2 V4DF_FTYPE_V4DF
27819 #define MULTI_ARG_1_DI V2DI_FTYPE_V2DI
27820 #define MULTI_ARG_1_SI V4SI_FTYPE_V4SI
27821 #define MULTI_ARG_1_HI V8HI_FTYPE_V8HI
27822 #define MULTI_ARG_1_QI V16QI_FTYPE_V16QI
27823 #define MULTI_ARG_1_SI_DI V2DI_FTYPE_V4SI
27824 #define MULTI_ARG_1_HI_DI V2DI_FTYPE_V8HI
27825 #define MULTI_ARG_1_HI_SI V4SI_FTYPE_V8HI
27826 #define MULTI_ARG_1_QI_DI V2DI_FTYPE_V16QI
27827 #define MULTI_ARG_1_QI_SI V4SI_FTYPE_V16QI
27828 #define MULTI_ARG_1_QI_HI V8HI_FTYPE_V16QI
27829
27830 static const struct builtin_description bdesc_multi_arg[] =
27831 {
27832 { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_vmfmadd_v4sf,
27833 "__builtin_ia32_vfmaddss", IX86_BUILTIN_VFMADDSS,
27834 UNKNOWN, (int)MULTI_ARG_3_SF },
27835 { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_vmfmadd_v2df,
27836 "__builtin_ia32_vfmaddsd", IX86_BUILTIN_VFMADDSD,
27837 UNKNOWN, (int)MULTI_ARG_3_DF },
27838
27839 { OPTION_MASK_ISA_FMA, CODE_FOR_fmai_vmfmadd_v4sf,
27840 "__builtin_ia32_vfmaddss3", IX86_BUILTIN_VFMADDSS3,
27841 UNKNOWN, (int)MULTI_ARG_3_SF },
27842 { OPTION_MASK_ISA_FMA, CODE_FOR_fmai_vmfmadd_v2df,
27843 "__builtin_ia32_vfmaddsd3", IX86_BUILTIN_VFMADDSD3,
27844 UNKNOWN, (int)MULTI_ARG_3_DF },
27845
27846 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v4sf,
27847 "__builtin_ia32_vfmaddps", IX86_BUILTIN_VFMADDPS,
27848 UNKNOWN, (int)MULTI_ARG_3_SF },
27849 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v2df,
27850 "__builtin_ia32_vfmaddpd", IX86_BUILTIN_VFMADDPD,
27851 UNKNOWN, (int)MULTI_ARG_3_DF },
27852 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v8sf,
27853 "__builtin_ia32_vfmaddps256", IX86_BUILTIN_VFMADDPS256,
27854 UNKNOWN, (int)MULTI_ARG_3_SF2 },
27855 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v4df,
27856 "__builtin_ia32_vfmaddpd256", IX86_BUILTIN_VFMADDPD256,
27857 UNKNOWN, (int)MULTI_ARG_3_DF2 },
27858
27859 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v4sf,
27860 "__builtin_ia32_vfmaddsubps", IX86_BUILTIN_VFMADDSUBPS,
27861 UNKNOWN, (int)MULTI_ARG_3_SF },
27862 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v2df,
27863 "__builtin_ia32_vfmaddsubpd", IX86_BUILTIN_VFMADDSUBPD,
27864 UNKNOWN, (int)MULTI_ARG_3_DF },
27865 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v8sf,
27866 "__builtin_ia32_vfmaddsubps256", IX86_BUILTIN_VFMADDSUBPS256,
27867 UNKNOWN, (int)MULTI_ARG_3_SF2 },
27868 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v4df,
27869 "__builtin_ia32_vfmaddsubpd256", IX86_BUILTIN_VFMADDSUBPD256,
27870 UNKNOWN, (int)MULTI_ARG_3_DF2 },
27871
27872 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v2di, "__builtin_ia32_vpcmov", IX86_BUILTIN_VPCMOV, UNKNOWN, (int)MULTI_ARG_3_DI },
27873 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v2di, "__builtin_ia32_vpcmov_v2di", IX86_BUILTIN_VPCMOV_V2DI, UNKNOWN, (int)MULTI_ARG_3_DI },
27874 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4si, "__builtin_ia32_vpcmov_v4si", IX86_BUILTIN_VPCMOV_V4SI, UNKNOWN, (int)MULTI_ARG_3_SI },
27875 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v8hi, "__builtin_ia32_vpcmov_v8hi", IX86_BUILTIN_VPCMOV_V8HI, UNKNOWN, (int)MULTI_ARG_3_HI },
27876 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v16qi, "__builtin_ia32_vpcmov_v16qi",IX86_BUILTIN_VPCMOV_V16QI,UNKNOWN, (int)MULTI_ARG_3_QI },
27877 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v2df, "__builtin_ia32_vpcmov_v2df", IX86_BUILTIN_VPCMOV_V2DF, UNKNOWN, (int)MULTI_ARG_3_DF },
27878 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4sf, "__builtin_ia32_vpcmov_v4sf", IX86_BUILTIN_VPCMOV_V4SF, UNKNOWN, (int)MULTI_ARG_3_SF },
27879
27880 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4di256, "__builtin_ia32_vpcmov256", IX86_BUILTIN_VPCMOV256, UNKNOWN, (int)MULTI_ARG_3_DI2 },
27881 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4di256, "__builtin_ia32_vpcmov_v4di256", IX86_BUILTIN_VPCMOV_V4DI256, UNKNOWN, (int)MULTI_ARG_3_DI2 },
27882 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v8si256, "__builtin_ia32_vpcmov_v8si256", IX86_BUILTIN_VPCMOV_V8SI256, UNKNOWN, (int)MULTI_ARG_3_SI2 },
27883 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v16hi256, "__builtin_ia32_vpcmov_v16hi256", IX86_BUILTIN_VPCMOV_V16HI256, UNKNOWN, (int)MULTI_ARG_3_HI2 },
27884 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v32qi256, "__builtin_ia32_vpcmov_v32qi256", IX86_BUILTIN_VPCMOV_V32QI256, UNKNOWN, (int)MULTI_ARG_3_QI2 },
27885 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4df256, "__builtin_ia32_vpcmov_v4df256", IX86_BUILTIN_VPCMOV_V4DF256, UNKNOWN, (int)MULTI_ARG_3_DF2 },
27886 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v8sf256, "__builtin_ia32_vpcmov_v8sf256", IX86_BUILTIN_VPCMOV_V8SF256, UNKNOWN, (int)MULTI_ARG_3_SF2 },
27887
27888 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pperm, "__builtin_ia32_vpperm", IX86_BUILTIN_VPPERM, UNKNOWN, (int)MULTI_ARG_3_QI },
27889
27890 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssww, "__builtin_ia32_vpmacssww", IX86_BUILTIN_VPMACSSWW, UNKNOWN, (int)MULTI_ARG_3_HI },
27891 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsww, "__builtin_ia32_vpmacsww", IX86_BUILTIN_VPMACSWW, UNKNOWN, (int)MULTI_ARG_3_HI },
27892 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsswd, "__builtin_ia32_vpmacsswd", IX86_BUILTIN_VPMACSSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
27893 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacswd, "__builtin_ia32_vpmacswd", IX86_BUILTIN_VPMACSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
27894 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssdd, "__builtin_ia32_vpmacssdd", IX86_BUILTIN_VPMACSSDD, UNKNOWN, (int)MULTI_ARG_3_SI },
27895 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsdd, "__builtin_ia32_vpmacsdd", IX86_BUILTIN_VPMACSDD, UNKNOWN, (int)MULTI_ARG_3_SI },
27896 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssdql, "__builtin_ia32_vpmacssdql", IX86_BUILTIN_VPMACSSDQL, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
27897 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssdqh, "__builtin_ia32_vpmacssdqh", IX86_BUILTIN_VPMACSSDQH, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
27898 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsdql, "__builtin_ia32_vpmacsdql", IX86_BUILTIN_VPMACSDQL, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
27899 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsdqh, "__builtin_ia32_vpmacsdqh", IX86_BUILTIN_VPMACSDQH, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
27900 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmadcsswd, "__builtin_ia32_vpmadcsswd", IX86_BUILTIN_VPMADCSSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
27901 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmadcswd, "__builtin_ia32_vpmadcswd", IX86_BUILTIN_VPMADCSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
27902
27903 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv2di3, "__builtin_ia32_vprotq", IX86_BUILTIN_VPROTQ, UNKNOWN, (int)MULTI_ARG_2_DI },
27904 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv4si3, "__builtin_ia32_vprotd", IX86_BUILTIN_VPROTD, UNKNOWN, (int)MULTI_ARG_2_SI },
27905 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv8hi3, "__builtin_ia32_vprotw", IX86_BUILTIN_VPROTW, UNKNOWN, (int)MULTI_ARG_2_HI },
27906 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv16qi3, "__builtin_ia32_vprotb", IX86_BUILTIN_VPROTB, UNKNOWN, (int)MULTI_ARG_2_QI },
27907 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv2di3, "__builtin_ia32_vprotqi", IX86_BUILTIN_VPROTQ_IMM, UNKNOWN, (int)MULTI_ARG_2_DI_IMM },
27908 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv4si3, "__builtin_ia32_vprotdi", IX86_BUILTIN_VPROTD_IMM, UNKNOWN, (int)MULTI_ARG_2_SI_IMM },
27909 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv8hi3, "__builtin_ia32_vprotwi", IX86_BUILTIN_VPROTW_IMM, UNKNOWN, (int)MULTI_ARG_2_HI_IMM },
27910 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv16qi3, "__builtin_ia32_vprotbi", IX86_BUILTIN_VPROTB_IMM, UNKNOWN, (int)MULTI_ARG_2_QI_IMM },
27911 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav2di3, "__builtin_ia32_vpshaq", IX86_BUILTIN_VPSHAQ, UNKNOWN, (int)MULTI_ARG_2_DI },
27912 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav4si3, "__builtin_ia32_vpshad", IX86_BUILTIN_VPSHAD, UNKNOWN, (int)MULTI_ARG_2_SI },
27913 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav8hi3, "__builtin_ia32_vpshaw", IX86_BUILTIN_VPSHAW, UNKNOWN, (int)MULTI_ARG_2_HI },
27914 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav16qi3, "__builtin_ia32_vpshab", IX86_BUILTIN_VPSHAB, UNKNOWN, (int)MULTI_ARG_2_QI },
27915 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv2di3, "__builtin_ia32_vpshlq", IX86_BUILTIN_VPSHLQ, UNKNOWN, (int)MULTI_ARG_2_DI },
27916 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv4si3, "__builtin_ia32_vpshld", IX86_BUILTIN_VPSHLD, UNKNOWN, (int)MULTI_ARG_2_SI },
27917 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv8hi3, "__builtin_ia32_vpshlw", IX86_BUILTIN_VPSHLW, UNKNOWN, (int)MULTI_ARG_2_HI },
27918 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv16qi3, "__builtin_ia32_vpshlb", IX86_BUILTIN_VPSHLB, UNKNOWN, (int)MULTI_ARG_2_QI },
27919
27920 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vmfrczv4sf2, "__builtin_ia32_vfrczss", IX86_BUILTIN_VFRCZSS, UNKNOWN, (int)MULTI_ARG_2_SF },
27921 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vmfrczv2df2, "__builtin_ia32_vfrczsd", IX86_BUILTIN_VFRCZSD, UNKNOWN, (int)MULTI_ARG_2_DF },
27922 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv4sf2, "__builtin_ia32_vfrczps", IX86_BUILTIN_VFRCZPS, UNKNOWN, (int)MULTI_ARG_1_SF },
27923 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv2df2, "__builtin_ia32_vfrczpd", IX86_BUILTIN_VFRCZPD, UNKNOWN, (int)MULTI_ARG_1_DF },
27924 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv8sf2, "__builtin_ia32_vfrczps256", IX86_BUILTIN_VFRCZPS256, UNKNOWN, (int)MULTI_ARG_1_SF2 },
27925 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv4df2, "__builtin_ia32_vfrczpd256", IX86_BUILTIN_VFRCZPD256, UNKNOWN, (int)MULTI_ARG_1_DF2 },
27926
27927 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddbw, "__builtin_ia32_vphaddbw", IX86_BUILTIN_VPHADDBW, UNKNOWN, (int)MULTI_ARG_1_QI_HI },
27928 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddbd, "__builtin_ia32_vphaddbd", IX86_BUILTIN_VPHADDBD, UNKNOWN, (int)MULTI_ARG_1_QI_SI },
27929 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddbq, "__builtin_ia32_vphaddbq", IX86_BUILTIN_VPHADDBQ, UNKNOWN, (int)MULTI_ARG_1_QI_DI },
27930 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddwd, "__builtin_ia32_vphaddwd", IX86_BUILTIN_VPHADDWD, UNKNOWN, (int)MULTI_ARG_1_HI_SI },
27931 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddwq, "__builtin_ia32_vphaddwq", IX86_BUILTIN_VPHADDWQ, UNKNOWN, (int)MULTI_ARG_1_HI_DI },
27932 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phadddq, "__builtin_ia32_vphadddq", IX86_BUILTIN_VPHADDDQ, UNKNOWN, (int)MULTI_ARG_1_SI_DI },
27933 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddubw, "__builtin_ia32_vphaddubw", IX86_BUILTIN_VPHADDUBW, UNKNOWN, (int)MULTI_ARG_1_QI_HI },
27934 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddubd, "__builtin_ia32_vphaddubd", IX86_BUILTIN_VPHADDUBD, UNKNOWN, (int)MULTI_ARG_1_QI_SI },
27935 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddubq, "__builtin_ia32_vphaddubq", IX86_BUILTIN_VPHADDUBQ, UNKNOWN, (int)MULTI_ARG_1_QI_DI },
27936 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phadduwd, "__builtin_ia32_vphadduwd", IX86_BUILTIN_VPHADDUWD, UNKNOWN, (int)MULTI_ARG_1_HI_SI },
27937 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phadduwq, "__builtin_ia32_vphadduwq", IX86_BUILTIN_VPHADDUWQ, UNKNOWN, (int)MULTI_ARG_1_HI_DI },
27938 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddudq, "__builtin_ia32_vphaddudq", IX86_BUILTIN_VPHADDUDQ, UNKNOWN, (int)MULTI_ARG_1_SI_DI },
27939 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phsubbw, "__builtin_ia32_vphsubbw", IX86_BUILTIN_VPHSUBBW, UNKNOWN, (int)MULTI_ARG_1_QI_HI },
27940 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phsubwd, "__builtin_ia32_vphsubwd", IX86_BUILTIN_VPHSUBWD, UNKNOWN, (int)MULTI_ARG_1_HI_SI },
27941 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phsubdq, "__builtin_ia32_vphsubdq", IX86_BUILTIN_VPHSUBDQ, UNKNOWN, (int)MULTI_ARG_1_SI_DI },
27942
27943 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomeqb", IX86_BUILTIN_VPCOMEQB, EQ, (int)MULTI_ARG_2_QI_CMP },
27944 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomneb", IX86_BUILTIN_VPCOMNEB, NE, (int)MULTI_ARG_2_QI_CMP },
27945 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomneqb", IX86_BUILTIN_VPCOMNEB, NE, (int)MULTI_ARG_2_QI_CMP },
27946 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomltb", IX86_BUILTIN_VPCOMLTB, LT, (int)MULTI_ARG_2_QI_CMP },
27947 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomleb", IX86_BUILTIN_VPCOMLEB, LE, (int)MULTI_ARG_2_QI_CMP },
27948 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomgtb", IX86_BUILTIN_VPCOMGTB, GT, (int)MULTI_ARG_2_QI_CMP },
27949 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomgeb", IX86_BUILTIN_VPCOMGEB, GE, (int)MULTI_ARG_2_QI_CMP },
27950
27951 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomeqw", IX86_BUILTIN_VPCOMEQW, EQ, (int)MULTI_ARG_2_HI_CMP },
27952 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomnew", IX86_BUILTIN_VPCOMNEW, NE, (int)MULTI_ARG_2_HI_CMP },
27953 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomneqw", IX86_BUILTIN_VPCOMNEW, NE, (int)MULTI_ARG_2_HI_CMP },
27954 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomltw", IX86_BUILTIN_VPCOMLTW, LT, (int)MULTI_ARG_2_HI_CMP },
27955 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomlew", IX86_BUILTIN_VPCOMLEW, LE, (int)MULTI_ARG_2_HI_CMP },
27956 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomgtw", IX86_BUILTIN_VPCOMGTW, GT, (int)MULTI_ARG_2_HI_CMP },
27957 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomgew", IX86_BUILTIN_VPCOMGEW, GE, (int)MULTI_ARG_2_HI_CMP },
27958
27959 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomeqd", IX86_BUILTIN_VPCOMEQD, EQ, (int)MULTI_ARG_2_SI_CMP },
27960 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomned", IX86_BUILTIN_VPCOMNED, NE, (int)MULTI_ARG_2_SI_CMP },
27961 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomneqd", IX86_BUILTIN_VPCOMNED, NE, (int)MULTI_ARG_2_SI_CMP },
27962 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomltd", IX86_BUILTIN_VPCOMLTD, LT, (int)MULTI_ARG_2_SI_CMP },
27963 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomled", IX86_BUILTIN_VPCOMLED, LE, (int)MULTI_ARG_2_SI_CMP },
27964 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomgtd", IX86_BUILTIN_VPCOMGTD, GT, (int)MULTI_ARG_2_SI_CMP },
27965 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomged", IX86_BUILTIN_VPCOMGED, GE, (int)MULTI_ARG_2_SI_CMP },
27966
27967 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomeqq", IX86_BUILTIN_VPCOMEQQ, EQ, (int)MULTI_ARG_2_DI_CMP },
27968 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomneq", IX86_BUILTIN_VPCOMNEQ, NE, (int)MULTI_ARG_2_DI_CMP },
27969 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomneqq", IX86_BUILTIN_VPCOMNEQ, NE, (int)MULTI_ARG_2_DI_CMP },
27970 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomltq", IX86_BUILTIN_VPCOMLTQ, LT, (int)MULTI_ARG_2_DI_CMP },
27971 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomleq", IX86_BUILTIN_VPCOMLEQ, LE, (int)MULTI_ARG_2_DI_CMP },
27972 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomgtq", IX86_BUILTIN_VPCOMGTQ, GT, (int)MULTI_ARG_2_DI_CMP },
27973 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomgeq", IX86_BUILTIN_VPCOMGEQ, GE, (int)MULTI_ARG_2_DI_CMP },
27974
27975 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v16qi3,"__builtin_ia32_vpcomequb", IX86_BUILTIN_VPCOMEQUB, EQ, (int)MULTI_ARG_2_QI_CMP },
27976 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v16qi3,"__builtin_ia32_vpcomneub", IX86_BUILTIN_VPCOMNEUB, NE, (int)MULTI_ARG_2_QI_CMP },
27977 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v16qi3,"__builtin_ia32_vpcomnequb", IX86_BUILTIN_VPCOMNEUB, NE, (int)MULTI_ARG_2_QI_CMP },
27978 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomltub", IX86_BUILTIN_VPCOMLTUB, LTU, (int)MULTI_ARG_2_QI_CMP },
27979 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomleub", IX86_BUILTIN_VPCOMLEUB, LEU, (int)MULTI_ARG_2_QI_CMP },
27980 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomgtub", IX86_BUILTIN_VPCOMGTUB, GTU, (int)MULTI_ARG_2_QI_CMP },
27981 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomgeub", IX86_BUILTIN_VPCOMGEUB, GEU, (int)MULTI_ARG_2_QI_CMP },
27982
27983 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v8hi3, "__builtin_ia32_vpcomequw", IX86_BUILTIN_VPCOMEQUW, EQ, (int)MULTI_ARG_2_HI_CMP },
27984 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v8hi3, "__builtin_ia32_vpcomneuw", IX86_BUILTIN_VPCOMNEUW, NE, (int)MULTI_ARG_2_HI_CMP },
27985 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v8hi3, "__builtin_ia32_vpcomnequw", IX86_BUILTIN_VPCOMNEUW, NE, (int)MULTI_ARG_2_HI_CMP },
27986 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomltuw", IX86_BUILTIN_VPCOMLTUW, LTU, (int)MULTI_ARG_2_HI_CMP },
27987 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomleuw", IX86_BUILTIN_VPCOMLEUW, LEU, (int)MULTI_ARG_2_HI_CMP },
27988 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomgtuw", IX86_BUILTIN_VPCOMGTUW, GTU, (int)MULTI_ARG_2_HI_CMP },
27989 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomgeuw", IX86_BUILTIN_VPCOMGEUW, GEU, (int)MULTI_ARG_2_HI_CMP },
27990
27991 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v4si3, "__builtin_ia32_vpcomequd", IX86_BUILTIN_VPCOMEQUD, EQ, (int)MULTI_ARG_2_SI_CMP },
27992 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v4si3, "__builtin_ia32_vpcomneud", IX86_BUILTIN_VPCOMNEUD, NE, (int)MULTI_ARG_2_SI_CMP },
27993 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v4si3, "__builtin_ia32_vpcomnequd", IX86_BUILTIN_VPCOMNEUD, NE, (int)MULTI_ARG_2_SI_CMP },
27994 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomltud", IX86_BUILTIN_VPCOMLTUD, LTU, (int)MULTI_ARG_2_SI_CMP },
27995 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomleud", IX86_BUILTIN_VPCOMLEUD, LEU, (int)MULTI_ARG_2_SI_CMP },
27996 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomgtud", IX86_BUILTIN_VPCOMGTUD, GTU, (int)MULTI_ARG_2_SI_CMP },
27997 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomgeud", IX86_BUILTIN_VPCOMGEUD, GEU, (int)MULTI_ARG_2_SI_CMP },
27998
27999 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v2di3, "__builtin_ia32_vpcomequq", IX86_BUILTIN_VPCOMEQUQ, EQ, (int)MULTI_ARG_2_DI_CMP },
28000 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v2di3, "__builtin_ia32_vpcomneuq", IX86_BUILTIN_VPCOMNEUQ, NE, (int)MULTI_ARG_2_DI_CMP },
28001 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v2di3, "__builtin_ia32_vpcomnequq", IX86_BUILTIN_VPCOMNEUQ, NE, (int)MULTI_ARG_2_DI_CMP },
28002 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomltuq", IX86_BUILTIN_VPCOMLTUQ, LTU, (int)MULTI_ARG_2_DI_CMP },
28003 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomleuq", IX86_BUILTIN_VPCOMLEUQ, LEU, (int)MULTI_ARG_2_DI_CMP },
28004 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomgtuq", IX86_BUILTIN_VPCOMGTUQ, GTU, (int)MULTI_ARG_2_DI_CMP },
28005 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomgeuq", IX86_BUILTIN_VPCOMGEUQ, GEU, (int)MULTI_ARG_2_DI_CMP },
28006
28007 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomfalseb", IX86_BUILTIN_VPCOMFALSEB, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_QI_TF },
28008 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomfalsew", IX86_BUILTIN_VPCOMFALSEW, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_HI_TF },
28009 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomfalsed", IX86_BUILTIN_VPCOMFALSED, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_SI_TF },
28010 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomfalseq", IX86_BUILTIN_VPCOMFALSEQ, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_DI_TF },
28011 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomfalseub",IX86_BUILTIN_VPCOMFALSEUB,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_QI_TF },
28012 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomfalseuw",IX86_BUILTIN_VPCOMFALSEUW,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_HI_TF },
28013 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomfalseud",IX86_BUILTIN_VPCOMFALSEUD,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_SI_TF },
28014 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomfalseuq",IX86_BUILTIN_VPCOMFALSEUQ,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_DI_TF },
28015
28016 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomtrueb", IX86_BUILTIN_VPCOMTRUEB, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_QI_TF },
28017 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomtruew", IX86_BUILTIN_VPCOMTRUEW, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_HI_TF },
28018 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomtrued", IX86_BUILTIN_VPCOMTRUED, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_SI_TF },
28019 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomtrueq", IX86_BUILTIN_VPCOMTRUEQ, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_DI_TF },
28020 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomtrueub", IX86_BUILTIN_VPCOMTRUEUB, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_QI_TF },
28021 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomtrueuw", IX86_BUILTIN_VPCOMTRUEUW, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_HI_TF },
28022 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomtrueud", IX86_BUILTIN_VPCOMTRUEUD, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_SI_TF },
28023 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomtrueuq", IX86_BUILTIN_VPCOMTRUEUQ, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_DI_TF },
28024
28025 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v2df3, "__builtin_ia32_vpermil2pd", IX86_BUILTIN_VPERMIL2PD, UNKNOWN, (int)MULTI_ARG_4_DF2_DI_I },
28026 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v4sf3, "__builtin_ia32_vpermil2ps", IX86_BUILTIN_VPERMIL2PS, UNKNOWN, (int)MULTI_ARG_4_SF2_SI_I },
28027 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v4df3, "__builtin_ia32_vpermil2pd256", IX86_BUILTIN_VPERMIL2PD256, UNKNOWN, (int)MULTI_ARG_4_DF2_DI_I1 },
28028 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v8sf3, "__builtin_ia32_vpermil2ps256", IX86_BUILTIN_VPERMIL2PS256, UNKNOWN, (int)MULTI_ARG_4_SF2_SI_I1 },
28029
28030 };
28031 \f
28032 /* TM vector builtins. */
28033
28034 /* Reuse the existing x86-specific `struct builtin_description' cause
28035 we're lazy. Add casts to make them fit. */
28036 static const struct builtin_description bdesc_tm[] =
28037 {
28038 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WM64", (enum ix86_builtins) BUILT_IN_TM_STORE_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
28039 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WaRM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
28040 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WaWM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
28041 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
28042 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RaRM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
28043 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RaWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
28044 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RfWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
28045
28046 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WM128", (enum ix86_builtins) BUILT_IN_TM_STORE_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
28047 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WaRM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
28048 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WaWM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
28049 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
28050 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RaRM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
28051 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RaWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
28052 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RfWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
28053
28054 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WM256", (enum ix86_builtins) BUILT_IN_TM_STORE_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
28055 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WaRM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
28056 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WaWM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
28057 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
28058 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RaRM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
28059 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RaWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
28060 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RfWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
28061
28062 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_LM64", (enum ix86_builtins) BUILT_IN_TM_LOG_M64, UNKNOWN, VOID_FTYPE_PCVOID },
28063 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_LM128", (enum ix86_builtins) BUILT_IN_TM_LOG_M128, UNKNOWN, VOID_FTYPE_PCVOID },
28064 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_LM256", (enum ix86_builtins) BUILT_IN_TM_LOG_M256, UNKNOWN, VOID_FTYPE_PCVOID },
28065 };
28066
28067 /* TM callbacks. */
28068
28069 /* Return the builtin decl needed to load a vector of TYPE. */
28070
28071 static tree
28072 ix86_builtin_tm_load (tree type)
28073 {
28074 if (TREE_CODE (type) == VECTOR_TYPE)
28075 {
28076 switch (tree_low_cst (TYPE_SIZE (type), 1))
28077 {
28078 case 64:
28079 return builtin_decl_explicit (BUILT_IN_TM_LOAD_M64);
28080 case 128:
28081 return builtin_decl_explicit (BUILT_IN_TM_LOAD_M128);
28082 case 256:
28083 return builtin_decl_explicit (BUILT_IN_TM_LOAD_M256);
28084 }
28085 }
28086 return NULL_TREE;
28087 }
28088
28089 /* Return the builtin decl needed to store a vector of TYPE. */
28090
28091 static tree
28092 ix86_builtin_tm_store (tree type)
28093 {
28094 if (TREE_CODE (type) == VECTOR_TYPE)
28095 {
28096 switch (tree_low_cst (TYPE_SIZE (type), 1))
28097 {
28098 case 64:
28099 return builtin_decl_explicit (BUILT_IN_TM_STORE_M64);
28100 case 128:
28101 return builtin_decl_explicit (BUILT_IN_TM_STORE_M128);
28102 case 256:
28103 return builtin_decl_explicit (BUILT_IN_TM_STORE_M256);
28104 }
28105 }
28106 return NULL_TREE;
28107 }
28108 \f
28109 /* Initialize the transactional memory vector load/store builtins. */
28110
28111 static void
28112 ix86_init_tm_builtins (void)
28113 {
28114 enum ix86_builtin_func_type ftype;
28115 const struct builtin_description *d;
28116 size_t i;
28117 tree decl;
28118 tree attrs_load, attrs_type_load, attrs_store, attrs_type_store;
28119 tree attrs_log, attrs_type_log;
28120
28121 if (!flag_tm)
28122 return;
28123
28124 /* If there are no builtins defined, we must be compiling in a
28125 language without trans-mem support. */
28126 if (!builtin_decl_explicit_p (BUILT_IN_TM_LOAD_1))
28127 return;
28128
28129 /* Use whatever attributes a normal TM load has. */
28130 decl = builtin_decl_explicit (BUILT_IN_TM_LOAD_1);
28131 attrs_load = DECL_ATTRIBUTES (decl);
28132 attrs_type_load = TYPE_ATTRIBUTES (TREE_TYPE (decl));
28133 /* Use whatever attributes a normal TM store has. */
28134 decl = builtin_decl_explicit (BUILT_IN_TM_STORE_1);
28135 attrs_store = DECL_ATTRIBUTES (decl);
28136 attrs_type_store = TYPE_ATTRIBUTES (TREE_TYPE (decl));
28137 /* Use whatever attributes a normal TM log has. */
28138 decl = builtin_decl_explicit (BUILT_IN_TM_LOG);
28139 attrs_log = DECL_ATTRIBUTES (decl);
28140 attrs_type_log = TYPE_ATTRIBUTES (TREE_TYPE (decl));
28141
28142 for (i = 0, d = bdesc_tm;
28143 i < ARRAY_SIZE (bdesc_tm);
28144 i++, d++)
28145 {
28146 if ((d->mask & ix86_isa_flags) != 0
28147 || (lang_hooks.builtin_function
28148 == lang_hooks.builtin_function_ext_scope))
28149 {
28150 tree type, attrs, attrs_type;
28151 enum built_in_function code = (enum built_in_function) d->code;
28152
28153 ftype = (enum ix86_builtin_func_type) d->flag;
28154 type = ix86_get_builtin_func_type (ftype);
28155
28156 if (BUILTIN_TM_LOAD_P (code))
28157 {
28158 attrs = attrs_load;
28159 attrs_type = attrs_type_load;
28160 }
28161 else if (BUILTIN_TM_STORE_P (code))
28162 {
28163 attrs = attrs_store;
28164 attrs_type = attrs_type_store;
28165 }
28166 else
28167 {
28168 attrs = attrs_log;
28169 attrs_type = attrs_type_log;
28170 }
28171 decl = add_builtin_function (d->name, type, code, BUILT_IN_NORMAL,
28172 /* The builtin without the prefix for
28173 calling it directly. */
28174 d->name + strlen ("__builtin_"),
28175 attrs);
28176 /* add_builtin_function() will set the DECL_ATTRIBUTES, now
28177 set the TYPE_ATTRIBUTES. */
28178 decl_attributes (&TREE_TYPE (decl), attrs_type, ATTR_FLAG_BUILT_IN);
28179
28180 set_builtin_decl (code, decl, false);
28181 }
28182 }
28183 }
28184
28185 /* Set up all the MMX/SSE builtins, even builtins for instructions that are not
28186 in the current target ISA to allow the user to compile particular modules
28187 with different target specific options that differ from the command line
28188 options. */
28189 static void
28190 ix86_init_mmx_sse_builtins (void)
28191 {
28192 const struct builtin_description * d;
28193 enum ix86_builtin_func_type ftype;
28194 size_t i;
28195
28196 /* Add all special builtins with variable number of operands. */
28197 for (i = 0, d = bdesc_special_args;
28198 i < ARRAY_SIZE (bdesc_special_args);
28199 i++, d++)
28200 {
28201 if (d->name == 0)
28202 continue;
28203
28204 ftype = (enum ix86_builtin_func_type) d->flag;
28205 def_builtin (d->mask, d->name, ftype, d->code);
28206 }
28207
28208 /* Add all builtins with variable number of operands. */
28209 for (i = 0, d = bdesc_args;
28210 i < ARRAY_SIZE (bdesc_args);
28211 i++, d++)
28212 {
28213 if (d->name == 0)
28214 continue;
28215
28216 ftype = (enum ix86_builtin_func_type) d->flag;
28217 def_builtin_const (d->mask, d->name, ftype, d->code);
28218 }
28219
28220 /* pcmpestr[im] insns. */
28221 for (i = 0, d = bdesc_pcmpestr;
28222 i < ARRAY_SIZE (bdesc_pcmpestr);
28223 i++, d++)
28224 {
28225 if (d->code == IX86_BUILTIN_PCMPESTRM128)
28226 ftype = V16QI_FTYPE_V16QI_INT_V16QI_INT_INT;
28227 else
28228 ftype = INT_FTYPE_V16QI_INT_V16QI_INT_INT;
28229 def_builtin_const (d->mask, d->name, ftype, d->code);
28230 }
28231
28232 /* pcmpistr[im] insns. */
28233 for (i = 0, d = bdesc_pcmpistr;
28234 i < ARRAY_SIZE (bdesc_pcmpistr);
28235 i++, d++)
28236 {
28237 if (d->code == IX86_BUILTIN_PCMPISTRM128)
28238 ftype = V16QI_FTYPE_V16QI_V16QI_INT;
28239 else
28240 ftype = INT_FTYPE_V16QI_V16QI_INT;
28241 def_builtin_const (d->mask, d->name, ftype, d->code);
28242 }
28243
28244 /* comi/ucomi insns. */
28245 for (i = 0, d = bdesc_comi; i < ARRAY_SIZE (bdesc_comi); i++, d++)
28246 {
28247 if (d->mask == OPTION_MASK_ISA_SSE2)
28248 ftype = INT_FTYPE_V2DF_V2DF;
28249 else
28250 ftype = INT_FTYPE_V4SF_V4SF;
28251 def_builtin_const (d->mask, d->name, ftype, d->code);
28252 }
28253
28254 /* SSE */
28255 def_builtin (OPTION_MASK_ISA_SSE, "__builtin_ia32_ldmxcsr",
28256 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_LDMXCSR);
28257 def_builtin (OPTION_MASK_ISA_SSE, "__builtin_ia32_stmxcsr",
28258 UNSIGNED_FTYPE_VOID, IX86_BUILTIN_STMXCSR);
28259
28260 /* SSE or 3DNow!A */
28261 def_builtin (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
28262 "__builtin_ia32_maskmovq", VOID_FTYPE_V8QI_V8QI_PCHAR,
28263 IX86_BUILTIN_MASKMOVQ);
28264
28265 /* SSE2 */
28266 def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_maskmovdqu",
28267 VOID_FTYPE_V16QI_V16QI_PCHAR, IX86_BUILTIN_MASKMOVDQU);
28268
28269 def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_clflush",
28270 VOID_FTYPE_PCVOID, IX86_BUILTIN_CLFLUSH);
28271 x86_mfence = def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_mfence",
28272 VOID_FTYPE_VOID, IX86_BUILTIN_MFENCE);
28273
28274 /* SSE3. */
28275 def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_monitor",
28276 VOID_FTYPE_PCVOID_UNSIGNED_UNSIGNED, IX86_BUILTIN_MONITOR);
28277 def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_mwait",
28278 VOID_FTYPE_UNSIGNED_UNSIGNED, IX86_BUILTIN_MWAIT);
28279
28280 /* AES */
28281 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesenc128",
28282 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENC128);
28283 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesenclast128",
28284 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENCLAST128);
28285 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesdec128",
28286 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDEC128);
28287 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesdeclast128",
28288 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDECLAST128);
28289 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesimc128",
28290 V2DI_FTYPE_V2DI, IX86_BUILTIN_AESIMC128);
28291 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aeskeygenassist128",
28292 V2DI_FTYPE_V2DI_INT, IX86_BUILTIN_AESKEYGENASSIST128);
28293
28294 /* PCLMUL */
28295 def_builtin_const (OPTION_MASK_ISA_PCLMUL, "__builtin_ia32_pclmulqdq128",
28296 V2DI_FTYPE_V2DI_V2DI_INT, IX86_BUILTIN_PCLMULQDQ128);
28297
28298 /* RDRND */
28299 def_builtin (OPTION_MASK_ISA_RDRND, "__builtin_ia32_rdrand16_step",
28300 INT_FTYPE_PUSHORT, IX86_BUILTIN_RDRAND16_STEP);
28301 def_builtin (OPTION_MASK_ISA_RDRND, "__builtin_ia32_rdrand32_step",
28302 INT_FTYPE_PUNSIGNED, IX86_BUILTIN_RDRAND32_STEP);
28303 def_builtin (OPTION_MASK_ISA_RDRND | OPTION_MASK_ISA_64BIT,
28304 "__builtin_ia32_rdrand64_step", INT_FTYPE_PULONGLONG,
28305 IX86_BUILTIN_RDRAND64_STEP);
28306
28307 /* AVX2 */
28308 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv2df",
28309 V2DF_FTYPE_V2DF_PCDOUBLE_V4SI_V2DF_INT,
28310 IX86_BUILTIN_GATHERSIV2DF);
28311
28312 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4df",
28313 V4DF_FTYPE_V4DF_PCDOUBLE_V4SI_V4DF_INT,
28314 IX86_BUILTIN_GATHERSIV4DF);
28315
28316 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv2df",
28317 V2DF_FTYPE_V2DF_PCDOUBLE_V2DI_V2DF_INT,
28318 IX86_BUILTIN_GATHERDIV2DF);
28319
28320 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4df",
28321 V4DF_FTYPE_V4DF_PCDOUBLE_V4DI_V4DF_INT,
28322 IX86_BUILTIN_GATHERDIV4DF);
28323
28324 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4sf",
28325 V4SF_FTYPE_V4SF_PCFLOAT_V4SI_V4SF_INT,
28326 IX86_BUILTIN_GATHERSIV4SF);
28327
28328 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv8sf",
28329 V8SF_FTYPE_V8SF_PCFLOAT_V8SI_V8SF_INT,
28330 IX86_BUILTIN_GATHERSIV8SF);
28331
28332 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4sf",
28333 V4SF_FTYPE_V4SF_PCFLOAT_V2DI_V4SF_INT,
28334 IX86_BUILTIN_GATHERDIV4SF);
28335
28336 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4sf256",
28337 V4SF_FTYPE_V4SF_PCFLOAT_V4DI_V4SF_INT,
28338 IX86_BUILTIN_GATHERDIV8SF);
28339
28340 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv2di",
28341 V2DI_FTYPE_V2DI_PCINT64_V4SI_V2DI_INT,
28342 IX86_BUILTIN_GATHERSIV2DI);
28343
28344 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4di",
28345 V4DI_FTYPE_V4DI_PCINT64_V4SI_V4DI_INT,
28346 IX86_BUILTIN_GATHERSIV4DI);
28347
28348 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv2di",
28349 V2DI_FTYPE_V2DI_PCINT64_V2DI_V2DI_INT,
28350 IX86_BUILTIN_GATHERDIV2DI);
28351
28352 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4di",
28353 V4DI_FTYPE_V4DI_PCINT64_V4DI_V4DI_INT,
28354 IX86_BUILTIN_GATHERDIV4DI);
28355
28356 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4si",
28357 V4SI_FTYPE_V4SI_PCINT_V4SI_V4SI_INT,
28358 IX86_BUILTIN_GATHERSIV4SI);
28359
28360 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv8si",
28361 V8SI_FTYPE_V8SI_PCINT_V8SI_V8SI_INT,
28362 IX86_BUILTIN_GATHERSIV8SI);
28363
28364 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4si",
28365 V4SI_FTYPE_V4SI_PCINT_V2DI_V4SI_INT,
28366 IX86_BUILTIN_GATHERDIV4SI);
28367
28368 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4si256",
28369 V4SI_FTYPE_V4SI_PCINT_V4DI_V4SI_INT,
28370 IX86_BUILTIN_GATHERDIV8SI);
28371
28372 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4df ",
28373 V4DF_FTYPE_V4DF_PCDOUBLE_V8SI_V4DF_INT,
28374 IX86_BUILTIN_GATHERALTSIV4DF);
28375
28376 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4sf256 ",
28377 V8SF_FTYPE_V8SF_PCFLOAT_V4DI_V8SF_INT,
28378 IX86_BUILTIN_GATHERALTDIV8SF);
28379
28380 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4di ",
28381 V4DI_FTYPE_V4DI_PCINT64_V8SI_V4DI_INT,
28382 IX86_BUILTIN_GATHERALTSIV4DI);
28383
28384 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4si256 ",
28385 V8SI_FTYPE_V8SI_PCINT_V4DI_V8SI_INT,
28386 IX86_BUILTIN_GATHERALTDIV8SI);
28387
28388 /* RTM. */
28389 def_builtin (OPTION_MASK_ISA_RTM, "__builtin_ia32_xabort",
28390 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_XABORT);
28391
28392 /* MMX access to the vec_init patterns. */
28393 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v2si",
28394 V2SI_FTYPE_INT_INT, IX86_BUILTIN_VEC_INIT_V2SI);
28395
28396 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v4hi",
28397 V4HI_FTYPE_HI_HI_HI_HI,
28398 IX86_BUILTIN_VEC_INIT_V4HI);
28399
28400 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v8qi",
28401 V8QI_FTYPE_QI_QI_QI_QI_QI_QI_QI_QI,
28402 IX86_BUILTIN_VEC_INIT_V8QI);
28403
28404 /* Access to the vec_extract patterns. */
28405 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2df",
28406 DOUBLE_FTYPE_V2DF_INT, IX86_BUILTIN_VEC_EXT_V2DF);
28407 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2di",
28408 DI_FTYPE_V2DI_INT, IX86_BUILTIN_VEC_EXT_V2DI);
28409 def_builtin_const (OPTION_MASK_ISA_SSE, "__builtin_ia32_vec_ext_v4sf",
28410 FLOAT_FTYPE_V4SF_INT, IX86_BUILTIN_VEC_EXT_V4SF);
28411 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v4si",
28412 SI_FTYPE_V4SI_INT, IX86_BUILTIN_VEC_EXT_V4SI);
28413 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v8hi",
28414 HI_FTYPE_V8HI_INT, IX86_BUILTIN_VEC_EXT_V8HI);
28415
28416 def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
28417 "__builtin_ia32_vec_ext_v4hi",
28418 HI_FTYPE_V4HI_INT, IX86_BUILTIN_VEC_EXT_V4HI);
28419
28420 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_ext_v2si",
28421 SI_FTYPE_V2SI_INT, IX86_BUILTIN_VEC_EXT_V2SI);
28422
28423 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v16qi",
28424 QI_FTYPE_V16QI_INT, IX86_BUILTIN_VEC_EXT_V16QI);
28425
28426 /* Access to the vec_set patterns. */
28427 def_builtin_const (OPTION_MASK_ISA_SSE4_1 | OPTION_MASK_ISA_64BIT,
28428 "__builtin_ia32_vec_set_v2di",
28429 V2DI_FTYPE_V2DI_DI_INT, IX86_BUILTIN_VEC_SET_V2DI);
28430
28431 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4sf",
28432 V4SF_FTYPE_V4SF_FLOAT_INT, IX86_BUILTIN_VEC_SET_V4SF);
28433
28434 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4si",
28435 V4SI_FTYPE_V4SI_SI_INT, IX86_BUILTIN_VEC_SET_V4SI);
28436
28437 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_set_v8hi",
28438 V8HI_FTYPE_V8HI_HI_INT, IX86_BUILTIN_VEC_SET_V8HI);
28439
28440 def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
28441 "__builtin_ia32_vec_set_v4hi",
28442 V4HI_FTYPE_V4HI_HI_INT, IX86_BUILTIN_VEC_SET_V4HI);
28443
28444 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v16qi",
28445 V16QI_FTYPE_V16QI_QI_INT, IX86_BUILTIN_VEC_SET_V16QI);
28446
28447 /* RDSEED */
28448 def_builtin (OPTION_MASK_ISA_RDSEED, "__builtin_ia32_rdseed_hi_step",
28449 INT_FTYPE_PUSHORT, IX86_BUILTIN_RDSEED16_STEP);
28450 def_builtin (OPTION_MASK_ISA_RDSEED, "__builtin_ia32_rdseed_si_step",
28451 INT_FTYPE_PUNSIGNED, IX86_BUILTIN_RDSEED32_STEP);
28452 def_builtin (OPTION_MASK_ISA_RDSEED | OPTION_MASK_ISA_64BIT,
28453 "__builtin_ia32_rdseed_di_step",
28454 INT_FTYPE_PULONGLONG, IX86_BUILTIN_RDSEED64_STEP);
28455
28456 /* ADCX */
28457 def_builtin (0, "__builtin_ia32_addcarryx_u32",
28458 UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED, IX86_BUILTIN_ADDCARRYX32);
28459 def_builtin (OPTION_MASK_ISA_64BIT,
28460 "__builtin_ia32_addcarryx_u64",
28461 UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG,
28462 IX86_BUILTIN_ADDCARRYX64);
28463
28464 /* Add FMA4 multi-arg argument instructions */
28465 for (i = 0, d = bdesc_multi_arg; i < ARRAY_SIZE (bdesc_multi_arg); i++, d++)
28466 {
28467 if (d->name == 0)
28468 continue;
28469
28470 ftype = (enum ix86_builtin_func_type) d->flag;
28471 def_builtin_const (d->mask, d->name, ftype, d->code);
28472 }
28473 }
28474
28475 /* This builds the processor_model struct type defined in
28476 libgcc/config/i386/cpuinfo.c */
28477
28478 static tree
28479 build_processor_model_struct (void)
28480 {
28481 const char *field_name[] = {"__cpu_vendor", "__cpu_type", "__cpu_subtype",
28482 "__cpu_features"};
28483 tree field = NULL_TREE, field_chain = NULL_TREE;
28484 int i;
28485 tree type = make_node (RECORD_TYPE);
28486
28487 /* The first 3 fields are unsigned int. */
28488 for (i = 0; i < 3; ++i)
28489 {
28490 field = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
28491 get_identifier (field_name[i]), unsigned_type_node);
28492 if (field_chain != NULL_TREE)
28493 DECL_CHAIN (field) = field_chain;
28494 field_chain = field;
28495 }
28496
28497 /* The last field is an array of unsigned integers of size one. */
28498 field = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
28499 get_identifier (field_name[3]),
28500 build_array_type (unsigned_type_node,
28501 build_index_type (size_one_node)));
28502 if (field_chain != NULL_TREE)
28503 DECL_CHAIN (field) = field_chain;
28504 field_chain = field;
28505
28506 finish_builtin_struct (type, "__processor_model", field_chain, NULL_TREE);
28507 return type;
28508 }
28509
28510 /* Returns a extern, comdat VAR_DECL of type TYPE and name NAME. */
28511
28512 static tree
28513 make_var_decl (tree type, const char *name)
28514 {
28515 tree new_decl;
28516
28517 new_decl = build_decl (UNKNOWN_LOCATION,
28518 VAR_DECL,
28519 get_identifier(name),
28520 type);
28521
28522 DECL_EXTERNAL (new_decl) = 1;
28523 TREE_STATIC (new_decl) = 1;
28524 TREE_PUBLIC (new_decl) = 1;
28525 DECL_INITIAL (new_decl) = 0;
28526 DECL_ARTIFICIAL (new_decl) = 0;
28527 DECL_PRESERVE_P (new_decl) = 1;
28528
28529 make_decl_one_only (new_decl, DECL_ASSEMBLER_NAME (new_decl));
28530 assemble_variable (new_decl, 0, 0, 0);
28531
28532 return new_decl;
28533 }
28534
28535 /* FNDECL is a __builtin_cpu_is or a __builtin_cpu_supports call that is folded
28536 into an integer defined in libgcc/config/i386/cpuinfo.c */
28537
28538 static tree
28539 fold_builtin_cpu (tree fndecl, tree *args)
28540 {
28541 unsigned int i;
28542 enum ix86_builtins fn_code = (enum ix86_builtins)
28543 DECL_FUNCTION_CODE (fndecl);
28544 tree param_string_cst = NULL;
28545
28546 /* This is the order of bit-fields in __processor_features in cpuinfo.c */
28547 enum processor_features
28548 {
28549 F_CMOV = 0,
28550 F_MMX,
28551 F_POPCNT,
28552 F_SSE,
28553 F_SSE2,
28554 F_SSE3,
28555 F_SSSE3,
28556 F_SSE4_1,
28557 F_SSE4_2,
28558 F_AVX,
28559 F_AVX2,
28560 F_MAX
28561 };
28562
28563 /* These are the values for vendor types and cpu types and subtypes
28564 in cpuinfo.c. Cpu types and subtypes should be subtracted by
28565 the corresponding start value. */
28566 enum processor_model
28567 {
28568 M_INTEL = 1,
28569 M_AMD,
28570 M_CPU_TYPE_START,
28571 M_INTEL_ATOM,
28572 M_INTEL_CORE2,
28573 M_INTEL_COREI7,
28574 M_AMDFAM10H,
28575 M_AMDFAM15H,
28576 M_CPU_SUBTYPE_START,
28577 M_INTEL_COREI7_NEHALEM,
28578 M_INTEL_COREI7_WESTMERE,
28579 M_INTEL_COREI7_SANDYBRIDGE,
28580 M_AMDFAM10H_BARCELONA,
28581 M_AMDFAM10H_SHANGHAI,
28582 M_AMDFAM10H_ISTANBUL,
28583 M_AMDFAM15H_BDVER1,
28584 M_AMDFAM15H_BDVER2
28585 };
28586
28587 static struct _arch_names_table
28588 {
28589 const char *const name;
28590 const enum processor_model model;
28591 }
28592 const arch_names_table[] =
28593 {
28594 {"amd", M_AMD},
28595 {"intel", M_INTEL},
28596 {"atom", M_INTEL_ATOM},
28597 {"core2", M_INTEL_CORE2},
28598 {"corei7", M_INTEL_COREI7},
28599 {"nehalem", M_INTEL_COREI7_NEHALEM},
28600 {"westmere", M_INTEL_COREI7_WESTMERE},
28601 {"sandybridge", M_INTEL_COREI7_SANDYBRIDGE},
28602 {"amdfam10h", M_AMDFAM10H},
28603 {"barcelona", M_AMDFAM10H_BARCELONA},
28604 {"shanghai", M_AMDFAM10H_SHANGHAI},
28605 {"istanbul", M_AMDFAM10H_ISTANBUL},
28606 {"amdfam15h", M_AMDFAM15H},
28607 {"bdver1", M_AMDFAM15H_BDVER1},
28608 {"bdver2", M_AMDFAM15H_BDVER2},
28609 };
28610
28611 static struct _isa_names_table
28612 {
28613 const char *const name;
28614 const enum processor_features feature;
28615 }
28616 const isa_names_table[] =
28617 {
28618 {"cmov", F_CMOV},
28619 {"mmx", F_MMX},
28620 {"popcnt", F_POPCNT},
28621 {"sse", F_SSE},
28622 {"sse2", F_SSE2},
28623 {"sse3", F_SSE3},
28624 {"ssse3", F_SSSE3},
28625 {"sse4.1", F_SSE4_1},
28626 {"sse4.2", F_SSE4_2},
28627 {"avx", F_AVX},
28628 {"avx2", F_AVX2}
28629 };
28630
28631 static tree __processor_model_type = NULL_TREE;
28632 static tree __cpu_model_var = NULL_TREE;
28633
28634 if (__processor_model_type == NULL_TREE)
28635 __processor_model_type = build_processor_model_struct ();
28636
28637 if (__cpu_model_var == NULL_TREE)
28638 __cpu_model_var = make_var_decl (__processor_model_type,
28639 "__cpu_model");
28640
28641 gcc_assert ((args != NULL) && (*args != NULL));
28642
28643 param_string_cst = *args;
28644 while (param_string_cst
28645 && TREE_CODE (param_string_cst) != STRING_CST)
28646 {
28647 /* *args must be a expr that can contain other EXPRS leading to a
28648 STRING_CST. */
28649 if (!EXPR_P (param_string_cst))
28650 {
28651 error ("Parameter to builtin must be a string constant or literal");
28652 return integer_zero_node;
28653 }
28654 param_string_cst = TREE_OPERAND (EXPR_CHECK (param_string_cst), 0);
28655 }
28656
28657 gcc_assert (param_string_cst);
28658
28659 if (fn_code == IX86_BUILTIN_CPU_IS)
28660 {
28661 tree ref;
28662 tree field;
28663 unsigned int field_val = 0;
28664 unsigned int NUM_ARCH_NAMES
28665 = sizeof (arch_names_table) / sizeof (struct _arch_names_table);
28666
28667 for (i = 0; i < NUM_ARCH_NAMES; i++)
28668 if (strcmp (arch_names_table[i].name,
28669 TREE_STRING_POINTER (param_string_cst)) == 0)
28670 break;
28671
28672 if (i == NUM_ARCH_NAMES)
28673 {
28674 error ("Parameter to builtin not valid: %s",
28675 TREE_STRING_POINTER (param_string_cst));
28676 return integer_zero_node;
28677 }
28678
28679 field = TYPE_FIELDS (__processor_model_type);
28680 field_val = arch_names_table[i].model;
28681
28682 /* CPU types are stored in the next field. */
28683 if (field_val > M_CPU_TYPE_START
28684 && field_val < M_CPU_SUBTYPE_START)
28685 {
28686 field = DECL_CHAIN (field);
28687 field_val -= M_CPU_TYPE_START;
28688 }
28689
28690 /* CPU subtypes are stored in the next field. */
28691 if (field_val > M_CPU_SUBTYPE_START)
28692 {
28693 field = DECL_CHAIN ( DECL_CHAIN (field));
28694 field_val -= M_CPU_SUBTYPE_START;
28695 }
28696
28697 /* Get the appropriate field in __cpu_model. */
28698 ref = build3 (COMPONENT_REF, TREE_TYPE (field), __cpu_model_var,
28699 field, NULL_TREE);
28700
28701 /* Check the value. */
28702 return build2 (EQ_EXPR, unsigned_type_node, ref,
28703 build_int_cstu (unsigned_type_node, field_val));
28704 }
28705 else if (fn_code == IX86_BUILTIN_CPU_SUPPORTS)
28706 {
28707 tree ref;
28708 tree array_elt;
28709 tree field;
28710 unsigned int field_val = 0;
28711 unsigned int NUM_ISA_NAMES
28712 = sizeof (isa_names_table) / sizeof (struct _isa_names_table);
28713
28714 for (i = 0; i < NUM_ISA_NAMES; i++)
28715 if (strcmp (isa_names_table[i].name,
28716 TREE_STRING_POINTER (param_string_cst)) == 0)
28717 break;
28718
28719 if (i == NUM_ISA_NAMES)
28720 {
28721 error ("Parameter to builtin not valid: %s",
28722 TREE_STRING_POINTER (param_string_cst));
28723 return integer_zero_node;
28724 }
28725
28726 field = TYPE_FIELDS (__processor_model_type);
28727 /* Get the last field, which is __cpu_features. */
28728 while (DECL_CHAIN (field))
28729 field = DECL_CHAIN (field);
28730
28731 /* Get the appropriate field: __cpu_model.__cpu_features */
28732 ref = build3 (COMPONENT_REF, TREE_TYPE (field), __cpu_model_var,
28733 field, NULL_TREE);
28734
28735 /* Access the 0th element of __cpu_features array. */
28736 array_elt = build4 (ARRAY_REF, unsigned_type_node, ref,
28737 integer_zero_node, NULL_TREE, NULL_TREE);
28738
28739 field_val = (1 << isa_names_table[i].feature);
28740 /* Return __cpu_model.__cpu_features[0] & field_val */
28741 return build2 (BIT_AND_EXPR, unsigned_type_node, array_elt,
28742 build_int_cstu (unsigned_type_node, field_val));
28743 }
28744 gcc_unreachable ();
28745 }
28746
28747 static tree
28748 ix86_fold_builtin (tree fndecl, int n_args,
28749 tree *args, bool ignore ATTRIBUTE_UNUSED)
28750 {
28751 if (DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD)
28752 {
28753 enum ix86_builtins fn_code = (enum ix86_builtins)
28754 DECL_FUNCTION_CODE (fndecl);
28755 if (fn_code == IX86_BUILTIN_CPU_IS
28756 || fn_code == IX86_BUILTIN_CPU_SUPPORTS)
28757 {
28758 gcc_assert (n_args == 1);
28759 return fold_builtin_cpu (fndecl, args);
28760 }
28761 }
28762
28763 #ifdef SUBTARGET_FOLD_BUILTIN
28764 return SUBTARGET_FOLD_BUILTIN (fndecl, n_args, args, ignore);
28765 #endif
28766
28767 return NULL_TREE;
28768 }
28769
28770 /* Make builtins to detect cpu type and features supported. NAME is
28771 the builtin name, CODE is the builtin code, and FTYPE is the function
28772 type of the builtin. */
28773
28774 static void
28775 make_cpu_type_builtin (const char* name, int code,
28776 enum ix86_builtin_func_type ftype, bool is_const)
28777 {
28778 tree decl;
28779 tree type;
28780
28781 type = ix86_get_builtin_func_type (ftype);
28782 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
28783 NULL, NULL_TREE);
28784 gcc_assert (decl != NULL_TREE);
28785 ix86_builtins[(int) code] = decl;
28786 TREE_READONLY (decl) = is_const;
28787 }
28788
28789 /* Make builtins to get CPU type and features supported. The created
28790 builtins are :
28791
28792 __builtin_cpu_init (), to detect cpu type and features,
28793 __builtin_cpu_is ("<CPUNAME>"), to check if cpu is of type <CPUNAME>,
28794 __builtin_cpu_supports ("<FEATURE>"), to check if cpu supports <FEATURE>
28795 */
28796
28797 static void
28798 ix86_init_platform_type_builtins (void)
28799 {
28800 make_cpu_type_builtin ("__builtin_cpu_init", IX86_BUILTIN_CPU_INIT,
28801 INT_FTYPE_VOID, false);
28802 make_cpu_type_builtin ("__builtin_cpu_is", IX86_BUILTIN_CPU_IS,
28803 INT_FTYPE_PCCHAR, true);
28804 make_cpu_type_builtin ("__builtin_cpu_supports", IX86_BUILTIN_CPU_SUPPORTS,
28805 INT_FTYPE_PCCHAR, true);
28806 }
28807
28808 /* Internal method for ix86_init_builtins. */
28809
28810 static void
28811 ix86_init_builtins_va_builtins_abi (void)
28812 {
28813 tree ms_va_ref, sysv_va_ref;
28814 tree fnvoid_va_end_ms, fnvoid_va_end_sysv;
28815 tree fnvoid_va_start_ms, fnvoid_va_start_sysv;
28816 tree fnvoid_va_copy_ms, fnvoid_va_copy_sysv;
28817 tree fnattr_ms = NULL_TREE, fnattr_sysv = NULL_TREE;
28818
28819 if (!TARGET_64BIT)
28820 return;
28821 fnattr_ms = build_tree_list (get_identifier ("ms_abi"), NULL_TREE);
28822 fnattr_sysv = build_tree_list (get_identifier ("sysv_abi"), NULL_TREE);
28823 ms_va_ref = build_reference_type (ms_va_list_type_node);
28824 sysv_va_ref =
28825 build_pointer_type (TREE_TYPE (sysv_va_list_type_node));
28826
28827 fnvoid_va_end_ms =
28828 build_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
28829 fnvoid_va_start_ms =
28830 build_varargs_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
28831 fnvoid_va_end_sysv =
28832 build_function_type_list (void_type_node, sysv_va_ref, NULL_TREE);
28833 fnvoid_va_start_sysv =
28834 build_varargs_function_type_list (void_type_node, sysv_va_ref,
28835 NULL_TREE);
28836 fnvoid_va_copy_ms =
28837 build_function_type_list (void_type_node, ms_va_ref, ms_va_list_type_node,
28838 NULL_TREE);
28839 fnvoid_va_copy_sysv =
28840 build_function_type_list (void_type_node, sysv_va_ref,
28841 sysv_va_ref, NULL_TREE);
28842
28843 add_builtin_function ("__builtin_ms_va_start", fnvoid_va_start_ms,
28844 BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_ms);
28845 add_builtin_function ("__builtin_ms_va_end", fnvoid_va_end_ms,
28846 BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_ms);
28847 add_builtin_function ("__builtin_ms_va_copy", fnvoid_va_copy_ms,
28848 BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_ms);
28849 add_builtin_function ("__builtin_sysv_va_start", fnvoid_va_start_sysv,
28850 BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_sysv);
28851 add_builtin_function ("__builtin_sysv_va_end", fnvoid_va_end_sysv,
28852 BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_sysv);
28853 add_builtin_function ("__builtin_sysv_va_copy", fnvoid_va_copy_sysv,
28854 BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_sysv);
28855 }
28856
28857 static void
28858 ix86_init_builtin_types (void)
28859 {
28860 tree float128_type_node, float80_type_node;
28861
28862 /* The __float80 type. */
28863 float80_type_node = long_double_type_node;
28864 if (TYPE_MODE (float80_type_node) != XFmode)
28865 {
28866 /* The __float80 type. */
28867 float80_type_node = make_node (REAL_TYPE);
28868
28869 TYPE_PRECISION (float80_type_node) = 80;
28870 layout_type (float80_type_node);
28871 }
28872 lang_hooks.types.register_builtin_type (float80_type_node, "__float80");
28873
28874 /* The __float128 type. */
28875 float128_type_node = make_node (REAL_TYPE);
28876 TYPE_PRECISION (float128_type_node) = 128;
28877 layout_type (float128_type_node);
28878 lang_hooks.types.register_builtin_type (float128_type_node, "__float128");
28879
28880 /* This macro is built by i386-builtin-types.awk. */
28881 DEFINE_BUILTIN_PRIMITIVE_TYPES;
28882 }
28883
28884 static void
28885 ix86_init_builtins (void)
28886 {
28887 tree t;
28888
28889 ix86_init_builtin_types ();
28890
28891 /* Builtins to get CPU type and features. */
28892 ix86_init_platform_type_builtins ();
28893
28894 /* TFmode support builtins. */
28895 def_builtin_const (0, "__builtin_infq",
28896 FLOAT128_FTYPE_VOID, IX86_BUILTIN_INFQ);
28897 def_builtin_const (0, "__builtin_huge_valq",
28898 FLOAT128_FTYPE_VOID, IX86_BUILTIN_HUGE_VALQ);
28899
28900 /* We will expand them to normal call if SSE isn't available since
28901 they are used by libgcc. */
28902 t = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128);
28903 t = add_builtin_function ("__builtin_fabsq", t, IX86_BUILTIN_FABSQ,
28904 BUILT_IN_MD, "__fabstf2", NULL_TREE);
28905 TREE_READONLY (t) = 1;
28906 ix86_builtins[(int) IX86_BUILTIN_FABSQ] = t;
28907
28908 t = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128_FLOAT128);
28909 t = add_builtin_function ("__builtin_copysignq", t, IX86_BUILTIN_COPYSIGNQ,
28910 BUILT_IN_MD, "__copysigntf3", NULL_TREE);
28911 TREE_READONLY (t) = 1;
28912 ix86_builtins[(int) IX86_BUILTIN_COPYSIGNQ] = t;
28913
28914 ix86_init_tm_builtins ();
28915 ix86_init_mmx_sse_builtins ();
28916
28917 if (TARGET_LP64)
28918 ix86_init_builtins_va_builtins_abi ();
28919
28920 #ifdef SUBTARGET_INIT_BUILTINS
28921 SUBTARGET_INIT_BUILTINS;
28922 #endif
28923 }
28924
28925 /* Return the ix86 builtin for CODE. */
28926
28927 static tree
28928 ix86_builtin_decl (unsigned code, bool initialize_p ATTRIBUTE_UNUSED)
28929 {
28930 if (code >= IX86_BUILTIN_MAX)
28931 return error_mark_node;
28932
28933 return ix86_builtins[code];
28934 }
28935
28936 /* Errors in the source file can cause expand_expr to return const0_rtx
28937 where we expect a vector. To avoid crashing, use one of the vector
28938 clear instructions. */
28939 static rtx
28940 safe_vector_operand (rtx x, enum machine_mode mode)
28941 {
28942 if (x == const0_rtx)
28943 x = CONST0_RTX (mode);
28944 return x;
28945 }
28946
28947 /* Subroutine of ix86_expand_builtin to take care of binop insns. */
28948
28949 static rtx
28950 ix86_expand_binop_builtin (enum insn_code icode, tree exp, rtx target)
28951 {
28952 rtx pat;
28953 tree arg0 = CALL_EXPR_ARG (exp, 0);
28954 tree arg1 = CALL_EXPR_ARG (exp, 1);
28955 rtx op0 = expand_normal (arg0);
28956 rtx op1 = expand_normal (arg1);
28957 enum machine_mode tmode = insn_data[icode].operand[0].mode;
28958 enum machine_mode mode0 = insn_data[icode].operand[1].mode;
28959 enum machine_mode mode1 = insn_data[icode].operand[2].mode;
28960
28961 if (VECTOR_MODE_P (mode0))
28962 op0 = safe_vector_operand (op0, mode0);
28963 if (VECTOR_MODE_P (mode1))
28964 op1 = safe_vector_operand (op1, mode1);
28965
28966 if (optimize || !target
28967 || GET_MODE (target) != tmode
28968 || !insn_data[icode].operand[0].predicate (target, tmode))
28969 target = gen_reg_rtx (tmode);
28970
28971 if (GET_MODE (op1) == SImode && mode1 == TImode)
28972 {
28973 rtx x = gen_reg_rtx (V4SImode);
28974 emit_insn (gen_sse2_loadd (x, op1));
28975 op1 = gen_lowpart (TImode, x);
28976 }
28977
28978 if (!insn_data[icode].operand[1].predicate (op0, mode0))
28979 op0 = copy_to_mode_reg (mode0, op0);
28980 if (!insn_data[icode].operand[2].predicate (op1, mode1))
28981 op1 = copy_to_mode_reg (mode1, op1);
28982
28983 pat = GEN_FCN (icode) (target, op0, op1);
28984 if (! pat)
28985 return 0;
28986
28987 emit_insn (pat);
28988
28989 return target;
28990 }
28991
28992 /* Subroutine of ix86_expand_builtin to take care of 2-4 argument insns. */
28993
28994 static rtx
28995 ix86_expand_multi_arg_builtin (enum insn_code icode, tree exp, rtx target,
28996 enum ix86_builtin_func_type m_type,
28997 enum rtx_code sub_code)
28998 {
28999 rtx pat;
29000 int i;
29001 int nargs;
29002 bool comparison_p = false;
29003 bool tf_p = false;
29004 bool last_arg_constant = false;
29005 int num_memory = 0;
29006 struct {
29007 rtx op;
29008 enum machine_mode mode;
29009 } args[4];
29010
29011 enum machine_mode tmode = insn_data[icode].operand[0].mode;
29012
29013 switch (m_type)
29014 {
29015 case MULTI_ARG_4_DF2_DI_I:
29016 case MULTI_ARG_4_DF2_DI_I1:
29017 case MULTI_ARG_4_SF2_SI_I:
29018 case MULTI_ARG_4_SF2_SI_I1:
29019 nargs = 4;
29020 last_arg_constant = true;
29021 break;
29022
29023 case MULTI_ARG_3_SF:
29024 case MULTI_ARG_3_DF:
29025 case MULTI_ARG_3_SF2:
29026 case MULTI_ARG_3_DF2:
29027 case MULTI_ARG_3_DI:
29028 case MULTI_ARG_3_SI:
29029 case MULTI_ARG_3_SI_DI:
29030 case MULTI_ARG_3_HI:
29031 case MULTI_ARG_3_HI_SI:
29032 case MULTI_ARG_3_QI:
29033 case MULTI_ARG_3_DI2:
29034 case MULTI_ARG_3_SI2:
29035 case MULTI_ARG_3_HI2:
29036 case MULTI_ARG_3_QI2:
29037 nargs = 3;
29038 break;
29039
29040 case MULTI_ARG_2_SF:
29041 case MULTI_ARG_2_DF:
29042 case MULTI_ARG_2_DI:
29043 case MULTI_ARG_2_SI:
29044 case MULTI_ARG_2_HI:
29045 case MULTI_ARG_2_QI:
29046 nargs = 2;
29047 break;
29048
29049 case MULTI_ARG_2_DI_IMM:
29050 case MULTI_ARG_2_SI_IMM:
29051 case MULTI_ARG_2_HI_IMM:
29052 case MULTI_ARG_2_QI_IMM:
29053 nargs = 2;
29054 last_arg_constant = true;
29055 break;
29056
29057 case MULTI_ARG_1_SF:
29058 case MULTI_ARG_1_DF:
29059 case MULTI_ARG_1_SF2:
29060 case MULTI_ARG_1_DF2:
29061 case MULTI_ARG_1_DI:
29062 case MULTI_ARG_1_SI:
29063 case MULTI_ARG_1_HI:
29064 case MULTI_ARG_1_QI:
29065 case MULTI_ARG_1_SI_DI:
29066 case MULTI_ARG_1_HI_DI:
29067 case MULTI_ARG_1_HI_SI:
29068 case MULTI_ARG_1_QI_DI:
29069 case MULTI_ARG_1_QI_SI:
29070 case MULTI_ARG_1_QI_HI:
29071 nargs = 1;
29072 break;
29073
29074 case MULTI_ARG_2_DI_CMP:
29075 case MULTI_ARG_2_SI_CMP:
29076 case MULTI_ARG_2_HI_CMP:
29077 case MULTI_ARG_2_QI_CMP:
29078 nargs = 2;
29079 comparison_p = true;
29080 break;
29081
29082 case MULTI_ARG_2_SF_TF:
29083 case MULTI_ARG_2_DF_TF:
29084 case MULTI_ARG_2_DI_TF:
29085 case MULTI_ARG_2_SI_TF:
29086 case MULTI_ARG_2_HI_TF:
29087 case MULTI_ARG_2_QI_TF:
29088 nargs = 2;
29089 tf_p = true;
29090 break;
29091
29092 default:
29093 gcc_unreachable ();
29094 }
29095
29096 if (optimize || !target
29097 || GET_MODE (target) != tmode
29098 || !insn_data[icode].operand[0].predicate (target, tmode))
29099 target = gen_reg_rtx (tmode);
29100
29101 gcc_assert (nargs <= 4);
29102
29103 for (i = 0; i < nargs; i++)
29104 {
29105 tree arg = CALL_EXPR_ARG (exp, i);
29106 rtx op = expand_normal (arg);
29107 int adjust = (comparison_p) ? 1 : 0;
29108 enum machine_mode mode = insn_data[icode].operand[i+adjust+1].mode;
29109
29110 if (last_arg_constant && i == nargs - 1)
29111 {
29112 if (!insn_data[icode].operand[i + 1].predicate (op, mode))
29113 {
29114 enum insn_code new_icode = icode;
29115 switch (icode)
29116 {
29117 case CODE_FOR_xop_vpermil2v2df3:
29118 case CODE_FOR_xop_vpermil2v4sf3:
29119 case CODE_FOR_xop_vpermil2v4df3:
29120 case CODE_FOR_xop_vpermil2v8sf3:
29121 error ("the last argument must be a 2-bit immediate");
29122 return gen_reg_rtx (tmode);
29123 case CODE_FOR_xop_rotlv2di3:
29124 new_icode = CODE_FOR_rotlv2di3;
29125 goto xop_rotl;
29126 case CODE_FOR_xop_rotlv4si3:
29127 new_icode = CODE_FOR_rotlv4si3;
29128 goto xop_rotl;
29129 case CODE_FOR_xop_rotlv8hi3:
29130 new_icode = CODE_FOR_rotlv8hi3;
29131 goto xop_rotl;
29132 case CODE_FOR_xop_rotlv16qi3:
29133 new_icode = CODE_FOR_rotlv16qi3;
29134 xop_rotl:
29135 if (CONST_INT_P (op))
29136 {
29137 int mask = GET_MODE_BITSIZE (GET_MODE_INNER (tmode)) - 1;
29138 op = GEN_INT (INTVAL (op) & mask);
29139 gcc_checking_assert
29140 (insn_data[icode].operand[i + 1].predicate (op, mode));
29141 }
29142 else
29143 {
29144 gcc_checking_assert
29145 (nargs == 2
29146 && insn_data[new_icode].operand[0].mode == tmode
29147 && insn_data[new_icode].operand[1].mode == tmode
29148 && insn_data[new_icode].operand[2].mode == mode
29149 && insn_data[new_icode].operand[0].predicate
29150 == insn_data[icode].operand[0].predicate
29151 && insn_data[new_icode].operand[1].predicate
29152 == insn_data[icode].operand[1].predicate);
29153 icode = new_icode;
29154 goto non_constant;
29155 }
29156 break;
29157 default:
29158 gcc_unreachable ();
29159 }
29160 }
29161 }
29162 else
29163 {
29164 non_constant:
29165 if (VECTOR_MODE_P (mode))
29166 op = safe_vector_operand (op, mode);
29167
29168 /* If we aren't optimizing, only allow one memory operand to be
29169 generated. */
29170 if (memory_operand (op, mode))
29171 num_memory++;
29172
29173 gcc_assert (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode);
29174
29175 if (optimize
29176 || !insn_data[icode].operand[i+adjust+1].predicate (op, mode)
29177 || num_memory > 1)
29178 op = force_reg (mode, op);
29179 }
29180
29181 args[i].op = op;
29182 args[i].mode = mode;
29183 }
29184
29185 switch (nargs)
29186 {
29187 case 1:
29188 pat = GEN_FCN (icode) (target, args[0].op);
29189 break;
29190
29191 case 2:
29192 if (tf_p)
29193 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
29194 GEN_INT ((int)sub_code));
29195 else if (! comparison_p)
29196 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
29197 else
29198 {
29199 rtx cmp_op = gen_rtx_fmt_ee (sub_code, GET_MODE (target),
29200 args[0].op,
29201 args[1].op);
29202
29203 pat = GEN_FCN (icode) (target, cmp_op, args[0].op, args[1].op);
29204 }
29205 break;
29206
29207 case 3:
29208 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
29209 break;
29210
29211 case 4:
29212 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op, args[3].op);
29213 break;
29214
29215 default:
29216 gcc_unreachable ();
29217 }
29218
29219 if (! pat)
29220 return 0;
29221
29222 emit_insn (pat);
29223 return target;
29224 }
29225
29226 /* Subroutine of ix86_expand_args_builtin to take care of scalar unop
29227 insns with vec_merge. */
29228
29229 static rtx
29230 ix86_expand_unop_vec_merge_builtin (enum insn_code icode, tree exp,
29231 rtx target)
29232 {
29233 rtx pat;
29234 tree arg0 = CALL_EXPR_ARG (exp, 0);
29235 rtx op1, op0 = expand_normal (arg0);
29236 enum machine_mode tmode = insn_data[icode].operand[0].mode;
29237 enum machine_mode mode0 = insn_data[icode].operand[1].mode;
29238
29239 if (optimize || !target
29240 || GET_MODE (target) != tmode
29241 || !insn_data[icode].operand[0].predicate (target, tmode))
29242 target = gen_reg_rtx (tmode);
29243
29244 if (VECTOR_MODE_P (mode0))
29245 op0 = safe_vector_operand (op0, mode0);
29246
29247 if ((optimize && !register_operand (op0, mode0))
29248 || !insn_data[icode].operand[1].predicate (op0, mode0))
29249 op0 = copy_to_mode_reg (mode0, op0);
29250
29251 op1 = op0;
29252 if (!insn_data[icode].operand[2].predicate (op1, mode0))
29253 op1 = copy_to_mode_reg (mode0, op1);
29254
29255 pat = GEN_FCN (icode) (target, op0, op1);
29256 if (! pat)
29257 return 0;
29258 emit_insn (pat);
29259 return target;
29260 }
29261
29262 /* Subroutine of ix86_expand_builtin to take care of comparison insns. */
29263
29264 static rtx
29265 ix86_expand_sse_compare (const struct builtin_description *d,
29266 tree exp, rtx target, bool swap)
29267 {
29268 rtx pat;
29269 tree arg0 = CALL_EXPR_ARG (exp, 0);
29270 tree arg1 = CALL_EXPR_ARG (exp, 1);
29271 rtx op0 = expand_normal (arg0);
29272 rtx op1 = expand_normal (arg1);
29273 rtx op2;
29274 enum machine_mode tmode = insn_data[d->icode].operand[0].mode;
29275 enum machine_mode mode0 = insn_data[d->icode].operand[1].mode;
29276 enum machine_mode mode1 = insn_data[d->icode].operand[2].mode;
29277 enum rtx_code comparison = d->comparison;
29278
29279 if (VECTOR_MODE_P (mode0))
29280 op0 = safe_vector_operand (op0, mode0);
29281 if (VECTOR_MODE_P (mode1))
29282 op1 = safe_vector_operand (op1, mode1);
29283
29284 /* Swap operands if we have a comparison that isn't available in
29285 hardware. */
29286 if (swap)
29287 {
29288 rtx tmp = gen_reg_rtx (mode1);
29289 emit_move_insn (tmp, op1);
29290 op1 = op0;
29291 op0 = tmp;
29292 }
29293
29294 if (optimize || !target
29295 || GET_MODE (target) != tmode
29296 || !insn_data[d->icode].operand[0].predicate (target, tmode))
29297 target = gen_reg_rtx (tmode);
29298
29299 if ((optimize && !register_operand (op0, mode0))
29300 || !insn_data[d->icode].operand[1].predicate (op0, mode0))
29301 op0 = copy_to_mode_reg (mode0, op0);
29302 if ((optimize && !register_operand (op1, mode1))
29303 || !insn_data[d->icode].operand[2].predicate (op1, mode1))
29304 op1 = copy_to_mode_reg (mode1, op1);
29305
29306 op2 = gen_rtx_fmt_ee (comparison, mode0, op0, op1);
29307 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
29308 if (! pat)
29309 return 0;
29310 emit_insn (pat);
29311 return target;
29312 }
29313
29314 /* Subroutine of ix86_expand_builtin to take care of comi insns. */
29315
29316 static rtx
29317 ix86_expand_sse_comi (const struct builtin_description *d, tree exp,
29318 rtx target)
29319 {
29320 rtx pat;
29321 tree arg0 = CALL_EXPR_ARG (exp, 0);
29322 tree arg1 = CALL_EXPR_ARG (exp, 1);
29323 rtx op0 = expand_normal (arg0);
29324 rtx op1 = expand_normal (arg1);
29325 enum machine_mode mode0 = insn_data[d->icode].operand[0].mode;
29326 enum machine_mode mode1 = insn_data[d->icode].operand[1].mode;
29327 enum rtx_code comparison = d->comparison;
29328
29329 if (VECTOR_MODE_P (mode0))
29330 op0 = safe_vector_operand (op0, mode0);
29331 if (VECTOR_MODE_P (mode1))
29332 op1 = safe_vector_operand (op1, mode1);
29333
29334 /* Swap operands if we have a comparison that isn't available in
29335 hardware. */
29336 if (d->flag & BUILTIN_DESC_SWAP_OPERANDS)
29337 {
29338 rtx tmp = op1;
29339 op1 = op0;
29340 op0 = tmp;
29341 }
29342
29343 target = gen_reg_rtx (SImode);
29344 emit_move_insn (target, const0_rtx);
29345 target = gen_rtx_SUBREG (QImode, target, 0);
29346
29347 if ((optimize && !register_operand (op0, mode0))
29348 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
29349 op0 = copy_to_mode_reg (mode0, op0);
29350 if ((optimize && !register_operand (op1, mode1))
29351 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
29352 op1 = copy_to_mode_reg (mode1, op1);
29353
29354 pat = GEN_FCN (d->icode) (op0, op1);
29355 if (! pat)
29356 return 0;
29357 emit_insn (pat);
29358 emit_insn (gen_rtx_SET (VOIDmode,
29359 gen_rtx_STRICT_LOW_PART (VOIDmode, target),
29360 gen_rtx_fmt_ee (comparison, QImode,
29361 SET_DEST (pat),
29362 const0_rtx)));
29363
29364 return SUBREG_REG (target);
29365 }
29366
29367 /* Subroutines of ix86_expand_args_builtin to take care of round insns. */
29368
29369 static rtx
29370 ix86_expand_sse_round (const struct builtin_description *d, tree exp,
29371 rtx target)
29372 {
29373 rtx pat;
29374 tree arg0 = CALL_EXPR_ARG (exp, 0);
29375 rtx op1, op0 = expand_normal (arg0);
29376 enum machine_mode tmode = insn_data[d->icode].operand[0].mode;
29377 enum machine_mode mode0 = insn_data[d->icode].operand[1].mode;
29378
29379 if (optimize || target == 0
29380 || GET_MODE (target) != tmode
29381 || !insn_data[d->icode].operand[0].predicate (target, tmode))
29382 target = gen_reg_rtx (tmode);
29383
29384 if (VECTOR_MODE_P (mode0))
29385 op0 = safe_vector_operand (op0, mode0);
29386
29387 if ((optimize && !register_operand (op0, mode0))
29388 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
29389 op0 = copy_to_mode_reg (mode0, op0);
29390
29391 op1 = GEN_INT (d->comparison);
29392
29393 pat = GEN_FCN (d->icode) (target, op0, op1);
29394 if (! pat)
29395 return 0;
29396 emit_insn (pat);
29397 return target;
29398 }
29399
29400 static rtx
29401 ix86_expand_sse_round_vec_pack_sfix (const struct builtin_description *d,
29402 tree exp, rtx target)
29403 {
29404 rtx pat;
29405 tree arg0 = CALL_EXPR_ARG (exp, 0);
29406 tree arg1 = CALL_EXPR_ARG (exp, 1);
29407 rtx op0 = expand_normal (arg0);
29408 rtx op1 = expand_normal (arg1);
29409 rtx op2;
29410 enum machine_mode tmode = insn_data[d->icode].operand[0].mode;
29411 enum machine_mode mode0 = insn_data[d->icode].operand[1].mode;
29412 enum machine_mode mode1 = insn_data[d->icode].operand[2].mode;
29413
29414 if (optimize || target == 0
29415 || GET_MODE (target) != tmode
29416 || !insn_data[d->icode].operand[0].predicate (target, tmode))
29417 target = gen_reg_rtx (tmode);
29418
29419 op0 = safe_vector_operand (op0, mode0);
29420 op1 = safe_vector_operand (op1, mode1);
29421
29422 if ((optimize && !register_operand (op0, mode0))
29423 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
29424 op0 = copy_to_mode_reg (mode0, op0);
29425 if ((optimize && !register_operand (op1, mode1))
29426 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
29427 op1 = copy_to_mode_reg (mode1, op1);
29428
29429 op2 = GEN_INT (d->comparison);
29430
29431 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
29432 if (! pat)
29433 return 0;
29434 emit_insn (pat);
29435 return target;
29436 }
29437
29438 /* Subroutine of ix86_expand_builtin to take care of ptest insns. */
29439
29440 static rtx
29441 ix86_expand_sse_ptest (const struct builtin_description *d, tree exp,
29442 rtx target)
29443 {
29444 rtx pat;
29445 tree arg0 = CALL_EXPR_ARG (exp, 0);
29446 tree arg1 = CALL_EXPR_ARG (exp, 1);
29447 rtx op0 = expand_normal (arg0);
29448 rtx op1 = expand_normal (arg1);
29449 enum machine_mode mode0 = insn_data[d->icode].operand[0].mode;
29450 enum machine_mode mode1 = insn_data[d->icode].operand[1].mode;
29451 enum rtx_code comparison = d->comparison;
29452
29453 if (VECTOR_MODE_P (mode0))
29454 op0 = safe_vector_operand (op0, mode0);
29455 if (VECTOR_MODE_P (mode1))
29456 op1 = safe_vector_operand (op1, mode1);
29457
29458 target = gen_reg_rtx (SImode);
29459 emit_move_insn (target, const0_rtx);
29460 target = gen_rtx_SUBREG (QImode, target, 0);
29461
29462 if ((optimize && !register_operand (op0, mode0))
29463 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
29464 op0 = copy_to_mode_reg (mode0, op0);
29465 if ((optimize && !register_operand (op1, mode1))
29466 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
29467 op1 = copy_to_mode_reg (mode1, op1);
29468
29469 pat = GEN_FCN (d->icode) (op0, op1);
29470 if (! pat)
29471 return 0;
29472 emit_insn (pat);
29473 emit_insn (gen_rtx_SET (VOIDmode,
29474 gen_rtx_STRICT_LOW_PART (VOIDmode, target),
29475 gen_rtx_fmt_ee (comparison, QImode,
29476 SET_DEST (pat),
29477 const0_rtx)));
29478
29479 return SUBREG_REG (target);
29480 }
29481
29482 /* Subroutine of ix86_expand_builtin to take care of pcmpestr[im] insns. */
29483
29484 static rtx
29485 ix86_expand_sse_pcmpestr (const struct builtin_description *d,
29486 tree exp, rtx target)
29487 {
29488 rtx pat;
29489 tree arg0 = CALL_EXPR_ARG (exp, 0);
29490 tree arg1 = CALL_EXPR_ARG (exp, 1);
29491 tree arg2 = CALL_EXPR_ARG (exp, 2);
29492 tree arg3 = CALL_EXPR_ARG (exp, 3);
29493 tree arg4 = CALL_EXPR_ARG (exp, 4);
29494 rtx scratch0, scratch1;
29495 rtx op0 = expand_normal (arg0);
29496 rtx op1 = expand_normal (arg1);
29497 rtx op2 = expand_normal (arg2);
29498 rtx op3 = expand_normal (arg3);
29499 rtx op4 = expand_normal (arg4);
29500 enum machine_mode tmode0, tmode1, modev2, modei3, modev4, modei5, modeimm;
29501
29502 tmode0 = insn_data[d->icode].operand[0].mode;
29503 tmode1 = insn_data[d->icode].operand[1].mode;
29504 modev2 = insn_data[d->icode].operand[2].mode;
29505 modei3 = insn_data[d->icode].operand[3].mode;
29506 modev4 = insn_data[d->icode].operand[4].mode;
29507 modei5 = insn_data[d->icode].operand[5].mode;
29508 modeimm = insn_data[d->icode].operand[6].mode;
29509
29510 if (VECTOR_MODE_P (modev2))
29511 op0 = safe_vector_operand (op0, modev2);
29512 if (VECTOR_MODE_P (modev4))
29513 op2 = safe_vector_operand (op2, modev4);
29514
29515 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
29516 op0 = copy_to_mode_reg (modev2, op0);
29517 if (!insn_data[d->icode].operand[3].predicate (op1, modei3))
29518 op1 = copy_to_mode_reg (modei3, op1);
29519 if ((optimize && !register_operand (op2, modev4))
29520 || !insn_data[d->icode].operand[4].predicate (op2, modev4))
29521 op2 = copy_to_mode_reg (modev4, op2);
29522 if (!insn_data[d->icode].operand[5].predicate (op3, modei5))
29523 op3 = copy_to_mode_reg (modei5, op3);
29524
29525 if (!insn_data[d->icode].operand[6].predicate (op4, modeimm))
29526 {
29527 error ("the fifth argument must be an 8-bit immediate");
29528 return const0_rtx;
29529 }
29530
29531 if (d->code == IX86_BUILTIN_PCMPESTRI128)
29532 {
29533 if (optimize || !target
29534 || GET_MODE (target) != tmode0
29535 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
29536 target = gen_reg_rtx (tmode0);
29537
29538 scratch1 = gen_reg_rtx (tmode1);
29539
29540 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2, op3, op4);
29541 }
29542 else if (d->code == IX86_BUILTIN_PCMPESTRM128)
29543 {
29544 if (optimize || !target
29545 || GET_MODE (target) != tmode1
29546 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
29547 target = gen_reg_rtx (tmode1);
29548
29549 scratch0 = gen_reg_rtx (tmode0);
29550
29551 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2, op3, op4);
29552 }
29553 else
29554 {
29555 gcc_assert (d->flag);
29556
29557 scratch0 = gen_reg_rtx (tmode0);
29558 scratch1 = gen_reg_rtx (tmode1);
29559
29560 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2, op3, op4);
29561 }
29562
29563 if (! pat)
29564 return 0;
29565
29566 emit_insn (pat);
29567
29568 if (d->flag)
29569 {
29570 target = gen_reg_rtx (SImode);
29571 emit_move_insn (target, const0_rtx);
29572 target = gen_rtx_SUBREG (QImode, target, 0);
29573
29574 emit_insn
29575 (gen_rtx_SET (VOIDmode, gen_rtx_STRICT_LOW_PART (VOIDmode, target),
29576 gen_rtx_fmt_ee (EQ, QImode,
29577 gen_rtx_REG ((enum machine_mode) d->flag,
29578 FLAGS_REG),
29579 const0_rtx)));
29580 return SUBREG_REG (target);
29581 }
29582 else
29583 return target;
29584 }
29585
29586
29587 /* Subroutine of ix86_expand_builtin to take care of pcmpistr[im] insns. */
29588
29589 static rtx
29590 ix86_expand_sse_pcmpistr (const struct builtin_description *d,
29591 tree exp, rtx target)
29592 {
29593 rtx pat;
29594 tree arg0 = CALL_EXPR_ARG (exp, 0);
29595 tree arg1 = CALL_EXPR_ARG (exp, 1);
29596 tree arg2 = CALL_EXPR_ARG (exp, 2);
29597 rtx scratch0, scratch1;
29598 rtx op0 = expand_normal (arg0);
29599 rtx op1 = expand_normal (arg1);
29600 rtx op2 = expand_normal (arg2);
29601 enum machine_mode tmode0, tmode1, modev2, modev3, modeimm;
29602
29603 tmode0 = insn_data[d->icode].operand[0].mode;
29604 tmode1 = insn_data[d->icode].operand[1].mode;
29605 modev2 = insn_data[d->icode].operand[2].mode;
29606 modev3 = insn_data[d->icode].operand[3].mode;
29607 modeimm = insn_data[d->icode].operand[4].mode;
29608
29609 if (VECTOR_MODE_P (modev2))
29610 op0 = safe_vector_operand (op0, modev2);
29611 if (VECTOR_MODE_P (modev3))
29612 op1 = safe_vector_operand (op1, modev3);
29613
29614 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
29615 op0 = copy_to_mode_reg (modev2, op0);
29616 if ((optimize && !register_operand (op1, modev3))
29617 || !insn_data[d->icode].operand[3].predicate (op1, modev3))
29618 op1 = copy_to_mode_reg (modev3, op1);
29619
29620 if (!insn_data[d->icode].operand[4].predicate (op2, modeimm))
29621 {
29622 error ("the third argument must be an 8-bit immediate");
29623 return const0_rtx;
29624 }
29625
29626 if (d->code == IX86_BUILTIN_PCMPISTRI128)
29627 {
29628 if (optimize || !target
29629 || GET_MODE (target) != tmode0
29630 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
29631 target = gen_reg_rtx (tmode0);
29632
29633 scratch1 = gen_reg_rtx (tmode1);
29634
29635 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2);
29636 }
29637 else if (d->code == IX86_BUILTIN_PCMPISTRM128)
29638 {
29639 if (optimize || !target
29640 || GET_MODE (target) != tmode1
29641 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
29642 target = gen_reg_rtx (tmode1);
29643
29644 scratch0 = gen_reg_rtx (tmode0);
29645
29646 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2);
29647 }
29648 else
29649 {
29650 gcc_assert (d->flag);
29651
29652 scratch0 = gen_reg_rtx (tmode0);
29653 scratch1 = gen_reg_rtx (tmode1);
29654
29655 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2);
29656 }
29657
29658 if (! pat)
29659 return 0;
29660
29661 emit_insn (pat);
29662
29663 if (d->flag)
29664 {
29665 target = gen_reg_rtx (SImode);
29666 emit_move_insn (target, const0_rtx);
29667 target = gen_rtx_SUBREG (QImode, target, 0);
29668
29669 emit_insn
29670 (gen_rtx_SET (VOIDmode, gen_rtx_STRICT_LOW_PART (VOIDmode, target),
29671 gen_rtx_fmt_ee (EQ, QImode,
29672 gen_rtx_REG ((enum machine_mode) d->flag,
29673 FLAGS_REG),
29674 const0_rtx)));
29675 return SUBREG_REG (target);
29676 }
29677 else
29678 return target;
29679 }
29680
29681 /* Subroutine of ix86_expand_builtin to take care of insns with
29682 variable number of operands. */
29683
29684 static rtx
29685 ix86_expand_args_builtin (const struct builtin_description *d,
29686 tree exp, rtx target)
29687 {
29688 rtx pat, real_target;
29689 unsigned int i, nargs;
29690 unsigned int nargs_constant = 0;
29691 int num_memory = 0;
29692 struct
29693 {
29694 rtx op;
29695 enum machine_mode mode;
29696 } args[4];
29697 bool last_arg_count = false;
29698 enum insn_code icode = d->icode;
29699 const struct insn_data_d *insn_p = &insn_data[icode];
29700 enum machine_mode tmode = insn_p->operand[0].mode;
29701 enum machine_mode rmode = VOIDmode;
29702 bool swap = false;
29703 enum rtx_code comparison = d->comparison;
29704
29705 switch ((enum ix86_builtin_func_type) d->flag)
29706 {
29707 case V2DF_FTYPE_V2DF_ROUND:
29708 case V4DF_FTYPE_V4DF_ROUND:
29709 case V4SF_FTYPE_V4SF_ROUND:
29710 case V8SF_FTYPE_V8SF_ROUND:
29711 case V4SI_FTYPE_V4SF_ROUND:
29712 case V8SI_FTYPE_V8SF_ROUND:
29713 return ix86_expand_sse_round (d, exp, target);
29714 case V4SI_FTYPE_V2DF_V2DF_ROUND:
29715 case V8SI_FTYPE_V4DF_V4DF_ROUND:
29716 return ix86_expand_sse_round_vec_pack_sfix (d, exp, target);
29717 case INT_FTYPE_V8SF_V8SF_PTEST:
29718 case INT_FTYPE_V4DI_V4DI_PTEST:
29719 case INT_FTYPE_V4DF_V4DF_PTEST:
29720 case INT_FTYPE_V4SF_V4SF_PTEST:
29721 case INT_FTYPE_V2DI_V2DI_PTEST:
29722 case INT_FTYPE_V2DF_V2DF_PTEST:
29723 return ix86_expand_sse_ptest (d, exp, target);
29724 case FLOAT128_FTYPE_FLOAT128:
29725 case FLOAT_FTYPE_FLOAT:
29726 case INT_FTYPE_INT:
29727 case UINT64_FTYPE_INT:
29728 case UINT16_FTYPE_UINT16:
29729 case INT64_FTYPE_INT64:
29730 case INT64_FTYPE_V4SF:
29731 case INT64_FTYPE_V2DF:
29732 case INT_FTYPE_V16QI:
29733 case INT_FTYPE_V8QI:
29734 case INT_FTYPE_V8SF:
29735 case INT_FTYPE_V4DF:
29736 case INT_FTYPE_V4SF:
29737 case INT_FTYPE_V2DF:
29738 case INT_FTYPE_V32QI:
29739 case V16QI_FTYPE_V16QI:
29740 case V8SI_FTYPE_V8SF:
29741 case V8SI_FTYPE_V4SI:
29742 case V8HI_FTYPE_V8HI:
29743 case V8HI_FTYPE_V16QI:
29744 case V8QI_FTYPE_V8QI:
29745 case V8SF_FTYPE_V8SF:
29746 case V8SF_FTYPE_V8SI:
29747 case V8SF_FTYPE_V4SF:
29748 case V8SF_FTYPE_V8HI:
29749 case V4SI_FTYPE_V4SI:
29750 case V4SI_FTYPE_V16QI:
29751 case V4SI_FTYPE_V4SF:
29752 case V4SI_FTYPE_V8SI:
29753 case V4SI_FTYPE_V8HI:
29754 case V4SI_FTYPE_V4DF:
29755 case V4SI_FTYPE_V2DF:
29756 case V4HI_FTYPE_V4HI:
29757 case V4DF_FTYPE_V4DF:
29758 case V4DF_FTYPE_V4SI:
29759 case V4DF_FTYPE_V4SF:
29760 case V4DF_FTYPE_V2DF:
29761 case V4SF_FTYPE_V4SF:
29762 case V4SF_FTYPE_V4SI:
29763 case V4SF_FTYPE_V8SF:
29764 case V4SF_FTYPE_V4DF:
29765 case V4SF_FTYPE_V8HI:
29766 case V4SF_FTYPE_V2DF:
29767 case V2DI_FTYPE_V2DI:
29768 case V2DI_FTYPE_V16QI:
29769 case V2DI_FTYPE_V8HI:
29770 case V2DI_FTYPE_V4SI:
29771 case V2DF_FTYPE_V2DF:
29772 case V2DF_FTYPE_V4SI:
29773 case V2DF_FTYPE_V4DF:
29774 case V2DF_FTYPE_V4SF:
29775 case V2DF_FTYPE_V2SI:
29776 case V2SI_FTYPE_V2SI:
29777 case V2SI_FTYPE_V4SF:
29778 case V2SI_FTYPE_V2SF:
29779 case V2SI_FTYPE_V2DF:
29780 case V2SF_FTYPE_V2SF:
29781 case V2SF_FTYPE_V2SI:
29782 case V32QI_FTYPE_V32QI:
29783 case V32QI_FTYPE_V16QI:
29784 case V16HI_FTYPE_V16HI:
29785 case V16HI_FTYPE_V8HI:
29786 case V8SI_FTYPE_V8SI:
29787 case V16HI_FTYPE_V16QI:
29788 case V8SI_FTYPE_V16QI:
29789 case V4DI_FTYPE_V16QI:
29790 case V8SI_FTYPE_V8HI:
29791 case V4DI_FTYPE_V8HI:
29792 case V4DI_FTYPE_V4SI:
29793 case V4DI_FTYPE_V2DI:
29794 nargs = 1;
29795 break;
29796 case V4SF_FTYPE_V4SF_VEC_MERGE:
29797 case V2DF_FTYPE_V2DF_VEC_MERGE:
29798 return ix86_expand_unop_vec_merge_builtin (icode, exp, target);
29799 case FLOAT128_FTYPE_FLOAT128_FLOAT128:
29800 case V16QI_FTYPE_V16QI_V16QI:
29801 case V16QI_FTYPE_V8HI_V8HI:
29802 case V8QI_FTYPE_V8QI_V8QI:
29803 case V8QI_FTYPE_V4HI_V4HI:
29804 case V8HI_FTYPE_V8HI_V8HI:
29805 case V8HI_FTYPE_V16QI_V16QI:
29806 case V8HI_FTYPE_V4SI_V4SI:
29807 case V8SF_FTYPE_V8SF_V8SF:
29808 case V8SF_FTYPE_V8SF_V8SI:
29809 case V4SI_FTYPE_V4SI_V4SI:
29810 case V4SI_FTYPE_V8HI_V8HI:
29811 case V4SI_FTYPE_V4SF_V4SF:
29812 case V4SI_FTYPE_V2DF_V2DF:
29813 case V4HI_FTYPE_V4HI_V4HI:
29814 case V4HI_FTYPE_V8QI_V8QI:
29815 case V4HI_FTYPE_V2SI_V2SI:
29816 case V4DF_FTYPE_V4DF_V4DF:
29817 case V4DF_FTYPE_V4DF_V4DI:
29818 case V4SF_FTYPE_V4SF_V4SF:
29819 case V4SF_FTYPE_V4SF_V4SI:
29820 case V4SF_FTYPE_V4SF_V2SI:
29821 case V4SF_FTYPE_V4SF_V2DF:
29822 case V4SF_FTYPE_V4SF_DI:
29823 case V4SF_FTYPE_V4SF_SI:
29824 case V2DI_FTYPE_V2DI_V2DI:
29825 case V2DI_FTYPE_V16QI_V16QI:
29826 case V2DI_FTYPE_V4SI_V4SI:
29827 case V2UDI_FTYPE_V4USI_V4USI:
29828 case V2DI_FTYPE_V2DI_V16QI:
29829 case V2DI_FTYPE_V2DF_V2DF:
29830 case V2SI_FTYPE_V2SI_V2SI:
29831 case V2SI_FTYPE_V4HI_V4HI:
29832 case V2SI_FTYPE_V2SF_V2SF:
29833 case V2DF_FTYPE_V2DF_V2DF:
29834 case V2DF_FTYPE_V2DF_V4SF:
29835 case V2DF_FTYPE_V2DF_V2DI:
29836 case V2DF_FTYPE_V2DF_DI:
29837 case V2DF_FTYPE_V2DF_SI:
29838 case V2SF_FTYPE_V2SF_V2SF:
29839 case V1DI_FTYPE_V1DI_V1DI:
29840 case V1DI_FTYPE_V8QI_V8QI:
29841 case V1DI_FTYPE_V2SI_V2SI:
29842 case V32QI_FTYPE_V16HI_V16HI:
29843 case V16HI_FTYPE_V8SI_V8SI:
29844 case V32QI_FTYPE_V32QI_V32QI:
29845 case V16HI_FTYPE_V32QI_V32QI:
29846 case V16HI_FTYPE_V16HI_V16HI:
29847 case V8SI_FTYPE_V4DF_V4DF:
29848 case V8SI_FTYPE_V8SI_V8SI:
29849 case V8SI_FTYPE_V16HI_V16HI:
29850 case V4DI_FTYPE_V4DI_V4DI:
29851 case V4DI_FTYPE_V8SI_V8SI:
29852 case V4UDI_FTYPE_V8USI_V8USI:
29853 if (comparison == UNKNOWN)
29854 return ix86_expand_binop_builtin (icode, exp, target);
29855 nargs = 2;
29856 break;
29857 case V4SF_FTYPE_V4SF_V4SF_SWAP:
29858 case V2DF_FTYPE_V2DF_V2DF_SWAP:
29859 gcc_assert (comparison != UNKNOWN);
29860 nargs = 2;
29861 swap = true;
29862 break;
29863 case V16HI_FTYPE_V16HI_V8HI_COUNT:
29864 case V16HI_FTYPE_V16HI_SI_COUNT:
29865 case V8SI_FTYPE_V8SI_V4SI_COUNT:
29866 case V8SI_FTYPE_V8SI_SI_COUNT:
29867 case V4DI_FTYPE_V4DI_V2DI_COUNT:
29868 case V4DI_FTYPE_V4DI_INT_COUNT:
29869 case V8HI_FTYPE_V8HI_V8HI_COUNT:
29870 case V8HI_FTYPE_V8HI_SI_COUNT:
29871 case V4SI_FTYPE_V4SI_V4SI_COUNT:
29872 case V4SI_FTYPE_V4SI_SI_COUNT:
29873 case V4HI_FTYPE_V4HI_V4HI_COUNT:
29874 case V4HI_FTYPE_V4HI_SI_COUNT:
29875 case V2DI_FTYPE_V2DI_V2DI_COUNT:
29876 case V2DI_FTYPE_V2DI_SI_COUNT:
29877 case V2SI_FTYPE_V2SI_V2SI_COUNT:
29878 case V2SI_FTYPE_V2SI_SI_COUNT:
29879 case V1DI_FTYPE_V1DI_V1DI_COUNT:
29880 case V1DI_FTYPE_V1DI_SI_COUNT:
29881 nargs = 2;
29882 last_arg_count = true;
29883 break;
29884 case UINT64_FTYPE_UINT64_UINT64:
29885 case UINT_FTYPE_UINT_UINT:
29886 case UINT_FTYPE_UINT_USHORT:
29887 case UINT_FTYPE_UINT_UCHAR:
29888 case UINT16_FTYPE_UINT16_INT:
29889 case UINT8_FTYPE_UINT8_INT:
29890 nargs = 2;
29891 break;
29892 case V2DI_FTYPE_V2DI_INT_CONVERT:
29893 nargs = 2;
29894 rmode = V1TImode;
29895 nargs_constant = 1;
29896 break;
29897 case V4DI_FTYPE_V4DI_INT_CONVERT:
29898 nargs = 2;
29899 rmode = V2TImode;
29900 nargs_constant = 1;
29901 break;
29902 case V8HI_FTYPE_V8HI_INT:
29903 case V8HI_FTYPE_V8SF_INT:
29904 case V8HI_FTYPE_V4SF_INT:
29905 case V8SF_FTYPE_V8SF_INT:
29906 case V4SI_FTYPE_V4SI_INT:
29907 case V4SI_FTYPE_V8SI_INT:
29908 case V4HI_FTYPE_V4HI_INT:
29909 case V4DF_FTYPE_V4DF_INT:
29910 case V4SF_FTYPE_V4SF_INT:
29911 case V4SF_FTYPE_V8SF_INT:
29912 case V2DI_FTYPE_V2DI_INT:
29913 case V2DF_FTYPE_V2DF_INT:
29914 case V2DF_FTYPE_V4DF_INT:
29915 case V16HI_FTYPE_V16HI_INT:
29916 case V8SI_FTYPE_V8SI_INT:
29917 case V4DI_FTYPE_V4DI_INT:
29918 case V2DI_FTYPE_V4DI_INT:
29919 nargs = 2;
29920 nargs_constant = 1;
29921 break;
29922 case V16QI_FTYPE_V16QI_V16QI_V16QI:
29923 case V8SF_FTYPE_V8SF_V8SF_V8SF:
29924 case V4DF_FTYPE_V4DF_V4DF_V4DF:
29925 case V4SF_FTYPE_V4SF_V4SF_V4SF:
29926 case V2DF_FTYPE_V2DF_V2DF_V2DF:
29927 case V32QI_FTYPE_V32QI_V32QI_V32QI:
29928 nargs = 3;
29929 break;
29930 case V32QI_FTYPE_V32QI_V32QI_INT:
29931 case V16HI_FTYPE_V16HI_V16HI_INT:
29932 case V16QI_FTYPE_V16QI_V16QI_INT:
29933 case V4DI_FTYPE_V4DI_V4DI_INT:
29934 case V8HI_FTYPE_V8HI_V8HI_INT:
29935 case V8SI_FTYPE_V8SI_V8SI_INT:
29936 case V8SI_FTYPE_V8SI_V4SI_INT:
29937 case V8SF_FTYPE_V8SF_V8SF_INT:
29938 case V8SF_FTYPE_V8SF_V4SF_INT:
29939 case V4SI_FTYPE_V4SI_V4SI_INT:
29940 case V4DF_FTYPE_V4DF_V4DF_INT:
29941 case V4DF_FTYPE_V4DF_V2DF_INT:
29942 case V4SF_FTYPE_V4SF_V4SF_INT:
29943 case V2DI_FTYPE_V2DI_V2DI_INT:
29944 case V4DI_FTYPE_V4DI_V2DI_INT:
29945 case V2DF_FTYPE_V2DF_V2DF_INT:
29946 nargs = 3;
29947 nargs_constant = 1;
29948 break;
29949 case V4DI_FTYPE_V4DI_V4DI_INT_CONVERT:
29950 nargs = 3;
29951 rmode = V4DImode;
29952 nargs_constant = 1;
29953 break;
29954 case V2DI_FTYPE_V2DI_V2DI_INT_CONVERT:
29955 nargs = 3;
29956 rmode = V2DImode;
29957 nargs_constant = 1;
29958 break;
29959 case V1DI_FTYPE_V1DI_V1DI_INT_CONVERT:
29960 nargs = 3;
29961 rmode = DImode;
29962 nargs_constant = 1;
29963 break;
29964 case V2DI_FTYPE_V2DI_UINT_UINT:
29965 nargs = 3;
29966 nargs_constant = 2;
29967 break;
29968 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT:
29969 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT:
29970 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT:
29971 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT:
29972 nargs = 4;
29973 nargs_constant = 1;
29974 break;
29975 case V2DI_FTYPE_V2DI_V2DI_UINT_UINT:
29976 nargs = 4;
29977 nargs_constant = 2;
29978 break;
29979 case UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED:
29980 case UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG:
29981 nargs = 4;
29982 break;
29983 default:
29984 gcc_unreachable ();
29985 }
29986
29987 gcc_assert (nargs <= ARRAY_SIZE (args));
29988
29989 if (comparison != UNKNOWN)
29990 {
29991 gcc_assert (nargs == 2);
29992 return ix86_expand_sse_compare (d, exp, target, swap);
29993 }
29994
29995 if (rmode == VOIDmode || rmode == tmode)
29996 {
29997 if (optimize
29998 || target == 0
29999 || GET_MODE (target) != tmode
30000 || !insn_p->operand[0].predicate (target, tmode))
30001 target = gen_reg_rtx (tmode);
30002 real_target = target;
30003 }
30004 else
30005 {
30006 target = gen_reg_rtx (rmode);
30007 real_target = simplify_gen_subreg (tmode, target, rmode, 0);
30008 }
30009
30010 for (i = 0; i < nargs; i++)
30011 {
30012 tree arg = CALL_EXPR_ARG (exp, i);
30013 rtx op = expand_normal (arg);
30014 enum machine_mode mode = insn_p->operand[i + 1].mode;
30015 bool match = insn_p->operand[i + 1].predicate (op, mode);
30016
30017 if (last_arg_count && (i + 1) == nargs)
30018 {
30019 /* SIMD shift insns take either an 8-bit immediate or
30020 register as count. But builtin functions take int as
30021 count. If count doesn't match, we put it in register. */
30022 if (!match)
30023 {
30024 op = simplify_gen_subreg (SImode, op, GET_MODE (op), 0);
30025 if (!insn_p->operand[i + 1].predicate (op, mode))
30026 op = copy_to_reg (op);
30027 }
30028 }
30029 else if ((nargs - i) <= nargs_constant)
30030 {
30031 if (!match)
30032 switch (icode)
30033 {
30034 case CODE_FOR_avx2_inserti128:
30035 case CODE_FOR_avx2_extracti128:
30036 error ("the last argument must be an 1-bit immediate");
30037 return const0_rtx;
30038
30039 case CODE_FOR_sse4_1_roundsd:
30040 case CODE_FOR_sse4_1_roundss:
30041
30042 case CODE_FOR_sse4_1_roundpd:
30043 case CODE_FOR_sse4_1_roundps:
30044 case CODE_FOR_avx_roundpd256:
30045 case CODE_FOR_avx_roundps256:
30046
30047 case CODE_FOR_sse4_1_roundpd_vec_pack_sfix:
30048 case CODE_FOR_sse4_1_roundps_sfix:
30049 case CODE_FOR_avx_roundpd_vec_pack_sfix256:
30050 case CODE_FOR_avx_roundps_sfix256:
30051
30052 case CODE_FOR_sse4_1_blendps:
30053 case CODE_FOR_avx_blendpd256:
30054 case CODE_FOR_avx_vpermilv4df:
30055 error ("the last argument must be a 4-bit immediate");
30056 return const0_rtx;
30057
30058 case CODE_FOR_sse4_1_blendpd:
30059 case CODE_FOR_avx_vpermilv2df:
30060 case CODE_FOR_xop_vpermil2v2df3:
30061 case CODE_FOR_xop_vpermil2v4sf3:
30062 case CODE_FOR_xop_vpermil2v4df3:
30063 case CODE_FOR_xop_vpermil2v8sf3:
30064 error ("the last argument must be a 2-bit immediate");
30065 return const0_rtx;
30066
30067 case CODE_FOR_avx_vextractf128v4df:
30068 case CODE_FOR_avx_vextractf128v8sf:
30069 case CODE_FOR_avx_vextractf128v8si:
30070 case CODE_FOR_avx_vinsertf128v4df:
30071 case CODE_FOR_avx_vinsertf128v8sf:
30072 case CODE_FOR_avx_vinsertf128v8si:
30073 error ("the last argument must be a 1-bit immediate");
30074 return const0_rtx;
30075
30076 case CODE_FOR_avx_vmcmpv2df3:
30077 case CODE_FOR_avx_vmcmpv4sf3:
30078 case CODE_FOR_avx_cmpv2df3:
30079 case CODE_FOR_avx_cmpv4sf3:
30080 case CODE_FOR_avx_cmpv4df3:
30081 case CODE_FOR_avx_cmpv8sf3:
30082 error ("the last argument must be a 5-bit immediate");
30083 return const0_rtx;
30084
30085 default:
30086 switch (nargs_constant)
30087 {
30088 case 2:
30089 if ((nargs - i) == nargs_constant)
30090 {
30091 error ("the next to last argument must be an 8-bit immediate");
30092 break;
30093 }
30094 case 1:
30095 error ("the last argument must be an 8-bit immediate");
30096 break;
30097 default:
30098 gcc_unreachable ();
30099 }
30100 return const0_rtx;
30101 }
30102 }
30103 else
30104 {
30105 if (VECTOR_MODE_P (mode))
30106 op = safe_vector_operand (op, mode);
30107
30108 /* If we aren't optimizing, only allow one memory operand to
30109 be generated. */
30110 if (memory_operand (op, mode))
30111 num_memory++;
30112
30113 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
30114 {
30115 if (optimize || !match || num_memory > 1)
30116 op = copy_to_mode_reg (mode, op);
30117 }
30118 else
30119 {
30120 op = copy_to_reg (op);
30121 op = simplify_gen_subreg (mode, op, GET_MODE (op), 0);
30122 }
30123 }
30124
30125 args[i].op = op;
30126 args[i].mode = mode;
30127 }
30128
30129 switch (nargs)
30130 {
30131 case 1:
30132 pat = GEN_FCN (icode) (real_target, args[0].op);
30133 break;
30134 case 2:
30135 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op);
30136 break;
30137 case 3:
30138 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
30139 args[2].op);
30140 break;
30141 case 4:
30142 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
30143 args[2].op, args[3].op);
30144 break;
30145 default:
30146 gcc_unreachable ();
30147 }
30148
30149 if (! pat)
30150 return 0;
30151
30152 emit_insn (pat);
30153 return target;
30154 }
30155
30156 /* Subroutine of ix86_expand_builtin to take care of special insns
30157 with variable number of operands. */
30158
30159 static rtx
30160 ix86_expand_special_args_builtin (const struct builtin_description *d,
30161 tree exp, rtx target)
30162 {
30163 tree arg;
30164 rtx pat, op;
30165 unsigned int i, nargs, arg_adjust, memory;
30166 struct
30167 {
30168 rtx op;
30169 enum machine_mode mode;
30170 } args[3];
30171 enum insn_code icode = d->icode;
30172 bool last_arg_constant = false;
30173 const struct insn_data_d *insn_p = &insn_data[icode];
30174 enum machine_mode tmode = insn_p->operand[0].mode;
30175 enum { load, store } klass;
30176
30177 switch ((enum ix86_builtin_func_type) d->flag)
30178 {
30179 case VOID_FTYPE_VOID:
30180 if (icode == CODE_FOR_avx_vzeroupper)
30181 target = GEN_INT (vzeroupper_intrinsic);
30182 emit_insn (GEN_FCN (icode) (target));
30183 return 0;
30184 case VOID_FTYPE_UINT64:
30185 case VOID_FTYPE_UNSIGNED:
30186 nargs = 0;
30187 klass = store;
30188 memory = 0;
30189 break;
30190
30191 case INT_FTYPE_VOID:
30192 case UINT64_FTYPE_VOID:
30193 case UNSIGNED_FTYPE_VOID:
30194 nargs = 0;
30195 klass = load;
30196 memory = 0;
30197 break;
30198 case UINT64_FTYPE_PUNSIGNED:
30199 case V2DI_FTYPE_PV2DI:
30200 case V4DI_FTYPE_PV4DI:
30201 case V32QI_FTYPE_PCCHAR:
30202 case V16QI_FTYPE_PCCHAR:
30203 case V8SF_FTYPE_PCV4SF:
30204 case V8SF_FTYPE_PCFLOAT:
30205 case V4SF_FTYPE_PCFLOAT:
30206 case V4DF_FTYPE_PCV2DF:
30207 case V4DF_FTYPE_PCDOUBLE:
30208 case V2DF_FTYPE_PCDOUBLE:
30209 case VOID_FTYPE_PVOID:
30210 nargs = 1;
30211 klass = load;
30212 memory = 0;
30213 break;
30214 case VOID_FTYPE_PV2SF_V4SF:
30215 case VOID_FTYPE_PV4DI_V4DI:
30216 case VOID_FTYPE_PV2DI_V2DI:
30217 case VOID_FTYPE_PCHAR_V32QI:
30218 case VOID_FTYPE_PCHAR_V16QI:
30219 case VOID_FTYPE_PFLOAT_V8SF:
30220 case VOID_FTYPE_PFLOAT_V4SF:
30221 case VOID_FTYPE_PDOUBLE_V4DF:
30222 case VOID_FTYPE_PDOUBLE_V2DF:
30223 case VOID_FTYPE_PLONGLONG_LONGLONG:
30224 case VOID_FTYPE_PULONGLONG_ULONGLONG:
30225 case VOID_FTYPE_PINT_INT:
30226 nargs = 1;
30227 klass = store;
30228 /* Reserve memory operand for target. */
30229 memory = ARRAY_SIZE (args);
30230 break;
30231 case V4SF_FTYPE_V4SF_PCV2SF:
30232 case V2DF_FTYPE_V2DF_PCDOUBLE:
30233 nargs = 2;
30234 klass = load;
30235 memory = 1;
30236 break;
30237 case V8SF_FTYPE_PCV8SF_V8SI:
30238 case V4DF_FTYPE_PCV4DF_V4DI:
30239 case V4SF_FTYPE_PCV4SF_V4SI:
30240 case V2DF_FTYPE_PCV2DF_V2DI:
30241 case V8SI_FTYPE_PCV8SI_V8SI:
30242 case V4DI_FTYPE_PCV4DI_V4DI:
30243 case V4SI_FTYPE_PCV4SI_V4SI:
30244 case V2DI_FTYPE_PCV2DI_V2DI:
30245 nargs = 2;
30246 klass = load;
30247 memory = 0;
30248 break;
30249 case VOID_FTYPE_PV8SF_V8SI_V8SF:
30250 case VOID_FTYPE_PV4DF_V4DI_V4DF:
30251 case VOID_FTYPE_PV4SF_V4SI_V4SF:
30252 case VOID_FTYPE_PV2DF_V2DI_V2DF:
30253 case VOID_FTYPE_PV8SI_V8SI_V8SI:
30254 case VOID_FTYPE_PV4DI_V4DI_V4DI:
30255 case VOID_FTYPE_PV4SI_V4SI_V4SI:
30256 case VOID_FTYPE_PV2DI_V2DI_V2DI:
30257 nargs = 2;
30258 klass = store;
30259 /* Reserve memory operand for target. */
30260 memory = ARRAY_SIZE (args);
30261 break;
30262 case VOID_FTYPE_UINT_UINT_UINT:
30263 case VOID_FTYPE_UINT64_UINT_UINT:
30264 case UCHAR_FTYPE_UINT_UINT_UINT:
30265 case UCHAR_FTYPE_UINT64_UINT_UINT:
30266 nargs = 3;
30267 klass = load;
30268 memory = ARRAY_SIZE (args);
30269 last_arg_constant = true;
30270 break;
30271 default:
30272 gcc_unreachable ();
30273 }
30274
30275 gcc_assert (nargs <= ARRAY_SIZE (args));
30276
30277 if (klass == store)
30278 {
30279 arg = CALL_EXPR_ARG (exp, 0);
30280 op = expand_normal (arg);
30281 gcc_assert (target == 0);
30282 if (memory)
30283 {
30284 if (GET_MODE (op) != Pmode)
30285 op = convert_to_mode (Pmode, op, 1);
30286 target = gen_rtx_MEM (tmode, force_reg (Pmode, op));
30287 }
30288 else
30289 target = force_reg (tmode, op);
30290 arg_adjust = 1;
30291 }
30292 else
30293 {
30294 arg_adjust = 0;
30295 if (optimize
30296 || target == 0
30297 || !register_operand (target, tmode)
30298 || GET_MODE (target) != tmode)
30299 target = gen_reg_rtx (tmode);
30300 }
30301
30302 for (i = 0; i < nargs; i++)
30303 {
30304 enum machine_mode mode = insn_p->operand[i + 1].mode;
30305 bool match;
30306
30307 arg = CALL_EXPR_ARG (exp, i + arg_adjust);
30308 op = expand_normal (arg);
30309 match = insn_p->operand[i + 1].predicate (op, mode);
30310
30311 if (last_arg_constant && (i + 1) == nargs)
30312 {
30313 if (!match)
30314 {
30315 if (icode == CODE_FOR_lwp_lwpvalsi3
30316 || icode == CODE_FOR_lwp_lwpinssi3
30317 || icode == CODE_FOR_lwp_lwpvaldi3
30318 || icode == CODE_FOR_lwp_lwpinsdi3)
30319 error ("the last argument must be a 32-bit immediate");
30320 else
30321 error ("the last argument must be an 8-bit immediate");
30322 return const0_rtx;
30323 }
30324 }
30325 else
30326 {
30327 if (i == memory)
30328 {
30329 /* This must be the memory operand. */
30330 if (GET_MODE (op) != Pmode)
30331 op = convert_to_mode (Pmode, op, 1);
30332 op = gen_rtx_MEM (mode, force_reg (Pmode, op));
30333 gcc_assert (GET_MODE (op) == mode
30334 || GET_MODE (op) == VOIDmode);
30335 }
30336 else
30337 {
30338 /* This must be register. */
30339 if (VECTOR_MODE_P (mode))
30340 op = safe_vector_operand (op, mode);
30341
30342 gcc_assert (GET_MODE (op) == mode
30343 || GET_MODE (op) == VOIDmode);
30344 op = copy_to_mode_reg (mode, op);
30345 }
30346 }
30347
30348 args[i].op = op;
30349 args[i].mode = mode;
30350 }
30351
30352 switch (nargs)
30353 {
30354 case 0:
30355 pat = GEN_FCN (icode) (target);
30356 break;
30357 case 1:
30358 pat = GEN_FCN (icode) (target, args[0].op);
30359 break;
30360 case 2:
30361 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
30362 break;
30363 case 3:
30364 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
30365 break;
30366 default:
30367 gcc_unreachable ();
30368 }
30369
30370 if (! pat)
30371 return 0;
30372 emit_insn (pat);
30373 return klass == store ? 0 : target;
30374 }
30375
30376 /* Return the integer constant in ARG. Constrain it to be in the range
30377 of the subparts of VEC_TYPE; issue an error if not. */
30378
30379 static int
30380 get_element_number (tree vec_type, tree arg)
30381 {
30382 unsigned HOST_WIDE_INT elt, max = TYPE_VECTOR_SUBPARTS (vec_type) - 1;
30383
30384 if (!host_integerp (arg, 1)
30385 || (elt = tree_low_cst (arg, 1), elt > max))
30386 {
30387 error ("selector must be an integer constant in the range 0..%wi", max);
30388 return 0;
30389 }
30390
30391 return elt;
30392 }
30393
30394 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
30395 ix86_expand_vector_init. We DO have language-level syntax for this, in
30396 the form of (type){ init-list }. Except that since we can't place emms
30397 instructions from inside the compiler, we can't allow the use of MMX
30398 registers unless the user explicitly asks for it. So we do *not* define
30399 vec_set/vec_extract/vec_init patterns for MMX modes in mmx.md. Instead
30400 we have builtins invoked by mmintrin.h that gives us license to emit
30401 these sorts of instructions. */
30402
30403 static rtx
30404 ix86_expand_vec_init_builtin (tree type, tree exp, rtx target)
30405 {
30406 enum machine_mode tmode = TYPE_MODE (type);
30407 enum machine_mode inner_mode = GET_MODE_INNER (tmode);
30408 int i, n_elt = GET_MODE_NUNITS (tmode);
30409 rtvec v = rtvec_alloc (n_elt);
30410
30411 gcc_assert (VECTOR_MODE_P (tmode));
30412 gcc_assert (call_expr_nargs (exp) == n_elt);
30413
30414 for (i = 0; i < n_elt; ++i)
30415 {
30416 rtx x = expand_normal (CALL_EXPR_ARG (exp, i));
30417 RTVEC_ELT (v, i) = gen_lowpart (inner_mode, x);
30418 }
30419
30420 if (!target || !register_operand (target, tmode))
30421 target = gen_reg_rtx (tmode);
30422
30423 ix86_expand_vector_init (true, target, gen_rtx_PARALLEL (tmode, v));
30424 return target;
30425 }
30426
30427 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
30428 ix86_expand_vector_extract. They would be redundant (for non-MMX) if we
30429 had a language-level syntax for referencing vector elements. */
30430
30431 static rtx
30432 ix86_expand_vec_ext_builtin (tree exp, rtx target)
30433 {
30434 enum machine_mode tmode, mode0;
30435 tree arg0, arg1;
30436 int elt;
30437 rtx op0;
30438
30439 arg0 = CALL_EXPR_ARG (exp, 0);
30440 arg1 = CALL_EXPR_ARG (exp, 1);
30441
30442 op0 = expand_normal (arg0);
30443 elt = get_element_number (TREE_TYPE (arg0), arg1);
30444
30445 tmode = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
30446 mode0 = TYPE_MODE (TREE_TYPE (arg0));
30447 gcc_assert (VECTOR_MODE_P (mode0));
30448
30449 op0 = force_reg (mode0, op0);
30450
30451 if (optimize || !target || !register_operand (target, tmode))
30452 target = gen_reg_rtx (tmode);
30453
30454 ix86_expand_vector_extract (true, target, op0, elt);
30455
30456 return target;
30457 }
30458
30459 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
30460 ix86_expand_vector_set. They would be redundant (for non-MMX) if we had
30461 a language-level syntax for referencing vector elements. */
30462
30463 static rtx
30464 ix86_expand_vec_set_builtin (tree exp)
30465 {
30466 enum machine_mode tmode, mode1;
30467 tree arg0, arg1, arg2;
30468 int elt;
30469 rtx op0, op1, target;
30470
30471 arg0 = CALL_EXPR_ARG (exp, 0);
30472 arg1 = CALL_EXPR_ARG (exp, 1);
30473 arg2 = CALL_EXPR_ARG (exp, 2);
30474
30475 tmode = TYPE_MODE (TREE_TYPE (arg0));
30476 mode1 = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
30477 gcc_assert (VECTOR_MODE_P (tmode));
30478
30479 op0 = expand_expr (arg0, NULL_RTX, tmode, EXPAND_NORMAL);
30480 op1 = expand_expr (arg1, NULL_RTX, mode1, EXPAND_NORMAL);
30481 elt = get_element_number (TREE_TYPE (arg0), arg2);
30482
30483 if (GET_MODE (op1) != mode1 && GET_MODE (op1) != VOIDmode)
30484 op1 = convert_modes (mode1, GET_MODE (op1), op1, true);
30485
30486 op0 = force_reg (tmode, op0);
30487 op1 = force_reg (mode1, op1);
30488
30489 /* OP0 is the source of these builtin functions and shouldn't be
30490 modified. Create a copy, use it and return it as target. */
30491 target = gen_reg_rtx (tmode);
30492 emit_move_insn (target, op0);
30493 ix86_expand_vector_set (true, target, op1, elt);
30494
30495 return target;
30496 }
30497
30498 /* Expand an expression EXP that calls a built-in function,
30499 with result going to TARGET if that's convenient
30500 (and in mode MODE if that's convenient).
30501 SUBTARGET may be used as the target for computing one of EXP's operands.
30502 IGNORE is nonzero if the value is to be ignored. */
30503
30504 static rtx
30505 ix86_expand_builtin (tree exp, rtx target, rtx subtarget ATTRIBUTE_UNUSED,
30506 enum machine_mode mode ATTRIBUTE_UNUSED,
30507 int ignore ATTRIBUTE_UNUSED)
30508 {
30509 const struct builtin_description *d;
30510 size_t i;
30511 enum insn_code icode;
30512 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
30513 tree arg0, arg1, arg2, arg3, arg4;
30514 rtx op0, op1, op2, op3, op4, pat, insn;
30515 enum machine_mode mode0, mode1, mode2, mode3, mode4;
30516 unsigned int fcode = DECL_FUNCTION_CODE (fndecl);
30517
30518 /* For CPU builtins that can be folded, fold first and expand the fold. */
30519 switch (fcode)
30520 {
30521 case IX86_BUILTIN_CPU_INIT:
30522 {
30523 /* Make it call __cpu_indicator_init in libgcc. */
30524 tree call_expr, fndecl, type;
30525 type = build_function_type_list (integer_type_node, NULL_TREE);
30526 fndecl = build_fn_decl ("__cpu_indicator_init", type);
30527 call_expr = build_call_expr (fndecl, 0);
30528 return expand_expr (call_expr, target, mode, EXPAND_NORMAL);
30529 }
30530 case IX86_BUILTIN_CPU_IS:
30531 case IX86_BUILTIN_CPU_SUPPORTS:
30532 {
30533 tree arg0 = CALL_EXPR_ARG (exp, 0);
30534 tree fold_expr = fold_builtin_cpu (fndecl, &arg0);
30535 gcc_assert (fold_expr != NULL_TREE);
30536 return expand_expr (fold_expr, target, mode, EXPAND_NORMAL);
30537 }
30538 }
30539
30540 /* Determine whether the builtin function is available under the current ISA.
30541 Originally the builtin was not created if it wasn't applicable to the
30542 current ISA based on the command line switches. With function specific
30543 options, we need to check in the context of the function making the call
30544 whether it is supported. */
30545 if (ix86_builtins_isa[fcode].isa
30546 && !(ix86_builtins_isa[fcode].isa & ix86_isa_flags))
30547 {
30548 char *opts = ix86_target_string (ix86_builtins_isa[fcode].isa, 0, NULL,
30549 NULL, (enum fpmath_unit) 0, false);
30550
30551 if (!opts)
30552 error ("%qE needs unknown isa option", fndecl);
30553 else
30554 {
30555 gcc_assert (opts != NULL);
30556 error ("%qE needs isa option %s", fndecl, opts);
30557 free (opts);
30558 }
30559 return const0_rtx;
30560 }
30561
30562 switch (fcode)
30563 {
30564 case IX86_BUILTIN_MASKMOVQ:
30565 case IX86_BUILTIN_MASKMOVDQU:
30566 icode = (fcode == IX86_BUILTIN_MASKMOVQ
30567 ? CODE_FOR_mmx_maskmovq
30568 : CODE_FOR_sse2_maskmovdqu);
30569 /* Note the arg order is different from the operand order. */
30570 arg1 = CALL_EXPR_ARG (exp, 0);
30571 arg2 = CALL_EXPR_ARG (exp, 1);
30572 arg0 = CALL_EXPR_ARG (exp, 2);
30573 op0 = expand_normal (arg0);
30574 op1 = expand_normal (arg1);
30575 op2 = expand_normal (arg2);
30576 mode0 = insn_data[icode].operand[0].mode;
30577 mode1 = insn_data[icode].operand[1].mode;
30578 mode2 = insn_data[icode].operand[2].mode;
30579
30580 if (GET_MODE (op0) != Pmode)
30581 op0 = convert_to_mode (Pmode, op0, 1);
30582 op0 = gen_rtx_MEM (mode1, force_reg (Pmode, op0));
30583
30584 if (!insn_data[icode].operand[0].predicate (op0, mode0))
30585 op0 = copy_to_mode_reg (mode0, op0);
30586 if (!insn_data[icode].operand[1].predicate (op1, mode1))
30587 op1 = copy_to_mode_reg (mode1, op1);
30588 if (!insn_data[icode].operand[2].predicate (op2, mode2))
30589 op2 = copy_to_mode_reg (mode2, op2);
30590 pat = GEN_FCN (icode) (op0, op1, op2);
30591 if (! pat)
30592 return 0;
30593 emit_insn (pat);
30594 return 0;
30595
30596 case IX86_BUILTIN_LDMXCSR:
30597 op0 = expand_normal (CALL_EXPR_ARG (exp, 0));
30598 target = assign_386_stack_local (SImode, SLOT_VIRTUAL);
30599 emit_move_insn (target, op0);
30600 emit_insn (gen_sse_ldmxcsr (target));
30601 return 0;
30602
30603 case IX86_BUILTIN_STMXCSR:
30604 target = assign_386_stack_local (SImode, SLOT_VIRTUAL);
30605 emit_insn (gen_sse_stmxcsr (target));
30606 return copy_to_mode_reg (SImode, target);
30607
30608 case IX86_BUILTIN_CLFLUSH:
30609 arg0 = CALL_EXPR_ARG (exp, 0);
30610 op0 = expand_normal (arg0);
30611 icode = CODE_FOR_sse2_clflush;
30612 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
30613 {
30614 if (GET_MODE (op0) != Pmode)
30615 op0 = convert_to_mode (Pmode, op0, 1);
30616 op0 = force_reg (Pmode, op0);
30617 }
30618
30619 emit_insn (gen_sse2_clflush (op0));
30620 return 0;
30621
30622 case IX86_BUILTIN_MONITOR:
30623 arg0 = CALL_EXPR_ARG (exp, 0);
30624 arg1 = CALL_EXPR_ARG (exp, 1);
30625 arg2 = CALL_EXPR_ARG (exp, 2);
30626 op0 = expand_normal (arg0);
30627 op1 = expand_normal (arg1);
30628 op2 = expand_normal (arg2);
30629 if (!REG_P (op0))
30630 {
30631 if (GET_MODE (op0) != Pmode)
30632 op0 = convert_to_mode (Pmode, op0, 1);
30633 op0 = force_reg (Pmode, op0);
30634 }
30635 if (!REG_P (op1))
30636 op1 = copy_to_mode_reg (SImode, op1);
30637 if (!REG_P (op2))
30638 op2 = copy_to_mode_reg (SImode, op2);
30639 emit_insn (ix86_gen_monitor (op0, op1, op2));
30640 return 0;
30641
30642 case IX86_BUILTIN_MWAIT:
30643 arg0 = CALL_EXPR_ARG (exp, 0);
30644 arg1 = CALL_EXPR_ARG (exp, 1);
30645 op0 = expand_normal (arg0);
30646 op1 = expand_normal (arg1);
30647 if (!REG_P (op0))
30648 op0 = copy_to_mode_reg (SImode, op0);
30649 if (!REG_P (op1))
30650 op1 = copy_to_mode_reg (SImode, op1);
30651 emit_insn (gen_sse3_mwait (op0, op1));
30652 return 0;
30653
30654 case IX86_BUILTIN_VEC_INIT_V2SI:
30655 case IX86_BUILTIN_VEC_INIT_V4HI:
30656 case IX86_BUILTIN_VEC_INIT_V8QI:
30657 return ix86_expand_vec_init_builtin (TREE_TYPE (exp), exp, target);
30658
30659 case IX86_BUILTIN_VEC_EXT_V2DF:
30660 case IX86_BUILTIN_VEC_EXT_V2DI:
30661 case IX86_BUILTIN_VEC_EXT_V4SF:
30662 case IX86_BUILTIN_VEC_EXT_V4SI:
30663 case IX86_BUILTIN_VEC_EXT_V8HI:
30664 case IX86_BUILTIN_VEC_EXT_V2SI:
30665 case IX86_BUILTIN_VEC_EXT_V4HI:
30666 case IX86_BUILTIN_VEC_EXT_V16QI:
30667 return ix86_expand_vec_ext_builtin (exp, target);
30668
30669 case IX86_BUILTIN_VEC_SET_V2DI:
30670 case IX86_BUILTIN_VEC_SET_V4SF:
30671 case IX86_BUILTIN_VEC_SET_V4SI:
30672 case IX86_BUILTIN_VEC_SET_V8HI:
30673 case IX86_BUILTIN_VEC_SET_V4HI:
30674 case IX86_BUILTIN_VEC_SET_V16QI:
30675 return ix86_expand_vec_set_builtin (exp);
30676
30677 case IX86_BUILTIN_INFQ:
30678 case IX86_BUILTIN_HUGE_VALQ:
30679 {
30680 REAL_VALUE_TYPE inf;
30681 rtx tmp;
30682
30683 real_inf (&inf);
30684 tmp = CONST_DOUBLE_FROM_REAL_VALUE (inf, mode);
30685
30686 tmp = validize_mem (force_const_mem (mode, tmp));
30687
30688 if (target == 0)
30689 target = gen_reg_rtx (mode);
30690
30691 emit_move_insn (target, tmp);
30692 return target;
30693 }
30694
30695 case IX86_BUILTIN_RDPMC:
30696 case IX86_BUILTIN_RDTSC:
30697 case IX86_BUILTIN_RDTSCP:
30698
30699 op0 = gen_reg_rtx (DImode);
30700 op1 = gen_reg_rtx (DImode);
30701
30702 if (fcode == IX86_BUILTIN_RDPMC)
30703 {
30704 arg0 = CALL_EXPR_ARG (exp, 0);
30705 op2 = expand_normal (arg0);
30706 if (!register_operand (op2, SImode))
30707 op2 = copy_to_mode_reg (SImode, op2);
30708
30709 insn = (TARGET_64BIT
30710 ? gen_rdpmc_rex64 (op0, op1, op2)
30711 : gen_rdpmc (op0, op2));
30712 emit_insn (insn);
30713 }
30714 else if (fcode == IX86_BUILTIN_RDTSC)
30715 {
30716 insn = (TARGET_64BIT
30717 ? gen_rdtsc_rex64 (op0, op1)
30718 : gen_rdtsc (op0));
30719 emit_insn (insn);
30720 }
30721 else
30722 {
30723 op2 = gen_reg_rtx (SImode);
30724
30725 insn = (TARGET_64BIT
30726 ? gen_rdtscp_rex64 (op0, op1, op2)
30727 : gen_rdtscp (op0, op2));
30728 emit_insn (insn);
30729
30730 arg0 = CALL_EXPR_ARG (exp, 0);
30731 op4 = expand_normal (arg0);
30732 if (!address_operand (op4, VOIDmode))
30733 {
30734 op4 = convert_memory_address (Pmode, op4);
30735 op4 = copy_addr_to_reg (op4);
30736 }
30737 emit_move_insn (gen_rtx_MEM (SImode, op4), op2);
30738 }
30739
30740 if (target == 0)
30741 target = gen_reg_rtx (mode);
30742
30743 if (TARGET_64BIT)
30744 {
30745 op1 = expand_simple_binop (DImode, ASHIFT, op1, GEN_INT (32),
30746 op1, 1, OPTAB_DIRECT);
30747 op0 = expand_simple_binop (DImode, IOR, op0, op1,
30748 op0, 1, OPTAB_DIRECT);
30749 }
30750
30751 emit_move_insn (target, op0);
30752 return target;
30753
30754 case IX86_BUILTIN_FXSAVE:
30755 case IX86_BUILTIN_FXRSTOR:
30756 case IX86_BUILTIN_FXSAVE64:
30757 case IX86_BUILTIN_FXRSTOR64:
30758 switch (fcode)
30759 {
30760 case IX86_BUILTIN_FXSAVE:
30761 icode = CODE_FOR_fxsave;
30762 break;
30763 case IX86_BUILTIN_FXRSTOR:
30764 icode = CODE_FOR_fxrstor;
30765 break;
30766 case IX86_BUILTIN_FXSAVE64:
30767 icode = CODE_FOR_fxsave64;
30768 break;
30769 case IX86_BUILTIN_FXRSTOR64:
30770 icode = CODE_FOR_fxrstor64;
30771 break;
30772 default:
30773 gcc_unreachable ();
30774 }
30775
30776 arg0 = CALL_EXPR_ARG (exp, 0);
30777 op0 = expand_normal (arg0);
30778
30779 if (!address_operand (op0, VOIDmode))
30780 {
30781 op0 = convert_memory_address (Pmode, op0);
30782 op0 = copy_addr_to_reg (op0);
30783 }
30784 op0 = gen_rtx_MEM (BLKmode, op0);
30785
30786 pat = GEN_FCN (icode) (op0);
30787 if (pat)
30788 emit_insn (pat);
30789 return 0;
30790
30791 case IX86_BUILTIN_XSAVE:
30792 case IX86_BUILTIN_XRSTOR:
30793 case IX86_BUILTIN_XSAVE64:
30794 case IX86_BUILTIN_XRSTOR64:
30795 case IX86_BUILTIN_XSAVEOPT:
30796 case IX86_BUILTIN_XSAVEOPT64:
30797 arg0 = CALL_EXPR_ARG (exp, 0);
30798 arg1 = CALL_EXPR_ARG (exp, 1);
30799 op0 = expand_normal (arg0);
30800 op1 = expand_normal (arg1);
30801
30802 if (!address_operand (op0, VOIDmode))
30803 {
30804 op0 = convert_memory_address (Pmode, op0);
30805 op0 = copy_addr_to_reg (op0);
30806 }
30807 op0 = gen_rtx_MEM (BLKmode, op0);
30808
30809 op1 = force_reg (DImode, op1);
30810
30811 if (TARGET_64BIT)
30812 {
30813 op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
30814 NULL, 1, OPTAB_DIRECT);
30815 switch (fcode)
30816 {
30817 case IX86_BUILTIN_XSAVE:
30818 icode = CODE_FOR_xsave_rex64;
30819 break;
30820 case IX86_BUILTIN_XRSTOR:
30821 icode = CODE_FOR_xrstor_rex64;
30822 break;
30823 case IX86_BUILTIN_XSAVE64:
30824 icode = CODE_FOR_xsave64;
30825 break;
30826 case IX86_BUILTIN_XRSTOR64:
30827 icode = CODE_FOR_xrstor64;
30828 break;
30829 case IX86_BUILTIN_XSAVEOPT:
30830 icode = CODE_FOR_xsaveopt_rex64;
30831 break;
30832 case IX86_BUILTIN_XSAVEOPT64:
30833 icode = CODE_FOR_xsaveopt64;
30834 break;
30835 default:
30836 gcc_unreachable ();
30837 }
30838
30839 op2 = gen_lowpart (SImode, op2);
30840 op1 = gen_lowpart (SImode, op1);
30841 pat = GEN_FCN (icode) (op0, op1, op2);
30842 }
30843 else
30844 {
30845 switch (fcode)
30846 {
30847 case IX86_BUILTIN_XSAVE:
30848 icode = CODE_FOR_xsave;
30849 break;
30850 case IX86_BUILTIN_XRSTOR:
30851 icode = CODE_FOR_xrstor;
30852 break;
30853 case IX86_BUILTIN_XSAVEOPT:
30854 icode = CODE_FOR_xsaveopt;
30855 break;
30856 default:
30857 gcc_unreachable ();
30858 }
30859 pat = GEN_FCN (icode) (op0, op1);
30860 }
30861
30862 if (pat)
30863 emit_insn (pat);
30864 return 0;
30865
30866 case IX86_BUILTIN_LLWPCB:
30867 arg0 = CALL_EXPR_ARG (exp, 0);
30868 op0 = expand_normal (arg0);
30869 icode = CODE_FOR_lwp_llwpcb;
30870 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
30871 {
30872 if (GET_MODE (op0) != Pmode)
30873 op0 = convert_to_mode (Pmode, op0, 1);
30874 op0 = force_reg (Pmode, op0);
30875 }
30876 emit_insn (gen_lwp_llwpcb (op0));
30877 return 0;
30878
30879 case IX86_BUILTIN_SLWPCB:
30880 icode = CODE_FOR_lwp_slwpcb;
30881 if (!target
30882 || !insn_data[icode].operand[0].predicate (target, Pmode))
30883 target = gen_reg_rtx (Pmode);
30884 emit_insn (gen_lwp_slwpcb (target));
30885 return target;
30886
30887 case IX86_BUILTIN_BEXTRI32:
30888 case IX86_BUILTIN_BEXTRI64:
30889 arg0 = CALL_EXPR_ARG (exp, 0);
30890 arg1 = CALL_EXPR_ARG (exp, 1);
30891 op0 = expand_normal (arg0);
30892 op1 = expand_normal (arg1);
30893 icode = (fcode == IX86_BUILTIN_BEXTRI32
30894 ? CODE_FOR_tbm_bextri_si
30895 : CODE_FOR_tbm_bextri_di);
30896 if (!CONST_INT_P (op1))
30897 {
30898 error ("last argument must be an immediate");
30899 return const0_rtx;
30900 }
30901 else
30902 {
30903 unsigned char length = (INTVAL (op1) >> 8) & 0xFF;
30904 unsigned char lsb_index = INTVAL (op1) & 0xFF;
30905 op1 = GEN_INT (length);
30906 op2 = GEN_INT (lsb_index);
30907 pat = GEN_FCN (icode) (target, op0, op1, op2);
30908 if (pat)
30909 emit_insn (pat);
30910 return target;
30911 }
30912
30913 case IX86_BUILTIN_RDRAND16_STEP:
30914 icode = CODE_FOR_rdrandhi_1;
30915 mode0 = HImode;
30916 goto rdrand_step;
30917
30918 case IX86_BUILTIN_RDRAND32_STEP:
30919 icode = CODE_FOR_rdrandsi_1;
30920 mode0 = SImode;
30921 goto rdrand_step;
30922
30923 case IX86_BUILTIN_RDRAND64_STEP:
30924 icode = CODE_FOR_rdranddi_1;
30925 mode0 = DImode;
30926
30927 rdrand_step:
30928 op0 = gen_reg_rtx (mode0);
30929 emit_insn (GEN_FCN (icode) (op0));
30930
30931 arg0 = CALL_EXPR_ARG (exp, 0);
30932 op1 = expand_normal (arg0);
30933 if (!address_operand (op1, VOIDmode))
30934 {
30935 op1 = convert_memory_address (Pmode, op1);
30936 op1 = copy_addr_to_reg (op1);
30937 }
30938 emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
30939
30940 op1 = gen_reg_rtx (SImode);
30941 emit_move_insn (op1, CONST1_RTX (SImode));
30942
30943 /* Emit SImode conditional move. */
30944 if (mode0 == HImode)
30945 {
30946 op2 = gen_reg_rtx (SImode);
30947 emit_insn (gen_zero_extendhisi2 (op2, op0));
30948 }
30949 else if (mode0 == SImode)
30950 op2 = op0;
30951 else
30952 op2 = gen_rtx_SUBREG (SImode, op0, 0);
30953
30954 if (target == 0)
30955 target = gen_reg_rtx (SImode);
30956
30957 pat = gen_rtx_GEU (VOIDmode, gen_rtx_REG (CCCmode, FLAGS_REG),
30958 const0_rtx);
30959 emit_insn (gen_rtx_SET (VOIDmode, target,
30960 gen_rtx_IF_THEN_ELSE (SImode, pat, op2, op1)));
30961 return target;
30962
30963 case IX86_BUILTIN_RDSEED16_STEP:
30964 icode = CODE_FOR_rdseedhi_1;
30965 mode0 = HImode;
30966 goto rdseed_step;
30967
30968 case IX86_BUILTIN_RDSEED32_STEP:
30969 icode = CODE_FOR_rdseedsi_1;
30970 mode0 = SImode;
30971 goto rdseed_step;
30972
30973 case IX86_BUILTIN_RDSEED64_STEP:
30974 icode = CODE_FOR_rdseeddi_1;
30975 mode0 = DImode;
30976
30977 rdseed_step:
30978 op0 = gen_reg_rtx (mode0);
30979 emit_insn (GEN_FCN (icode) (op0));
30980
30981 arg0 = CALL_EXPR_ARG (exp, 0);
30982 op1 = expand_normal (arg0);
30983 if (!address_operand (op1, VOIDmode))
30984 {
30985 op1 = convert_memory_address (Pmode, op1);
30986 op1 = copy_addr_to_reg (op1);
30987 }
30988 emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
30989
30990 op2 = gen_reg_rtx (QImode);
30991
30992 pat = gen_rtx_LTU (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
30993 const0_rtx);
30994 emit_insn (gen_rtx_SET (VOIDmode, op2, pat));
30995
30996 if (target == 0)
30997 target = gen_reg_rtx (SImode);
30998
30999 emit_insn (gen_zero_extendqisi2 (target, op2));
31000 return target;
31001
31002 case IX86_BUILTIN_ADDCARRYX32:
31003 icode = TARGET_ADX ? CODE_FOR_adcxsi3 : CODE_FOR_addsi3_carry;
31004 mode0 = SImode;
31005 goto addcarryx;
31006
31007 case IX86_BUILTIN_ADDCARRYX64:
31008 icode = TARGET_ADX ? CODE_FOR_adcxdi3 : CODE_FOR_adddi3_carry;
31009 mode0 = DImode;
31010
31011 addcarryx:
31012 arg0 = CALL_EXPR_ARG (exp, 0); /* unsigned char c_in. */
31013 arg1 = CALL_EXPR_ARG (exp, 1); /* unsigned int src1. */
31014 arg2 = CALL_EXPR_ARG (exp, 2); /* unsigned int src2. */
31015 arg3 = CALL_EXPR_ARG (exp, 3); /* unsigned int *sum_out. */
31016
31017 op0 = gen_reg_rtx (QImode);
31018
31019 /* Generate CF from input operand. */
31020 op1 = expand_normal (arg0);
31021 if (GET_MODE (op1) != QImode)
31022 op1 = convert_to_mode (QImode, op1, 1);
31023 op1 = copy_to_mode_reg (QImode, op1);
31024 emit_insn (gen_addqi3_cc (op0, op1, constm1_rtx));
31025
31026 /* Gen ADCX instruction to compute X+Y+CF. */
31027 op2 = expand_normal (arg1);
31028 op3 = expand_normal (arg2);
31029
31030 if (!REG_P (op2))
31031 op2 = copy_to_mode_reg (mode0, op2);
31032 if (!REG_P (op3))
31033 op3 = copy_to_mode_reg (mode0, op3);
31034
31035 op0 = gen_reg_rtx (mode0);
31036
31037 op4 = gen_rtx_REG (CCCmode, FLAGS_REG);
31038 pat = gen_rtx_LTU (VOIDmode, op4, const0_rtx);
31039 emit_insn (GEN_FCN (icode) (op0, op2, op3, op4, pat));
31040
31041 /* Store the result. */
31042 op4 = expand_normal (arg3);
31043 if (!address_operand (op4, VOIDmode))
31044 {
31045 op4 = convert_memory_address (Pmode, op4);
31046 op4 = copy_addr_to_reg (op4);
31047 }
31048 emit_move_insn (gen_rtx_MEM (mode0, op4), op0);
31049
31050 /* Return current CF value. */
31051 if (target == 0)
31052 target = gen_reg_rtx (QImode);
31053
31054 PUT_MODE (pat, QImode);
31055 emit_insn (gen_rtx_SET (VOIDmode, target, pat));
31056 return target;
31057
31058 case IX86_BUILTIN_GATHERSIV2DF:
31059 icode = CODE_FOR_avx2_gathersiv2df;
31060 goto gather_gen;
31061 case IX86_BUILTIN_GATHERSIV4DF:
31062 icode = CODE_FOR_avx2_gathersiv4df;
31063 goto gather_gen;
31064 case IX86_BUILTIN_GATHERDIV2DF:
31065 icode = CODE_FOR_avx2_gatherdiv2df;
31066 goto gather_gen;
31067 case IX86_BUILTIN_GATHERDIV4DF:
31068 icode = CODE_FOR_avx2_gatherdiv4df;
31069 goto gather_gen;
31070 case IX86_BUILTIN_GATHERSIV4SF:
31071 icode = CODE_FOR_avx2_gathersiv4sf;
31072 goto gather_gen;
31073 case IX86_BUILTIN_GATHERSIV8SF:
31074 icode = CODE_FOR_avx2_gathersiv8sf;
31075 goto gather_gen;
31076 case IX86_BUILTIN_GATHERDIV4SF:
31077 icode = CODE_FOR_avx2_gatherdiv4sf;
31078 goto gather_gen;
31079 case IX86_BUILTIN_GATHERDIV8SF:
31080 icode = CODE_FOR_avx2_gatherdiv8sf;
31081 goto gather_gen;
31082 case IX86_BUILTIN_GATHERSIV2DI:
31083 icode = CODE_FOR_avx2_gathersiv2di;
31084 goto gather_gen;
31085 case IX86_BUILTIN_GATHERSIV4DI:
31086 icode = CODE_FOR_avx2_gathersiv4di;
31087 goto gather_gen;
31088 case IX86_BUILTIN_GATHERDIV2DI:
31089 icode = CODE_FOR_avx2_gatherdiv2di;
31090 goto gather_gen;
31091 case IX86_BUILTIN_GATHERDIV4DI:
31092 icode = CODE_FOR_avx2_gatherdiv4di;
31093 goto gather_gen;
31094 case IX86_BUILTIN_GATHERSIV4SI:
31095 icode = CODE_FOR_avx2_gathersiv4si;
31096 goto gather_gen;
31097 case IX86_BUILTIN_GATHERSIV8SI:
31098 icode = CODE_FOR_avx2_gathersiv8si;
31099 goto gather_gen;
31100 case IX86_BUILTIN_GATHERDIV4SI:
31101 icode = CODE_FOR_avx2_gatherdiv4si;
31102 goto gather_gen;
31103 case IX86_BUILTIN_GATHERDIV8SI:
31104 icode = CODE_FOR_avx2_gatherdiv8si;
31105 goto gather_gen;
31106 case IX86_BUILTIN_GATHERALTSIV4DF:
31107 icode = CODE_FOR_avx2_gathersiv4df;
31108 goto gather_gen;
31109 case IX86_BUILTIN_GATHERALTDIV8SF:
31110 icode = CODE_FOR_avx2_gatherdiv8sf;
31111 goto gather_gen;
31112 case IX86_BUILTIN_GATHERALTSIV4DI:
31113 icode = CODE_FOR_avx2_gathersiv4di;
31114 goto gather_gen;
31115 case IX86_BUILTIN_GATHERALTDIV8SI:
31116 icode = CODE_FOR_avx2_gatherdiv8si;
31117 goto gather_gen;
31118
31119 gather_gen:
31120 arg0 = CALL_EXPR_ARG (exp, 0);
31121 arg1 = CALL_EXPR_ARG (exp, 1);
31122 arg2 = CALL_EXPR_ARG (exp, 2);
31123 arg3 = CALL_EXPR_ARG (exp, 3);
31124 arg4 = CALL_EXPR_ARG (exp, 4);
31125 op0 = expand_normal (arg0);
31126 op1 = expand_normal (arg1);
31127 op2 = expand_normal (arg2);
31128 op3 = expand_normal (arg3);
31129 op4 = expand_normal (arg4);
31130 /* Note the arg order is different from the operand order. */
31131 mode0 = insn_data[icode].operand[1].mode;
31132 mode2 = insn_data[icode].operand[3].mode;
31133 mode3 = insn_data[icode].operand[4].mode;
31134 mode4 = insn_data[icode].operand[5].mode;
31135
31136 if (target == NULL_RTX
31137 || GET_MODE (target) != insn_data[icode].operand[0].mode)
31138 subtarget = gen_reg_rtx (insn_data[icode].operand[0].mode);
31139 else
31140 subtarget = target;
31141
31142 if (fcode == IX86_BUILTIN_GATHERALTSIV4DF
31143 || fcode == IX86_BUILTIN_GATHERALTSIV4DI)
31144 {
31145 rtx half = gen_reg_rtx (V4SImode);
31146 if (!nonimmediate_operand (op2, V8SImode))
31147 op2 = copy_to_mode_reg (V8SImode, op2);
31148 emit_insn (gen_vec_extract_lo_v8si (half, op2));
31149 op2 = half;
31150 }
31151 else if (fcode == IX86_BUILTIN_GATHERALTDIV8SF
31152 || fcode == IX86_BUILTIN_GATHERALTDIV8SI)
31153 {
31154 rtx (*gen) (rtx, rtx);
31155 rtx half = gen_reg_rtx (mode0);
31156 if (mode0 == V4SFmode)
31157 gen = gen_vec_extract_lo_v8sf;
31158 else
31159 gen = gen_vec_extract_lo_v8si;
31160 if (!nonimmediate_operand (op0, GET_MODE (op0)))
31161 op0 = copy_to_mode_reg (GET_MODE (op0), op0);
31162 emit_insn (gen (half, op0));
31163 op0 = half;
31164 if (!nonimmediate_operand (op3, GET_MODE (op3)))
31165 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
31166 emit_insn (gen (half, op3));
31167 op3 = half;
31168 }
31169
31170 /* Force memory operand only with base register here. But we
31171 don't want to do it on memory operand for other builtin
31172 functions. */
31173 if (GET_MODE (op1) != Pmode)
31174 op1 = convert_to_mode (Pmode, op1, 1);
31175 op1 = force_reg (Pmode, op1);
31176
31177 if (!insn_data[icode].operand[1].predicate (op0, mode0))
31178 op0 = copy_to_mode_reg (mode0, op0);
31179 if (!insn_data[icode].operand[2].predicate (op1, Pmode))
31180 op1 = copy_to_mode_reg (Pmode, op1);
31181 if (!insn_data[icode].operand[3].predicate (op2, mode2))
31182 op2 = copy_to_mode_reg (mode2, op2);
31183 if (!insn_data[icode].operand[4].predicate (op3, mode3))
31184 op3 = copy_to_mode_reg (mode3, op3);
31185 if (!insn_data[icode].operand[5].predicate (op4, mode4))
31186 {
31187 error ("last argument must be scale 1, 2, 4, 8");
31188 return const0_rtx;
31189 }
31190
31191 /* Optimize. If mask is known to have all high bits set,
31192 replace op0 with pc_rtx to signal that the instruction
31193 overwrites the whole destination and doesn't use its
31194 previous contents. */
31195 if (optimize)
31196 {
31197 if (TREE_CODE (arg3) == VECTOR_CST)
31198 {
31199 unsigned int negative = 0;
31200 for (i = 0; i < VECTOR_CST_NELTS (arg3); ++i)
31201 {
31202 tree cst = VECTOR_CST_ELT (arg3, i);
31203 if (TREE_CODE (cst) == INTEGER_CST
31204 && tree_int_cst_sign_bit (cst))
31205 negative++;
31206 else if (TREE_CODE (cst) == REAL_CST
31207 && REAL_VALUE_NEGATIVE (TREE_REAL_CST (cst)))
31208 negative++;
31209 }
31210 if (negative == TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg3)))
31211 op0 = pc_rtx;
31212 }
31213 else if (TREE_CODE (arg3) == SSA_NAME)
31214 {
31215 /* Recognize also when mask is like:
31216 __v2df src = _mm_setzero_pd ();
31217 __v2df mask = _mm_cmpeq_pd (src, src);
31218 or
31219 __v8sf src = _mm256_setzero_ps ();
31220 __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ);
31221 as that is a cheaper way to load all ones into
31222 a register than having to load a constant from
31223 memory. */
31224 gimple def_stmt = SSA_NAME_DEF_STMT (arg3);
31225 if (is_gimple_call (def_stmt))
31226 {
31227 tree fndecl = gimple_call_fndecl (def_stmt);
31228 if (fndecl
31229 && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD)
31230 switch ((unsigned int) DECL_FUNCTION_CODE (fndecl))
31231 {
31232 case IX86_BUILTIN_CMPPD:
31233 case IX86_BUILTIN_CMPPS:
31234 case IX86_BUILTIN_CMPPD256:
31235 case IX86_BUILTIN_CMPPS256:
31236 if (!integer_zerop (gimple_call_arg (def_stmt, 2)))
31237 break;
31238 /* FALLTHRU */
31239 case IX86_BUILTIN_CMPEQPD:
31240 case IX86_BUILTIN_CMPEQPS:
31241 if (initializer_zerop (gimple_call_arg (def_stmt, 0))
31242 && initializer_zerop (gimple_call_arg (def_stmt,
31243 1)))
31244 op0 = pc_rtx;
31245 break;
31246 default:
31247 break;
31248 }
31249 }
31250 }
31251 }
31252
31253 pat = GEN_FCN (icode) (subtarget, op0, op1, op2, op3, op4);
31254 if (! pat)
31255 return const0_rtx;
31256 emit_insn (pat);
31257
31258 if (fcode == IX86_BUILTIN_GATHERDIV8SF
31259 || fcode == IX86_BUILTIN_GATHERDIV8SI)
31260 {
31261 enum machine_mode tmode = GET_MODE (subtarget) == V8SFmode
31262 ? V4SFmode : V4SImode;
31263 if (target == NULL_RTX)
31264 target = gen_reg_rtx (tmode);
31265 if (tmode == V4SFmode)
31266 emit_insn (gen_vec_extract_lo_v8sf (target, subtarget));
31267 else
31268 emit_insn (gen_vec_extract_lo_v8si (target, subtarget));
31269 }
31270 else
31271 target = subtarget;
31272
31273 return target;
31274
31275 case IX86_BUILTIN_XABORT:
31276 icode = CODE_FOR_xabort;
31277 arg0 = CALL_EXPR_ARG (exp, 0);
31278 op0 = expand_normal (arg0);
31279 mode0 = insn_data[icode].operand[0].mode;
31280 if (!insn_data[icode].operand[0].predicate (op0, mode0))
31281 {
31282 error ("the xabort's argument must be an 8-bit immediate");
31283 return const0_rtx;
31284 }
31285 emit_insn (gen_xabort (op0));
31286 return 0;
31287
31288 default:
31289 break;
31290 }
31291
31292 for (i = 0, d = bdesc_special_args;
31293 i < ARRAY_SIZE (bdesc_special_args);
31294 i++, d++)
31295 if (d->code == fcode)
31296 return ix86_expand_special_args_builtin (d, exp, target);
31297
31298 for (i = 0, d = bdesc_args;
31299 i < ARRAY_SIZE (bdesc_args);
31300 i++, d++)
31301 if (d->code == fcode)
31302 switch (fcode)
31303 {
31304 case IX86_BUILTIN_FABSQ:
31305 case IX86_BUILTIN_COPYSIGNQ:
31306 if (!TARGET_SSE)
31307 /* Emit a normal call if SSE isn't available. */
31308 return expand_call (exp, target, ignore);
31309 default:
31310 return ix86_expand_args_builtin (d, exp, target);
31311 }
31312
31313 for (i = 0, d = bdesc_comi; i < ARRAY_SIZE (bdesc_comi); i++, d++)
31314 if (d->code == fcode)
31315 return ix86_expand_sse_comi (d, exp, target);
31316
31317 for (i = 0, d = bdesc_pcmpestr;
31318 i < ARRAY_SIZE (bdesc_pcmpestr);
31319 i++, d++)
31320 if (d->code == fcode)
31321 return ix86_expand_sse_pcmpestr (d, exp, target);
31322
31323 for (i = 0, d = bdesc_pcmpistr;
31324 i < ARRAY_SIZE (bdesc_pcmpistr);
31325 i++, d++)
31326 if (d->code == fcode)
31327 return ix86_expand_sse_pcmpistr (d, exp, target);
31328
31329 for (i = 0, d = bdesc_multi_arg; i < ARRAY_SIZE (bdesc_multi_arg); i++, d++)
31330 if (d->code == fcode)
31331 return ix86_expand_multi_arg_builtin (d->icode, exp, target,
31332 (enum ix86_builtin_func_type)
31333 d->flag, d->comparison);
31334
31335 gcc_unreachable ();
31336 }
31337
31338 /* Returns a function decl for a vectorized version of the builtin function
31339 with builtin function code FN and the result vector type TYPE, or NULL_TREE
31340 if it is not available. */
31341
31342 static tree
31343 ix86_builtin_vectorized_function (tree fndecl, tree type_out,
31344 tree type_in)
31345 {
31346 enum machine_mode in_mode, out_mode;
31347 int in_n, out_n;
31348 enum built_in_function fn = DECL_FUNCTION_CODE (fndecl);
31349
31350 if (TREE_CODE (type_out) != VECTOR_TYPE
31351 || TREE_CODE (type_in) != VECTOR_TYPE
31352 || DECL_BUILT_IN_CLASS (fndecl) != BUILT_IN_NORMAL)
31353 return NULL_TREE;
31354
31355 out_mode = TYPE_MODE (TREE_TYPE (type_out));
31356 out_n = TYPE_VECTOR_SUBPARTS (type_out);
31357 in_mode = TYPE_MODE (TREE_TYPE (type_in));
31358 in_n = TYPE_VECTOR_SUBPARTS (type_in);
31359
31360 switch (fn)
31361 {
31362 case BUILT_IN_SQRT:
31363 if (out_mode == DFmode && in_mode == DFmode)
31364 {
31365 if (out_n == 2 && in_n == 2)
31366 return ix86_builtins[IX86_BUILTIN_SQRTPD];
31367 else if (out_n == 4 && in_n == 4)
31368 return ix86_builtins[IX86_BUILTIN_SQRTPD256];
31369 }
31370 break;
31371
31372 case BUILT_IN_SQRTF:
31373 if (out_mode == SFmode && in_mode == SFmode)
31374 {
31375 if (out_n == 4 && in_n == 4)
31376 return ix86_builtins[IX86_BUILTIN_SQRTPS_NR];
31377 else if (out_n == 8 && in_n == 8)
31378 return ix86_builtins[IX86_BUILTIN_SQRTPS_NR256];
31379 }
31380 break;
31381
31382 case BUILT_IN_IFLOOR:
31383 case BUILT_IN_LFLOOR:
31384 case BUILT_IN_LLFLOOR:
31385 /* The round insn does not trap on denormals. */
31386 if (flag_trapping_math || !TARGET_ROUND)
31387 break;
31388
31389 if (out_mode == SImode && in_mode == DFmode)
31390 {
31391 if (out_n == 4 && in_n == 2)
31392 return ix86_builtins[IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX];
31393 else if (out_n == 8 && in_n == 4)
31394 return ix86_builtins[IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256];
31395 }
31396 break;
31397
31398 case BUILT_IN_IFLOORF:
31399 case BUILT_IN_LFLOORF:
31400 case BUILT_IN_LLFLOORF:
31401 /* The round insn does not trap on denormals. */
31402 if (flag_trapping_math || !TARGET_ROUND)
31403 break;
31404
31405 if (out_mode == SImode && in_mode == SFmode)
31406 {
31407 if (out_n == 4 && in_n == 4)
31408 return ix86_builtins[IX86_BUILTIN_FLOORPS_SFIX];
31409 else if (out_n == 8 && in_n == 8)
31410 return ix86_builtins[IX86_BUILTIN_FLOORPS_SFIX256];
31411 }
31412 break;
31413
31414 case BUILT_IN_ICEIL:
31415 case BUILT_IN_LCEIL:
31416 case BUILT_IN_LLCEIL:
31417 /* The round insn does not trap on denormals. */
31418 if (flag_trapping_math || !TARGET_ROUND)
31419 break;
31420
31421 if (out_mode == SImode && in_mode == DFmode)
31422 {
31423 if (out_n == 4 && in_n == 2)
31424 return ix86_builtins[IX86_BUILTIN_CEILPD_VEC_PACK_SFIX];
31425 else if (out_n == 8 && in_n == 4)
31426 return ix86_builtins[IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256];
31427 }
31428 break;
31429
31430 case BUILT_IN_ICEILF:
31431 case BUILT_IN_LCEILF:
31432 case BUILT_IN_LLCEILF:
31433 /* The round insn does not trap on denormals. */
31434 if (flag_trapping_math || !TARGET_ROUND)
31435 break;
31436
31437 if (out_mode == SImode && in_mode == SFmode)
31438 {
31439 if (out_n == 4 && in_n == 4)
31440 return ix86_builtins[IX86_BUILTIN_CEILPS_SFIX];
31441 else if (out_n == 8 && in_n == 8)
31442 return ix86_builtins[IX86_BUILTIN_CEILPS_SFIX256];
31443 }
31444 break;
31445
31446 case BUILT_IN_IRINT:
31447 case BUILT_IN_LRINT:
31448 case BUILT_IN_LLRINT:
31449 if (out_mode == SImode && in_mode == DFmode)
31450 {
31451 if (out_n == 4 && in_n == 2)
31452 return ix86_builtins[IX86_BUILTIN_VEC_PACK_SFIX];
31453 else if (out_n == 8 && in_n == 4)
31454 return ix86_builtins[IX86_BUILTIN_VEC_PACK_SFIX256];
31455 }
31456 break;
31457
31458 case BUILT_IN_IRINTF:
31459 case BUILT_IN_LRINTF:
31460 case BUILT_IN_LLRINTF:
31461 if (out_mode == SImode && in_mode == SFmode)
31462 {
31463 if (out_n == 4 && in_n == 4)
31464 return ix86_builtins[IX86_BUILTIN_CVTPS2DQ];
31465 else if (out_n == 8 && in_n == 8)
31466 return ix86_builtins[IX86_BUILTIN_CVTPS2DQ256];
31467 }
31468 break;
31469
31470 case BUILT_IN_IROUND:
31471 case BUILT_IN_LROUND:
31472 case BUILT_IN_LLROUND:
31473 /* The round insn does not trap on denormals. */
31474 if (flag_trapping_math || !TARGET_ROUND)
31475 break;
31476
31477 if (out_mode == SImode && in_mode == DFmode)
31478 {
31479 if (out_n == 4 && in_n == 2)
31480 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX];
31481 else if (out_n == 8 && in_n == 4)
31482 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256];
31483 }
31484 break;
31485
31486 case BUILT_IN_IROUNDF:
31487 case BUILT_IN_LROUNDF:
31488 case BUILT_IN_LLROUNDF:
31489 /* The round insn does not trap on denormals. */
31490 if (flag_trapping_math || !TARGET_ROUND)
31491 break;
31492
31493 if (out_mode == SImode && in_mode == SFmode)
31494 {
31495 if (out_n == 4 && in_n == 4)
31496 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ_SFIX];
31497 else if (out_n == 8 && in_n == 8)
31498 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ_SFIX256];
31499 }
31500 break;
31501
31502 case BUILT_IN_COPYSIGN:
31503 if (out_mode == DFmode && in_mode == DFmode)
31504 {
31505 if (out_n == 2 && in_n == 2)
31506 return ix86_builtins[IX86_BUILTIN_CPYSGNPD];
31507 else if (out_n == 4 && in_n == 4)
31508 return ix86_builtins[IX86_BUILTIN_CPYSGNPD256];
31509 }
31510 break;
31511
31512 case BUILT_IN_COPYSIGNF:
31513 if (out_mode == SFmode && in_mode == SFmode)
31514 {
31515 if (out_n == 4 && in_n == 4)
31516 return ix86_builtins[IX86_BUILTIN_CPYSGNPS];
31517 else if (out_n == 8 && in_n == 8)
31518 return ix86_builtins[IX86_BUILTIN_CPYSGNPS256];
31519 }
31520 break;
31521
31522 case BUILT_IN_FLOOR:
31523 /* The round insn does not trap on denormals. */
31524 if (flag_trapping_math || !TARGET_ROUND)
31525 break;
31526
31527 if (out_mode == DFmode && in_mode == DFmode)
31528 {
31529 if (out_n == 2 && in_n == 2)
31530 return ix86_builtins[IX86_BUILTIN_FLOORPD];
31531 else if (out_n == 4 && in_n == 4)
31532 return ix86_builtins[IX86_BUILTIN_FLOORPD256];
31533 }
31534 break;
31535
31536 case BUILT_IN_FLOORF:
31537 /* The round insn does not trap on denormals. */
31538 if (flag_trapping_math || !TARGET_ROUND)
31539 break;
31540
31541 if (out_mode == SFmode && in_mode == SFmode)
31542 {
31543 if (out_n == 4 && in_n == 4)
31544 return ix86_builtins[IX86_BUILTIN_FLOORPS];
31545 else if (out_n == 8 && in_n == 8)
31546 return ix86_builtins[IX86_BUILTIN_FLOORPS256];
31547 }
31548 break;
31549
31550 case BUILT_IN_CEIL:
31551 /* The round insn does not trap on denormals. */
31552 if (flag_trapping_math || !TARGET_ROUND)
31553 break;
31554
31555 if (out_mode == DFmode && in_mode == DFmode)
31556 {
31557 if (out_n == 2 && in_n == 2)
31558 return ix86_builtins[IX86_BUILTIN_CEILPD];
31559 else if (out_n == 4 && in_n == 4)
31560 return ix86_builtins[IX86_BUILTIN_CEILPD256];
31561 }
31562 break;
31563
31564 case BUILT_IN_CEILF:
31565 /* The round insn does not trap on denormals. */
31566 if (flag_trapping_math || !TARGET_ROUND)
31567 break;
31568
31569 if (out_mode == SFmode && in_mode == SFmode)
31570 {
31571 if (out_n == 4 && in_n == 4)
31572 return ix86_builtins[IX86_BUILTIN_CEILPS];
31573 else if (out_n == 8 && in_n == 8)
31574 return ix86_builtins[IX86_BUILTIN_CEILPS256];
31575 }
31576 break;
31577
31578 case BUILT_IN_TRUNC:
31579 /* The round insn does not trap on denormals. */
31580 if (flag_trapping_math || !TARGET_ROUND)
31581 break;
31582
31583 if (out_mode == DFmode && in_mode == DFmode)
31584 {
31585 if (out_n == 2 && in_n == 2)
31586 return ix86_builtins[IX86_BUILTIN_TRUNCPD];
31587 else if (out_n == 4 && in_n == 4)
31588 return ix86_builtins[IX86_BUILTIN_TRUNCPD256];
31589 }
31590 break;
31591
31592 case BUILT_IN_TRUNCF:
31593 /* The round insn does not trap on denormals. */
31594 if (flag_trapping_math || !TARGET_ROUND)
31595 break;
31596
31597 if (out_mode == SFmode && in_mode == SFmode)
31598 {
31599 if (out_n == 4 && in_n == 4)
31600 return ix86_builtins[IX86_BUILTIN_TRUNCPS];
31601 else if (out_n == 8 && in_n == 8)
31602 return ix86_builtins[IX86_BUILTIN_TRUNCPS256];
31603 }
31604 break;
31605
31606 case BUILT_IN_RINT:
31607 /* The round insn does not trap on denormals. */
31608 if (flag_trapping_math || !TARGET_ROUND)
31609 break;
31610
31611 if (out_mode == DFmode && in_mode == DFmode)
31612 {
31613 if (out_n == 2 && in_n == 2)
31614 return ix86_builtins[IX86_BUILTIN_RINTPD];
31615 else if (out_n == 4 && in_n == 4)
31616 return ix86_builtins[IX86_BUILTIN_RINTPD256];
31617 }
31618 break;
31619
31620 case BUILT_IN_RINTF:
31621 /* The round insn does not trap on denormals. */
31622 if (flag_trapping_math || !TARGET_ROUND)
31623 break;
31624
31625 if (out_mode == SFmode && in_mode == SFmode)
31626 {
31627 if (out_n == 4 && in_n == 4)
31628 return ix86_builtins[IX86_BUILTIN_RINTPS];
31629 else if (out_n == 8 && in_n == 8)
31630 return ix86_builtins[IX86_BUILTIN_RINTPS256];
31631 }
31632 break;
31633
31634 case BUILT_IN_ROUND:
31635 /* The round insn does not trap on denormals. */
31636 if (flag_trapping_math || !TARGET_ROUND)
31637 break;
31638
31639 if (out_mode == DFmode && in_mode == DFmode)
31640 {
31641 if (out_n == 2 && in_n == 2)
31642 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ];
31643 else if (out_n == 4 && in_n == 4)
31644 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ256];
31645 }
31646 break;
31647
31648 case BUILT_IN_ROUNDF:
31649 /* The round insn does not trap on denormals. */
31650 if (flag_trapping_math || !TARGET_ROUND)
31651 break;
31652
31653 if (out_mode == SFmode && in_mode == SFmode)
31654 {
31655 if (out_n == 4 && in_n == 4)
31656 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ];
31657 else if (out_n == 8 && in_n == 8)
31658 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ256];
31659 }
31660 break;
31661
31662 case BUILT_IN_FMA:
31663 if (out_mode == DFmode && in_mode == DFmode)
31664 {
31665 if (out_n == 2 && in_n == 2)
31666 return ix86_builtins[IX86_BUILTIN_VFMADDPD];
31667 if (out_n == 4 && in_n == 4)
31668 return ix86_builtins[IX86_BUILTIN_VFMADDPD256];
31669 }
31670 break;
31671
31672 case BUILT_IN_FMAF:
31673 if (out_mode == SFmode && in_mode == SFmode)
31674 {
31675 if (out_n == 4 && in_n == 4)
31676 return ix86_builtins[IX86_BUILTIN_VFMADDPS];
31677 if (out_n == 8 && in_n == 8)
31678 return ix86_builtins[IX86_BUILTIN_VFMADDPS256];
31679 }
31680 break;
31681
31682 default:
31683 break;
31684 }
31685
31686 /* Dispatch to a handler for a vectorization library. */
31687 if (ix86_veclib_handler)
31688 return ix86_veclib_handler ((enum built_in_function) fn, type_out,
31689 type_in);
31690
31691 return NULL_TREE;
31692 }
31693
31694 /* Handler for an SVML-style interface to
31695 a library with vectorized intrinsics. */
31696
31697 static tree
31698 ix86_veclibabi_svml (enum built_in_function fn, tree type_out, tree type_in)
31699 {
31700 char name[20];
31701 tree fntype, new_fndecl, args;
31702 unsigned arity;
31703 const char *bname;
31704 enum machine_mode el_mode, in_mode;
31705 int n, in_n;
31706
31707 /* The SVML is suitable for unsafe math only. */
31708 if (!flag_unsafe_math_optimizations)
31709 return NULL_TREE;
31710
31711 el_mode = TYPE_MODE (TREE_TYPE (type_out));
31712 n = TYPE_VECTOR_SUBPARTS (type_out);
31713 in_mode = TYPE_MODE (TREE_TYPE (type_in));
31714 in_n = TYPE_VECTOR_SUBPARTS (type_in);
31715 if (el_mode != in_mode
31716 || n != in_n)
31717 return NULL_TREE;
31718
31719 switch (fn)
31720 {
31721 case BUILT_IN_EXP:
31722 case BUILT_IN_LOG:
31723 case BUILT_IN_LOG10:
31724 case BUILT_IN_POW:
31725 case BUILT_IN_TANH:
31726 case BUILT_IN_TAN:
31727 case BUILT_IN_ATAN:
31728 case BUILT_IN_ATAN2:
31729 case BUILT_IN_ATANH:
31730 case BUILT_IN_CBRT:
31731 case BUILT_IN_SINH:
31732 case BUILT_IN_SIN:
31733 case BUILT_IN_ASINH:
31734 case BUILT_IN_ASIN:
31735 case BUILT_IN_COSH:
31736 case BUILT_IN_COS:
31737 case BUILT_IN_ACOSH:
31738 case BUILT_IN_ACOS:
31739 if (el_mode != DFmode || n != 2)
31740 return NULL_TREE;
31741 break;
31742
31743 case BUILT_IN_EXPF:
31744 case BUILT_IN_LOGF:
31745 case BUILT_IN_LOG10F:
31746 case BUILT_IN_POWF:
31747 case BUILT_IN_TANHF:
31748 case BUILT_IN_TANF:
31749 case BUILT_IN_ATANF:
31750 case BUILT_IN_ATAN2F:
31751 case BUILT_IN_ATANHF:
31752 case BUILT_IN_CBRTF:
31753 case BUILT_IN_SINHF:
31754 case BUILT_IN_SINF:
31755 case BUILT_IN_ASINHF:
31756 case BUILT_IN_ASINF:
31757 case BUILT_IN_COSHF:
31758 case BUILT_IN_COSF:
31759 case BUILT_IN_ACOSHF:
31760 case BUILT_IN_ACOSF:
31761 if (el_mode != SFmode || n != 4)
31762 return NULL_TREE;
31763 break;
31764
31765 default:
31766 return NULL_TREE;
31767 }
31768
31769 bname = IDENTIFIER_POINTER (DECL_NAME (builtin_decl_implicit (fn)));
31770
31771 if (fn == BUILT_IN_LOGF)
31772 strcpy (name, "vmlsLn4");
31773 else if (fn == BUILT_IN_LOG)
31774 strcpy (name, "vmldLn2");
31775 else if (n == 4)
31776 {
31777 sprintf (name, "vmls%s", bname+10);
31778 name[strlen (name)-1] = '4';
31779 }
31780 else
31781 sprintf (name, "vmld%s2", bname+10);
31782
31783 /* Convert to uppercase. */
31784 name[4] &= ~0x20;
31785
31786 arity = 0;
31787 for (args = DECL_ARGUMENTS (builtin_decl_implicit (fn));
31788 args;
31789 args = TREE_CHAIN (args))
31790 arity++;
31791
31792 if (arity == 1)
31793 fntype = build_function_type_list (type_out, type_in, NULL);
31794 else
31795 fntype = build_function_type_list (type_out, type_in, type_in, NULL);
31796
31797 /* Build a function declaration for the vectorized function. */
31798 new_fndecl = build_decl (BUILTINS_LOCATION,
31799 FUNCTION_DECL, get_identifier (name), fntype);
31800 TREE_PUBLIC (new_fndecl) = 1;
31801 DECL_EXTERNAL (new_fndecl) = 1;
31802 DECL_IS_NOVOPS (new_fndecl) = 1;
31803 TREE_READONLY (new_fndecl) = 1;
31804
31805 return new_fndecl;
31806 }
31807
31808 /* Handler for an ACML-style interface to
31809 a library with vectorized intrinsics. */
31810
31811 static tree
31812 ix86_veclibabi_acml (enum built_in_function fn, tree type_out, tree type_in)
31813 {
31814 char name[20] = "__vr.._";
31815 tree fntype, new_fndecl, args;
31816 unsigned arity;
31817 const char *bname;
31818 enum machine_mode el_mode, in_mode;
31819 int n, in_n;
31820
31821 /* The ACML is 64bits only and suitable for unsafe math only as
31822 it does not correctly support parts of IEEE with the required
31823 precision such as denormals. */
31824 if (!TARGET_64BIT
31825 || !flag_unsafe_math_optimizations)
31826 return NULL_TREE;
31827
31828 el_mode = TYPE_MODE (TREE_TYPE (type_out));
31829 n = TYPE_VECTOR_SUBPARTS (type_out);
31830 in_mode = TYPE_MODE (TREE_TYPE (type_in));
31831 in_n = TYPE_VECTOR_SUBPARTS (type_in);
31832 if (el_mode != in_mode
31833 || n != in_n)
31834 return NULL_TREE;
31835
31836 switch (fn)
31837 {
31838 case BUILT_IN_SIN:
31839 case BUILT_IN_COS:
31840 case BUILT_IN_EXP:
31841 case BUILT_IN_LOG:
31842 case BUILT_IN_LOG2:
31843 case BUILT_IN_LOG10:
31844 name[4] = 'd';
31845 name[5] = '2';
31846 if (el_mode != DFmode
31847 || n != 2)
31848 return NULL_TREE;
31849 break;
31850
31851 case BUILT_IN_SINF:
31852 case BUILT_IN_COSF:
31853 case BUILT_IN_EXPF:
31854 case BUILT_IN_POWF:
31855 case BUILT_IN_LOGF:
31856 case BUILT_IN_LOG2F:
31857 case BUILT_IN_LOG10F:
31858 name[4] = 's';
31859 name[5] = '4';
31860 if (el_mode != SFmode
31861 || n != 4)
31862 return NULL_TREE;
31863 break;
31864
31865 default:
31866 return NULL_TREE;
31867 }
31868
31869 bname = IDENTIFIER_POINTER (DECL_NAME (builtin_decl_implicit (fn)));
31870 sprintf (name + 7, "%s", bname+10);
31871
31872 arity = 0;
31873 for (args = DECL_ARGUMENTS (builtin_decl_implicit (fn));
31874 args;
31875 args = TREE_CHAIN (args))
31876 arity++;
31877
31878 if (arity == 1)
31879 fntype = build_function_type_list (type_out, type_in, NULL);
31880 else
31881 fntype = build_function_type_list (type_out, type_in, type_in, NULL);
31882
31883 /* Build a function declaration for the vectorized function. */
31884 new_fndecl = build_decl (BUILTINS_LOCATION,
31885 FUNCTION_DECL, get_identifier (name), fntype);
31886 TREE_PUBLIC (new_fndecl) = 1;
31887 DECL_EXTERNAL (new_fndecl) = 1;
31888 DECL_IS_NOVOPS (new_fndecl) = 1;
31889 TREE_READONLY (new_fndecl) = 1;
31890
31891 return new_fndecl;
31892 }
31893
31894 /* Returns a decl of a function that implements gather load with
31895 memory type MEM_VECTYPE and index type INDEX_VECTYPE and SCALE.
31896 Return NULL_TREE if it is not available. */
31897
31898 static tree
31899 ix86_vectorize_builtin_gather (const_tree mem_vectype,
31900 const_tree index_type, int scale)
31901 {
31902 bool si;
31903 enum ix86_builtins code;
31904
31905 if (! TARGET_AVX2)
31906 return NULL_TREE;
31907
31908 if ((TREE_CODE (index_type) != INTEGER_TYPE
31909 && !POINTER_TYPE_P (index_type))
31910 || (TYPE_MODE (index_type) != SImode
31911 && TYPE_MODE (index_type) != DImode))
31912 return NULL_TREE;
31913
31914 if (TYPE_PRECISION (index_type) > POINTER_SIZE)
31915 return NULL_TREE;
31916
31917 /* v*gather* insn sign extends index to pointer mode. */
31918 if (TYPE_PRECISION (index_type) < POINTER_SIZE
31919 && TYPE_UNSIGNED (index_type))
31920 return NULL_TREE;
31921
31922 if (scale <= 0
31923 || scale > 8
31924 || (scale & (scale - 1)) != 0)
31925 return NULL_TREE;
31926
31927 si = TYPE_MODE (index_type) == SImode;
31928 switch (TYPE_MODE (mem_vectype))
31929 {
31930 case V2DFmode:
31931 code = si ? IX86_BUILTIN_GATHERSIV2DF : IX86_BUILTIN_GATHERDIV2DF;
31932 break;
31933 case V4DFmode:
31934 code = si ? IX86_BUILTIN_GATHERALTSIV4DF : IX86_BUILTIN_GATHERDIV4DF;
31935 break;
31936 case V2DImode:
31937 code = si ? IX86_BUILTIN_GATHERSIV2DI : IX86_BUILTIN_GATHERDIV2DI;
31938 break;
31939 case V4DImode:
31940 code = si ? IX86_BUILTIN_GATHERALTSIV4DI : IX86_BUILTIN_GATHERDIV4DI;
31941 break;
31942 case V4SFmode:
31943 code = si ? IX86_BUILTIN_GATHERSIV4SF : IX86_BUILTIN_GATHERDIV4SF;
31944 break;
31945 case V8SFmode:
31946 code = si ? IX86_BUILTIN_GATHERSIV8SF : IX86_BUILTIN_GATHERALTDIV8SF;
31947 break;
31948 case V4SImode:
31949 code = si ? IX86_BUILTIN_GATHERSIV4SI : IX86_BUILTIN_GATHERDIV4SI;
31950 break;
31951 case V8SImode:
31952 code = si ? IX86_BUILTIN_GATHERSIV8SI : IX86_BUILTIN_GATHERALTDIV8SI;
31953 break;
31954 default:
31955 return NULL_TREE;
31956 }
31957
31958 return ix86_builtins[code];
31959 }
31960
31961 /* Returns a code for a target-specific builtin that implements
31962 reciprocal of the function, or NULL_TREE if not available. */
31963
31964 static tree
31965 ix86_builtin_reciprocal (unsigned int fn, bool md_fn,
31966 bool sqrt ATTRIBUTE_UNUSED)
31967 {
31968 if (! (TARGET_SSE_MATH && !optimize_insn_for_size_p ()
31969 && flag_finite_math_only && !flag_trapping_math
31970 && flag_unsafe_math_optimizations))
31971 return NULL_TREE;
31972
31973 if (md_fn)
31974 /* Machine dependent builtins. */
31975 switch (fn)
31976 {
31977 /* Vectorized version of sqrt to rsqrt conversion. */
31978 case IX86_BUILTIN_SQRTPS_NR:
31979 return ix86_builtins[IX86_BUILTIN_RSQRTPS_NR];
31980
31981 case IX86_BUILTIN_SQRTPS_NR256:
31982 return ix86_builtins[IX86_BUILTIN_RSQRTPS_NR256];
31983
31984 default:
31985 return NULL_TREE;
31986 }
31987 else
31988 /* Normal builtins. */
31989 switch (fn)
31990 {
31991 /* Sqrt to rsqrt conversion. */
31992 case BUILT_IN_SQRTF:
31993 return ix86_builtins[IX86_BUILTIN_RSQRTF];
31994
31995 default:
31996 return NULL_TREE;
31997 }
31998 }
31999 \f
32000 /* Helper for avx_vpermilps256_operand et al. This is also used by
32001 the expansion functions to turn the parallel back into a mask.
32002 The return value is 0 for no match and the imm8+1 for a match. */
32003
32004 int
32005 avx_vpermilp_parallel (rtx par, enum machine_mode mode)
32006 {
32007 unsigned i, nelt = GET_MODE_NUNITS (mode);
32008 unsigned mask = 0;
32009 unsigned char ipar[8];
32010
32011 if (XVECLEN (par, 0) != (int) nelt)
32012 return 0;
32013
32014 /* Validate that all of the elements are constants, and not totally
32015 out of range. Copy the data into an integral array to make the
32016 subsequent checks easier. */
32017 for (i = 0; i < nelt; ++i)
32018 {
32019 rtx er = XVECEXP (par, 0, i);
32020 unsigned HOST_WIDE_INT ei;
32021
32022 if (!CONST_INT_P (er))
32023 return 0;
32024 ei = INTVAL (er);
32025 if (ei >= nelt)
32026 return 0;
32027 ipar[i] = ei;
32028 }
32029
32030 switch (mode)
32031 {
32032 case V4DFmode:
32033 /* In the 256-bit DFmode case, we can only move elements within
32034 a 128-bit lane. */
32035 for (i = 0; i < 2; ++i)
32036 {
32037 if (ipar[i] >= 2)
32038 return 0;
32039 mask |= ipar[i] << i;
32040 }
32041 for (i = 2; i < 4; ++i)
32042 {
32043 if (ipar[i] < 2)
32044 return 0;
32045 mask |= (ipar[i] - 2) << i;
32046 }
32047 break;
32048
32049 case V8SFmode:
32050 /* In the 256-bit SFmode case, we have full freedom of movement
32051 within the low 128-bit lane, but the high 128-bit lane must
32052 mirror the exact same pattern. */
32053 for (i = 0; i < 4; ++i)
32054 if (ipar[i] + 4 != ipar[i + 4])
32055 return 0;
32056 nelt = 4;
32057 /* FALLTHRU */
32058
32059 case V2DFmode:
32060 case V4SFmode:
32061 /* In the 128-bit case, we've full freedom in the placement of
32062 the elements from the source operand. */
32063 for (i = 0; i < nelt; ++i)
32064 mask |= ipar[i] << (i * (nelt / 2));
32065 break;
32066
32067 default:
32068 gcc_unreachable ();
32069 }
32070
32071 /* Make sure success has a non-zero value by adding one. */
32072 return mask + 1;
32073 }
32074
32075 /* Helper for avx_vperm2f128_v4df_operand et al. This is also used by
32076 the expansion functions to turn the parallel back into a mask.
32077 The return value is 0 for no match and the imm8+1 for a match. */
32078
32079 int
32080 avx_vperm2f128_parallel (rtx par, enum machine_mode mode)
32081 {
32082 unsigned i, nelt = GET_MODE_NUNITS (mode), nelt2 = nelt / 2;
32083 unsigned mask = 0;
32084 unsigned char ipar[8];
32085
32086 if (XVECLEN (par, 0) != (int) nelt)
32087 return 0;
32088
32089 /* Validate that all of the elements are constants, and not totally
32090 out of range. Copy the data into an integral array to make the
32091 subsequent checks easier. */
32092 for (i = 0; i < nelt; ++i)
32093 {
32094 rtx er = XVECEXP (par, 0, i);
32095 unsigned HOST_WIDE_INT ei;
32096
32097 if (!CONST_INT_P (er))
32098 return 0;
32099 ei = INTVAL (er);
32100 if (ei >= 2 * nelt)
32101 return 0;
32102 ipar[i] = ei;
32103 }
32104
32105 /* Validate that the halves of the permute are halves. */
32106 for (i = 0; i < nelt2 - 1; ++i)
32107 if (ipar[i] + 1 != ipar[i + 1])
32108 return 0;
32109 for (i = nelt2; i < nelt - 1; ++i)
32110 if (ipar[i] + 1 != ipar[i + 1])
32111 return 0;
32112
32113 /* Reconstruct the mask. */
32114 for (i = 0; i < 2; ++i)
32115 {
32116 unsigned e = ipar[i * nelt2];
32117 if (e % nelt2)
32118 return 0;
32119 e /= nelt2;
32120 mask |= e << (i * 4);
32121 }
32122
32123 /* Make sure success has a non-zero value by adding one. */
32124 return mask + 1;
32125 }
32126 \f
32127 /* Store OPERAND to the memory after reload is completed. This means
32128 that we can't easily use assign_stack_local. */
32129 rtx
32130 ix86_force_to_memory (enum machine_mode mode, rtx operand)
32131 {
32132 rtx result;
32133
32134 gcc_assert (reload_completed);
32135 if (ix86_using_red_zone ())
32136 {
32137 result = gen_rtx_MEM (mode,
32138 gen_rtx_PLUS (Pmode,
32139 stack_pointer_rtx,
32140 GEN_INT (-RED_ZONE_SIZE)));
32141 emit_move_insn (result, operand);
32142 }
32143 else if (TARGET_64BIT)
32144 {
32145 switch (mode)
32146 {
32147 case HImode:
32148 case SImode:
32149 operand = gen_lowpart (DImode, operand);
32150 /* FALLTHRU */
32151 case DImode:
32152 emit_insn (
32153 gen_rtx_SET (VOIDmode,
32154 gen_rtx_MEM (DImode,
32155 gen_rtx_PRE_DEC (DImode,
32156 stack_pointer_rtx)),
32157 operand));
32158 break;
32159 default:
32160 gcc_unreachable ();
32161 }
32162 result = gen_rtx_MEM (mode, stack_pointer_rtx);
32163 }
32164 else
32165 {
32166 switch (mode)
32167 {
32168 case DImode:
32169 {
32170 rtx operands[2];
32171 split_double_mode (mode, &operand, 1, operands, operands + 1);
32172 emit_insn (
32173 gen_rtx_SET (VOIDmode,
32174 gen_rtx_MEM (SImode,
32175 gen_rtx_PRE_DEC (Pmode,
32176 stack_pointer_rtx)),
32177 operands[1]));
32178 emit_insn (
32179 gen_rtx_SET (VOIDmode,
32180 gen_rtx_MEM (SImode,
32181 gen_rtx_PRE_DEC (Pmode,
32182 stack_pointer_rtx)),
32183 operands[0]));
32184 }
32185 break;
32186 case HImode:
32187 /* Store HImodes as SImodes. */
32188 operand = gen_lowpart (SImode, operand);
32189 /* FALLTHRU */
32190 case SImode:
32191 emit_insn (
32192 gen_rtx_SET (VOIDmode,
32193 gen_rtx_MEM (GET_MODE (operand),
32194 gen_rtx_PRE_DEC (SImode,
32195 stack_pointer_rtx)),
32196 operand));
32197 break;
32198 default:
32199 gcc_unreachable ();
32200 }
32201 result = gen_rtx_MEM (mode, stack_pointer_rtx);
32202 }
32203 return result;
32204 }
32205
32206 /* Free operand from the memory. */
32207 void
32208 ix86_free_from_memory (enum machine_mode mode)
32209 {
32210 if (!ix86_using_red_zone ())
32211 {
32212 int size;
32213
32214 if (mode == DImode || TARGET_64BIT)
32215 size = 8;
32216 else
32217 size = 4;
32218 /* Use LEA to deallocate stack space. In peephole2 it will be converted
32219 to pop or add instruction if registers are available. */
32220 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
32221 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
32222 GEN_INT (size))));
32223 }
32224 }
32225
32226 /* Return true if we use LRA instead of reload pass. */
32227 static bool
32228 ix86_lra_p (void)
32229 {
32230 return true;
32231 }
32232
32233 /* Return a register priority for hard reg REGNO. */
32234 static int
32235 ix86_register_priority (int hard_regno)
32236 {
32237 /* ebp and r13 as the base always wants a displacement, r12 as the
32238 base always wants an index. So discourage their usage in an
32239 address. */
32240 if (hard_regno == R12_REG || hard_regno == R13_REG)
32241 return 0;
32242 if (hard_regno == BP_REG)
32243 return 1;
32244 /* New x86-64 int registers result in bigger code size. Discourage
32245 them. */
32246 if (FIRST_REX_INT_REG <= hard_regno && hard_regno <= LAST_REX_INT_REG)
32247 return 2;
32248 /* New x86-64 SSE registers result in bigger code size. Discourage
32249 them. */
32250 if (FIRST_REX_SSE_REG <= hard_regno && hard_regno <= LAST_REX_SSE_REG)
32251 return 2;
32252 /* Usage of AX register results in smaller code. Prefer it. */
32253 if (hard_regno == 0)
32254 return 4;
32255 return 3;
32256 }
32257
32258 /* Implement TARGET_PREFERRED_RELOAD_CLASS.
32259
32260 Put float CONST_DOUBLE in the constant pool instead of fp regs.
32261 QImode must go into class Q_REGS.
32262 Narrow ALL_REGS to GENERAL_REGS. This supports allowing movsf and
32263 movdf to do mem-to-mem moves through integer regs. */
32264
32265 static reg_class_t
32266 ix86_preferred_reload_class (rtx x, reg_class_t regclass)
32267 {
32268 enum machine_mode mode = GET_MODE (x);
32269
32270 /* We're only allowed to return a subclass of CLASS. Many of the
32271 following checks fail for NO_REGS, so eliminate that early. */
32272 if (regclass == NO_REGS)
32273 return NO_REGS;
32274
32275 /* All classes can load zeros. */
32276 if (x == CONST0_RTX (mode))
32277 return regclass;
32278
32279 /* Force constants into memory if we are loading a (nonzero) constant into
32280 an MMX or SSE register. This is because there are no MMX/SSE instructions
32281 to load from a constant. */
32282 if (CONSTANT_P (x)
32283 && (MAYBE_MMX_CLASS_P (regclass) || MAYBE_SSE_CLASS_P (regclass)))
32284 return NO_REGS;
32285
32286 /* Prefer SSE regs only, if we can use them for math. */
32287 if (TARGET_SSE_MATH && !TARGET_MIX_SSE_I387 && SSE_FLOAT_MODE_P (mode))
32288 return SSE_CLASS_P (regclass) ? regclass : NO_REGS;
32289
32290 /* Floating-point constants need more complex checks. */
32291 if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) != VOIDmode)
32292 {
32293 /* General regs can load everything. */
32294 if (reg_class_subset_p (regclass, GENERAL_REGS))
32295 return regclass;
32296
32297 /* Floats can load 0 and 1 plus some others. Note that we eliminated
32298 zero above. We only want to wind up preferring 80387 registers if
32299 we plan on doing computation with them. */
32300 if (TARGET_80387
32301 && standard_80387_constant_p (x) > 0)
32302 {
32303 /* Limit class to non-sse. */
32304 if (regclass == FLOAT_SSE_REGS)
32305 return FLOAT_REGS;
32306 if (regclass == FP_TOP_SSE_REGS)
32307 return FP_TOP_REG;
32308 if (regclass == FP_SECOND_SSE_REGS)
32309 return FP_SECOND_REG;
32310 if (regclass == FLOAT_INT_REGS || regclass == FLOAT_REGS)
32311 return regclass;
32312 }
32313
32314 return NO_REGS;
32315 }
32316
32317 /* Generally when we see PLUS here, it's the function invariant
32318 (plus soft-fp const_int). Which can only be computed into general
32319 regs. */
32320 if (GET_CODE (x) == PLUS)
32321 return reg_class_subset_p (regclass, GENERAL_REGS) ? regclass : NO_REGS;
32322
32323 /* QImode constants are easy to load, but non-constant QImode data
32324 must go into Q_REGS. */
32325 if (GET_MODE (x) == QImode && !CONSTANT_P (x))
32326 {
32327 if (reg_class_subset_p (regclass, Q_REGS))
32328 return regclass;
32329 if (reg_class_subset_p (Q_REGS, regclass))
32330 return Q_REGS;
32331 return NO_REGS;
32332 }
32333
32334 return regclass;
32335 }
32336
32337 /* Discourage putting floating-point values in SSE registers unless
32338 SSE math is being used, and likewise for the 387 registers. */
32339 static reg_class_t
32340 ix86_preferred_output_reload_class (rtx x, reg_class_t regclass)
32341 {
32342 enum machine_mode mode = GET_MODE (x);
32343
32344 /* Restrict the output reload class to the register bank that we are doing
32345 math on. If we would like not to return a subset of CLASS, reject this
32346 alternative: if reload cannot do this, it will still use its choice. */
32347 mode = GET_MODE (x);
32348 if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode))
32349 return MAYBE_SSE_CLASS_P (regclass) ? SSE_REGS : NO_REGS;
32350
32351 if (X87_FLOAT_MODE_P (mode))
32352 {
32353 if (regclass == FP_TOP_SSE_REGS)
32354 return FP_TOP_REG;
32355 else if (regclass == FP_SECOND_SSE_REGS)
32356 return FP_SECOND_REG;
32357 else
32358 return FLOAT_CLASS_P (regclass) ? regclass : NO_REGS;
32359 }
32360
32361 return regclass;
32362 }
32363
32364 static reg_class_t
32365 ix86_secondary_reload (bool in_p, rtx x, reg_class_t rclass,
32366 enum machine_mode mode, secondary_reload_info *sri)
32367 {
32368 /* Double-word spills from general registers to non-offsettable memory
32369 references (zero-extended addresses) require special handling. */
32370 if (TARGET_64BIT
32371 && MEM_P (x)
32372 && GET_MODE_SIZE (mode) > UNITS_PER_WORD
32373 && rclass == GENERAL_REGS
32374 && !offsettable_memref_p (x))
32375 {
32376 sri->icode = (in_p
32377 ? CODE_FOR_reload_noff_load
32378 : CODE_FOR_reload_noff_store);
32379 /* Add the cost of moving address to a temporary. */
32380 sri->extra_cost = 1;
32381
32382 return NO_REGS;
32383 }
32384
32385 /* QImode spills from non-QI registers require
32386 intermediate register on 32bit targets. */
32387 if (!TARGET_64BIT
32388 && !in_p && mode == QImode
32389 && (rclass == GENERAL_REGS
32390 || rclass == LEGACY_REGS
32391 || rclass == NON_Q_REGS
32392 || rclass == SIREG
32393 || rclass == DIREG
32394 || rclass == INDEX_REGS))
32395 {
32396 int regno;
32397
32398 if (REG_P (x))
32399 regno = REGNO (x);
32400 else
32401 regno = -1;
32402
32403 if (regno >= FIRST_PSEUDO_REGISTER || GET_CODE (x) == SUBREG)
32404 regno = true_regnum (x);
32405
32406 /* Return Q_REGS if the operand is in memory. */
32407 if (regno == -1)
32408 return Q_REGS;
32409 }
32410
32411 /* This condition handles corner case where an expression involving
32412 pointers gets vectorized. We're trying to use the address of a
32413 stack slot as a vector initializer.
32414
32415 (set (reg:V2DI 74 [ vect_cst_.2 ])
32416 (vec_duplicate:V2DI (reg/f:DI 20 frame)))
32417
32418 Eventually frame gets turned into sp+offset like this:
32419
32420 (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
32421 (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
32422 (const_int 392 [0x188]))))
32423
32424 That later gets turned into:
32425
32426 (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
32427 (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
32428 (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))))
32429
32430 We'll have the following reload recorded:
32431
32432 Reload 0: reload_in (DI) =
32433 (plus:DI (reg/f:DI 7 sp)
32434 (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))
32435 reload_out (V2DI) = (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
32436 SSE_REGS, RELOAD_OTHER (opnum = 0), can't combine
32437 reload_in_reg: (plus:DI (reg/f:DI 7 sp) (const_int 392 [0x188]))
32438 reload_out_reg: (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
32439 reload_reg_rtx: (reg:V2DI 22 xmm1)
32440
32441 Which isn't going to work since SSE instructions can't handle scalar
32442 additions. Returning GENERAL_REGS forces the addition into integer
32443 register and reload can handle subsequent reloads without problems. */
32444
32445 if (in_p && GET_CODE (x) == PLUS
32446 && SSE_CLASS_P (rclass)
32447 && SCALAR_INT_MODE_P (mode))
32448 return GENERAL_REGS;
32449
32450 return NO_REGS;
32451 }
32452
32453 /* Implement TARGET_CLASS_LIKELY_SPILLED_P. */
32454
32455 static bool
32456 ix86_class_likely_spilled_p (reg_class_t rclass)
32457 {
32458 switch (rclass)
32459 {
32460 case AREG:
32461 case DREG:
32462 case CREG:
32463 case BREG:
32464 case AD_REGS:
32465 case SIREG:
32466 case DIREG:
32467 case SSE_FIRST_REG:
32468 case FP_TOP_REG:
32469 case FP_SECOND_REG:
32470 return true;
32471
32472 default:
32473 break;
32474 }
32475
32476 return false;
32477 }
32478
32479 /* If we are copying between general and FP registers, we need a memory
32480 location. The same is true for SSE and MMX registers.
32481
32482 To optimize register_move_cost performance, allow inline variant.
32483
32484 The macro can't work reliably when one of the CLASSES is class containing
32485 registers from multiple units (SSE, MMX, integer). We avoid this by never
32486 combining those units in single alternative in the machine description.
32487 Ensure that this constraint holds to avoid unexpected surprises.
32488
32489 When STRICT is false, we are being called from REGISTER_MOVE_COST, so do not
32490 enforce these sanity checks. */
32491
32492 static inline bool
32493 inline_secondary_memory_needed (enum reg_class class1, enum reg_class class2,
32494 enum machine_mode mode, int strict)
32495 {
32496 if (MAYBE_FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class1)
32497 || MAYBE_FLOAT_CLASS_P (class2) != FLOAT_CLASS_P (class2)
32498 || MAYBE_SSE_CLASS_P (class1) != SSE_CLASS_P (class1)
32499 || MAYBE_SSE_CLASS_P (class2) != SSE_CLASS_P (class2)
32500 || MAYBE_MMX_CLASS_P (class1) != MMX_CLASS_P (class1)
32501 || MAYBE_MMX_CLASS_P (class2) != MMX_CLASS_P (class2))
32502 {
32503 gcc_assert (!strict || lra_in_progress);
32504 return true;
32505 }
32506
32507 if (FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class2))
32508 return true;
32509
32510 /* ??? This is a lie. We do have moves between mmx/general, and for
32511 mmx/sse2. But by saying we need secondary memory we discourage the
32512 register allocator from using the mmx registers unless needed. */
32513 if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2))
32514 return true;
32515
32516 if (SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
32517 {
32518 /* SSE1 doesn't have any direct moves from other classes. */
32519 if (!TARGET_SSE2)
32520 return true;
32521
32522 /* If the target says that inter-unit moves are more expensive
32523 than moving through memory, then don't generate them. */
32524 if (!TARGET_INTER_UNIT_MOVES)
32525 return true;
32526
32527 /* Between SSE and general, we have moves no larger than word size. */
32528 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
32529 return true;
32530 }
32531
32532 return false;
32533 }
32534
32535 bool
32536 ix86_secondary_memory_needed (enum reg_class class1, enum reg_class class2,
32537 enum machine_mode mode, int strict)
32538 {
32539 return inline_secondary_memory_needed (class1, class2, mode, strict);
32540 }
32541
32542 /* Implement the TARGET_CLASS_MAX_NREGS hook.
32543
32544 On the 80386, this is the size of MODE in words,
32545 except in the FP regs, where a single reg is always enough. */
32546
32547 static unsigned char
32548 ix86_class_max_nregs (reg_class_t rclass, enum machine_mode mode)
32549 {
32550 if (MAYBE_INTEGER_CLASS_P (rclass))
32551 {
32552 if (mode == XFmode)
32553 return (TARGET_64BIT ? 2 : 3);
32554 else if (mode == XCmode)
32555 return (TARGET_64BIT ? 4 : 6);
32556 else
32557 return ((GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD);
32558 }
32559 else
32560 {
32561 if (COMPLEX_MODE_P (mode))
32562 return 2;
32563 else
32564 return 1;
32565 }
32566 }
32567
32568 /* Return true if the registers in CLASS cannot represent the change from
32569 modes FROM to TO. */
32570
32571 bool
32572 ix86_cannot_change_mode_class (enum machine_mode from, enum machine_mode to,
32573 enum reg_class regclass)
32574 {
32575 if (from == to)
32576 return false;
32577
32578 /* x87 registers can't do subreg at all, as all values are reformatted
32579 to extended precision. */
32580 if (MAYBE_FLOAT_CLASS_P (regclass))
32581 return true;
32582
32583 if (MAYBE_SSE_CLASS_P (regclass) || MAYBE_MMX_CLASS_P (regclass))
32584 {
32585 /* Vector registers do not support QI or HImode loads. If we don't
32586 disallow a change to these modes, reload will assume it's ok to
32587 drop the subreg from (subreg:SI (reg:HI 100) 0). This affects
32588 the vec_dupv4hi pattern. */
32589 if (GET_MODE_SIZE (from) < 4)
32590 return true;
32591
32592 /* Vector registers do not support subreg with nonzero offsets, which
32593 are otherwise valid for integer registers. Since we can't see
32594 whether we have a nonzero offset from here, prohibit all
32595 nonparadoxical subregs changing size. */
32596 if (GET_MODE_SIZE (to) < GET_MODE_SIZE (from))
32597 return true;
32598 }
32599
32600 return false;
32601 }
32602
32603 /* Return the cost of moving data of mode M between a
32604 register and memory. A value of 2 is the default; this cost is
32605 relative to those in `REGISTER_MOVE_COST'.
32606
32607 This function is used extensively by register_move_cost that is used to
32608 build tables at startup. Make it inline in this case.
32609 When IN is 2, return maximum of in and out move cost.
32610
32611 If moving between registers and memory is more expensive than
32612 between two registers, you should define this macro to express the
32613 relative cost.
32614
32615 Model also increased moving costs of QImode registers in non
32616 Q_REGS classes.
32617 */
32618 static inline int
32619 inline_memory_move_cost (enum machine_mode mode, enum reg_class regclass,
32620 int in)
32621 {
32622 int cost;
32623 if (FLOAT_CLASS_P (regclass))
32624 {
32625 int index;
32626 switch (mode)
32627 {
32628 case SFmode:
32629 index = 0;
32630 break;
32631 case DFmode:
32632 index = 1;
32633 break;
32634 case XFmode:
32635 index = 2;
32636 break;
32637 default:
32638 return 100;
32639 }
32640 if (in == 2)
32641 return MAX (ix86_cost->fp_load [index], ix86_cost->fp_store [index]);
32642 return in ? ix86_cost->fp_load [index] : ix86_cost->fp_store [index];
32643 }
32644 if (SSE_CLASS_P (regclass))
32645 {
32646 int index;
32647 switch (GET_MODE_SIZE (mode))
32648 {
32649 case 4:
32650 index = 0;
32651 break;
32652 case 8:
32653 index = 1;
32654 break;
32655 case 16:
32656 index = 2;
32657 break;
32658 default:
32659 return 100;
32660 }
32661 if (in == 2)
32662 return MAX (ix86_cost->sse_load [index], ix86_cost->sse_store [index]);
32663 return in ? ix86_cost->sse_load [index] : ix86_cost->sse_store [index];
32664 }
32665 if (MMX_CLASS_P (regclass))
32666 {
32667 int index;
32668 switch (GET_MODE_SIZE (mode))
32669 {
32670 case 4:
32671 index = 0;
32672 break;
32673 case 8:
32674 index = 1;
32675 break;
32676 default:
32677 return 100;
32678 }
32679 if (in)
32680 return MAX (ix86_cost->mmx_load [index], ix86_cost->mmx_store [index]);
32681 return in ? ix86_cost->mmx_load [index] : ix86_cost->mmx_store [index];
32682 }
32683 switch (GET_MODE_SIZE (mode))
32684 {
32685 case 1:
32686 if (Q_CLASS_P (regclass) || TARGET_64BIT)
32687 {
32688 if (!in)
32689 return ix86_cost->int_store[0];
32690 if (TARGET_PARTIAL_REG_DEPENDENCY
32691 && optimize_function_for_speed_p (cfun))
32692 cost = ix86_cost->movzbl_load;
32693 else
32694 cost = ix86_cost->int_load[0];
32695 if (in == 2)
32696 return MAX (cost, ix86_cost->int_store[0]);
32697 return cost;
32698 }
32699 else
32700 {
32701 if (in == 2)
32702 return MAX (ix86_cost->movzbl_load, ix86_cost->int_store[0] + 4);
32703 if (in)
32704 return ix86_cost->movzbl_load;
32705 else
32706 return ix86_cost->int_store[0] + 4;
32707 }
32708 break;
32709 case 2:
32710 if (in == 2)
32711 return MAX (ix86_cost->int_load[1], ix86_cost->int_store[1]);
32712 return in ? ix86_cost->int_load[1] : ix86_cost->int_store[1];
32713 default:
32714 /* Compute number of 32bit moves needed. TFmode is moved as XFmode. */
32715 if (mode == TFmode)
32716 mode = XFmode;
32717 if (in == 2)
32718 cost = MAX (ix86_cost->int_load[2] , ix86_cost->int_store[2]);
32719 else if (in)
32720 cost = ix86_cost->int_load[2];
32721 else
32722 cost = ix86_cost->int_store[2];
32723 return (cost * (((int) GET_MODE_SIZE (mode)
32724 + UNITS_PER_WORD - 1) / UNITS_PER_WORD));
32725 }
32726 }
32727
32728 static int
32729 ix86_memory_move_cost (enum machine_mode mode, reg_class_t regclass,
32730 bool in)
32731 {
32732 return inline_memory_move_cost (mode, (enum reg_class) regclass, in ? 1 : 0);
32733 }
32734
32735
32736 /* Return the cost of moving data from a register in class CLASS1 to
32737 one in class CLASS2.
32738
32739 It is not required that the cost always equal 2 when FROM is the same as TO;
32740 on some machines it is expensive to move between registers if they are not
32741 general registers. */
32742
32743 static int
32744 ix86_register_move_cost (enum machine_mode mode, reg_class_t class1_i,
32745 reg_class_t class2_i)
32746 {
32747 enum reg_class class1 = (enum reg_class) class1_i;
32748 enum reg_class class2 = (enum reg_class) class2_i;
32749
32750 /* In case we require secondary memory, compute cost of the store followed
32751 by load. In order to avoid bad register allocation choices, we need
32752 for this to be *at least* as high as the symmetric MEMORY_MOVE_COST. */
32753
32754 if (inline_secondary_memory_needed (class1, class2, mode, 0))
32755 {
32756 int cost = 1;
32757
32758 cost += inline_memory_move_cost (mode, class1, 2);
32759 cost += inline_memory_move_cost (mode, class2, 2);
32760
32761 /* In case of copying from general_purpose_register we may emit multiple
32762 stores followed by single load causing memory size mismatch stall.
32763 Count this as arbitrarily high cost of 20. */
32764 if (targetm.class_max_nregs (class1, mode)
32765 > targetm.class_max_nregs (class2, mode))
32766 cost += 20;
32767
32768 /* In the case of FP/MMX moves, the registers actually overlap, and we
32769 have to switch modes in order to treat them differently. */
32770 if ((MMX_CLASS_P (class1) && MAYBE_FLOAT_CLASS_P (class2))
32771 || (MMX_CLASS_P (class2) && MAYBE_FLOAT_CLASS_P (class1)))
32772 cost += 20;
32773
32774 return cost;
32775 }
32776
32777 /* Moves between SSE/MMX and integer unit are expensive. */
32778 if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2)
32779 || SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
32780
32781 /* ??? By keeping returned value relatively high, we limit the number
32782 of moves between integer and MMX/SSE registers for all targets.
32783 Additionally, high value prevents problem with x86_modes_tieable_p(),
32784 where integer modes in MMX/SSE registers are not tieable
32785 because of missing QImode and HImode moves to, from or between
32786 MMX/SSE registers. */
32787 return MAX (8, ix86_cost->mmxsse_to_integer);
32788
32789 if (MAYBE_FLOAT_CLASS_P (class1))
32790 return ix86_cost->fp_move;
32791 if (MAYBE_SSE_CLASS_P (class1))
32792 return ix86_cost->sse_move;
32793 if (MAYBE_MMX_CLASS_P (class1))
32794 return ix86_cost->mmx_move;
32795 return 2;
32796 }
32797
32798 /* Return TRUE if hard register REGNO can hold a value of machine-mode
32799 MODE. */
32800
32801 bool
32802 ix86_hard_regno_mode_ok (int regno, enum machine_mode mode)
32803 {
32804 /* Flags and only flags can only hold CCmode values. */
32805 if (CC_REGNO_P (regno))
32806 return GET_MODE_CLASS (mode) == MODE_CC;
32807 if (GET_MODE_CLASS (mode) == MODE_CC
32808 || GET_MODE_CLASS (mode) == MODE_RANDOM
32809 || GET_MODE_CLASS (mode) == MODE_PARTIAL_INT)
32810 return false;
32811 if (STACK_REGNO_P (regno))
32812 return VALID_FP_MODE_P (mode);
32813 if (SSE_REGNO_P (regno))
32814 {
32815 /* We implement the move patterns for all vector modes into and
32816 out of SSE registers, even when no operation instructions
32817 are available. OImode move is available only when AVX is
32818 enabled. */
32819 return ((TARGET_AVX && mode == OImode)
32820 || VALID_AVX256_REG_MODE (mode)
32821 || VALID_SSE_REG_MODE (mode)
32822 || VALID_SSE2_REG_MODE (mode)
32823 || VALID_MMX_REG_MODE (mode)
32824 || VALID_MMX_REG_MODE_3DNOW (mode));
32825 }
32826 if (MMX_REGNO_P (regno))
32827 {
32828 /* We implement the move patterns for 3DNOW modes even in MMX mode,
32829 so if the register is available at all, then we can move data of
32830 the given mode into or out of it. */
32831 return (VALID_MMX_REG_MODE (mode)
32832 || VALID_MMX_REG_MODE_3DNOW (mode));
32833 }
32834
32835 if (mode == QImode)
32836 {
32837 /* Take care for QImode values - they can be in non-QI regs,
32838 but then they do cause partial register stalls. */
32839 if (TARGET_64BIT || QI_REGNO_P (regno))
32840 return true;
32841 if (!TARGET_PARTIAL_REG_STALL)
32842 return true;
32843 return !can_create_pseudo_p ();
32844 }
32845 /* We handle both integer and floats in the general purpose registers. */
32846 else if (VALID_INT_MODE_P (mode))
32847 return true;
32848 else if (VALID_FP_MODE_P (mode))
32849 return true;
32850 else if (VALID_DFP_MODE_P (mode))
32851 return true;
32852 /* Lots of MMX code casts 8 byte vector modes to DImode. If we then go
32853 on to use that value in smaller contexts, this can easily force a
32854 pseudo to be allocated to GENERAL_REGS. Since this is no worse than
32855 supporting DImode, allow it. */
32856 else if (VALID_MMX_REG_MODE_3DNOW (mode) || VALID_MMX_REG_MODE (mode))
32857 return true;
32858
32859 return false;
32860 }
32861
32862 /* A subroutine of ix86_modes_tieable_p. Return true if MODE is a
32863 tieable integer mode. */
32864
32865 static bool
32866 ix86_tieable_integer_mode_p (enum machine_mode mode)
32867 {
32868 switch (mode)
32869 {
32870 case HImode:
32871 case SImode:
32872 return true;
32873
32874 case QImode:
32875 return TARGET_64BIT || !TARGET_PARTIAL_REG_STALL;
32876
32877 case DImode:
32878 return TARGET_64BIT;
32879
32880 default:
32881 return false;
32882 }
32883 }
32884
32885 /* Return true if MODE1 is accessible in a register that can hold MODE2
32886 without copying. That is, all register classes that can hold MODE2
32887 can also hold MODE1. */
32888
32889 bool
32890 ix86_modes_tieable_p (enum machine_mode mode1, enum machine_mode mode2)
32891 {
32892 if (mode1 == mode2)
32893 return true;
32894
32895 if (ix86_tieable_integer_mode_p (mode1)
32896 && ix86_tieable_integer_mode_p (mode2))
32897 return true;
32898
32899 /* MODE2 being XFmode implies fp stack or general regs, which means we
32900 can tie any smaller floating point modes to it. Note that we do not
32901 tie this with TFmode. */
32902 if (mode2 == XFmode)
32903 return mode1 == SFmode || mode1 == DFmode;
32904
32905 /* MODE2 being DFmode implies fp stack, general or sse regs, which means
32906 that we can tie it with SFmode. */
32907 if (mode2 == DFmode)
32908 return mode1 == SFmode;
32909
32910 /* If MODE2 is only appropriate for an SSE register, then tie with
32911 any other mode acceptable to SSE registers. */
32912 if (GET_MODE_SIZE (mode2) == 32
32913 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
32914 return (GET_MODE_SIZE (mode1) == 32
32915 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
32916 if (GET_MODE_SIZE (mode2) == 16
32917 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
32918 return (GET_MODE_SIZE (mode1) == 16
32919 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
32920
32921 /* If MODE2 is appropriate for an MMX register, then tie
32922 with any other mode acceptable to MMX registers. */
32923 if (GET_MODE_SIZE (mode2) == 8
32924 && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode2))
32925 return (GET_MODE_SIZE (mode1) == 8
32926 && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode1));
32927
32928 return false;
32929 }
32930
32931 /* Return the cost of moving between two registers of mode MODE. */
32932
32933 static int
32934 ix86_set_reg_reg_cost (enum machine_mode mode)
32935 {
32936 unsigned int units = UNITS_PER_WORD;
32937
32938 switch (GET_MODE_CLASS (mode))
32939 {
32940 default:
32941 break;
32942
32943 case MODE_CC:
32944 units = GET_MODE_SIZE (CCmode);
32945 break;
32946
32947 case MODE_FLOAT:
32948 if ((TARGET_SSE && mode == TFmode)
32949 || (TARGET_80387 && mode == XFmode)
32950 || ((TARGET_80387 || TARGET_SSE2) && mode == DFmode)
32951 || ((TARGET_80387 || TARGET_SSE) && mode == SFmode))
32952 units = GET_MODE_SIZE (mode);
32953 break;
32954
32955 case MODE_COMPLEX_FLOAT:
32956 if ((TARGET_SSE && mode == TCmode)
32957 || (TARGET_80387 && mode == XCmode)
32958 || ((TARGET_80387 || TARGET_SSE2) && mode == DCmode)
32959 || ((TARGET_80387 || TARGET_SSE) && mode == SCmode))
32960 units = GET_MODE_SIZE (mode);
32961 break;
32962
32963 case MODE_VECTOR_INT:
32964 case MODE_VECTOR_FLOAT:
32965 if ((TARGET_AVX && VALID_AVX256_REG_MODE (mode))
32966 || (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode))
32967 || (TARGET_SSE && VALID_SSE_REG_MODE (mode))
32968 || (TARGET_MMX && VALID_MMX_REG_MODE (mode)))
32969 units = GET_MODE_SIZE (mode);
32970 }
32971
32972 /* Return the cost of moving between two registers of mode MODE,
32973 assuming that the move will be in pieces of at most UNITS bytes. */
32974 return COSTS_N_INSNS ((GET_MODE_SIZE (mode) + units - 1) / units);
32975 }
32976
32977 /* Compute a (partial) cost for rtx X. Return true if the complete
32978 cost has been computed, and false if subexpressions should be
32979 scanned. In either case, *TOTAL contains the cost result. */
32980
32981 static bool
32982 ix86_rtx_costs (rtx x, int code_i, int outer_code_i, int opno, int *total,
32983 bool speed)
32984 {
32985 enum rtx_code code = (enum rtx_code) code_i;
32986 enum rtx_code outer_code = (enum rtx_code) outer_code_i;
32987 enum machine_mode mode = GET_MODE (x);
32988 const struct processor_costs *cost = speed ? ix86_cost : &ix86_size_cost;
32989
32990 switch (code)
32991 {
32992 case SET:
32993 if (register_operand (SET_DEST (x), VOIDmode)
32994 && reg_or_0_operand (SET_SRC (x), VOIDmode))
32995 {
32996 *total = ix86_set_reg_reg_cost (GET_MODE (SET_DEST (x)));
32997 return true;
32998 }
32999 return false;
33000
33001 case CONST_INT:
33002 case CONST:
33003 case LABEL_REF:
33004 case SYMBOL_REF:
33005 if (TARGET_64BIT && !x86_64_immediate_operand (x, VOIDmode))
33006 *total = 3;
33007 else if (TARGET_64BIT && !x86_64_zext_immediate_operand (x, VOIDmode))
33008 *total = 2;
33009 else if (flag_pic && SYMBOLIC_CONST (x)
33010 && (!TARGET_64BIT
33011 || (!GET_CODE (x) != LABEL_REF
33012 && (GET_CODE (x) != SYMBOL_REF
33013 || !SYMBOL_REF_LOCAL_P (x)))))
33014 *total = 1;
33015 else
33016 *total = 0;
33017 return true;
33018
33019 case CONST_DOUBLE:
33020 if (mode == VOIDmode)
33021 {
33022 *total = 0;
33023 return true;
33024 }
33025 switch (standard_80387_constant_p (x))
33026 {
33027 case 1: /* 0.0 */
33028 *total = 1;
33029 return true;
33030 default: /* Other constants */
33031 *total = 2;
33032 return true;
33033 case 0:
33034 case -1:
33035 break;
33036 }
33037 if (SSE_FLOAT_MODE_P (mode))
33038 {
33039 case CONST_VECTOR:
33040 switch (standard_sse_constant_p (x))
33041 {
33042 case 0:
33043 break;
33044 case 1: /* 0: xor eliminates false dependency */
33045 *total = 0;
33046 return true;
33047 default: /* -1: cmp contains false dependency */
33048 *total = 1;
33049 return true;
33050 }
33051 }
33052 /* Fall back to (MEM (SYMBOL_REF)), since that's where
33053 it'll probably end up. Add a penalty for size. */
33054 *total = (COSTS_N_INSNS (1)
33055 + (flag_pic != 0 && !TARGET_64BIT)
33056 + (mode == SFmode ? 0 : mode == DFmode ? 1 : 2));
33057 return true;
33058
33059 case ZERO_EXTEND:
33060 /* The zero extensions is often completely free on x86_64, so make
33061 it as cheap as possible. */
33062 if (TARGET_64BIT && mode == DImode
33063 && GET_MODE (XEXP (x, 0)) == SImode)
33064 *total = 1;
33065 else if (TARGET_ZERO_EXTEND_WITH_AND)
33066 *total = cost->add;
33067 else
33068 *total = cost->movzx;
33069 return false;
33070
33071 case SIGN_EXTEND:
33072 *total = cost->movsx;
33073 return false;
33074
33075 case ASHIFT:
33076 if (SCALAR_INT_MODE_P (mode)
33077 && GET_MODE_SIZE (mode) < UNITS_PER_WORD
33078 && CONST_INT_P (XEXP (x, 1)))
33079 {
33080 HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
33081 if (value == 1)
33082 {
33083 *total = cost->add;
33084 return false;
33085 }
33086 if ((value == 2 || value == 3)
33087 && cost->lea <= cost->shift_const)
33088 {
33089 *total = cost->lea;
33090 return false;
33091 }
33092 }
33093 /* FALLTHRU */
33094
33095 case ROTATE:
33096 case ASHIFTRT:
33097 case LSHIFTRT:
33098 case ROTATERT:
33099 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
33100 {
33101 /* ??? Should be SSE vector operation cost. */
33102 /* At least for published AMD latencies, this really is the same
33103 as the latency for a simple fpu operation like fabs. */
33104 /* V*QImode is emulated with 1-11 insns. */
33105 if (mode == V16QImode || mode == V32QImode)
33106 {
33107 int count = 11;
33108 if (TARGET_XOP && mode == V16QImode)
33109 {
33110 /* For XOP we use vpshab, which requires a broadcast of the
33111 value to the variable shift insn. For constants this
33112 means a V16Q const in mem; even when we can perform the
33113 shift with one insn set the cost to prefer paddb. */
33114 if (CONSTANT_P (XEXP (x, 1)))
33115 {
33116 *total = (cost->fabs
33117 + rtx_cost (XEXP (x, 0), code, 0, speed)
33118 + (speed ? 2 : COSTS_N_BYTES (16)));
33119 return true;
33120 }
33121 count = 3;
33122 }
33123 else if (TARGET_SSSE3)
33124 count = 7;
33125 *total = cost->fabs * count;
33126 }
33127 else
33128 *total = cost->fabs;
33129 }
33130 else if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
33131 {
33132 if (CONST_INT_P (XEXP (x, 1)))
33133 {
33134 if (INTVAL (XEXP (x, 1)) > 32)
33135 *total = cost->shift_const + COSTS_N_INSNS (2);
33136 else
33137 *total = cost->shift_const * 2;
33138 }
33139 else
33140 {
33141 if (GET_CODE (XEXP (x, 1)) == AND)
33142 *total = cost->shift_var * 2;
33143 else
33144 *total = cost->shift_var * 6 + COSTS_N_INSNS (2);
33145 }
33146 }
33147 else
33148 {
33149 if (CONST_INT_P (XEXP (x, 1)))
33150 *total = cost->shift_const;
33151 else
33152 *total = cost->shift_var;
33153 }
33154 return false;
33155
33156 case FMA:
33157 {
33158 rtx sub;
33159
33160 gcc_assert (FLOAT_MODE_P (mode));
33161 gcc_assert (TARGET_FMA || TARGET_FMA4);
33162
33163 /* ??? SSE scalar/vector cost should be used here. */
33164 /* ??? Bald assumption that fma has the same cost as fmul. */
33165 *total = cost->fmul;
33166 *total += rtx_cost (XEXP (x, 1), FMA, 1, speed);
33167
33168 /* Negate in op0 or op2 is free: FMS, FNMA, FNMS. */
33169 sub = XEXP (x, 0);
33170 if (GET_CODE (sub) == NEG)
33171 sub = XEXP (sub, 0);
33172 *total += rtx_cost (sub, FMA, 0, speed);
33173
33174 sub = XEXP (x, 2);
33175 if (GET_CODE (sub) == NEG)
33176 sub = XEXP (sub, 0);
33177 *total += rtx_cost (sub, FMA, 2, speed);
33178 return true;
33179 }
33180
33181 case MULT:
33182 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
33183 {
33184 /* ??? SSE scalar cost should be used here. */
33185 *total = cost->fmul;
33186 return false;
33187 }
33188 else if (X87_FLOAT_MODE_P (mode))
33189 {
33190 *total = cost->fmul;
33191 return false;
33192 }
33193 else if (FLOAT_MODE_P (mode))
33194 {
33195 /* ??? SSE vector cost should be used here. */
33196 *total = cost->fmul;
33197 return false;
33198 }
33199 else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
33200 {
33201 /* V*QImode is emulated with 7-13 insns. */
33202 if (mode == V16QImode || mode == V32QImode)
33203 {
33204 int extra = 11;
33205 if (TARGET_XOP && mode == V16QImode)
33206 extra = 5;
33207 else if (TARGET_SSSE3)
33208 extra = 6;
33209 *total = cost->fmul * 2 + cost->fabs * extra;
33210 }
33211 /* V*DImode is emulated with 5-8 insns. */
33212 else if (mode == V2DImode || mode == V4DImode)
33213 {
33214 if (TARGET_XOP && mode == V2DImode)
33215 *total = cost->fmul * 2 + cost->fabs * 3;
33216 else
33217 *total = cost->fmul * 3 + cost->fabs * 5;
33218 }
33219 /* Without sse4.1, we don't have PMULLD; it's emulated with 7
33220 insns, including two PMULUDQ. */
33221 else if (mode == V4SImode && !(TARGET_SSE4_1 || TARGET_AVX))
33222 *total = cost->fmul * 2 + cost->fabs * 5;
33223 else
33224 *total = cost->fmul;
33225 return false;
33226 }
33227 else
33228 {
33229 rtx op0 = XEXP (x, 0);
33230 rtx op1 = XEXP (x, 1);
33231 int nbits;
33232 if (CONST_INT_P (XEXP (x, 1)))
33233 {
33234 unsigned HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
33235 for (nbits = 0; value != 0; value &= value - 1)
33236 nbits++;
33237 }
33238 else
33239 /* This is arbitrary. */
33240 nbits = 7;
33241
33242 /* Compute costs correctly for widening multiplication. */
33243 if ((GET_CODE (op0) == SIGN_EXTEND || GET_CODE (op0) == ZERO_EXTEND)
33244 && GET_MODE_SIZE (GET_MODE (XEXP (op0, 0))) * 2
33245 == GET_MODE_SIZE (mode))
33246 {
33247 int is_mulwiden = 0;
33248 enum machine_mode inner_mode = GET_MODE (op0);
33249
33250 if (GET_CODE (op0) == GET_CODE (op1))
33251 is_mulwiden = 1, op1 = XEXP (op1, 0);
33252 else if (CONST_INT_P (op1))
33253 {
33254 if (GET_CODE (op0) == SIGN_EXTEND)
33255 is_mulwiden = trunc_int_for_mode (INTVAL (op1), inner_mode)
33256 == INTVAL (op1);
33257 else
33258 is_mulwiden = !(INTVAL (op1) & ~GET_MODE_MASK (inner_mode));
33259 }
33260
33261 if (is_mulwiden)
33262 op0 = XEXP (op0, 0), mode = GET_MODE (op0);
33263 }
33264
33265 *total = (cost->mult_init[MODE_INDEX (mode)]
33266 + nbits * cost->mult_bit
33267 + rtx_cost (op0, outer_code, opno, speed)
33268 + rtx_cost (op1, outer_code, opno, speed));
33269
33270 return true;
33271 }
33272
33273 case DIV:
33274 case UDIV:
33275 case MOD:
33276 case UMOD:
33277 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
33278 /* ??? SSE cost should be used here. */
33279 *total = cost->fdiv;
33280 else if (X87_FLOAT_MODE_P (mode))
33281 *total = cost->fdiv;
33282 else if (FLOAT_MODE_P (mode))
33283 /* ??? SSE vector cost should be used here. */
33284 *total = cost->fdiv;
33285 else
33286 *total = cost->divide[MODE_INDEX (mode)];
33287 return false;
33288
33289 case PLUS:
33290 if (GET_MODE_CLASS (mode) == MODE_INT
33291 && GET_MODE_SIZE (mode) <= UNITS_PER_WORD)
33292 {
33293 if (GET_CODE (XEXP (x, 0)) == PLUS
33294 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
33295 && CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 0), 1))
33296 && CONSTANT_P (XEXP (x, 1)))
33297 {
33298 HOST_WIDE_INT val = INTVAL (XEXP (XEXP (XEXP (x, 0), 0), 1));
33299 if (val == 2 || val == 4 || val == 8)
33300 {
33301 *total = cost->lea;
33302 *total += rtx_cost (XEXP (XEXP (x, 0), 1),
33303 outer_code, opno, speed);
33304 *total += rtx_cost (XEXP (XEXP (XEXP (x, 0), 0), 0),
33305 outer_code, opno, speed);
33306 *total += rtx_cost (XEXP (x, 1), outer_code, opno, speed);
33307 return true;
33308 }
33309 }
33310 else if (GET_CODE (XEXP (x, 0)) == MULT
33311 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
33312 {
33313 HOST_WIDE_INT val = INTVAL (XEXP (XEXP (x, 0), 1));
33314 if (val == 2 || val == 4 || val == 8)
33315 {
33316 *total = cost->lea;
33317 *total += rtx_cost (XEXP (XEXP (x, 0), 0),
33318 outer_code, opno, speed);
33319 *total += rtx_cost (XEXP (x, 1), outer_code, opno, speed);
33320 return true;
33321 }
33322 }
33323 else if (GET_CODE (XEXP (x, 0)) == PLUS)
33324 {
33325 *total = cost->lea;
33326 *total += rtx_cost (XEXP (XEXP (x, 0), 0),
33327 outer_code, opno, speed);
33328 *total += rtx_cost (XEXP (XEXP (x, 0), 1),
33329 outer_code, opno, speed);
33330 *total += rtx_cost (XEXP (x, 1), outer_code, opno, speed);
33331 return true;
33332 }
33333 }
33334 /* FALLTHRU */
33335
33336 case MINUS:
33337 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
33338 {
33339 /* ??? SSE cost should be used here. */
33340 *total = cost->fadd;
33341 return false;
33342 }
33343 else if (X87_FLOAT_MODE_P (mode))
33344 {
33345 *total = cost->fadd;
33346 return false;
33347 }
33348 else if (FLOAT_MODE_P (mode))
33349 {
33350 /* ??? SSE vector cost should be used here. */
33351 *total = cost->fadd;
33352 return false;
33353 }
33354 /* FALLTHRU */
33355
33356 case AND:
33357 case IOR:
33358 case XOR:
33359 if (GET_MODE_CLASS (mode) == MODE_INT
33360 && GET_MODE_SIZE (mode) > UNITS_PER_WORD)
33361 {
33362 *total = (cost->add * 2
33363 + (rtx_cost (XEXP (x, 0), outer_code, opno, speed)
33364 << (GET_MODE (XEXP (x, 0)) != DImode))
33365 + (rtx_cost (XEXP (x, 1), outer_code, opno, speed)
33366 << (GET_MODE (XEXP (x, 1)) != DImode)));
33367 return true;
33368 }
33369 /* FALLTHRU */
33370
33371 case NEG:
33372 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
33373 {
33374 /* ??? SSE cost should be used here. */
33375 *total = cost->fchs;
33376 return false;
33377 }
33378 else if (X87_FLOAT_MODE_P (mode))
33379 {
33380 *total = cost->fchs;
33381 return false;
33382 }
33383 else if (FLOAT_MODE_P (mode))
33384 {
33385 /* ??? SSE vector cost should be used here. */
33386 *total = cost->fchs;
33387 return false;
33388 }
33389 /* FALLTHRU */
33390
33391 case NOT:
33392 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
33393 {
33394 /* ??? Should be SSE vector operation cost. */
33395 /* At least for published AMD latencies, this really is the same
33396 as the latency for a simple fpu operation like fabs. */
33397 *total = cost->fabs;
33398 }
33399 else if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
33400 *total = cost->add * 2;
33401 else
33402 *total = cost->add;
33403 return false;
33404
33405 case COMPARE:
33406 if (GET_CODE (XEXP (x, 0)) == ZERO_EXTRACT
33407 && XEXP (XEXP (x, 0), 1) == const1_rtx
33408 && CONST_INT_P (XEXP (XEXP (x, 0), 2))
33409 && XEXP (x, 1) == const0_rtx)
33410 {
33411 /* This kind of construct is implemented using test[bwl].
33412 Treat it as if we had an AND. */
33413 *total = (cost->add
33414 + rtx_cost (XEXP (XEXP (x, 0), 0), outer_code, opno, speed)
33415 + rtx_cost (const1_rtx, outer_code, opno, speed));
33416 return true;
33417 }
33418 return false;
33419
33420 case FLOAT_EXTEND:
33421 if (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH))
33422 *total = 0;
33423 return false;
33424
33425 case ABS:
33426 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
33427 /* ??? SSE cost should be used here. */
33428 *total = cost->fabs;
33429 else if (X87_FLOAT_MODE_P (mode))
33430 *total = cost->fabs;
33431 else if (FLOAT_MODE_P (mode))
33432 /* ??? SSE vector cost should be used here. */
33433 *total = cost->fabs;
33434 return false;
33435
33436 case SQRT:
33437 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
33438 /* ??? SSE cost should be used here. */
33439 *total = cost->fsqrt;
33440 else if (X87_FLOAT_MODE_P (mode))
33441 *total = cost->fsqrt;
33442 else if (FLOAT_MODE_P (mode))
33443 /* ??? SSE vector cost should be used here. */
33444 *total = cost->fsqrt;
33445 return false;
33446
33447 case UNSPEC:
33448 if (XINT (x, 1) == UNSPEC_TP)
33449 *total = 0;
33450 return false;
33451
33452 case VEC_SELECT:
33453 case VEC_CONCAT:
33454 case VEC_MERGE:
33455 case VEC_DUPLICATE:
33456 /* ??? Assume all of these vector manipulation patterns are
33457 recognizable. In which case they all pretty much have the
33458 same cost. */
33459 *total = cost->fabs;
33460 return true;
33461
33462 default:
33463 return false;
33464 }
33465 }
33466
33467 #if TARGET_MACHO
33468
33469 static int current_machopic_label_num;
33470
33471 /* Given a symbol name and its associated stub, write out the
33472 definition of the stub. */
33473
33474 void
33475 machopic_output_stub (FILE *file, const char *symb, const char *stub)
33476 {
33477 unsigned int length;
33478 char *binder_name, *symbol_name, lazy_ptr_name[32];
33479 int label = ++current_machopic_label_num;
33480
33481 /* For 64-bit we shouldn't get here. */
33482 gcc_assert (!TARGET_64BIT);
33483
33484 /* Lose our funky encoding stuff so it doesn't contaminate the stub. */
33485 symb = targetm.strip_name_encoding (symb);
33486
33487 length = strlen (stub);
33488 binder_name = XALLOCAVEC (char, length + 32);
33489 GEN_BINDER_NAME_FOR_STUB (binder_name, stub, length);
33490
33491 length = strlen (symb);
33492 symbol_name = XALLOCAVEC (char, length + 32);
33493 GEN_SYMBOL_NAME_FOR_SYMBOL (symbol_name, symb, length);
33494
33495 sprintf (lazy_ptr_name, "L%d$lz", label);
33496
33497 if (MACHOPIC_ATT_STUB)
33498 switch_to_section (darwin_sections[machopic_picsymbol_stub3_section]);
33499 else if (MACHOPIC_PURE)
33500 switch_to_section (darwin_sections[machopic_picsymbol_stub2_section]);
33501 else
33502 switch_to_section (darwin_sections[machopic_symbol_stub_section]);
33503
33504 fprintf (file, "%s:\n", stub);
33505 fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
33506
33507 if (MACHOPIC_ATT_STUB)
33508 {
33509 fprintf (file, "\thlt ; hlt ; hlt ; hlt ; hlt\n");
33510 }
33511 else if (MACHOPIC_PURE)
33512 {
33513 /* PIC stub. */
33514 /* 25-byte PIC stub using "CALL get_pc_thunk". */
33515 rtx tmp = gen_rtx_REG (SImode, 2 /* ECX */);
33516 output_set_got (tmp, NULL_RTX); /* "CALL ___<cpu>.get_pc_thunk.cx". */
33517 fprintf (file, "LPC$%d:\tmovl\t%s-LPC$%d(%%ecx),%%ecx\n",
33518 label, lazy_ptr_name, label);
33519 fprintf (file, "\tjmp\t*%%ecx\n");
33520 }
33521 else
33522 fprintf (file, "\tjmp\t*%s\n", lazy_ptr_name);
33523
33524 /* The AT&T-style ("self-modifying") stub is not lazily bound, thus
33525 it needs no stub-binding-helper. */
33526 if (MACHOPIC_ATT_STUB)
33527 return;
33528
33529 fprintf (file, "%s:\n", binder_name);
33530
33531 if (MACHOPIC_PURE)
33532 {
33533 fprintf (file, "\tlea\t%s-%s(%%ecx),%%ecx\n", lazy_ptr_name, binder_name);
33534 fprintf (file, "\tpushl\t%%ecx\n");
33535 }
33536 else
33537 fprintf (file, "\tpushl\t$%s\n", lazy_ptr_name);
33538
33539 fputs ("\tjmp\tdyld_stub_binding_helper\n", file);
33540
33541 /* N.B. Keep the correspondence of these
33542 'symbol_ptr/symbol_ptr2/symbol_ptr3' sections consistent with the
33543 old-pic/new-pic/non-pic stubs; altering this will break
33544 compatibility with existing dylibs. */
33545 if (MACHOPIC_PURE)
33546 {
33547 /* 25-byte PIC stub using "CALL get_pc_thunk". */
33548 switch_to_section (darwin_sections[machopic_lazy_symbol_ptr2_section]);
33549 }
33550 else
33551 /* 16-byte -mdynamic-no-pic stub. */
33552 switch_to_section(darwin_sections[machopic_lazy_symbol_ptr3_section]);
33553
33554 fprintf (file, "%s:\n", lazy_ptr_name);
33555 fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
33556 fprintf (file, ASM_LONG "%s\n", binder_name);
33557 }
33558 #endif /* TARGET_MACHO */
33559
33560 /* Order the registers for register allocator. */
33561
33562 void
33563 x86_order_regs_for_local_alloc (void)
33564 {
33565 int pos = 0;
33566 int i;
33567
33568 /* First allocate the local general purpose registers. */
33569 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
33570 if (GENERAL_REGNO_P (i) && call_used_regs[i])
33571 reg_alloc_order [pos++] = i;
33572
33573 /* Global general purpose registers. */
33574 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
33575 if (GENERAL_REGNO_P (i) && !call_used_regs[i])
33576 reg_alloc_order [pos++] = i;
33577
33578 /* x87 registers come first in case we are doing FP math
33579 using them. */
33580 if (!TARGET_SSE_MATH)
33581 for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
33582 reg_alloc_order [pos++] = i;
33583
33584 /* SSE registers. */
33585 for (i = FIRST_SSE_REG; i <= LAST_SSE_REG; i++)
33586 reg_alloc_order [pos++] = i;
33587 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
33588 reg_alloc_order [pos++] = i;
33589
33590 /* x87 registers. */
33591 if (TARGET_SSE_MATH)
33592 for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
33593 reg_alloc_order [pos++] = i;
33594
33595 for (i = FIRST_MMX_REG; i <= LAST_MMX_REG; i++)
33596 reg_alloc_order [pos++] = i;
33597
33598 /* Initialize the rest of array as we do not allocate some registers
33599 at all. */
33600 while (pos < FIRST_PSEUDO_REGISTER)
33601 reg_alloc_order [pos++] = 0;
33602 }
33603
33604 /* Handle a "callee_pop_aggregate_return" attribute; arguments as
33605 in struct attribute_spec handler. */
33606 static tree
33607 ix86_handle_callee_pop_aggregate_return (tree *node, tree name,
33608 tree args,
33609 int flags ATTRIBUTE_UNUSED,
33610 bool *no_add_attrs)
33611 {
33612 if (TREE_CODE (*node) != FUNCTION_TYPE
33613 && TREE_CODE (*node) != METHOD_TYPE
33614 && TREE_CODE (*node) != FIELD_DECL
33615 && TREE_CODE (*node) != TYPE_DECL)
33616 {
33617 warning (OPT_Wattributes, "%qE attribute only applies to functions",
33618 name);
33619 *no_add_attrs = true;
33620 return NULL_TREE;
33621 }
33622 if (TARGET_64BIT)
33623 {
33624 warning (OPT_Wattributes, "%qE attribute only available for 32-bit",
33625 name);
33626 *no_add_attrs = true;
33627 return NULL_TREE;
33628 }
33629 if (is_attribute_p ("callee_pop_aggregate_return", name))
33630 {
33631 tree cst;
33632
33633 cst = TREE_VALUE (args);
33634 if (TREE_CODE (cst) != INTEGER_CST)
33635 {
33636 warning (OPT_Wattributes,
33637 "%qE attribute requires an integer constant argument",
33638 name);
33639 *no_add_attrs = true;
33640 }
33641 else if (compare_tree_int (cst, 0) != 0
33642 && compare_tree_int (cst, 1) != 0)
33643 {
33644 warning (OPT_Wattributes,
33645 "argument to %qE attribute is neither zero, nor one",
33646 name);
33647 *no_add_attrs = true;
33648 }
33649
33650 return NULL_TREE;
33651 }
33652
33653 return NULL_TREE;
33654 }
33655
33656 /* Handle a "ms_abi" or "sysv" attribute; arguments as in
33657 struct attribute_spec.handler. */
33658 static tree
33659 ix86_handle_abi_attribute (tree *node, tree name,
33660 tree args ATTRIBUTE_UNUSED,
33661 int flags ATTRIBUTE_UNUSED, bool *no_add_attrs)
33662 {
33663 if (TREE_CODE (*node) != FUNCTION_TYPE
33664 && TREE_CODE (*node) != METHOD_TYPE
33665 && TREE_CODE (*node) != FIELD_DECL
33666 && TREE_CODE (*node) != TYPE_DECL)
33667 {
33668 warning (OPT_Wattributes, "%qE attribute only applies to functions",
33669 name);
33670 *no_add_attrs = true;
33671 return NULL_TREE;
33672 }
33673
33674 /* Can combine regparm with all attributes but fastcall. */
33675 if (is_attribute_p ("ms_abi", name))
33676 {
33677 if (lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (*node)))
33678 {
33679 error ("ms_abi and sysv_abi attributes are not compatible");
33680 }
33681
33682 return NULL_TREE;
33683 }
33684 else if (is_attribute_p ("sysv_abi", name))
33685 {
33686 if (lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (*node)))
33687 {
33688 error ("ms_abi and sysv_abi attributes are not compatible");
33689 }
33690
33691 return NULL_TREE;
33692 }
33693
33694 return NULL_TREE;
33695 }
33696
33697 /* Handle a "ms_struct" or "gcc_struct" attribute; arguments as in
33698 struct attribute_spec.handler. */
33699 static tree
33700 ix86_handle_struct_attribute (tree *node, tree name,
33701 tree args ATTRIBUTE_UNUSED,
33702 int flags ATTRIBUTE_UNUSED, bool *no_add_attrs)
33703 {
33704 tree *type = NULL;
33705 if (DECL_P (*node))
33706 {
33707 if (TREE_CODE (*node) == TYPE_DECL)
33708 type = &TREE_TYPE (*node);
33709 }
33710 else
33711 type = node;
33712
33713 if (!(type && RECORD_OR_UNION_TYPE_P (*type)))
33714 {
33715 warning (OPT_Wattributes, "%qE attribute ignored",
33716 name);
33717 *no_add_attrs = true;
33718 }
33719
33720 else if ((is_attribute_p ("ms_struct", name)
33721 && lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (*type)))
33722 || ((is_attribute_p ("gcc_struct", name)
33723 && lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (*type)))))
33724 {
33725 warning (OPT_Wattributes, "%qE incompatible attribute ignored",
33726 name);
33727 *no_add_attrs = true;
33728 }
33729
33730 return NULL_TREE;
33731 }
33732
33733 static tree
33734 ix86_handle_fndecl_attribute (tree *node, tree name,
33735 tree args ATTRIBUTE_UNUSED,
33736 int flags ATTRIBUTE_UNUSED, bool *no_add_attrs)
33737 {
33738 if (TREE_CODE (*node) != FUNCTION_DECL)
33739 {
33740 warning (OPT_Wattributes, "%qE attribute only applies to functions",
33741 name);
33742 *no_add_attrs = true;
33743 }
33744 return NULL_TREE;
33745 }
33746
33747 static bool
33748 ix86_ms_bitfield_layout_p (const_tree record_type)
33749 {
33750 return ((TARGET_MS_BITFIELD_LAYOUT
33751 && !lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (record_type)))
33752 || lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (record_type)));
33753 }
33754
33755 /* Returns an expression indicating where the this parameter is
33756 located on entry to the FUNCTION. */
33757
33758 static rtx
33759 x86_this_parameter (tree function)
33760 {
33761 tree type = TREE_TYPE (function);
33762 bool aggr = aggregate_value_p (TREE_TYPE (type), type) != 0;
33763 int nregs;
33764
33765 if (TARGET_64BIT)
33766 {
33767 const int *parm_regs;
33768
33769 if (ix86_function_type_abi (type) == MS_ABI)
33770 parm_regs = x86_64_ms_abi_int_parameter_registers;
33771 else
33772 parm_regs = x86_64_int_parameter_registers;
33773 return gen_rtx_REG (Pmode, parm_regs[aggr]);
33774 }
33775
33776 nregs = ix86_function_regparm (type, function);
33777
33778 if (nregs > 0 && !stdarg_p (type))
33779 {
33780 int regno;
33781 unsigned int ccvt = ix86_get_callcvt (type);
33782
33783 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
33784 regno = aggr ? DX_REG : CX_REG;
33785 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
33786 {
33787 regno = CX_REG;
33788 if (aggr)
33789 return gen_rtx_MEM (SImode,
33790 plus_constant (Pmode, stack_pointer_rtx, 4));
33791 }
33792 else
33793 {
33794 regno = AX_REG;
33795 if (aggr)
33796 {
33797 regno = DX_REG;
33798 if (nregs == 1)
33799 return gen_rtx_MEM (SImode,
33800 plus_constant (Pmode,
33801 stack_pointer_rtx, 4));
33802 }
33803 }
33804 return gen_rtx_REG (SImode, regno);
33805 }
33806
33807 return gen_rtx_MEM (SImode, plus_constant (Pmode, stack_pointer_rtx,
33808 aggr ? 8 : 4));
33809 }
33810
33811 /* Determine whether x86_output_mi_thunk can succeed. */
33812
33813 static bool
33814 x86_can_output_mi_thunk (const_tree thunk ATTRIBUTE_UNUSED,
33815 HOST_WIDE_INT delta ATTRIBUTE_UNUSED,
33816 HOST_WIDE_INT vcall_offset, const_tree function)
33817 {
33818 /* 64-bit can handle anything. */
33819 if (TARGET_64BIT)
33820 return true;
33821
33822 /* For 32-bit, everything's fine if we have one free register. */
33823 if (ix86_function_regparm (TREE_TYPE (function), function) < 3)
33824 return true;
33825
33826 /* Need a free register for vcall_offset. */
33827 if (vcall_offset)
33828 return false;
33829
33830 /* Need a free register for GOT references. */
33831 if (flag_pic && !targetm.binds_local_p (function))
33832 return false;
33833
33834 /* Otherwise ok. */
33835 return true;
33836 }
33837
33838 /* Output the assembler code for a thunk function. THUNK_DECL is the
33839 declaration for the thunk function itself, FUNCTION is the decl for
33840 the target function. DELTA is an immediate constant offset to be
33841 added to THIS. If VCALL_OFFSET is nonzero, the word at
33842 *(*this + vcall_offset) should be added to THIS. */
33843
33844 static void
33845 x86_output_mi_thunk (FILE *file,
33846 tree thunk ATTRIBUTE_UNUSED, HOST_WIDE_INT delta,
33847 HOST_WIDE_INT vcall_offset, tree function)
33848 {
33849 rtx this_param = x86_this_parameter (function);
33850 rtx this_reg, tmp, fnaddr;
33851 unsigned int tmp_regno;
33852
33853 if (TARGET_64BIT)
33854 tmp_regno = R10_REG;
33855 else
33856 {
33857 unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (function));
33858 if ((ccvt & (IX86_CALLCVT_FASTCALL | IX86_CALLCVT_THISCALL)) != 0)
33859 tmp_regno = AX_REG;
33860 else
33861 tmp_regno = CX_REG;
33862 }
33863
33864 emit_note (NOTE_INSN_PROLOGUE_END);
33865
33866 /* If VCALL_OFFSET, we'll need THIS in a register. Might as well
33867 pull it in now and let DELTA benefit. */
33868 if (REG_P (this_param))
33869 this_reg = this_param;
33870 else if (vcall_offset)
33871 {
33872 /* Put the this parameter into %eax. */
33873 this_reg = gen_rtx_REG (Pmode, AX_REG);
33874 emit_move_insn (this_reg, this_param);
33875 }
33876 else
33877 this_reg = NULL_RTX;
33878
33879 /* Adjust the this parameter by a fixed constant. */
33880 if (delta)
33881 {
33882 rtx delta_rtx = GEN_INT (delta);
33883 rtx delta_dst = this_reg ? this_reg : this_param;
33884
33885 if (TARGET_64BIT)
33886 {
33887 if (!x86_64_general_operand (delta_rtx, Pmode))
33888 {
33889 tmp = gen_rtx_REG (Pmode, tmp_regno);
33890 emit_move_insn (tmp, delta_rtx);
33891 delta_rtx = tmp;
33892 }
33893 }
33894
33895 ix86_emit_binop (PLUS, Pmode, delta_dst, delta_rtx);
33896 }
33897
33898 /* Adjust the this parameter by a value stored in the vtable. */
33899 if (vcall_offset)
33900 {
33901 rtx vcall_addr, vcall_mem, this_mem;
33902
33903 tmp = gen_rtx_REG (Pmode, tmp_regno);
33904
33905 this_mem = gen_rtx_MEM (ptr_mode, this_reg);
33906 if (Pmode != ptr_mode)
33907 this_mem = gen_rtx_ZERO_EXTEND (Pmode, this_mem);
33908 emit_move_insn (tmp, this_mem);
33909
33910 /* Adjust the this parameter. */
33911 vcall_addr = plus_constant (Pmode, tmp, vcall_offset);
33912 if (TARGET_64BIT
33913 && !ix86_legitimate_address_p (ptr_mode, vcall_addr, true))
33914 {
33915 rtx tmp2 = gen_rtx_REG (Pmode, R11_REG);
33916 emit_move_insn (tmp2, GEN_INT (vcall_offset));
33917 vcall_addr = gen_rtx_PLUS (Pmode, tmp, tmp2);
33918 }
33919
33920 vcall_mem = gen_rtx_MEM (ptr_mode, vcall_addr);
33921 if (Pmode != ptr_mode)
33922 emit_insn (gen_addsi_1_zext (this_reg,
33923 gen_rtx_REG (ptr_mode,
33924 REGNO (this_reg)),
33925 vcall_mem));
33926 else
33927 ix86_emit_binop (PLUS, Pmode, this_reg, vcall_mem);
33928 }
33929
33930 /* If necessary, drop THIS back to its stack slot. */
33931 if (this_reg && this_reg != this_param)
33932 emit_move_insn (this_param, this_reg);
33933
33934 fnaddr = XEXP (DECL_RTL (function), 0);
33935 if (TARGET_64BIT)
33936 {
33937 if (!flag_pic || targetm.binds_local_p (function)
33938 || cfun->machine->call_abi == MS_ABI)
33939 ;
33940 else
33941 {
33942 tmp = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOTPCREL);
33943 tmp = gen_rtx_CONST (Pmode, tmp);
33944 fnaddr = gen_rtx_MEM (Pmode, tmp);
33945 }
33946 }
33947 else
33948 {
33949 if (!flag_pic || targetm.binds_local_p (function))
33950 ;
33951 #if TARGET_MACHO
33952 else if (TARGET_MACHO)
33953 {
33954 fnaddr = machopic_indirect_call_target (DECL_RTL (function));
33955 fnaddr = XEXP (fnaddr, 0);
33956 }
33957 #endif /* TARGET_MACHO */
33958 else
33959 {
33960 tmp = gen_rtx_REG (Pmode, CX_REG);
33961 output_set_got (tmp, NULL_RTX);
33962
33963 fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOT);
33964 fnaddr = gen_rtx_PLUS (Pmode, fnaddr, tmp);
33965 fnaddr = gen_rtx_MEM (Pmode, fnaddr);
33966 }
33967 }
33968
33969 /* Our sibling call patterns do not allow memories, because we have no
33970 predicate that can distinguish between frame and non-frame memory.
33971 For our purposes here, we can get away with (ab)using a jump pattern,
33972 because we're going to do no optimization. */
33973 if (MEM_P (fnaddr))
33974 emit_jump_insn (gen_indirect_jump (fnaddr));
33975 else
33976 {
33977 if (ix86_cmodel == CM_LARGE_PIC && SYMBOLIC_CONST (fnaddr))
33978 fnaddr = legitimize_pic_address (fnaddr,
33979 gen_rtx_REG (Pmode, tmp_regno));
33980
33981 if (!sibcall_insn_operand (fnaddr, word_mode))
33982 {
33983 tmp = gen_rtx_REG (word_mode, tmp_regno);
33984 if (GET_MODE (fnaddr) != word_mode)
33985 fnaddr = gen_rtx_ZERO_EXTEND (word_mode, fnaddr);
33986 emit_move_insn (tmp, fnaddr);
33987 fnaddr = tmp;
33988 }
33989
33990 tmp = gen_rtx_MEM (QImode, fnaddr);
33991 tmp = gen_rtx_CALL (VOIDmode, tmp, const0_rtx);
33992 tmp = emit_call_insn (tmp);
33993 SIBLING_CALL_P (tmp) = 1;
33994 }
33995 emit_barrier ();
33996
33997 /* Emit just enough of rest_of_compilation to get the insns emitted.
33998 Note that use_thunk calls assemble_start_function et al. */
33999 tmp = get_insns ();
34000 shorten_branches (tmp);
34001 final_start_function (tmp, file, 1);
34002 final (tmp, file, 1);
34003 final_end_function ();
34004 }
34005
34006 static void
34007 x86_file_start (void)
34008 {
34009 default_file_start ();
34010 #if TARGET_MACHO
34011 darwin_file_start ();
34012 #endif
34013 if (X86_FILE_START_VERSION_DIRECTIVE)
34014 fputs ("\t.version\t\"01.01\"\n", asm_out_file);
34015 if (X86_FILE_START_FLTUSED)
34016 fputs ("\t.global\t__fltused\n", asm_out_file);
34017 if (ix86_asm_dialect == ASM_INTEL)
34018 fputs ("\t.intel_syntax noprefix\n", asm_out_file);
34019 }
34020
34021 int
34022 x86_field_alignment (tree field, int computed)
34023 {
34024 enum machine_mode mode;
34025 tree type = TREE_TYPE (field);
34026
34027 if (TARGET_64BIT || TARGET_ALIGN_DOUBLE)
34028 return computed;
34029 mode = TYPE_MODE (strip_array_types (type));
34030 if (mode == DFmode || mode == DCmode
34031 || GET_MODE_CLASS (mode) == MODE_INT
34032 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
34033 return MIN (32, computed);
34034 return computed;
34035 }
34036
34037 /* Output assembler code to FILE to increment profiler label # LABELNO
34038 for profiling a function entry. */
34039 void
34040 x86_function_profiler (FILE *file, int labelno ATTRIBUTE_UNUSED)
34041 {
34042 const char *mcount_name = (flag_fentry ? MCOUNT_NAME_BEFORE_PROLOGUE
34043 : MCOUNT_NAME);
34044
34045 if (TARGET_64BIT)
34046 {
34047 #ifndef NO_PROFILE_COUNTERS
34048 fprintf (file, "\tleaq\t%sP%d(%%rip),%%r11\n", LPREFIX, labelno);
34049 #endif
34050
34051 if (DEFAULT_ABI == SYSV_ABI && flag_pic)
34052 fprintf (file, "\tcall\t*%s@GOTPCREL(%%rip)\n", mcount_name);
34053 else
34054 fprintf (file, "\tcall\t%s\n", mcount_name);
34055 }
34056 else if (flag_pic)
34057 {
34058 #ifndef NO_PROFILE_COUNTERS
34059 fprintf (file, "\tleal\t%sP%d@GOTOFF(%%ebx),%%" PROFILE_COUNT_REGISTER "\n",
34060 LPREFIX, labelno);
34061 #endif
34062 fprintf (file, "\tcall\t*%s@GOT(%%ebx)\n", mcount_name);
34063 }
34064 else
34065 {
34066 #ifndef NO_PROFILE_COUNTERS
34067 fprintf (file, "\tmovl\t$%sP%d,%%" PROFILE_COUNT_REGISTER "\n",
34068 LPREFIX, labelno);
34069 #endif
34070 fprintf (file, "\tcall\t%s\n", mcount_name);
34071 }
34072 }
34073
34074 /* We don't have exact information about the insn sizes, but we may assume
34075 quite safely that we are informed about all 1 byte insns and memory
34076 address sizes. This is enough to eliminate unnecessary padding in
34077 99% of cases. */
34078
34079 static int
34080 min_insn_size (rtx insn)
34081 {
34082 int l = 0, len;
34083
34084 if (!INSN_P (insn) || !active_insn_p (insn))
34085 return 0;
34086
34087 /* Discard alignments we've emit and jump instructions. */
34088 if (GET_CODE (PATTERN (insn)) == UNSPEC_VOLATILE
34089 && XINT (PATTERN (insn), 1) == UNSPECV_ALIGN)
34090 return 0;
34091 if (JUMP_TABLE_DATA_P (insn))
34092 return 0;
34093
34094 /* Important case - calls are always 5 bytes.
34095 It is common to have many calls in the row. */
34096 if (CALL_P (insn)
34097 && symbolic_reference_mentioned_p (PATTERN (insn))
34098 && !SIBLING_CALL_P (insn))
34099 return 5;
34100 len = get_attr_length (insn);
34101 if (len <= 1)
34102 return 1;
34103
34104 /* For normal instructions we rely on get_attr_length being exact,
34105 with a few exceptions. */
34106 if (!JUMP_P (insn))
34107 {
34108 enum attr_type type = get_attr_type (insn);
34109
34110 switch (type)
34111 {
34112 case TYPE_MULTI:
34113 if (GET_CODE (PATTERN (insn)) == ASM_INPUT
34114 || asm_noperands (PATTERN (insn)) >= 0)
34115 return 0;
34116 break;
34117 case TYPE_OTHER:
34118 case TYPE_FCMP:
34119 break;
34120 default:
34121 /* Otherwise trust get_attr_length. */
34122 return len;
34123 }
34124
34125 l = get_attr_length_address (insn);
34126 if (l < 4 && symbolic_reference_mentioned_p (PATTERN (insn)))
34127 l = 4;
34128 }
34129 if (l)
34130 return 1+l;
34131 else
34132 return 2;
34133 }
34134
34135 #ifdef ASM_OUTPUT_MAX_SKIP_PAD
34136
34137 /* AMD K8 core mispredicts jumps when there are more than 3 jumps in 16 byte
34138 window. */
34139
34140 static void
34141 ix86_avoid_jump_mispredicts (void)
34142 {
34143 rtx insn, start = get_insns ();
34144 int nbytes = 0, njumps = 0;
34145 int isjump = 0;
34146
34147 /* Look for all minimal intervals of instructions containing 4 jumps.
34148 The intervals are bounded by START and INSN. NBYTES is the total
34149 size of instructions in the interval including INSN and not including
34150 START. When the NBYTES is smaller than 16 bytes, it is possible
34151 that the end of START and INSN ends up in the same 16byte page.
34152
34153 The smallest offset in the page INSN can start is the case where START
34154 ends on the offset 0. Offset of INSN is then NBYTES - sizeof (INSN).
34155 We add p2align to 16byte window with maxskip 15 - NBYTES + sizeof (INSN).
34156 */
34157 for (insn = start; insn; insn = NEXT_INSN (insn))
34158 {
34159 int min_size;
34160
34161 if (LABEL_P (insn))
34162 {
34163 int align = label_to_alignment (insn);
34164 int max_skip = label_to_max_skip (insn);
34165
34166 if (max_skip > 15)
34167 max_skip = 15;
34168 /* If align > 3, only up to 16 - max_skip - 1 bytes can be
34169 already in the current 16 byte page, because otherwise
34170 ASM_OUTPUT_MAX_SKIP_ALIGN could skip max_skip or fewer
34171 bytes to reach 16 byte boundary. */
34172 if (align <= 0
34173 || (align <= 3 && max_skip != (1 << align) - 1))
34174 max_skip = 0;
34175 if (dump_file)
34176 fprintf (dump_file, "Label %i with max_skip %i\n",
34177 INSN_UID (insn), max_skip);
34178 if (max_skip)
34179 {
34180 while (nbytes + max_skip >= 16)
34181 {
34182 start = NEXT_INSN (start);
34183 if ((JUMP_P (start)
34184 && GET_CODE (PATTERN (start)) != ADDR_VEC
34185 && GET_CODE (PATTERN (start)) != ADDR_DIFF_VEC)
34186 || CALL_P (start))
34187 njumps--, isjump = 1;
34188 else
34189 isjump = 0;
34190 nbytes -= min_insn_size (start);
34191 }
34192 }
34193 continue;
34194 }
34195
34196 min_size = min_insn_size (insn);
34197 nbytes += min_size;
34198 if (dump_file)
34199 fprintf (dump_file, "Insn %i estimated to %i bytes\n",
34200 INSN_UID (insn), min_size);
34201 if ((JUMP_P (insn)
34202 && GET_CODE (PATTERN (insn)) != ADDR_VEC
34203 && GET_CODE (PATTERN (insn)) != ADDR_DIFF_VEC)
34204 || CALL_P (insn))
34205 njumps++;
34206 else
34207 continue;
34208
34209 while (njumps > 3)
34210 {
34211 start = NEXT_INSN (start);
34212 if ((JUMP_P (start)
34213 && GET_CODE (PATTERN (start)) != ADDR_VEC
34214 && GET_CODE (PATTERN (start)) != ADDR_DIFF_VEC)
34215 || CALL_P (start))
34216 njumps--, isjump = 1;
34217 else
34218 isjump = 0;
34219 nbytes -= min_insn_size (start);
34220 }
34221 gcc_assert (njumps >= 0);
34222 if (dump_file)
34223 fprintf (dump_file, "Interval %i to %i has %i bytes\n",
34224 INSN_UID (start), INSN_UID (insn), nbytes);
34225
34226 if (njumps == 3 && isjump && nbytes < 16)
34227 {
34228 int padsize = 15 - nbytes + min_insn_size (insn);
34229
34230 if (dump_file)
34231 fprintf (dump_file, "Padding insn %i by %i bytes!\n",
34232 INSN_UID (insn), padsize);
34233 emit_insn_before (gen_pad (GEN_INT (padsize)), insn);
34234 }
34235 }
34236 }
34237 #endif
34238
34239 /* AMD Athlon works faster
34240 when RET is not destination of conditional jump or directly preceded
34241 by other jump instruction. We avoid the penalty by inserting NOP just
34242 before the RET instructions in such cases. */
34243 static void
34244 ix86_pad_returns (void)
34245 {
34246 edge e;
34247 edge_iterator ei;
34248
34249 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR->preds)
34250 {
34251 basic_block bb = e->src;
34252 rtx ret = BB_END (bb);
34253 rtx prev;
34254 bool replace = false;
34255
34256 if (!JUMP_P (ret) || !ANY_RETURN_P (PATTERN (ret))
34257 || optimize_bb_for_size_p (bb))
34258 continue;
34259 for (prev = PREV_INSN (ret); prev; prev = PREV_INSN (prev))
34260 if (active_insn_p (prev) || LABEL_P (prev))
34261 break;
34262 if (prev && LABEL_P (prev))
34263 {
34264 edge e;
34265 edge_iterator ei;
34266
34267 FOR_EACH_EDGE (e, ei, bb->preds)
34268 if (EDGE_FREQUENCY (e) && e->src->index >= 0
34269 && !(e->flags & EDGE_FALLTHRU))
34270 replace = true;
34271 }
34272 if (!replace)
34273 {
34274 prev = prev_active_insn (ret);
34275 if (prev
34276 && ((JUMP_P (prev) && any_condjump_p (prev))
34277 || CALL_P (prev)))
34278 replace = true;
34279 /* Empty functions get branch mispredict even when
34280 the jump destination is not visible to us. */
34281 if (!prev && !optimize_function_for_size_p (cfun))
34282 replace = true;
34283 }
34284 if (replace)
34285 {
34286 emit_jump_insn_before (gen_simple_return_internal_long (), ret);
34287 delete_insn (ret);
34288 }
34289 }
34290 }
34291
34292 /* Count the minimum number of instructions in BB. Return 4 if the
34293 number of instructions >= 4. */
34294
34295 static int
34296 ix86_count_insn_bb (basic_block bb)
34297 {
34298 rtx insn;
34299 int insn_count = 0;
34300
34301 /* Count number of instructions in this block. Return 4 if the number
34302 of instructions >= 4. */
34303 FOR_BB_INSNS (bb, insn)
34304 {
34305 /* Only happen in exit blocks. */
34306 if (JUMP_P (insn)
34307 && ANY_RETURN_P (PATTERN (insn)))
34308 break;
34309
34310 if (NONDEBUG_INSN_P (insn)
34311 && GET_CODE (PATTERN (insn)) != USE
34312 && GET_CODE (PATTERN (insn)) != CLOBBER)
34313 {
34314 insn_count++;
34315 if (insn_count >= 4)
34316 return insn_count;
34317 }
34318 }
34319
34320 return insn_count;
34321 }
34322
34323
34324 /* Count the minimum number of instructions in code path in BB.
34325 Return 4 if the number of instructions >= 4. */
34326
34327 static int
34328 ix86_count_insn (basic_block bb)
34329 {
34330 edge e;
34331 edge_iterator ei;
34332 int min_prev_count;
34333
34334 /* Only bother counting instructions along paths with no
34335 more than 2 basic blocks between entry and exit. Given
34336 that BB has an edge to exit, determine if a predecessor
34337 of BB has an edge from entry. If so, compute the number
34338 of instructions in the predecessor block. If there
34339 happen to be multiple such blocks, compute the minimum. */
34340 min_prev_count = 4;
34341 FOR_EACH_EDGE (e, ei, bb->preds)
34342 {
34343 edge prev_e;
34344 edge_iterator prev_ei;
34345
34346 if (e->src == ENTRY_BLOCK_PTR)
34347 {
34348 min_prev_count = 0;
34349 break;
34350 }
34351 FOR_EACH_EDGE (prev_e, prev_ei, e->src->preds)
34352 {
34353 if (prev_e->src == ENTRY_BLOCK_PTR)
34354 {
34355 int count = ix86_count_insn_bb (e->src);
34356 if (count < min_prev_count)
34357 min_prev_count = count;
34358 break;
34359 }
34360 }
34361 }
34362
34363 if (min_prev_count < 4)
34364 min_prev_count += ix86_count_insn_bb (bb);
34365
34366 return min_prev_count;
34367 }
34368
34369 /* Pad short function to 4 instructions. */
34370
34371 static void
34372 ix86_pad_short_function (void)
34373 {
34374 edge e;
34375 edge_iterator ei;
34376
34377 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR->preds)
34378 {
34379 rtx ret = BB_END (e->src);
34380 if (JUMP_P (ret) && ANY_RETURN_P (PATTERN (ret)))
34381 {
34382 int insn_count = ix86_count_insn (e->src);
34383
34384 /* Pad short function. */
34385 if (insn_count < 4)
34386 {
34387 rtx insn = ret;
34388
34389 /* Find epilogue. */
34390 while (insn
34391 && (!NOTE_P (insn)
34392 || NOTE_KIND (insn) != NOTE_INSN_EPILOGUE_BEG))
34393 insn = PREV_INSN (insn);
34394
34395 if (!insn)
34396 insn = ret;
34397
34398 /* Two NOPs count as one instruction. */
34399 insn_count = 2 * (4 - insn_count);
34400 emit_insn_before (gen_nops (GEN_INT (insn_count)), insn);
34401 }
34402 }
34403 }
34404 }
34405
34406 /* Implement machine specific optimizations. We implement padding of returns
34407 for K8 CPUs and pass to avoid 4 jumps in the single 16 byte window. */
34408 static void
34409 ix86_reorg (void)
34410 {
34411 /* We are freeing block_for_insn in the toplev to keep compatibility
34412 with old MDEP_REORGS that are not CFG based. Recompute it now. */
34413 compute_bb_for_insn ();
34414
34415 /* Run the vzeroupper optimization if needed. */
34416 if (TARGET_VZEROUPPER)
34417 move_or_delete_vzeroupper ();
34418
34419 if (optimize && optimize_function_for_speed_p (cfun))
34420 {
34421 if (TARGET_PAD_SHORT_FUNCTION)
34422 ix86_pad_short_function ();
34423 else if (TARGET_PAD_RETURNS)
34424 ix86_pad_returns ();
34425 #ifdef ASM_OUTPUT_MAX_SKIP_PAD
34426 if (TARGET_FOUR_JUMP_LIMIT)
34427 ix86_avoid_jump_mispredicts ();
34428 #endif
34429 }
34430 }
34431
34432 /* Return nonzero when QImode register that must be represented via REX prefix
34433 is used. */
34434 bool
34435 x86_extended_QIreg_mentioned_p (rtx insn)
34436 {
34437 int i;
34438 extract_insn_cached (insn);
34439 for (i = 0; i < recog_data.n_operands; i++)
34440 if (GENERAL_REG_P (recog_data.operand[i])
34441 && !QI_REGNO_P (REGNO (recog_data.operand[i])))
34442 return true;
34443 return false;
34444 }
34445
34446 /* Return nonzero when P points to register encoded via REX prefix.
34447 Called via for_each_rtx. */
34448 static int
34449 extended_reg_mentioned_1 (rtx *p, void *data ATTRIBUTE_UNUSED)
34450 {
34451 unsigned int regno;
34452 if (!REG_P (*p))
34453 return 0;
34454 regno = REGNO (*p);
34455 return REX_INT_REGNO_P (regno) || REX_SSE_REGNO_P (regno);
34456 }
34457
34458 /* Return true when INSN mentions register that must be encoded using REX
34459 prefix. */
34460 bool
34461 x86_extended_reg_mentioned_p (rtx insn)
34462 {
34463 return for_each_rtx (INSN_P (insn) ? &PATTERN (insn) : &insn,
34464 extended_reg_mentioned_1, NULL);
34465 }
34466
34467 /* If profitable, negate (without causing overflow) integer constant
34468 of mode MODE at location LOC. Return true in this case. */
34469 bool
34470 x86_maybe_negate_const_int (rtx *loc, enum machine_mode mode)
34471 {
34472 HOST_WIDE_INT val;
34473
34474 if (!CONST_INT_P (*loc))
34475 return false;
34476
34477 switch (mode)
34478 {
34479 case DImode:
34480 /* DImode x86_64 constants must fit in 32 bits. */
34481 gcc_assert (x86_64_immediate_operand (*loc, mode));
34482
34483 mode = SImode;
34484 break;
34485
34486 case SImode:
34487 case HImode:
34488 case QImode:
34489 break;
34490
34491 default:
34492 gcc_unreachable ();
34493 }
34494
34495 /* Avoid overflows. */
34496 if (mode_signbit_p (mode, *loc))
34497 return false;
34498
34499 val = INTVAL (*loc);
34500
34501 /* Make things pretty and `subl $4,%eax' rather than `addl $-4,%eax'.
34502 Exceptions: -128 encodes smaller than 128, so swap sign and op. */
34503 if ((val < 0 && val != -128)
34504 || val == 128)
34505 {
34506 *loc = GEN_INT (-val);
34507 return true;
34508 }
34509
34510 return false;
34511 }
34512
34513 /* Generate an unsigned DImode/SImode to FP conversion. This is the same code
34514 optabs would emit if we didn't have TFmode patterns. */
34515
34516 void
34517 x86_emit_floatuns (rtx operands[2])
34518 {
34519 rtx neglab, donelab, i0, i1, f0, in, out;
34520 enum machine_mode mode, inmode;
34521
34522 inmode = GET_MODE (operands[1]);
34523 gcc_assert (inmode == SImode || inmode == DImode);
34524
34525 out = operands[0];
34526 in = force_reg (inmode, operands[1]);
34527 mode = GET_MODE (out);
34528 neglab = gen_label_rtx ();
34529 donelab = gen_label_rtx ();
34530 f0 = gen_reg_rtx (mode);
34531
34532 emit_cmp_and_jump_insns (in, const0_rtx, LT, const0_rtx, inmode, 0, neglab);
34533
34534 expand_float (out, in, 0);
34535
34536 emit_jump_insn (gen_jump (donelab));
34537 emit_barrier ();
34538
34539 emit_label (neglab);
34540
34541 i0 = expand_simple_binop (inmode, LSHIFTRT, in, const1_rtx, NULL,
34542 1, OPTAB_DIRECT);
34543 i1 = expand_simple_binop (inmode, AND, in, const1_rtx, NULL,
34544 1, OPTAB_DIRECT);
34545 i0 = expand_simple_binop (inmode, IOR, i0, i1, i0, 1, OPTAB_DIRECT);
34546
34547 expand_float (f0, i0, 0);
34548
34549 emit_insn (gen_rtx_SET (VOIDmode, out, gen_rtx_PLUS (mode, f0, f0)));
34550
34551 emit_label (donelab);
34552 }
34553 \f
34554 /* AVX2 does support 32-byte integer vector operations,
34555 thus the longest vector we are faced with is V32QImode. */
34556 #define MAX_VECT_LEN 32
34557
34558 struct expand_vec_perm_d
34559 {
34560 rtx target, op0, op1;
34561 unsigned char perm[MAX_VECT_LEN];
34562 enum machine_mode vmode;
34563 unsigned char nelt;
34564 bool one_operand_p;
34565 bool testing_p;
34566 };
34567
34568 static bool canonicalize_perm (struct expand_vec_perm_d *d);
34569 static bool expand_vec_perm_1 (struct expand_vec_perm_d *d);
34570 static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d);
34571
34572 /* Get a vector mode of the same size as the original but with elements
34573 twice as wide. This is only guaranteed to apply to integral vectors. */
34574
34575 static inline enum machine_mode
34576 get_mode_wider_vector (enum machine_mode o)
34577 {
34578 /* ??? Rely on the ordering that genmodes.c gives to vectors. */
34579 enum machine_mode n = GET_MODE_WIDER_MODE (o);
34580 gcc_assert (GET_MODE_NUNITS (o) == GET_MODE_NUNITS (n) * 2);
34581 gcc_assert (GET_MODE_SIZE (o) == GET_MODE_SIZE (n));
34582 return n;
34583 }
34584
34585 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
34586 with all elements equal to VAR. Return true if successful. */
34587
34588 static bool
34589 ix86_expand_vector_init_duplicate (bool mmx_ok, enum machine_mode mode,
34590 rtx target, rtx val)
34591 {
34592 bool ok;
34593
34594 switch (mode)
34595 {
34596 case V2SImode:
34597 case V2SFmode:
34598 if (!mmx_ok)
34599 return false;
34600 /* FALLTHRU */
34601
34602 case V4DFmode:
34603 case V4DImode:
34604 case V8SFmode:
34605 case V8SImode:
34606 case V2DFmode:
34607 case V2DImode:
34608 case V4SFmode:
34609 case V4SImode:
34610 {
34611 rtx insn, dup;
34612
34613 /* First attempt to recognize VAL as-is. */
34614 dup = gen_rtx_VEC_DUPLICATE (mode, val);
34615 insn = emit_insn (gen_rtx_SET (VOIDmode, target, dup));
34616 if (recog_memoized (insn) < 0)
34617 {
34618 rtx seq;
34619 /* If that fails, force VAL into a register. */
34620
34621 start_sequence ();
34622 XEXP (dup, 0) = force_reg (GET_MODE_INNER (mode), val);
34623 seq = get_insns ();
34624 end_sequence ();
34625 if (seq)
34626 emit_insn_before (seq, insn);
34627
34628 ok = recog_memoized (insn) >= 0;
34629 gcc_assert (ok);
34630 }
34631 }
34632 return true;
34633
34634 case V4HImode:
34635 if (!mmx_ok)
34636 return false;
34637 if (TARGET_SSE || TARGET_3DNOW_A)
34638 {
34639 rtx x;
34640
34641 val = gen_lowpart (SImode, val);
34642 x = gen_rtx_TRUNCATE (HImode, val);
34643 x = gen_rtx_VEC_DUPLICATE (mode, x);
34644 emit_insn (gen_rtx_SET (VOIDmode, target, x));
34645 return true;
34646 }
34647 goto widen;
34648
34649 case V8QImode:
34650 if (!mmx_ok)
34651 return false;
34652 goto widen;
34653
34654 case V8HImode:
34655 if (TARGET_SSE2)
34656 {
34657 struct expand_vec_perm_d dperm;
34658 rtx tmp1, tmp2;
34659
34660 permute:
34661 memset (&dperm, 0, sizeof (dperm));
34662 dperm.target = target;
34663 dperm.vmode = mode;
34664 dperm.nelt = GET_MODE_NUNITS (mode);
34665 dperm.op0 = dperm.op1 = gen_reg_rtx (mode);
34666 dperm.one_operand_p = true;
34667
34668 /* Extend to SImode using a paradoxical SUBREG. */
34669 tmp1 = gen_reg_rtx (SImode);
34670 emit_move_insn (tmp1, gen_lowpart (SImode, val));
34671
34672 /* Insert the SImode value as low element of a V4SImode vector. */
34673 tmp2 = gen_lowpart (V4SImode, dperm.op0);
34674 emit_insn (gen_vec_setv4si_0 (tmp2, CONST0_RTX (V4SImode), tmp1));
34675
34676 ok = (expand_vec_perm_1 (&dperm)
34677 || expand_vec_perm_broadcast_1 (&dperm));
34678 gcc_assert (ok);
34679 return ok;
34680 }
34681 goto widen;
34682
34683 case V16QImode:
34684 if (TARGET_SSE2)
34685 goto permute;
34686 goto widen;
34687
34688 widen:
34689 /* Replicate the value once into the next wider mode and recurse. */
34690 {
34691 enum machine_mode smode, wsmode, wvmode;
34692 rtx x;
34693
34694 smode = GET_MODE_INNER (mode);
34695 wvmode = get_mode_wider_vector (mode);
34696 wsmode = GET_MODE_INNER (wvmode);
34697
34698 val = convert_modes (wsmode, smode, val, true);
34699 x = expand_simple_binop (wsmode, ASHIFT, val,
34700 GEN_INT (GET_MODE_BITSIZE (smode)),
34701 NULL_RTX, 1, OPTAB_LIB_WIDEN);
34702 val = expand_simple_binop (wsmode, IOR, val, x, x, 1, OPTAB_LIB_WIDEN);
34703
34704 x = gen_lowpart (wvmode, target);
34705 ok = ix86_expand_vector_init_duplicate (mmx_ok, wvmode, x, val);
34706 gcc_assert (ok);
34707 return ok;
34708 }
34709
34710 case V16HImode:
34711 case V32QImode:
34712 {
34713 enum machine_mode hvmode = (mode == V16HImode ? V8HImode : V16QImode);
34714 rtx x = gen_reg_rtx (hvmode);
34715
34716 ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
34717 gcc_assert (ok);
34718
34719 x = gen_rtx_VEC_CONCAT (mode, x, x);
34720 emit_insn (gen_rtx_SET (VOIDmode, target, x));
34721 }
34722 return true;
34723
34724 default:
34725 return false;
34726 }
34727 }
34728
34729 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
34730 whose ONE_VAR element is VAR, and other elements are zero. Return true
34731 if successful. */
34732
34733 static bool
34734 ix86_expand_vector_init_one_nonzero (bool mmx_ok, enum machine_mode mode,
34735 rtx target, rtx var, int one_var)
34736 {
34737 enum machine_mode vsimode;
34738 rtx new_target;
34739 rtx x, tmp;
34740 bool use_vector_set = false;
34741
34742 switch (mode)
34743 {
34744 case V2DImode:
34745 /* For SSE4.1, we normally use vector set. But if the second
34746 element is zero and inter-unit moves are OK, we use movq
34747 instead. */
34748 use_vector_set = (TARGET_64BIT
34749 && TARGET_SSE4_1
34750 && !(TARGET_INTER_UNIT_MOVES
34751 && one_var == 0));
34752 break;
34753 case V16QImode:
34754 case V4SImode:
34755 case V4SFmode:
34756 use_vector_set = TARGET_SSE4_1;
34757 break;
34758 case V8HImode:
34759 use_vector_set = TARGET_SSE2;
34760 break;
34761 case V4HImode:
34762 use_vector_set = TARGET_SSE || TARGET_3DNOW_A;
34763 break;
34764 case V32QImode:
34765 case V16HImode:
34766 case V8SImode:
34767 case V8SFmode:
34768 case V4DFmode:
34769 use_vector_set = TARGET_AVX;
34770 break;
34771 case V4DImode:
34772 /* Use ix86_expand_vector_set in 64bit mode only. */
34773 use_vector_set = TARGET_AVX && TARGET_64BIT;
34774 break;
34775 default:
34776 break;
34777 }
34778
34779 if (use_vector_set)
34780 {
34781 emit_insn (gen_rtx_SET (VOIDmode, target, CONST0_RTX (mode)));
34782 var = force_reg (GET_MODE_INNER (mode), var);
34783 ix86_expand_vector_set (mmx_ok, target, var, one_var);
34784 return true;
34785 }
34786
34787 switch (mode)
34788 {
34789 case V2SFmode:
34790 case V2SImode:
34791 if (!mmx_ok)
34792 return false;
34793 /* FALLTHRU */
34794
34795 case V2DFmode:
34796 case V2DImode:
34797 if (one_var != 0)
34798 return false;
34799 var = force_reg (GET_MODE_INNER (mode), var);
34800 x = gen_rtx_VEC_CONCAT (mode, var, CONST0_RTX (GET_MODE_INNER (mode)));
34801 emit_insn (gen_rtx_SET (VOIDmode, target, x));
34802 return true;
34803
34804 case V4SFmode:
34805 case V4SImode:
34806 if (!REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
34807 new_target = gen_reg_rtx (mode);
34808 else
34809 new_target = target;
34810 var = force_reg (GET_MODE_INNER (mode), var);
34811 x = gen_rtx_VEC_DUPLICATE (mode, var);
34812 x = gen_rtx_VEC_MERGE (mode, x, CONST0_RTX (mode), const1_rtx);
34813 emit_insn (gen_rtx_SET (VOIDmode, new_target, x));
34814 if (one_var != 0)
34815 {
34816 /* We need to shuffle the value to the correct position, so
34817 create a new pseudo to store the intermediate result. */
34818
34819 /* With SSE2, we can use the integer shuffle insns. */
34820 if (mode != V4SFmode && TARGET_SSE2)
34821 {
34822 emit_insn (gen_sse2_pshufd_1 (new_target, new_target,
34823 const1_rtx,
34824 GEN_INT (one_var == 1 ? 0 : 1),
34825 GEN_INT (one_var == 2 ? 0 : 1),
34826 GEN_INT (one_var == 3 ? 0 : 1)));
34827 if (target != new_target)
34828 emit_move_insn (target, new_target);
34829 return true;
34830 }
34831
34832 /* Otherwise convert the intermediate result to V4SFmode and
34833 use the SSE1 shuffle instructions. */
34834 if (mode != V4SFmode)
34835 {
34836 tmp = gen_reg_rtx (V4SFmode);
34837 emit_move_insn (tmp, gen_lowpart (V4SFmode, new_target));
34838 }
34839 else
34840 tmp = new_target;
34841
34842 emit_insn (gen_sse_shufps_v4sf (tmp, tmp, tmp,
34843 const1_rtx,
34844 GEN_INT (one_var == 1 ? 0 : 1),
34845 GEN_INT (one_var == 2 ? 0+4 : 1+4),
34846 GEN_INT (one_var == 3 ? 0+4 : 1+4)));
34847
34848 if (mode != V4SFmode)
34849 emit_move_insn (target, gen_lowpart (V4SImode, tmp));
34850 else if (tmp != target)
34851 emit_move_insn (target, tmp);
34852 }
34853 else if (target != new_target)
34854 emit_move_insn (target, new_target);
34855 return true;
34856
34857 case V8HImode:
34858 case V16QImode:
34859 vsimode = V4SImode;
34860 goto widen;
34861 case V4HImode:
34862 case V8QImode:
34863 if (!mmx_ok)
34864 return false;
34865 vsimode = V2SImode;
34866 goto widen;
34867 widen:
34868 if (one_var != 0)
34869 return false;
34870
34871 /* Zero extend the variable element to SImode and recurse. */
34872 var = convert_modes (SImode, GET_MODE_INNER (mode), var, true);
34873
34874 x = gen_reg_rtx (vsimode);
34875 if (!ix86_expand_vector_init_one_nonzero (mmx_ok, vsimode, x,
34876 var, one_var))
34877 gcc_unreachable ();
34878
34879 emit_move_insn (target, gen_lowpart (mode, x));
34880 return true;
34881
34882 default:
34883 return false;
34884 }
34885 }
34886
34887 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
34888 consisting of the values in VALS. It is known that all elements
34889 except ONE_VAR are constants. Return true if successful. */
34890
34891 static bool
34892 ix86_expand_vector_init_one_var (bool mmx_ok, enum machine_mode mode,
34893 rtx target, rtx vals, int one_var)
34894 {
34895 rtx var = XVECEXP (vals, 0, one_var);
34896 enum machine_mode wmode;
34897 rtx const_vec, x;
34898
34899 const_vec = copy_rtx (vals);
34900 XVECEXP (const_vec, 0, one_var) = CONST0_RTX (GET_MODE_INNER (mode));
34901 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (const_vec, 0));
34902
34903 switch (mode)
34904 {
34905 case V2DFmode:
34906 case V2DImode:
34907 case V2SFmode:
34908 case V2SImode:
34909 /* For the two element vectors, it's just as easy to use
34910 the general case. */
34911 return false;
34912
34913 case V4DImode:
34914 /* Use ix86_expand_vector_set in 64bit mode only. */
34915 if (!TARGET_64BIT)
34916 return false;
34917 case V4DFmode:
34918 case V8SFmode:
34919 case V8SImode:
34920 case V16HImode:
34921 case V32QImode:
34922 case V4SFmode:
34923 case V4SImode:
34924 case V8HImode:
34925 case V4HImode:
34926 break;
34927
34928 case V16QImode:
34929 if (TARGET_SSE4_1)
34930 break;
34931 wmode = V8HImode;
34932 goto widen;
34933 case V8QImode:
34934 wmode = V4HImode;
34935 goto widen;
34936 widen:
34937 /* There's no way to set one QImode entry easily. Combine
34938 the variable value with its adjacent constant value, and
34939 promote to an HImode set. */
34940 x = XVECEXP (vals, 0, one_var ^ 1);
34941 if (one_var & 1)
34942 {
34943 var = convert_modes (HImode, QImode, var, true);
34944 var = expand_simple_binop (HImode, ASHIFT, var, GEN_INT (8),
34945 NULL_RTX, 1, OPTAB_LIB_WIDEN);
34946 x = GEN_INT (INTVAL (x) & 0xff);
34947 }
34948 else
34949 {
34950 var = convert_modes (HImode, QImode, var, true);
34951 x = gen_int_mode (INTVAL (x) << 8, HImode);
34952 }
34953 if (x != const0_rtx)
34954 var = expand_simple_binop (HImode, IOR, var, x, var,
34955 1, OPTAB_LIB_WIDEN);
34956
34957 x = gen_reg_rtx (wmode);
34958 emit_move_insn (x, gen_lowpart (wmode, const_vec));
34959 ix86_expand_vector_set (mmx_ok, x, var, one_var >> 1);
34960
34961 emit_move_insn (target, gen_lowpart (mode, x));
34962 return true;
34963
34964 default:
34965 return false;
34966 }
34967
34968 emit_move_insn (target, const_vec);
34969 ix86_expand_vector_set (mmx_ok, target, var, one_var);
34970 return true;
34971 }
34972
34973 /* A subroutine of ix86_expand_vector_init_general. Use vector
34974 concatenate to handle the most general case: all values variable,
34975 and none identical. */
34976
34977 static void
34978 ix86_expand_vector_init_concat (enum machine_mode mode,
34979 rtx target, rtx *ops, int n)
34980 {
34981 enum machine_mode cmode, hmode = VOIDmode;
34982 rtx first[8], second[4];
34983 rtvec v;
34984 int i, j;
34985
34986 switch (n)
34987 {
34988 case 2:
34989 switch (mode)
34990 {
34991 case V8SImode:
34992 cmode = V4SImode;
34993 break;
34994 case V8SFmode:
34995 cmode = V4SFmode;
34996 break;
34997 case V4DImode:
34998 cmode = V2DImode;
34999 break;
35000 case V4DFmode:
35001 cmode = V2DFmode;
35002 break;
35003 case V4SImode:
35004 cmode = V2SImode;
35005 break;
35006 case V4SFmode:
35007 cmode = V2SFmode;
35008 break;
35009 case V2DImode:
35010 cmode = DImode;
35011 break;
35012 case V2SImode:
35013 cmode = SImode;
35014 break;
35015 case V2DFmode:
35016 cmode = DFmode;
35017 break;
35018 case V2SFmode:
35019 cmode = SFmode;
35020 break;
35021 default:
35022 gcc_unreachable ();
35023 }
35024
35025 if (!register_operand (ops[1], cmode))
35026 ops[1] = force_reg (cmode, ops[1]);
35027 if (!register_operand (ops[0], cmode))
35028 ops[0] = force_reg (cmode, ops[0]);
35029 emit_insn (gen_rtx_SET (VOIDmode, target,
35030 gen_rtx_VEC_CONCAT (mode, ops[0],
35031 ops[1])));
35032 break;
35033
35034 case 4:
35035 switch (mode)
35036 {
35037 case V4DImode:
35038 cmode = V2DImode;
35039 break;
35040 case V4DFmode:
35041 cmode = V2DFmode;
35042 break;
35043 case V4SImode:
35044 cmode = V2SImode;
35045 break;
35046 case V4SFmode:
35047 cmode = V2SFmode;
35048 break;
35049 default:
35050 gcc_unreachable ();
35051 }
35052 goto half;
35053
35054 case 8:
35055 switch (mode)
35056 {
35057 case V8SImode:
35058 cmode = V2SImode;
35059 hmode = V4SImode;
35060 break;
35061 case V8SFmode:
35062 cmode = V2SFmode;
35063 hmode = V4SFmode;
35064 break;
35065 default:
35066 gcc_unreachable ();
35067 }
35068 goto half;
35069
35070 half:
35071 /* FIXME: We process inputs backward to help RA. PR 36222. */
35072 i = n - 1;
35073 j = (n >> 1) - 1;
35074 for (; i > 0; i -= 2, j--)
35075 {
35076 first[j] = gen_reg_rtx (cmode);
35077 v = gen_rtvec (2, ops[i - 1], ops[i]);
35078 ix86_expand_vector_init (false, first[j],
35079 gen_rtx_PARALLEL (cmode, v));
35080 }
35081
35082 n >>= 1;
35083 if (n > 2)
35084 {
35085 gcc_assert (hmode != VOIDmode);
35086 for (i = j = 0; i < n; i += 2, j++)
35087 {
35088 second[j] = gen_reg_rtx (hmode);
35089 ix86_expand_vector_init_concat (hmode, second [j],
35090 &first [i], 2);
35091 }
35092 n >>= 1;
35093 ix86_expand_vector_init_concat (mode, target, second, n);
35094 }
35095 else
35096 ix86_expand_vector_init_concat (mode, target, first, n);
35097 break;
35098
35099 default:
35100 gcc_unreachable ();
35101 }
35102 }
35103
35104 /* A subroutine of ix86_expand_vector_init_general. Use vector
35105 interleave to handle the most general case: all values variable,
35106 and none identical. */
35107
35108 static void
35109 ix86_expand_vector_init_interleave (enum machine_mode mode,
35110 rtx target, rtx *ops, int n)
35111 {
35112 enum machine_mode first_imode, second_imode, third_imode, inner_mode;
35113 int i, j;
35114 rtx op0, op1;
35115 rtx (*gen_load_even) (rtx, rtx, rtx);
35116 rtx (*gen_interleave_first_low) (rtx, rtx, rtx);
35117 rtx (*gen_interleave_second_low) (rtx, rtx, rtx);
35118
35119 switch (mode)
35120 {
35121 case V8HImode:
35122 gen_load_even = gen_vec_setv8hi;
35123 gen_interleave_first_low = gen_vec_interleave_lowv4si;
35124 gen_interleave_second_low = gen_vec_interleave_lowv2di;
35125 inner_mode = HImode;
35126 first_imode = V4SImode;
35127 second_imode = V2DImode;
35128 third_imode = VOIDmode;
35129 break;
35130 case V16QImode:
35131 gen_load_even = gen_vec_setv16qi;
35132 gen_interleave_first_low = gen_vec_interleave_lowv8hi;
35133 gen_interleave_second_low = gen_vec_interleave_lowv4si;
35134 inner_mode = QImode;
35135 first_imode = V8HImode;
35136 second_imode = V4SImode;
35137 third_imode = V2DImode;
35138 break;
35139 default:
35140 gcc_unreachable ();
35141 }
35142
35143 for (i = 0; i < n; i++)
35144 {
35145 /* Extend the odd elment to SImode using a paradoxical SUBREG. */
35146 op0 = gen_reg_rtx (SImode);
35147 emit_move_insn (op0, gen_lowpart (SImode, ops [i + i]));
35148
35149 /* Insert the SImode value as low element of V4SImode vector. */
35150 op1 = gen_reg_rtx (V4SImode);
35151 op0 = gen_rtx_VEC_MERGE (V4SImode,
35152 gen_rtx_VEC_DUPLICATE (V4SImode,
35153 op0),
35154 CONST0_RTX (V4SImode),
35155 const1_rtx);
35156 emit_insn (gen_rtx_SET (VOIDmode, op1, op0));
35157
35158 /* Cast the V4SImode vector back to a vector in orignal mode. */
35159 op0 = gen_reg_rtx (mode);
35160 emit_move_insn (op0, gen_lowpart (mode, op1));
35161
35162 /* Load even elements into the second positon. */
35163 emit_insn (gen_load_even (op0,
35164 force_reg (inner_mode,
35165 ops [i + i + 1]),
35166 const1_rtx));
35167
35168 /* Cast vector to FIRST_IMODE vector. */
35169 ops[i] = gen_reg_rtx (first_imode);
35170 emit_move_insn (ops[i], gen_lowpart (first_imode, op0));
35171 }
35172
35173 /* Interleave low FIRST_IMODE vectors. */
35174 for (i = j = 0; i < n; i += 2, j++)
35175 {
35176 op0 = gen_reg_rtx (first_imode);
35177 emit_insn (gen_interleave_first_low (op0, ops[i], ops[i + 1]));
35178
35179 /* Cast FIRST_IMODE vector to SECOND_IMODE vector. */
35180 ops[j] = gen_reg_rtx (second_imode);
35181 emit_move_insn (ops[j], gen_lowpart (second_imode, op0));
35182 }
35183
35184 /* Interleave low SECOND_IMODE vectors. */
35185 switch (second_imode)
35186 {
35187 case V4SImode:
35188 for (i = j = 0; i < n / 2; i += 2, j++)
35189 {
35190 op0 = gen_reg_rtx (second_imode);
35191 emit_insn (gen_interleave_second_low (op0, ops[i],
35192 ops[i + 1]));
35193
35194 /* Cast the SECOND_IMODE vector to the THIRD_IMODE
35195 vector. */
35196 ops[j] = gen_reg_rtx (third_imode);
35197 emit_move_insn (ops[j], gen_lowpart (third_imode, op0));
35198 }
35199 second_imode = V2DImode;
35200 gen_interleave_second_low = gen_vec_interleave_lowv2di;
35201 /* FALLTHRU */
35202
35203 case V2DImode:
35204 op0 = gen_reg_rtx (second_imode);
35205 emit_insn (gen_interleave_second_low (op0, ops[0],
35206 ops[1]));
35207
35208 /* Cast the SECOND_IMODE vector back to a vector on original
35209 mode. */
35210 emit_insn (gen_rtx_SET (VOIDmode, target,
35211 gen_lowpart (mode, op0)));
35212 break;
35213
35214 default:
35215 gcc_unreachable ();
35216 }
35217 }
35218
35219 /* A subroutine of ix86_expand_vector_init. Handle the most general case:
35220 all values variable, and none identical. */
35221
35222 static void
35223 ix86_expand_vector_init_general (bool mmx_ok, enum machine_mode mode,
35224 rtx target, rtx vals)
35225 {
35226 rtx ops[32], op0, op1;
35227 enum machine_mode half_mode = VOIDmode;
35228 int n, i;
35229
35230 switch (mode)
35231 {
35232 case V2SFmode:
35233 case V2SImode:
35234 if (!mmx_ok && !TARGET_SSE)
35235 break;
35236 /* FALLTHRU */
35237
35238 case V8SFmode:
35239 case V8SImode:
35240 case V4DFmode:
35241 case V4DImode:
35242 case V4SFmode:
35243 case V4SImode:
35244 case V2DFmode:
35245 case V2DImode:
35246 n = GET_MODE_NUNITS (mode);
35247 for (i = 0; i < n; i++)
35248 ops[i] = XVECEXP (vals, 0, i);
35249 ix86_expand_vector_init_concat (mode, target, ops, n);
35250 return;
35251
35252 case V32QImode:
35253 half_mode = V16QImode;
35254 goto half;
35255
35256 case V16HImode:
35257 half_mode = V8HImode;
35258 goto half;
35259
35260 half:
35261 n = GET_MODE_NUNITS (mode);
35262 for (i = 0; i < n; i++)
35263 ops[i] = XVECEXP (vals, 0, i);
35264 op0 = gen_reg_rtx (half_mode);
35265 op1 = gen_reg_rtx (half_mode);
35266 ix86_expand_vector_init_interleave (half_mode, op0, ops,
35267 n >> 2);
35268 ix86_expand_vector_init_interleave (half_mode, op1,
35269 &ops [n >> 1], n >> 2);
35270 emit_insn (gen_rtx_SET (VOIDmode, target,
35271 gen_rtx_VEC_CONCAT (mode, op0, op1)));
35272 return;
35273
35274 case V16QImode:
35275 if (!TARGET_SSE4_1)
35276 break;
35277 /* FALLTHRU */
35278
35279 case V8HImode:
35280 if (!TARGET_SSE2)
35281 break;
35282
35283 /* Don't use ix86_expand_vector_init_interleave if we can't
35284 move from GPR to SSE register directly. */
35285 if (!TARGET_INTER_UNIT_MOVES)
35286 break;
35287
35288 n = GET_MODE_NUNITS (mode);
35289 for (i = 0; i < n; i++)
35290 ops[i] = XVECEXP (vals, 0, i);
35291 ix86_expand_vector_init_interleave (mode, target, ops, n >> 1);
35292 return;
35293
35294 case V4HImode:
35295 case V8QImode:
35296 break;
35297
35298 default:
35299 gcc_unreachable ();
35300 }
35301
35302 {
35303 int i, j, n_elts, n_words, n_elt_per_word;
35304 enum machine_mode inner_mode;
35305 rtx words[4], shift;
35306
35307 inner_mode = GET_MODE_INNER (mode);
35308 n_elts = GET_MODE_NUNITS (mode);
35309 n_words = GET_MODE_SIZE (mode) / UNITS_PER_WORD;
35310 n_elt_per_word = n_elts / n_words;
35311 shift = GEN_INT (GET_MODE_BITSIZE (inner_mode));
35312
35313 for (i = 0; i < n_words; ++i)
35314 {
35315 rtx word = NULL_RTX;
35316
35317 for (j = 0; j < n_elt_per_word; ++j)
35318 {
35319 rtx elt = XVECEXP (vals, 0, (i+1)*n_elt_per_word - j - 1);
35320 elt = convert_modes (word_mode, inner_mode, elt, true);
35321
35322 if (j == 0)
35323 word = elt;
35324 else
35325 {
35326 word = expand_simple_binop (word_mode, ASHIFT, word, shift,
35327 word, 1, OPTAB_LIB_WIDEN);
35328 word = expand_simple_binop (word_mode, IOR, word, elt,
35329 word, 1, OPTAB_LIB_WIDEN);
35330 }
35331 }
35332
35333 words[i] = word;
35334 }
35335
35336 if (n_words == 1)
35337 emit_move_insn (target, gen_lowpart (mode, words[0]));
35338 else if (n_words == 2)
35339 {
35340 rtx tmp = gen_reg_rtx (mode);
35341 emit_clobber (tmp);
35342 emit_move_insn (gen_lowpart (word_mode, tmp), words[0]);
35343 emit_move_insn (gen_highpart (word_mode, tmp), words[1]);
35344 emit_move_insn (target, tmp);
35345 }
35346 else if (n_words == 4)
35347 {
35348 rtx tmp = gen_reg_rtx (V4SImode);
35349 gcc_assert (word_mode == SImode);
35350 vals = gen_rtx_PARALLEL (V4SImode, gen_rtvec_v (4, words));
35351 ix86_expand_vector_init_general (false, V4SImode, tmp, vals);
35352 emit_move_insn (target, gen_lowpart (mode, tmp));
35353 }
35354 else
35355 gcc_unreachable ();
35356 }
35357 }
35358
35359 /* Initialize vector TARGET via VALS. Suppress the use of MMX
35360 instructions unless MMX_OK is true. */
35361
35362 void
35363 ix86_expand_vector_init (bool mmx_ok, rtx target, rtx vals)
35364 {
35365 enum machine_mode mode = GET_MODE (target);
35366 enum machine_mode inner_mode = GET_MODE_INNER (mode);
35367 int n_elts = GET_MODE_NUNITS (mode);
35368 int n_var = 0, one_var = -1;
35369 bool all_same = true, all_const_zero = true;
35370 int i;
35371 rtx x;
35372
35373 for (i = 0; i < n_elts; ++i)
35374 {
35375 x = XVECEXP (vals, 0, i);
35376 if (!(CONST_INT_P (x)
35377 || GET_CODE (x) == CONST_DOUBLE
35378 || GET_CODE (x) == CONST_FIXED))
35379 n_var++, one_var = i;
35380 else if (x != CONST0_RTX (inner_mode))
35381 all_const_zero = false;
35382 if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
35383 all_same = false;
35384 }
35385
35386 /* Constants are best loaded from the constant pool. */
35387 if (n_var == 0)
35388 {
35389 emit_move_insn (target, gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0)));
35390 return;
35391 }
35392
35393 /* If all values are identical, broadcast the value. */
35394 if (all_same
35395 && ix86_expand_vector_init_duplicate (mmx_ok, mode, target,
35396 XVECEXP (vals, 0, 0)))
35397 return;
35398
35399 /* Values where only one field is non-constant are best loaded from
35400 the pool and overwritten via move later. */
35401 if (n_var == 1)
35402 {
35403 if (all_const_zero
35404 && ix86_expand_vector_init_one_nonzero (mmx_ok, mode, target,
35405 XVECEXP (vals, 0, one_var),
35406 one_var))
35407 return;
35408
35409 if (ix86_expand_vector_init_one_var (mmx_ok, mode, target, vals, one_var))
35410 return;
35411 }
35412
35413 ix86_expand_vector_init_general (mmx_ok, mode, target, vals);
35414 }
35415
35416 void
35417 ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt)
35418 {
35419 enum machine_mode mode = GET_MODE (target);
35420 enum machine_mode inner_mode = GET_MODE_INNER (mode);
35421 enum machine_mode half_mode;
35422 bool use_vec_merge = false;
35423 rtx tmp;
35424 static rtx (*gen_extract[6][2]) (rtx, rtx)
35425 = {
35426 { gen_vec_extract_lo_v32qi, gen_vec_extract_hi_v32qi },
35427 { gen_vec_extract_lo_v16hi, gen_vec_extract_hi_v16hi },
35428 { gen_vec_extract_lo_v8si, gen_vec_extract_hi_v8si },
35429 { gen_vec_extract_lo_v4di, gen_vec_extract_hi_v4di },
35430 { gen_vec_extract_lo_v8sf, gen_vec_extract_hi_v8sf },
35431 { gen_vec_extract_lo_v4df, gen_vec_extract_hi_v4df }
35432 };
35433 static rtx (*gen_insert[6][2]) (rtx, rtx, rtx)
35434 = {
35435 { gen_vec_set_lo_v32qi, gen_vec_set_hi_v32qi },
35436 { gen_vec_set_lo_v16hi, gen_vec_set_hi_v16hi },
35437 { gen_vec_set_lo_v8si, gen_vec_set_hi_v8si },
35438 { gen_vec_set_lo_v4di, gen_vec_set_hi_v4di },
35439 { gen_vec_set_lo_v8sf, gen_vec_set_hi_v8sf },
35440 { gen_vec_set_lo_v4df, gen_vec_set_hi_v4df }
35441 };
35442 int i, j, n;
35443
35444 switch (mode)
35445 {
35446 case V2SFmode:
35447 case V2SImode:
35448 if (mmx_ok)
35449 {
35450 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
35451 ix86_expand_vector_extract (true, tmp, target, 1 - elt);
35452 if (elt == 0)
35453 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
35454 else
35455 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
35456 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
35457 return;
35458 }
35459 break;
35460
35461 case V2DImode:
35462 use_vec_merge = TARGET_SSE4_1 && TARGET_64BIT;
35463 if (use_vec_merge)
35464 break;
35465
35466 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
35467 ix86_expand_vector_extract (false, tmp, target, 1 - elt);
35468 if (elt == 0)
35469 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
35470 else
35471 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
35472 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
35473 return;
35474
35475 case V2DFmode:
35476 {
35477 rtx op0, op1;
35478
35479 /* For the two element vectors, we implement a VEC_CONCAT with
35480 the extraction of the other element. */
35481
35482 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (1 - elt)));
35483 tmp = gen_rtx_VEC_SELECT (inner_mode, target, tmp);
35484
35485 if (elt == 0)
35486 op0 = val, op1 = tmp;
35487 else
35488 op0 = tmp, op1 = val;
35489
35490 tmp = gen_rtx_VEC_CONCAT (mode, op0, op1);
35491 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
35492 }
35493 return;
35494
35495 case V4SFmode:
35496 use_vec_merge = TARGET_SSE4_1;
35497 if (use_vec_merge)
35498 break;
35499
35500 switch (elt)
35501 {
35502 case 0:
35503 use_vec_merge = true;
35504 break;
35505
35506 case 1:
35507 /* tmp = target = A B C D */
35508 tmp = copy_to_reg (target);
35509 /* target = A A B B */
35510 emit_insn (gen_vec_interleave_lowv4sf (target, target, target));
35511 /* target = X A B B */
35512 ix86_expand_vector_set (false, target, val, 0);
35513 /* target = A X C D */
35514 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
35515 const1_rtx, const0_rtx,
35516 GEN_INT (2+4), GEN_INT (3+4)));
35517 return;
35518
35519 case 2:
35520 /* tmp = target = A B C D */
35521 tmp = copy_to_reg (target);
35522 /* tmp = X B C D */
35523 ix86_expand_vector_set (false, tmp, val, 0);
35524 /* target = A B X D */
35525 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
35526 const0_rtx, const1_rtx,
35527 GEN_INT (0+4), GEN_INT (3+4)));
35528 return;
35529
35530 case 3:
35531 /* tmp = target = A B C D */
35532 tmp = copy_to_reg (target);
35533 /* tmp = X B C D */
35534 ix86_expand_vector_set (false, tmp, val, 0);
35535 /* target = A B X D */
35536 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
35537 const0_rtx, const1_rtx,
35538 GEN_INT (2+4), GEN_INT (0+4)));
35539 return;
35540
35541 default:
35542 gcc_unreachable ();
35543 }
35544 break;
35545
35546 case V4SImode:
35547 use_vec_merge = TARGET_SSE4_1;
35548 if (use_vec_merge)
35549 break;
35550
35551 /* Element 0 handled by vec_merge below. */
35552 if (elt == 0)
35553 {
35554 use_vec_merge = true;
35555 break;
35556 }
35557
35558 if (TARGET_SSE2)
35559 {
35560 /* With SSE2, use integer shuffles to swap element 0 and ELT,
35561 store into element 0, then shuffle them back. */
35562
35563 rtx order[4];
35564
35565 order[0] = GEN_INT (elt);
35566 order[1] = const1_rtx;
35567 order[2] = const2_rtx;
35568 order[3] = GEN_INT (3);
35569 order[elt] = const0_rtx;
35570
35571 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
35572 order[1], order[2], order[3]));
35573
35574 ix86_expand_vector_set (false, target, val, 0);
35575
35576 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
35577 order[1], order[2], order[3]));
35578 }
35579 else
35580 {
35581 /* For SSE1, we have to reuse the V4SF code. */
35582 ix86_expand_vector_set (false, gen_lowpart (V4SFmode, target),
35583 gen_lowpart (SFmode, val), elt);
35584 }
35585 return;
35586
35587 case V8HImode:
35588 use_vec_merge = TARGET_SSE2;
35589 break;
35590 case V4HImode:
35591 use_vec_merge = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
35592 break;
35593
35594 case V16QImode:
35595 use_vec_merge = TARGET_SSE4_1;
35596 break;
35597
35598 case V8QImode:
35599 break;
35600
35601 case V32QImode:
35602 half_mode = V16QImode;
35603 j = 0;
35604 n = 16;
35605 goto half;
35606
35607 case V16HImode:
35608 half_mode = V8HImode;
35609 j = 1;
35610 n = 8;
35611 goto half;
35612
35613 case V8SImode:
35614 half_mode = V4SImode;
35615 j = 2;
35616 n = 4;
35617 goto half;
35618
35619 case V4DImode:
35620 half_mode = V2DImode;
35621 j = 3;
35622 n = 2;
35623 goto half;
35624
35625 case V8SFmode:
35626 half_mode = V4SFmode;
35627 j = 4;
35628 n = 4;
35629 goto half;
35630
35631 case V4DFmode:
35632 half_mode = V2DFmode;
35633 j = 5;
35634 n = 2;
35635 goto half;
35636
35637 half:
35638 /* Compute offset. */
35639 i = elt / n;
35640 elt %= n;
35641
35642 gcc_assert (i <= 1);
35643
35644 /* Extract the half. */
35645 tmp = gen_reg_rtx (half_mode);
35646 emit_insn (gen_extract[j][i] (tmp, target));
35647
35648 /* Put val in tmp at elt. */
35649 ix86_expand_vector_set (false, tmp, val, elt);
35650
35651 /* Put it back. */
35652 emit_insn (gen_insert[j][i] (target, target, tmp));
35653 return;
35654
35655 default:
35656 break;
35657 }
35658
35659 if (use_vec_merge)
35660 {
35661 tmp = gen_rtx_VEC_DUPLICATE (mode, val);
35662 tmp = gen_rtx_VEC_MERGE (mode, tmp, target, GEN_INT (1 << elt));
35663 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
35664 }
35665 else
35666 {
35667 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
35668
35669 emit_move_insn (mem, target);
35670
35671 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
35672 emit_move_insn (tmp, val);
35673
35674 emit_move_insn (target, mem);
35675 }
35676 }
35677
35678 void
35679 ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt)
35680 {
35681 enum machine_mode mode = GET_MODE (vec);
35682 enum machine_mode inner_mode = GET_MODE_INNER (mode);
35683 bool use_vec_extr = false;
35684 rtx tmp;
35685
35686 switch (mode)
35687 {
35688 case V2SImode:
35689 case V2SFmode:
35690 if (!mmx_ok)
35691 break;
35692 /* FALLTHRU */
35693
35694 case V2DFmode:
35695 case V2DImode:
35696 use_vec_extr = true;
35697 break;
35698
35699 case V4SFmode:
35700 use_vec_extr = TARGET_SSE4_1;
35701 if (use_vec_extr)
35702 break;
35703
35704 switch (elt)
35705 {
35706 case 0:
35707 tmp = vec;
35708 break;
35709
35710 case 1:
35711 case 3:
35712 tmp = gen_reg_rtx (mode);
35713 emit_insn (gen_sse_shufps_v4sf (tmp, vec, vec,
35714 GEN_INT (elt), GEN_INT (elt),
35715 GEN_INT (elt+4), GEN_INT (elt+4)));
35716 break;
35717
35718 case 2:
35719 tmp = gen_reg_rtx (mode);
35720 emit_insn (gen_vec_interleave_highv4sf (tmp, vec, vec));
35721 break;
35722
35723 default:
35724 gcc_unreachable ();
35725 }
35726 vec = tmp;
35727 use_vec_extr = true;
35728 elt = 0;
35729 break;
35730
35731 case V4SImode:
35732 use_vec_extr = TARGET_SSE4_1;
35733 if (use_vec_extr)
35734 break;
35735
35736 if (TARGET_SSE2)
35737 {
35738 switch (elt)
35739 {
35740 case 0:
35741 tmp = vec;
35742 break;
35743
35744 case 1:
35745 case 3:
35746 tmp = gen_reg_rtx (mode);
35747 emit_insn (gen_sse2_pshufd_1 (tmp, vec,
35748 GEN_INT (elt), GEN_INT (elt),
35749 GEN_INT (elt), GEN_INT (elt)));
35750 break;
35751
35752 case 2:
35753 tmp = gen_reg_rtx (mode);
35754 emit_insn (gen_vec_interleave_highv4si (tmp, vec, vec));
35755 break;
35756
35757 default:
35758 gcc_unreachable ();
35759 }
35760 vec = tmp;
35761 use_vec_extr = true;
35762 elt = 0;
35763 }
35764 else
35765 {
35766 /* For SSE1, we have to reuse the V4SF code. */
35767 ix86_expand_vector_extract (false, gen_lowpart (SFmode, target),
35768 gen_lowpart (V4SFmode, vec), elt);
35769 return;
35770 }
35771 break;
35772
35773 case V8HImode:
35774 use_vec_extr = TARGET_SSE2;
35775 break;
35776 case V4HImode:
35777 use_vec_extr = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
35778 break;
35779
35780 case V16QImode:
35781 use_vec_extr = TARGET_SSE4_1;
35782 break;
35783
35784 case V8SFmode:
35785 if (TARGET_AVX)
35786 {
35787 tmp = gen_reg_rtx (V4SFmode);
35788 if (elt < 4)
35789 emit_insn (gen_vec_extract_lo_v8sf (tmp, vec));
35790 else
35791 emit_insn (gen_vec_extract_hi_v8sf (tmp, vec));
35792 ix86_expand_vector_extract (false, target, tmp, elt & 3);
35793 return;
35794 }
35795 break;
35796
35797 case V4DFmode:
35798 if (TARGET_AVX)
35799 {
35800 tmp = gen_reg_rtx (V2DFmode);
35801 if (elt < 2)
35802 emit_insn (gen_vec_extract_lo_v4df (tmp, vec));
35803 else
35804 emit_insn (gen_vec_extract_hi_v4df (tmp, vec));
35805 ix86_expand_vector_extract (false, target, tmp, elt & 1);
35806 return;
35807 }
35808 break;
35809
35810 case V32QImode:
35811 if (TARGET_AVX)
35812 {
35813 tmp = gen_reg_rtx (V16QImode);
35814 if (elt < 16)
35815 emit_insn (gen_vec_extract_lo_v32qi (tmp, vec));
35816 else
35817 emit_insn (gen_vec_extract_hi_v32qi (tmp, vec));
35818 ix86_expand_vector_extract (false, target, tmp, elt & 15);
35819 return;
35820 }
35821 break;
35822
35823 case V16HImode:
35824 if (TARGET_AVX)
35825 {
35826 tmp = gen_reg_rtx (V8HImode);
35827 if (elt < 8)
35828 emit_insn (gen_vec_extract_lo_v16hi (tmp, vec));
35829 else
35830 emit_insn (gen_vec_extract_hi_v16hi (tmp, vec));
35831 ix86_expand_vector_extract (false, target, tmp, elt & 7);
35832 return;
35833 }
35834 break;
35835
35836 case V8SImode:
35837 if (TARGET_AVX)
35838 {
35839 tmp = gen_reg_rtx (V4SImode);
35840 if (elt < 4)
35841 emit_insn (gen_vec_extract_lo_v8si (tmp, vec));
35842 else
35843 emit_insn (gen_vec_extract_hi_v8si (tmp, vec));
35844 ix86_expand_vector_extract (false, target, tmp, elt & 3);
35845 return;
35846 }
35847 break;
35848
35849 case V4DImode:
35850 if (TARGET_AVX)
35851 {
35852 tmp = gen_reg_rtx (V2DImode);
35853 if (elt < 2)
35854 emit_insn (gen_vec_extract_lo_v4di (tmp, vec));
35855 else
35856 emit_insn (gen_vec_extract_hi_v4di (tmp, vec));
35857 ix86_expand_vector_extract (false, target, tmp, elt & 1);
35858 return;
35859 }
35860 break;
35861
35862 case V8QImode:
35863 /* ??? Could extract the appropriate HImode element and shift. */
35864 default:
35865 break;
35866 }
35867
35868 if (use_vec_extr)
35869 {
35870 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (elt)));
35871 tmp = gen_rtx_VEC_SELECT (inner_mode, vec, tmp);
35872
35873 /* Let the rtl optimizers know about the zero extension performed. */
35874 if (inner_mode == QImode || inner_mode == HImode)
35875 {
35876 tmp = gen_rtx_ZERO_EXTEND (SImode, tmp);
35877 target = gen_lowpart (SImode, target);
35878 }
35879
35880 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
35881 }
35882 else
35883 {
35884 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
35885
35886 emit_move_insn (mem, vec);
35887
35888 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
35889 emit_move_insn (target, tmp);
35890 }
35891 }
35892
35893 /* Generate code to copy vector bits i / 2 ... i - 1 from vector SRC
35894 to bits 0 ... i / 2 - 1 of vector DEST, which has the same mode.
35895 The upper bits of DEST are undefined, though they shouldn't cause
35896 exceptions (some bits from src or all zeros are ok). */
35897
35898 static void
35899 emit_reduc_half (rtx dest, rtx src, int i)
35900 {
35901 rtx tem;
35902 switch (GET_MODE (src))
35903 {
35904 case V4SFmode:
35905 if (i == 128)
35906 tem = gen_sse_movhlps (dest, src, src);
35907 else
35908 tem = gen_sse_shufps_v4sf (dest, src, src, const1_rtx, const1_rtx,
35909 GEN_INT (1 + 4), GEN_INT (1 + 4));
35910 break;
35911 case V2DFmode:
35912 tem = gen_vec_interleave_highv2df (dest, src, src);
35913 break;
35914 case V16QImode:
35915 case V8HImode:
35916 case V4SImode:
35917 case V2DImode:
35918 tem = gen_sse2_lshrv1ti3 (gen_lowpart (V1TImode, dest),
35919 gen_lowpart (V1TImode, src),
35920 GEN_INT (i / 2));
35921 break;
35922 case V8SFmode:
35923 if (i == 256)
35924 tem = gen_avx_vperm2f128v8sf3 (dest, src, src, const1_rtx);
35925 else
35926 tem = gen_avx_shufps256 (dest, src, src,
35927 GEN_INT (i == 128 ? 2 + (3 << 2) : 1));
35928 break;
35929 case V4DFmode:
35930 if (i == 256)
35931 tem = gen_avx_vperm2f128v4df3 (dest, src, src, const1_rtx);
35932 else
35933 tem = gen_avx_shufpd256 (dest, src, src, const1_rtx);
35934 break;
35935 case V32QImode:
35936 case V16HImode:
35937 case V8SImode:
35938 case V4DImode:
35939 if (i == 256)
35940 tem = gen_avx2_permv2ti (gen_lowpart (V4DImode, dest),
35941 gen_lowpart (V4DImode, src),
35942 gen_lowpart (V4DImode, src),
35943 const1_rtx);
35944 else
35945 tem = gen_avx2_lshrv2ti3 (gen_lowpart (V2TImode, dest),
35946 gen_lowpart (V2TImode, src),
35947 GEN_INT (i / 2));
35948 break;
35949 default:
35950 gcc_unreachable ();
35951 }
35952 emit_insn (tem);
35953 }
35954
35955 /* Expand a vector reduction. FN is the binary pattern to reduce;
35956 DEST is the destination; IN is the input vector. */
35957
35958 void
35959 ix86_expand_reduc (rtx (*fn) (rtx, rtx, rtx), rtx dest, rtx in)
35960 {
35961 rtx half, dst, vec = in;
35962 enum machine_mode mode = GET_MODE (in);
35963 int i;
35964
35965 /* SSE4 has a special instruction for V8HImode UMIN reduction. */
35966 if (TARGET_SSE4_1
35967 && mode == V8HImode
35968 && fn == gen_uminv8hi3)
35969 {
35970 emit_insn (gen_sse4_1_phminposuw (dest, in));
35971 return;
35972 }
35973
35974 for (i = GET_MODE_BITSIZE (mode);
35975 i > GET_MODE_BITSIZE (GET_MODE_INNER (mode));
35976 i >>= 1)
35977 {
35978 half = gen_reg_rtx (mode);
35979 emit_reduc_half (half, vec, i);
35980 if (i == GET_MODE_BITSIZE (GET_MODE_INNER (mode)) * 2)
35981 dst = dest;
35982 else
35983 dst = gen_reg_rtx (mode);
35984 emit_insn (fn (dst, half, vec));
35985 vec = dst;
35986 }
35987 }
35988 \f
35989 /* Target hook for scalar_mode_supported_p. */
35990 static bool
35991 ix86_scalar_mode_supported_p (enum machine_mode mode)
35992 {
35993 if (DECIMAL_FLOAT_MODE_P (mode))
35994 return default_decimal_float_supported_p ();
35995 else if (mode == TFmode)
35996 return true;
35997 else
35998 return default_scalar_mode_supported_p (mode);
35999 }
36000
36001 /* Implements target hook vector_mode_supported_p. */
36002 static bool
36003 ix86_vector_mode_supported_p (enum machine_mode mode)
36004 {
36005 if (TARGET_SSE && VALID_SSE_REG_MODE (mode))
36006 return true;
36007 if (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode))
36008 return true;
36009 if (TARGET_AVX && VALID_AVX256_REG_MODE (mode))
36010 return true;
36011 if (TARGET_MMX && VALID_MMX_REG_MODE (mode))
36012 return true;
36013 if (TARGET_3DNOW && VALID_MMX_REG_MODE_3DNOW (mode))
36014 return true;
36015 return false;
36016 }
36017
36018 /* Target hook for c_mode_for_suffix. */
36019 static enum machine_mode
36020 ix86_c_mode_for_suffix (char suffix)
36021 {
36022 if (suffix == 'q')
36023 return TFmode;
36024 if (suffix == 'w')
36025 return XFmode;
36026
36027 return VOIDmode;
36028 }
36029
36030 /* Worker function for TARGET_MD_ASM_CLOBBERS.
36031
36032 We do this in the new i386 backend to maintain source compatibility
36033 with the old cc0-based compiler. */
36034
36035 static tree
36036 ix86_md_asm_clobbers (tree outputs ATTRIBUTE_UNUSED,
36037 tree inputs ATTRIBUTE_UNUSED,
36038 tree clobbers)
36039 {
36040 clobbers = tree_cons (NULL_TREE, build_string (5, "flags"),
36041 clobbers);
36042 clobbers = tree_cons (NULL_TREE, build_string (4, "fpsr"),
36043 clobbers);
36044 return clobbers;
36045 }
36046
36047 /* Implements target vector targetm.asm.encode_section_info. */
36048
36049 static void ATTRIBUTE_UNUSED
36050 ix86_encode_section_info (tree decl, rtx rtl, int first)
36051 {
36052 default_encode_section_info (decl, rtl, first);
36053
36054 if (TREE_CODE (decl) == VAR_DECL
36055 && (TREE_STATIC (decl) || DECL_EXTERNAL (decl))
36056 && ix86_in_large_data_p (decl))
36057 SYMBOL_REF_FLAGS (XEXP (rtl, 0)) |= SYMBOL_FLAG_FAR_ADDR;
36058 }
36059
36060 /* Worker function for REVERSE_CONDITION. */
36061
36062 enum rtx_code
36063 ix86_reverse_condition (enum rtx_code code, enum machine_mode mode)
36064 {
36065 return (mode != CCFPmode && mode != CCFPUmode
36066 ? reverse_condition (code)
36067 : reverse_condition_maybe_unordered (code));
36068 }
36069
36070 /* Output code to perform an x87 FP register move, from OPERANDS[1]
36071 to OPERANDS[0]. */
36072
36073 const char *
36074 output_387_reg_move (rtx insn, rtx *operands)
36075 {
36076 if (REG_P (operands[0]))
36077 {
36078 if (REG_P (operands[1])
36079 && find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
36080 {
36081 if (REGNO (operands[0]) == FIRST_STACK_REG)
36082 return output_387_ffreep (operands, 0);
36083 return "fstp\t%y0";
36084 }
36085 if (STACK_TOP_P (operands[0]))
36086 return "fld%Z1\t%y1";
36087 return "fst\t%y0";
36088 }
36089 else if (MEM_P (operands[0]))
36090 {
36091 gcc_assert (REG_P (operands[1]));
36092 if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
36093 return "fstp%Z0\t%y0";
36094 else
36095 {
36096 /* There is no non-popping store to memory for XFmode.
36097 So if we need one, follow the store with a load. */
36098 if (GET_MODE (operands[0]) == XFmode)
36099 return "fstp%Z0\t%y0\n\tfld%Z0\t%y0";
36100 else
36101 return "fst%Z0\t%y0";
36102 }
36103 }
36104 else
36105 gcc_unreachable();
36106 }
36107
36108 /* Output code to perform a conditional jump to LABEL, if C2 flag in
36109 FP status register is set. */
36110
36111 void
36112 ix86_emit_fp_unordered_jump (rtx label)
36113 {
36114 rtx reg = gen_reg_rtx (HImode);
36115 rtx temp;
36116
36117 emit_insn (gen_x86_fnstsw_1 (reg));
36118
36119 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
36120 {
36121 emit_insn (gen_x86_sahf_1 (reg));
36122
36123 temp = gen_rtx_REG (CCmode, FLAGS_REG);
36124 temp = gen_rtx_UNORDERED (VOIDmode, temp, const0_rtx);
36125 }
36126 else
36127 {
36128 emit_insn (gen_testqi_ext_ccno_0 (reg, GEN_INT (0x04)));
36129
36130 temp = gen_rtx_REG (CCNOmode, FLAGS_REG);
36131 temp = gen_rtx_NE (VOIDmode, temp, const0_rtx);
36132 }
36133
36134 temp = gen_rtx_IF_THEN_ELSE (VOIDmode, temp,
36135 gen_rtx_LABEL_REF (VOIDmode, label),
36136 pc_rtx);
36137 temp = gen_rtx_SET (VOIDmode, pc_rtx, temp);
36138
36139 emit_jump_insn (temp);
36140 predict_jump (REG_BR_PROB_BASE * 10 / 100);
36141 }
36142
36143 /* Output code to perform a log1p XFmode calculation. */
36144
36145 void ix86_emit_i387_log1p (rtx op0, rtx op1)
36146 {
36147 rtx label1 = gen_label_rtx ();
36148 rtx label2 = gen_label_rtx ();
36149
36150 rtx tmp = gen_reg_rtx (XFmode);
36151 rtx tmp2 = gen_reg_rtx (XFmode);
36152 rtx test;
36153
36154 emit_insn (gen_absxf2 (tmp, op1));
36155 test = gen_rtx_GE (VOIDmode, tmp,
36156 CONST_DOUBLE_FROM_REAL_VALUE (
36157 REAL_VALUE_ATOF ("0.29289321881345247561810596348408353", XFmode),
36158 XFmode));
36159 emit_jump_insn (gen_cbranchxf4 (test, XEXP (test, 0), XEXP (test, 1), label1));
36160
36161 emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
36162 emit_insn (gen_fyl2xp1xf3_i387 (op0, op1, tmp2));
36163 emit_jump (label2);
36164
36165 emit_label (label1);
36166 emit_move_insn (tmp, CONST1_RTX (XFmode));
36167 emit_insn (gen_addxf3 (tmp, op1, tmp));
36168 emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
36169 emit_insn (gen_fyl2xxf3_i387 (op0, tmp, tmp2));
36170
36171 emit_label (label2);
36172 }
36173
36174 /* Emit code for round calculation. */
36175 void ix86_emit_i387_round (rtx op0, rtx op1)
36176 {
36177 enum machine_mode inmode = GET_MODE (op1);
36178 enum machine_mode outmode = GET_MODE (op0);
36179 rtx e1, e2, res, tmp, tmp1, half;
36180 rtx scratch = gen_reg_rtx (HImode);
36181 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
36182 rtx jump_label = gen_label_rtx ();
36183 rtx insn;
36184 rtx (*gen_abs) (rtx, rtx);
36185 rtx (*gen_neg) (rtx, rtx);
36186
36187 switch (inmode)
36188 {
36189 case SFmode:
36190 gen_abs = gen_abssf2;
36191 break;
36192 case DFmode:
36193 gen_abs = gen_absdf2;
36194 break;
36195 case XFmode:
36196 gen_abs = gen_absxf2;
36197 break;
36198 default:
36199 gcc_unreachable ();
36200 }
36201
36202 switch (outmode)
36203 {
36204 case SFmode:
36205 gen_neg = gen_negsf2;
36206 break;
36207 case DFmode:
36208 gen_neg = gen_negdf2;
36209 break;
36210 case XFmode:
36211 gen_neg = gen_negxf2;
36212 break;
36213 case HImode:
36214 gen_neg = gen_neghi2;
36215 break;
36216 case SImode:
36217 gen_neg = gen_negsi2;
36218 break;
36219 case DImode:
36220 gen_neg = gen_negdi2;
36221 break;
36222 default:
36223 gcc_unreachable ();
36224 }
36225
36226 e1 = gen_reg_rtx (inmode);
36227 e2 = gen_reg_rtx (inmode);
36228 res = gen_reg_rtx (outmode);
36229
36230 half = CONST_DOUBLE_FROM_REAL_VALUE (dconsthalf, inmode);
36231
36232 /* round(a) = sgn(a) * floor(fabs(a) + 0.5) */
36233
36234 /* scratch = fxam(op1) */
36235 emit_insn (gen_rtx_SET (VOIDmode, scratch,
36236 gen_rtx_UNSPEC (HImode, gen_rtvec (1, op1),
36237 UNSPEC_FXAM)));
36238 /* e1 = fabs(op1) */
36239 emit_insn (gen_abs (e1, op1));
36240
36241 /* e2 = e1 + 0.5 */
36242 half = force_reg (inmode, half);
36243 emit_insn (gen_rtx_SET (VOIDmode, e2,
36244 gen_rtx_PLUS (inmode, e1, half)));
36245
36246 /* res = floor(e2) */
36247 if (inmode != XFmode)
36248 {
36249 tmp1 = gen_reg_rtx (XFmode);
36250
36251 emit_insn (gen_rtx_SET (VOIDmode, tmp1,
36252 gen_rtx_FLOAT_EXTEND (XFmode, e2)));
36253 }
36254 else
36255 tmp1 = e2;
36256
36257 switch (outmode)
36258 {
36259 case SFmode:
36260 case DFmode:
36261 {
36262 rtx tmp0 = gen_reg_rtx (XFmode);
36263
36264 emit_insn (gen_frndintxf2_floor (tmp0, tmp1));
36265
36266 emit_insn (gen_rtx_SET (VOIDmode, res,
36267 gen_rtx_UNSPEC (outmode, gen_rtvec (1, tmp0),
36268 UNSPEC_TRUNC_NOOP)));
36269 }
36270 break;
36271 case XFmode:
36272 emit_insn (gen_frndintxf2_floor (res, tmp1));
36273 break;
36274 case HImode:
36275 emit_insn (gen_lfloorxfhi2 (res, tmp1));
36276 break;
36277 case SImode:
36278 emit_insn (gen_lfloorxfsi2 (res, tmp1));
36279 break;
36280 case DImode:
36281 emit_insn (gen_lfloorxfdi2 (res, tmp1));
36282 break;
36283 default:
36284 gcc_unreachable ();
36285 }
36286
36287 /* flags = signbit(a) */
36288 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x02)));
36289
36290 /* if (flags) then res = -res */
36291 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
36292 gen_rtx_EQ (VOIDmode, flags, const0_rtx),
36293 gen_rtx_LABEL_REF (VOIDmode, jump_label),
36294 pc_rtx);
36295 insn = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
36296 predict_jump (REG_BR_PROB_BASE * 50 / 100);
36297 JUMP_LABEL (insn) = jump_label;
36298
36299 emit_insn (gen_neg (res, res));
36300
36301 emit_label (jump_label);
36302 LABEL_NUSES (jump_label) = 1;
36303
36304 emit_move_insn (op0, res);
36305 }
36306
36307 /* Output code to perform a Newton-Rhapson approximation of a single precision
36308 floating point divide [http://en.wikipedia.org/wiki/N-th_root_algorithm]. */
36309
36310 void ix86_emit_swdivsf (rtx res, rtx a, rtx b, enum machine_mode mode)
36311 {
36312 rtx x0, x1, e0, e1;
36313
36314 x0 = gen_reg_rtx (mode);
36315 e0 = gen_reg_rtx (mode);
36316 e1 = gen_reg_rtx (mode);
36317 x1 = gen_reg_rtx (mode);
36318
36319 /* a / b = a * ((rcp(b) + rcp(b)) - (b * rcp(b) * rcp (b))) */
36320
36321 b = force_reg (mode, b);
36322
36323 /* x0 = rcp(b) estimate */
36324 emit_insn (gen_rtx_SET (VOIDmode, x0,
36325 gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
36326 UNSPEC_RCP)));
36327 /* e0 = x0 * b */
36328 emit_insn (gen_rtx_SET (VOIDmode, e0,
36329 gen_rtx_MULT (mode, x0, b)));
36330
36331 /* e0 = x0 * e0 */
36332 emit_insn (gen_rtx_SET (VOIDmode, e0,
36333 gen_rtx_MULT (mode, x0, e0)));
36334
36335 /* e1 = x0 + x0 */
36336 emit_insn (gen_rtx_SET (VOIDmode, e1,
36337 gen_rtx_PLUS (mode, x0, x0)));
36338
36339 /* x1 = e1 - e0 */
36340 emit_insn (gen_rtx_SET (VOIDmode, x1,
36341 gen_rtx_MINUS (mode, e1, e0)));
36342
36343 /* res = a * x1 */
36344 emit_insn (gen_rtx_SET (VOIDmode, res,
36345 gen_rtx_MULT (mode, a, x1)));
36346 }
36347
36348 /* Output code to perform a Newton-Rhapson approximation of a
36349 single precision floating point [reciprocal] square root. */
36350
36351 void ix86_emit_swsqrtsf (rtx res, rtx a, enum machine_mode mode,
36352 bool recip)
36353 {
36354 rtx x0, e0, e1, e2, e3, mthree, mhalf;
36355 REAL_VALUE_TYPE r;
36356
36357 x0 = gen_reg_rtx (mode);
36358 e0 = gen_reg_rtx (mode);
36359 e1 = gen_reg_rtx (mode);
36360 e2 = gen_reg_rtx (mode);
36361 e3 = gen_reg_rtx (mode);
36362
36363 real_from_integer (&r, VOIDmode, -3, -1, 0);
36364 mthree = CONST_DOUBLE_FROM_REAL_VALUE (r, SFmode);
36365
36366 real_arithmetic (&r, NEGATE_EXPR, &dconsthalf, NULL);
36367 mhalf = CONST_DOUBLE_FROM_REAL_VALUE (r, SFmode);
36368
36369 if (VECTOR_MODE_P (mode))
36370 {
36371 mthree = ix86_build_const_vector (mode, true, mthree);
36372 mhalf = ix86_build_const_vector (mode, true, mhalf);
36373 }
36374
36375 /* sqrt(a) = -0.5 * a * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0)
36376 rsqrt(a) = -0.5 * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) */
36377
36378 a = force_reg (mode, a);
36379
36380 /* x0 = rsqrt(a) estimate */
36381 emit_insn (gen_rtx_SET (VOIDmode, x0,
36382 gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
36383 UNSPEC_RSQRT)));
36384
36385 /* If (a == 0.0) Filter out infinity to prevent NaN for sqrt(0.0). */
36386 if (!recip)
36387 {
36388 rtx zero, mask;
36389
36390 zero = gen_reg_rtx (mode);
36391 mask = gen_reg_rtx (mode);
36392
36393 zero = force_reg (mode, CONST0_RTX(mode));
36394 emit_insn (gen_rtx_SET (VOIDmode, mask,
36395 gen_rtx_NE (mode, zero, a)));
36396
36397 emit_insn (gen_rtx_SET (VOIDmode, x0,
36398 gen_rtx_AND (mode, x0, mask)));
36399 }
36400
36401 /* e0 = x0 * a */
36402 emit_insn (gen_rtx_SET (VOIDmode, e0,
36403 gen_rtx_MULT (mode, x0, a)));
36404 /* e1 = e0 * x0 */
36405 emit_insn (gen_rtx_SET (VOIDmode, e1,
36406 gen_rtx_MULT (mode, e0, x0)));
36407
36408 /* e2 = e1 - 3. */
36409 mthree = force_reg (mode, mthree);
36410 emit_insn (gen_rtx_SET (VOIDmode, e2,
36411 gen_rtx_PLUS (mode, e1, mthree)));
36412
36413 mhalf = force_reg (mode, mhalf);
36414 if (recip)
36415 /* e3 = -.5 * x0 */
36416 emit_insn (gen_rtx_SET (VOIDmode, e3,
36417 gen_rtx_MULT (mode, x0, mhalf)));
36418 else
36419 /* e3 = -.5 * e0 */
36420 emit_insn (gen_rtx_SET (VOIDmode, e3,
36421 gen_rtx_MULT (mode, e0, mhalf)));
36422 /* ret = e2 * e3 */
36423 emit_insn (gen_rtx_SET (VOIDmode, res,
36424 gen_rtx_MULT (mode, e2, e3)));
36425 }
36426
36427 #ifdef TARGET_SOLARIS
36428 /* Solaris implementation of TARGET_ASM_NAMED_SECTION. */
36429
36430 static void
36431 i386_solaris_elf_named_section (const char *name, unsigned int flags,
36432 tree decl)
36433 {
36434 /* With Binutils 2.15, the "@unwind" marker must be specified on
36435 every occurrence of the ".eh_frame" section, not just the first
36436 one. */
36437 if (TARGET_64BIT
36438 && strcmp (name, ".eh_frame") == 0)
36439 {
36440 fprintf (asm_out_file, "\t.section\t%s,\"%s\",@unwind\n", name,
36441 flags & SECTION_WRITE ? "aw" : "a");
36442 return;
36443 }
36444
36445 #ifndef USE_GAS
36446 if (HAVE_COMDAT_GROUP && flags & SECTION_LINKONCE)
36447 {
36448 solaris_elf_asm_comdat_section (name, flags, decl);
36449 return;
36450 }
36451 #endif
36452
36453 default_elf_asm_named_section (name, flags, decl);
36454 }
36455 #endif /* TARGET_SOLARIS */
36456
36457 /* Return the mangling of TYPE if it is an extended fundamental type. */
36458
36459 static const char *
36460 ix86_mangle_type (const_tree type)
36461 {
36462 type = TYPE_MAIN_VARIANT (type);
36463
36464 if (TREE_CODE (type) != VOID_TYPE && TREE_CODE (type) != BOOLEAN_TYPE
36465 && TREE_CODE (type) != INTEGER_TYPE && TREE_CODE (type) != REAL_TYPE)
36466 return NULL;
36467
36468 switch (TYPE_MODE (type))
36469 {
36470 case TFmode:
36471 /* __float128 is "g". */
36472 return "g";
36473 case XFmode:
36474 /* "long double" or __float80 is "e". */
36475 return "e";
36476 default:
36477 return NULL;
36478 }
36479 }
36480
36481 /* For 32-bit code we can save PIC register setup by using
36482 __stack_chk_fail_local hidden function instead of calling
36483 __stack_chk_fail directly. 64-bit code doesn't need to setup any PIC
36484 register, so it is better to call __stack_chk_fail directly. */
36485
36486 static tree ATTRIBUTE_UNUSED
36487 ix86_stack_protect_fail (void)
36488 {
36489 return TARGET_64BIT
36490 ? default_external_stack_protect_fail ()
36491 : default_hidden_stack_protect_fail ();
36492 }
36493
36494 /* Select a format to encode pointers in exception handling data. CODE
36495 is 0 for data, 1 for code labels, 2 for function pointers. GLOBAL is
36496 true if the symbol may be affected by dynamic relocations.
36497
36498 ??? All x86 object file formats are capable of representing this.
36499 After all, the relocation needed is the same as for the call insn.
36500 Whether or not a particular assembler allows us to enter such, I
36501 guess we'll have to see. */
36502 int
36503 asm_preferred_eh_data_format (int code, int global)
36504 {
36505 if (flag_pic)
36506 {
36507 int type = DW_EH_PE_sdata8;
36508 if (!TARGET_64BIT
36509 || ix86_cmodel == CM_SMALL_PIC
36510 || (ix86_cmodel == CM_MEDIUM_PIC && (global || code)))
36511 type = DW_EH_PE_sdata4;
36512 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
36513 }
36514 if (ix86_cmodel == CM_SMALL
36515 || (ix86_cmodel == CM_MEDIUM && code))
36516 return DW_EH_PE_udata4;
36517 return DW_EH_PE_absptr;
36518 }
36519 \f
36520 /* Expand copysign from SIGN to the positive value ABS_VALUE
36521 storing in RESULT. If MASK is non-null, it shall be a mask to mask out
36522 the sign-bit. */
36523 static void
36524 ix86_sse_copysign_to_positive (rtx result, rtx abs_value, rtx sign, rtx mask)
36525 {
36526 enum machine_mode mode = GET_MODE (sign);
36527 rtx sgn = gen_reg_rtx (mode);
36528 if (mask == NULL_RTX)
36529 {
36530 enum machine_mode vmode;
36531
36532 if (mode == SFmode)
36533 vmode = V4SFmode;
36534 else if (mode == DFmode)
36535 vmode = V2DFmode;
36536 else
36537 vmode = mode;
36538
36539 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), false);
36540 if (!VECTOR_MODE_P (mode))
36541 {
36542 /* We need to generate a scalar mode mask in this case. */
36543 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
36544 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
36545 mask = gen_reg_rtx (mode);
36546 emit_insn (gen_rtx_SET (VOIDmode, mask, tmp));
36547 }
36548 }
36549 else
36550 mask = gen_rtx_NOT (mode, mask);
36551 emit_insn (gen_rtx_SET (VOIDmode, sgn,
36552 gen_rtx_AND (mode, mask, sign)));
36553 emit_insn (gen_rtx_SET (VOIDmode, result,
36554 gen_rtx_IOR (mode, abs_value, sgn)));
36555 }
36556
36557 /* Expand fabs (OP0) and return a new rtx that holds the result. The
36558 mask for masking out the sign-bit is stored in *SMASK, if that is
36559 non-null. */
36560 static rtx
36561 ix86_expand_sse_fabs (rtx op0, rtx *smask)
36562 {
36563 enum machine_mode vmode, mode = GET_MODE (op0);
36564 rtx xa, mask;
36565
36566 xa = gen_reg_rtx (mode);
36567 if (mode == SFmode)
36568 vmode = V4SFmode;
36569 else if (mode == DFmode)
36570 vmode = V2DFmode;
36571 else
36572 vmode = mode;
36573 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), true);
36574 if (!VECTOR_MODE_P (mode))
36575 {
36576 /* We need to generate a scalar mode mask in this case. */
36577 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
36578 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
36579 mask = gen_reg_rtx (mode);
36580 emit_insn (gen_rtx_SET (VOIDmode, mask, tmp));
36581 }
36582 emit_insn (gen_rtx_SET (VOIDmode, xa,
36583 gen_rtx_AND (mode, op0, mask)));
36584
36585 if (smask)
36586 *smask = mask;
36587
36588 return xa;
36589 }
36590
36591 /* Expands a comparison of OP0 with OP1 using comparison code CODE,
36592 swapping the operands if SWAP_OPERANDS is true. The expanded
36593 code is a forward jump to a newly created label in case the
36594 comparison is true. The generated label rtx is returned. */
36595 static rtx
36596 ix86_expand_sse_compare_and_jump (enum rtx_code code, rtx op0, rtx op1,
36597 bool swap_operands)
36598 {
36599 rtx label, tmp;
36600
36601 if (swap_operands)
36602 {
36603 tmp = op0;
36604 op0 = op1;
36605 op1 = tmp;
36606 }
36607
36608 label = gen_label_rtx ();
36609 tmp = gen_rtx_REG (CCFPUmode, FLAGS_REG);
36610 emit_insn (gen_rtx_SET (VOIDmode, tmp,
36611 gen_rtx_COMPARE (CCFPUmode, op0, op1)));
36612 tmp = gen_rtx_fmt_ee (code, VOIDmode, tmp, const0_rtx);
36613 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
36614 gen_rtx_LABEL_REF (VOIDmode, label), pc_rtx);
36615 tmp = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
36616 JUMP_LABEL (tmp) = label;
36617
36618 return label;
36619 }
36620
36621 /* Expand a mask generating SSE comparison instruction comparing OP0 with OP1
36622 using comparison code CODE. Operands are swapped for the comparison if
36623 SWAP_OPERANDS is true. Returns a rtx for the generated mask. */
36624 static rtx
36625 ix86_expand_sse_compare_mask (enum rtx_code code, rtx op0, rtx op1,
36626 bool swap_operands)
36627 {
36628 rtx (*insn)(rtx, rtx, rtx, rtx);
36629 enum machine_mode mode = GET_MODE (op0);
36630 rtx mask = gen_reg_rtx (mode);
36631
36632 if (swap_operands)
36633 {
36634 rtx tmp = op0;
36635 op0 = op1;
36636 op1 = tmp;
36637 }
36638
36639 insn = mode == DFmode ? gen_setcc_df_sse : gen_setcc_sf_sse;
36640
36641 emit_insn (insn (mask, op0, op1,
36642 gen_rtx_fmt_ee (code, mode, op0, op1)));
36643 return mask;
36644 }
36645
36646 /* Generate and return a rtx of mode MODE for 2**n where n is the number
36647 of bits of the mantissa of MODE, which must be one of DFmode or SFmode. */
36648 static rtx
36649 ix86_gen_TWO52 (enum machine_mode mode)
36650 {
36651 REAL_VALUE_TYPE TWO52r;
36652 rtx TWO52;
36653
36654 real_ldexp (&TWO52r, &dconst1, mode == DFmode ? 52 : 23);
36655 TWO52 = const_double_from_real_value (TWO52r, mode);
36656 TWO52 = force_reg (mode, TWO52);
36657
36658 return TWO52;
36659 }
36660
36661 /* Expand SSE sequence for computing lround from OP1 storing
36662 into OP0. */
36663 void
36664 ix86_expand_lround (rtx op0, rtx op1)
36665 {
36666 /* C code for the stuff we're doing below:
36667 tmp = op1 + copysign (nextafter (0.5, 0.0), op1)
36668 return (long)tmp;
36669 */
36670 enum machine_mode mode = GET_MODE (op1);
36671 const struct real_format *fmt;
36672 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
36673 rtx adj;
36674
36675 /* load nextafter (0.5, 0.0) */
36676 fmt = REAL_MODE_FORMAT (mode);
36677 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
36678 REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
36679
36680 /* adj = copysign (0.5, op1) */
36681 adj = force_reg (mode, const_double_from_real_value (pred_half, mode));
36682 ix86_sse_copysign_to_positive (adj, adj, force_reg (mode, op1), NULL_RTX);
36683
36684 /* adj = op1 + adj */
36685 adj = expand_simple_binop (mode, PLUS, adj, op1, NULL_RTX, 0, OPTAB_DIRECT);
36686
36687 /* op0 = (imode)adj */
36688 expand_fix (op0, adj, 0);
36689 }
36690
36691 /* Expand SSE2 sequence for computing lround from OPERAND1 storing
36692 into OPERAND0. */
36693 void
36694 ix86_expand_lfloorceil (rtx op0, rtx op1, bool do_floor)
36695 {
36696 /* C code for the stuff we're doing below (for do_floor):
36697 xi = (long)op1;
36698 xi -= (double)xi > op1 ? 1 : 0;
36699 return xi;
36700 */
36701 enum machine_mode fmode = GET_MODE (op1);
36702 enum machine_mode imode = GET_MODE (op0);
36703 rtx ireg, freg, label, tmp;
36704
36705 /* reg = (long)op1 */
36706 ireg = gen_reg_rtx (imode);
36707 expand_fix (ireg, op1, 0);
36708
36709 /* freg = (double)reg */
36710 freg = gen_reg_rtx (fmode);
36711 expand_float (freg, ireg, 0);
36712
36713 /* ireg = (freg > op1) ? ireg - 1 : ireg */
36714 label = ix86_expand_sse_compare_and_jump (UNLE,
36715 freg, op1, !do_floor);
36716 tmp = expand_simple_binop (imode, do_floor ? MINUS : PLUS,
36717 ireg, const1_rtx, NULL_RTX, 0, OPTAB_DIRECT);
36718 emit_move_insn (ireg, tmp);
36719
36720 emit_label (label);
36721 LABEL_NUSES (label) = 1;
36722
36723 emit_move_insn (op0, ireg);
36724 }
36725
36726 /* Expand rint (IEEE round to nearest) rounding OPERAND1 and storing the
36727 result in OPERAND0. */
36728 void
36729 ix86_expand_rint (rtx operand0, rtx operand1)
36730 {
36731 /* C code for the stuff we're doing below:
36732 xa = fabs (operand1);
36733 if (!isless (xa, 2**52))
36734 return operand1;
36735 xa = xa + 2**52 - 2**52;
36736 return copysign (xa, operand1);
36737 */
36738 enum machine_mode mode = GET_MODE (operand0);
36739 rtx res, xa, label, TWO52, mask;
36740
36741 res = gen_reg_rtx (mode);
36742 emit_move_insn (res, operand1);
36743
36744 /* xa = abs (operand1) */
36745 xa = ix86_expand_sse_fabs (res, &mask);
36746
36747 /* if (!isless (xa, TWO52)) goto label; */
36748 TWO52 = ix86_gen_TWO52 (mode);
36749 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
36750
36751 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
36752 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
36753
36754 ix86_sse_copysign_to_positive (res, xa, res, mask);
36755
36756 emit_label (label);
36757 LABEL_NUSES (label) = 1;
36758
36759 emit_move_insn (operand0, res);
36760 }
36761
36762 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
36763 into OPERAND0. */
36764 void
36765 ix86_expand_floorceildf_32 (rtx operand0, rtx operand1, bool do_floor)
36766 {
36767 /* C code for the stuff we expand below.
36768 double xa = fabs (x), x2;
36769 if (!isless (xa, TWO52))
36770 return x;
36771 xa = xa + TWO52 - TWO52;
36772 x2 = copysign (xa, x);
36773 Compensate. Floor:
36774 if (x2 > x)
36775 x2 -= 1;
36776 Compensate. Ceil:
36777 if (x2 < x)
36778 x2 -= -1;
36779 return x2;
36780 */
36781 enum machine_mode mode = GET_MODE (operand0);
36782 rtx xa, TWO52, tmp, label, one, res, mask;
36783
36784 TWO52 = ix86_gen_TWO52 (mode);
36785
36786 /* Temporary for holding the result, initialized to the input
36787 operand to ease control flow. */
36788 res = gen_reg_rtx (mode);
36789 emit_move_insn (res, operand1);
36790
36791 /* xa = abs (operand1) */
36792 xa = ix86_expand_sse_fabs (res, &mask);
36793
36794 /* if (!isless (xa, TWO52)) goto label; */
36795 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
36796
36797 /* xa = xa + TWO52 - TWO52; */
36798 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
36799 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
36800
36801 /* xa = copysign (xa, operand1) */
36802 ix86_sse_copysign_to_positive (xa, xa, res, mask);
36803
36804 /* generate 1.0 or -1.0 */
36805 one = force_reg (mode,
36806 const_double_from_real_value (do_floor
36807 ? dconst1 : dconstm1, mode));
36808
36809 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
36810 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
36811 emit_insn (gen_rtx_SET (VOIDmode, tmp,
36812 gen_rtx_AND (mode, one, tmp)));
36813 /* We always need to subtract here to preserve signed zero. */
36814 tmp = expand_simple_binop (mode, MINUS,
36815 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
36816 emit_move_insn (res, tmp);
36817
36818 emit_label (label);
36819 LABEL_NUSES (label) = 1;
36820
36821 emit_move_insn (operand0, res);
36822 }
36823
36824 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
36825 into OPERAND0. */
36826 void
36827 ix86_expand_floorceil (rtx operand0, rtx operand1, bool do_floor)
36828 {
36829 /* C code for the stuff we expand below.
36830 double xa = fabs (x), x2;
36831 if (!isless (xa, TWO52))
36832 return x;
36833 x2 = (double)(long)x;
36834 Compensate. Floor:
36835 if (x2 > x)
36836 x2 -= 1;
36837 Compensate. Ceil:
36838 if (x2 < x)
36839 x2 += 1;
36840 if (HONOR_SIGNED_ZEROS (mode))
36841 return copysign (x2, x);
36842 return x2;
36843 */
36844 enum machine_mode mode = GET_MODE (operand0);
36845 rtx xa, xi, TWO52, tmp, label, one, res, mask;
36846
36847 TWO52 = ix86_gen_TWO52 (mode);
36848
36849 /* Temporary for holding the result, initialized to the input
36850 operand to ease control flow. */
36851 res = gen_reg_rtx (mode);
36852 emit_move_insn (res, operand1);
36853
36854 /* xa = abs (operand1) */
36855 xa = ix86_expand_sse_fabs (res, &mask);
36856
36857 /* if (!isless (xa, TWO52)) goto label; */
36858 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
36859
36860 /* xa = (double)(long)x */
36861 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
36862 expand_fix (xi, res, 0);
36863 expand_float (xa, xi, 0);
36864
36865 /* generate 1.0 */
36866 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
36867
36868 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
36869 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
36870 emit_insn (gen_rtx_SET (VOIDmode, tmp,
36871 gen_rtx_AND (mode, one, tmp)));
36872 tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
36873 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
36874 emit_move_insn (res, tmp);
36875
36876 if (HONOR_SIGNED_ZEROS (mode))
36877 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
36878
36879 emit_label (label);
36880 LABEL_NUSES (label) = 1;
36881
36882 emit_move_insn (operand0, res);
36883 }
36884
36885 /* Expand SSE sequence for computing round from OPERAND1 storing
36886 into OPERAND0. Sequence that works without relying on DImode truncation
36887 via cvttsd2siq that is only available on 64bit targets. */
36888 void
36889 ix86_expand_rounddf_32 (rtx operand0, rtx operand1)
36890 {
36891 /* C code for the stuff we expand below.
36892 double xa = fabs (x), xa2, x2;
36893 if (!isless (xa, TWO52))
36894 return x;
36895 Using the absolute value and copying back sign makes
36896 -0.0 -> -0.0 correct.
36897 xa2 = xa + TWO52 - TWO52;
36898 Compensate.
36899 dxa = xa2 - xa;
36900 if (dxa <= -0.5)
36901 xa2 += 1;
36902 else if (dxa > 0.5)
36903 xa2 -= 1;
36904 x2 = copysign (xa2, x);
36905 return x2;
36906 */
36907 enum machine_mode mode = GET_MODE (operand0);
36908 rtx xa, xa2, dxa, TWO52, tmp, label, half, mhalf, one, res, mask;
36909
36910 TWO52 = ix86_gen_TWO52 (mode);
36911
36912 /* Temporary for holding the result, initialized to the input
36913 operand to ease control flow. */
36914 res = gen_reg_rtx (mode);
36915 emit_move_insn (res, operand1);
36916
36917 /* xa = abs (operand1) */
36918 xa = ix86_expand_sse_fabs (res, &mask);
36919
36920 /* if (!isless (xa, TWO52)) goto label; */
36921 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
36922
36923 /* xa2 = xa + TWO52 - TWO52; */
36924 xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
36925 xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
36926
36927 /* dxa = xa2 - xa; */
36928 dxa = expand_simple_binop (mode, MINUS, xa2, xa, NULL_RTX, 0, OPTAB_DIRECT);
36929
36930 /* generate 0.5, 1.0 and -0.5 */
36931 half = force_reg (mode, const_double_from_real_value (dconsthalf, mode));
36932 one = expand_simple_binop (mode, PLUS, half, half, NULL_RTX, 0, OPTAB_DIRECT);
36933 mhalf = expand_simple_binop (mode, MINUS, half, one, NULL_RTX,
36934 0, OPTAB_DIRECT);
36935
36936 /* Compensate. */
36937 tmp = gen_reg_rtx (mode);
36938 /* xa2 = xa2 - (dxa > 0.5 ? 1 : 0) */
36939 tmp = ix86_expand_sse_compare_mask (UNGT, dxa, half, false);
36940 emit_insn (gen_rtx_SET (VOIDmode, tmp,
36941 gen_rtx_AND (mode, one, tmp)));
36942 xa2 = expand_simple_binop (mode, MINUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
36943 /* xa2 = xa2 + (dxa <= -0.5 ? 1 : 0) */
36944 tmp = ix86_expand_sse_compare_mask (UNGE, mhalf, dxa, false);
36945 emit_insn (gen_rtx_SET (VOIDmode, tmp,
36946 gen_rtx_AND (mode, one, tmp)));
36947 xa2 = expand_simple_binop (mode, PLUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
36948
36949 /* res = copysign (xa2, operand1) */
36950 ix86_sse_copysign_to_positive (res, xa2, force_reg (mode, operand1), mask);
36951
36952 emit_label (label);
36953 LABEL_NUSES (label) = 1;
36954
36955 emit_move_insn (operand0, res);
36956 }
36957
36958 /* Expand SSE sequence for computing trunc from OPERAND1 storing
36959 into OPERAND0. */
36960 void
36961 ix86_expand_trunc (rtx operand0, rtx operand1)
36962 {
36963 /* C code for SSE variant we expand below.
36964 double xa = fabs (x), x2;
36965 if (!isless (xa, TWO52))
36966 return x;
36967 x2 = (double)(long)x;
36968 if (HONOR_SIGNED_ZEROS (mode))
36969 return copysign (x2, x);
36970 return x2;
36971 */
36972 enum machine_mode mode = GET_MODE (operand0);
36973 rtx xa, xi, TWO52, label, res, mask;
36974
36975 TWO52 = ix86_gen_TWO52 (mode);
36976
36977 /* Temporary for holding the result, initialized to the input
36978 operand to ease control flow. */
36979 res = gen_reg_rtx (mode);
36980 emit_move_insn (res, operand1);
36981
36982 /* xa = abs (operand1) */
36983 xa = ix86_expand_sse_fabs (res, &mask);
36984
36985 /* if (!isless (xa, TWO52)) goto label; */
36986 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
36987
36988 /* x = (double)(long)x */
36989 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
36990 expand_fix (xi, res, 0);
36991 expand_float (res, xi, 0);
36992
36993 if (HONOR_SIGNED_ZEROS (mode))
36994 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
36995
36996 emit_label (label);
36997 LABEL_NUSES (label) = 1;
36998
36999 emit_move_insn (operand0, res);
37000 }
37001
37002 /* Expand SSE sequence for computing trunc from OPERAND1 storing
37003 into OPERAND0. */
37004 void
37005 ix86_expand_truncdf_32 (rtx operand0, rtx operand1)
37006 {
37007 enum machine_mode mode = GET_MODE (operand0);
37008 rtx xa, mask, TWO52, label, one, res, smask, tmp;
37009
37010 /* C code for SSE variant we expand below.
37011 double xa = fabs (x), x2;
37012 if (!isless (xa, TWO52))
37013 return x;
37014 xa2 = xa + TWO52 - TWO52;
37015 Compensate:
37016 if (xa2 > xa)
37017 xa2 -= 1.0;
37018 x2 = copysign (xa2, x);
37019 return x2;
37020 */
37021
37022 TWO52 = ix86_gen_TWO52 (mode);
37023
37024 /* Temporary for holding the result, initialized to the input
37025 operand to ease control flow. */
37026 res = gen_reg_rtx (mode);
37027 emit_move_insn (res, operand1);
37028
37029 /* xa = abs (operand1) */
37030 xa = ix86_expand_sse_fabs (res, &smask);
37031
37032 /* if (!isless (xa, TWO52)) goto label; */
37033 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
37034
37035 /* res = xa + TWO52 - TWO52; */
37036 tmp = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
37037 tmp = expand_simple_binop (mode, MINUS, tmp, TWO52, tmp, 0, OPTAB_DIRECT);
37038 emit_move_insn (res, tmp);
37039
37040 /* generate 1.0 */
37041 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
37042
37043 /* Compensate: res = xa2 - (res > xa ? 1 : 0) */
37044 mask = ix86_expand_sse_compare_mask (UNGT, res, xa, false);
37045 emit_insn (gen_rtx_SET (VOIDmode, mask,
37046 gen_rtx_AND (mode, mask, one)));
37047 tmp = expand_simple_binop (mode, MINUS,
37048 res, mask, NULL_RTX, 0, OPTAB_DIRECT);
37049 emit_move_insn (res, tmp);
37050
37051 /* res = copysign (res, operand1) */
37052 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), smask);
37053
37054 emit_label (label);
37055 LABEL_NUSES (label) = 1;
37056
37057 emit_move_insn (operand0, res);
37058 }
37059
37060 /* Expand SSE sequence for computing round from OPERAND1 storing
37061 into OPERAND0. */
37062 void
37063 ix86_expand_round (rtx operand0, rtx operand1)
37064 {
37065 /* C code for the stuff we're doing below:
37066 double xa = fabs (x);
37067 if (!isless (xa, TWO52))
37068 return x;
37069 xa = (double)(long)(xa + nextafter (0.5, 0.0));
37070 return copysign (xa, x);
37071 */
37072 enum machine_mode mode = GET_MODE (operand0);
37073 rtx res, TWO52, xa, label, xi, half, mask;
37074 const struct real_format *fmt;
37075 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
37076
37077 /* Temporary for holding the result, initialized to the input
37078 operand to ease control flow. */
37079 res = gen_reg_rtx (mode);
37080 emit_move_insn (res, operand1);
37081
37082 TWO52 = ix86_gen_TWO52 (mode);
37083 xa = ix86_expand_sse_fabs (res, &mask);
37084 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
37085
37086 /* load nextafter (0.5, 0.0) */
37087 fmt = REAL_MODE_FORMAT (mode);
37088 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
37089 REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
37090
37091 /* xa = xa + 0.5 */
37092 half = force_reg (mode, const_double_from_real_value (pred_half, mode));
37093 xa = expand_simple_binop (mode, PLUS, xa, half, NULL_RTX, 0, OPTAB_DIRECT);
37094
37095 /* xa = (double)(int64_t)xa */
37096 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
37097 expand_fix (xi, xa, 0);
37098 expand_float (xa, xi, 0);
37099
37100 /* res = copysign (xa, operand1) */
37101 ix86_sse_copysign_to_positive (res, xa, force_reg (mode, operand1), mask);
37102
37103 emit_label (label);
37104 LABEL_NUSES (label) = 1;
37105
37106 emit_move_insn (operand0, res);
37107 }
37108
37109 /* Expand SSE sequence for computing round
37110 from OP1 storing into OP0 using sse4 round insn. */
37111 void
37112 ix86_expand_round_sse4 (rtx op0, rtx op1)
37113 {
37114 enum machine_mode mode = GET_MODE (op0);
37115 rtx e1, e2, res, half;
37116 const struct real_format *fmt;
37117 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
37118 rtx (*gen_copysign) (rtx, rtx, rtx);
37119 rtx (*gen_round) (rtx, rtx, rtx);
37120
37121 switch (mode)
37122 {
37123 case SFmode:
37124 gen_copysign = gen_copysignsf3;
37125 gen_round = gen_sse4_1_roundsf2;
37126 break;
37127 case DFmode:
37128 gen_copysign = gen_copysigndf3;
37129 gen_round = gen_sse4_1_rounddf2;
37130 break;
37131 default:
37132 gcc_unreachable ();
37133 }
37134
37135 /* round (a) = trunc (a + copysign (0.5, a)) */
37136
37137 /* load nextafter (0.5, 0.0) */
37138 fmt = REAL_MODE_FORMAT (mode);
37139 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
37140 REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
37141 half = const_double_from_real_value (pred_half, mode);
37142
37143 /* e1 = copysign (0.5, op1) */
37144 e1 = gen_reg_rtx (mode);
37145 emit_insn (gen_copysign (e1, half, op1));
37146
37147 /* e2 = op1 + e1 */
37148 e2 = expand_simple_binop (mode, PLUS, op1, e1, NULL_RTX, 0, OPTAB_DIRECT);
37149
37150 /* res = trunc (e2) */
37151 res = gen_reg_rtx (mode);
37152 emit_insn (gen_round (res, e2, GEN_INT (ROUND_TRUNC)));
37153
37154 emit_move_insn (op0, res);
37155 }
37156 \f
37157
37158 /* Table of valid machine attributes. */
37159 static const struct attribute_spec ix86_attribute_table[] =
37160 {
37161 /* { name, min_len, max_len, decl_req, type_req, fn_type_req, handler,
37162 affects_type_identity } */
37163 /* Stdcall attribute says callee is responsible for popping arguments
37164 if they are not variable. */
37165 { "stdcall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
37166 true },
37167 /* Fastcall attribute says callee is responsible for popping arguments
37168 if they are not variable. */
37169 { "fastcall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
37170 true },
37171 /* Thiscall attribute says callee is responsible for popping arguments
37172 if they are not variable. */
37173 { "thiscall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
37174 true },
37175 /* Cdecl attribute says the callee is a normal C declaration */
37176 { "cdecl", 0, 0, false, true, true, ix86_handle_cconv_attribute,
37177 true },
37178 /* Regparm attribute specifies how many integer arguments are to be
37179 passed in registers. */
37180 { "regparm", 1, 1, false, true, true, ix86_handle_cconv_attribute,
37181 true },
37182 /* Sseregparm attribute says we are using x86_64 calling conventions
37183 for FP arguments. */
37184 { "sseregparm", 0, 0, false, true, true, ix86_handle_cconv_attribute,
37185 true },
37186 /* The transactional memory builtins are implicitly regparm or fastcall
37187 depending on the ABI. Override the generic do-nothing attribute that
37188 these builtins were declared with. */
37189 { "*tm regparm", 0, 0, false, true, true, ix86_handle_tm_regparm_attribute,
37190 true },
37191 /* force_align_arg_pointer says this function realigns the stack at entry. */
37192 { (const char *)&ix86_force_align_arg_pointer_string, 0, 0,
37193 false, true, true, ix86_handle_cconv_attribute, false },
37194 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
37195 { "dllimport", 0, 0, false, false, false, handle_dll_attribute, false },
37196 { "dllexport", 0, 0, false, false, false, handle_dll_attribute, false },
37197 { "shared", 0, 0, true, false, false, ix86_handle_shared_attribute,
37198 false },
37199 #endif
37200 { "ms_struct", 0, 0, false, false, false, ix86_handle_struct_attribute,
37201 false },
37202 { "gcc_struct", 0, 0, false, false, false, ix86_handle_struct_attribute,
37203 false },
37204 #ifdef SUBTARGET_ATTRIBUTE_TABLE
37205 SUBTARGET_ATTRIBUTE_TABLE,
37206 #endif
37207 /* ms_abi and sysv_abi calling convention function attributes. */
37208 { "ms_abi", 0, 0, false, true, true, ix86_handle_abi_attribute, true },
37209 { "sysv_abi", 0, 0, false, true, true, ix86_handle_abi_attribute, true },
37210 { "ms_hook_prologue", 0, 0, true, false, false, ix86_handle_fndecl_attribute,
37211 false },
37212 { "callee_pop_aggregate_return", 1, 1, false, true, true,
37213 ix86_handle_callee_pop_aggregate_return, true },
37214 /* End element. */
37215 { NULL, 0, 0, false, false, false, NULL, false }
37216 };
37217
37218 /* Implement targetm.vectorize.builtin_vectorization_cost. */
37219 static int
37220 ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
37221 tree vectype,
37222 int misalign ATTRIBUTE_UNUSED)
37223 {
37224 unsigned elements;
37225
37226 switch (type_of_cost)
37227 {
37228 case scalar_stmt:
37229 return ix86_cost->scalar_stmt_cost;
37230
37231 case scalar_load:
37232 return ix86_cost->scalar_load_cost;
37233
37234 case scalar_store:
37235 return ix86_cost->scalar_store_cost;
37236
37237 case vector_stmt:
37238 return ix86_cost->vec_stmt_cost;
37239
37240 case vector_load:
37241 return ix86_cost->vec_align_load_cost;
37242
37243 case vector_store:
37244 return ix86_cost->vec_store_cost;
37245
37246 case vec_to_scalar:
37247 return ix86_cost->vec_to_scalar_cost;
37248
37249 case scalar_to_vec:
37250 return ix86_cost->scalar_to_vec_cost;
37251
37252 case unaligned_load:
37253 case unaligned_store:
37254 return ix86_cost->vec_unalign_load_cost;
37255
37256 case cond_branch_taken:
37257 return ix86_cost->cond_taken_branch_cost;
37258
37259 case cond_branch_not_taken:
37260 return ix86_cost->cond_not_taken_branch_cost;
37261
37262 case vec_perm:
37263 case vec_promote_demote:
37264 return ix86_cost->vec_stmt_cost;
37265
37266 case vec_construct:
37267 elements = TYPE_VECTOR_SUBPARTS (vectype);
37268 return elements / 2 + 1;
37269
37270 default:
37271 gcc_unreachable ();
37272 }
37273 }
37274
37275 /* A cached (set (nil) (vselect (vconcat (nil) (nil)) (parallel [])))
37276 insn, so that expand_vselect{,_vconcat} doesn't have to create a fresh
37277 insn every time. */
37278
37279 static GTY(()) rtx vselect_insn;
37280
37281 /* Initialize vselect_insn. */
37282
37283 static void
37284 init_vselect_insn (void)
37285 {
37286 unsigned i;
37287 rtx x;
37288
37289 x = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (MAX_VECT_LEN));
37290 for (i = 0; i < MAX_VECT_LEN; ++i)
37291 XVECEXP (x, 0, i) = const0_rtx;
37292 x = gen_rtx_VEC_SELECT (V2DFmode, gen_rtx_VEC_CONCAT (V4DFmode, const0_rtx,
37293 const0_rtx), x);
37294 x = gen_rtx_SET (VOIDmode, const0_rtx, x);
37295 start_sequence ();
37296 vselect_insn = emit_insn (x);
37297 end_sequence ();
37298 }
37299
37300 /* Construct (set target (vec_select op0 (parallel perm))) and
37301 return true if that's a valid instruction in the active ISA. */
37302
37303 static bool
37304 expand_vselect (rtx target, rtx op0, const unsigned char *perm,
37305 unsigned nelt, bool testing_p)
37306 {
37307 unsigned int i;
37308 rtx x, save_vconcat;
37309 int icode;
37310
37311 if (vselect_insn == NULL_RTX)
37312 init_vselect_insn ();
37313
37314 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 1);
37315 PUT_NUM_ELEM (XVEC (x, 0), nelt);
37316 for (i = 0; i < nelt; ++i)
37317 XVECEXP (x, 0, i) = GEN_INT (perm[i]);
37318 save_vconcat = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
37319 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = op0;
37320 PUT_MODE (SET_SRC (PATTERN (vselect_insn)), GET_MODE (target));
37321 SET_DEST (PATTERN (vselect_insn)) = target;
37322 icode = recog_memoized (vselect_insn);
37323
37324 if (icode >= 0 && !testing_p)
37325 emit_insn (copy_rtx (PATTERN (vselect_insn)));
37326
37327 SET_DEST (PATTERN (vselect_insn)) = const0_rtx;
37328 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = save_vconcat;
37329 INSN_CODE (vselect_insn) = -1;
37330
37331 return icode >= 0;
37332 }
37333
37334 /* Similar, but generate a vec_concat from op0 and op1 as well. */
37335
37336 static bool
37337 expand_vselect_vconcat (rtx target, rtx op0, rtx op1,
37338 const unsigned char *perm, unsigned nelt,
37339 bool testing_p)
37340 {
37341 enum machine_mode v2mode;
37342 rtx x;
37343 bool ok;
37344
37345 if (vselect_insn == NULL_RTX)
37346 init_vselect_insn ();
37347
37348 v2mode = GET_MODE_2XWIDER_MODE (GET_MODE (op0));
37349 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
37350 PUT_MODE (x, v2mode);
37351 XEXP (x, 0) = op0;
37352 XEXP (x, 1) = op1;
37353 ok = expand_vselect (target, x, perm, nelt, testing_p);
37354 XEXP (x, 0) = const0_rtx;
37355 XEXP (x, 1) = const0_rtx;
37356 return ok;
37357 }
37358
37359 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
37360 in terms of blendp[sd] / pblendw / pblendvb / vpblendd. */
37361
37362 static bool
37363 expand_vec_perm_blend (struct expand_vec_perm_d *d)
37364 {
37365 enum machine_mode vmode = d->vmode;
37366 unsigned i, mask, nelt = d->nelt;
37367 rtx target, op0, op1, x;
37368 rtx rperm[32], vperm;
37369
37370 if (d->one_operand_p)
37371 return false;
37372 if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
37373 ;
37374 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
37375 ;
37376 else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
37377 ;
37378 else
37379 return false;
37380
37381 /* This is a blend, not a permute. Elements must stay in their
37382 respective lanes. */
37383 for (i = 0; i < nelt; ++i)
37384 {
37385 unsigned e = d->perm[i];
37386 if (!(e == i || e == i + nelt))
37387 return false;
37388 }
37389
37390 if (d->testing_p)
37391 return true;
37392
37393 /* ??? Without SSE4.1, we could implement this with and/andn/or. This
37394 decision should be extracted elsewhere, so that we only try that
37395 sequence once all budget==3 options have been tried. */
37396 target = d->target;
37397 op0 = d->op0;
37398 op1 = d->op1;
37399 mask = 0;
37400
37401 switch (vmode)
37402 {
37403 case V4DFmode:
37404 case V8SFmode:
37405 case V2DFmode:
37406 case V4SFmode:
37407 case V8HImode:
37408 case V8SImode:
37409 for (i = 0; i < nelt; ++i)
37410 mask |= (d->perm[i] >= nelt) << i;
37411 break;
37412
37413 case V2DImode:
37414 for (i = 0; i < 2; ++i)
37415 mask |= (d->perm[i] >= 2 ? 15 : 0) << (i * 4);
37416 vmode = V8HImode;
37417 goto do_subreg;
37418
37419 case V4SImode:
37420 for (i = 0; i < 4; ++i)
37421 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
37422 vmode = V8HImode;
37423 goto do_subreg;
37424
37425 case V16QImode:
37426 /* See if bytes move in pairs so we can use pblendw with
37427 an immediate argument, rather than pblendvb with a vector
37428 argument. */
37429 for (i = 0; i < 16; i += 2)
37430 if (d->perm[i] + 1 != d->perm[i + 1])
37431 {
37432 use_pblendvb:
37433 for (i = 0; i < nelt; ++i)
37434 rperm[i] = (d->perm[i] < nelt ? const0_rtx : constm1_rtx);
37435
37436 finish_pblendvb:
37437 vperm = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
37438 vperm = force_reg (vmode, vperm);
37439
37440 if (GET_MODE_SIZE (vmode) == 16)
37441 emit_insn (gen_sse4_1_pblendvb (target, op0, op1, vperm));
37442 else
37443 emit_insn (gen_avx2_pblendvb (target, op0, op1, vperm));
37444 return true;
37445 }
37446
37447 for (i = 0; i < 8; ++i)
37448 mask |= (d->perm[i * 2] >= 16) << i;
37449 vmode = V8HImode;
37450 /* FALLTHRU */
37451
37452 do_subreg:
37453 target = gen_lowpart (vmode, target);
37454 op0 = gen_lowpart (vmode, op0);
37455 op1 = gen_lowpart (vmode, op1);
37456 break;
37457
37458 case V32QImode:
37459 /* See if bytes move in pairs. If not, vpblendvb must be used. */
37460 for (i = 0; i < 32; i += 2)
37461 if (d->perm[i] + 1 != d->perm[i + 1])
37462 goto use_pblendvb;
37463 /* See if bytes move in quadruplets. If yes, vpblendd
37464 with immediate can be used. */
37465 for (i = 0; i < 32; i += 4)
37466 if (d->perm[i] + 2 != d->perm[i + 2])
37467 break;
37468 if (i < 32)
37469 {
37470 /* See if bytes move the same in both lanes. If yes,
37471 vpblendw with immediate can be used. */
37472 for (i = 0; i < 16; i += 2)
37473 if (d->perm[i] + 16 != d->perm[i + 16])
37474 goto use_pblendvb;
37475
37476 /* Use vpblendw. */
37477 for (i = 0; i < 16; ++i)
37478 mask |= (d->perm[i * 2] >= 32) << i;
37479 vmode = V16HImode;
37480 goto do_subreg;
37481 }
37482
37483 /* Use vpblendd. */
37484 for (i = 0; i < 8; ++i)
37485 mask |= (d->perm[i * 4] >= 32) << i;
37486 vmode = V8SImode;
37487 goto do_subreg;
37488
37489 case V16HImode:
37490 /* See if words move in pairs. If yes, vpblendd can be used. */
37491 for (i = 0; i < 16; i += 2)
37492 if (d->perm[i] + 1 != d->perm[i + 1])
37493 break;
37494 if (i < 16)
37495 {
37496 /* See if words move the same in both lanes. If not,
37497 vpblendvb must be used. */
37498 for (i = 0; i < 8; i++)
37499 if (d->perm[i] + 8 != d->perm[i + 8])
37500 {
37501 /* Use vpblendvb. */
37502 for (i = 0; i < 32; ++i)
37503 rperm[i] = (d->perm[i / 2] < 16 ? const0_rtx : constm1_rtx);
37504
37505 vmode = V32QImode;
37506 nelt = 32;
37507 target = gen_lowpart (vmode, target);
37508 op0 = gen_lowpart (vmode, op0);
37509 op1 = gen_lowpart (vmode, op1);
37510 goto finish_pblendvb;
37511 }
37512
37513 /* Use vpblendw. */
37514 for (i = 0; i < 16; ++i)
37515 mask |= (d->perm[i] >= 16) << i;
37516 break;
37517 }
37518
37519 /* Use vpblendd. */
37520 for (i = 0; i < 8; ++i)
37521 mask |= (d->perm[i * 2] >= 16) << i;
37522 vmode = V8SImode;
37523 goto do_subreg;
37524
37525 case V4DImode:
37526 /* Use vpblendd. */
37527 for (i = 0; i < 4; ++i)
37528 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
37529 vmode = V8SImode;
37530 goto do_subreg;
37531
37532 default:
37533 gcc_unreachable ();
37534 }
37535
37536 /* This matches five different patterns with the different modes. */
37537 x = gen_rtx_VEC_MERGE (vmode, op1, op0, GEN_INT (mask));
37538 x = gen_rtx_SET (VOIDmode, target, x);
37539 emit_insn (x);
37540
37541 return true;
37542 }
37543
37544 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
37545 in terms of the variable form of vpermilps.
37546
37547 Note that we will have already failed the immediate input vpermilps,
37548 which requires that the high and low part shuffle be identical; the
37549 variable form doesn't require that. */
37550
37551 static bool
37552 expand_vec_perm_vpermil (struct expand_vec_perm_d *d)
37553 {
37554 rtx rperm[8], vperm;
37555 unsigned i;
37556
37557 if (!TARGET_AVX || d->vmode != V8SFmode || !d->one_operand_p)
37558 return false;
37559
37560 /* We can only permute within the 128-bit lane. */
37561 for (i = 0; i < 8; ++i)
37562 {
37563 unsigned e = d->perm[i];
37564 if (i < 4 ? e >= 4 : e < 4)
37565 return false;
37566 }
37567
37568 if (d->testing_p)
37569 return true;
37570
37571 for (i = 0; i < 8; ++i)
37572 {
37573 unsigned e = d->perm[i];
37574
37575 /* Within each 128-bit lane, the elements of op0 are numbered
37576 from 0 and the elements of op1 are numbered from 4. */
37577 if (e >= 8 + 4)
37578 e -= 8;
37579 else if (e >= 4)
37580 e -= 4;
37581
37582 rperm[i] = GEN_INT (e);
37583 }
37584
37585 vperm = gen_rtx_CONST_VECTOR (V8SImode, gen_rtvec_v (8, rperm));
37586 vperm = force_reg (V8SImode, vperm);
37587 emit_insn (gen_avx_vpermilvarv8sf3 (d->target, d->op0, vperm));
37588
37589 return true;
37590 }
37591
37592 /* Return true if permutation D can be performed as VMODE permutation
37593 instead. */
37594
37595 static bool
37596 valid_perm_using_mode_p (enum machine_mode vmode, struct expand_vec_perm_d *d)
37597 {
37598 unsigned int i, j, chunk;
37599
37600 if (GET_MODE_CLASS (vmode) != MODE_VECTOR_INT
37601 || GET_MODE_CLASS (d->vmode) != MODE_VECTOR_INT
37602 || GET_MODE_SIZE (vmode) != GET_MODE_SIZE (d->vmode))
37603 return false;
37604
37605 if (GET_MODE_NUNITS (vmode) >= d->nelt)
37606 return true;
37607
37608 chunk = d->nelt / GET_MODE_NUNITS (vmode);
37609 for (i = 0; i < d->nelt; i += chunk)
37610 if (d->perm[i] & (chunk - 1))
37611 return false;
37612 else
37613 for (j = 1; j < chunk; ++j)
37614 if (d->perm[i] + j != d->perm[i + j])
37615 return false;
37616
37617 return true;
37618 }
37619
37620 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
37621 in terms of pshufb, vpperm, vpermq, vpermd, vpermps or vperm2i128. */
37622
37623 static bool
37624 expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
37625 {
37626 unsigned i, nelt, eltsz, mask;
37627 unsigned char perm[32];
37628 enum machine_mode vmode = V16QImode;
37629 rtx rperm[32], vperm, target, op0, op1;
37630
37631 nelt = d->nelt;
37632
37633 if (!d->one_operand_p)
37634 {
37635 if (!TARGET_XOP || GET_MODE_SIZE (d->vmode) != 16)
37636 {
37637 if (TARGET_AVX2
37638 && valid_perm_using_mode_p (V2TImode, d))
37639 {
37640 if (d->testing_p)
37641 return true;
37642
37643 /* Use vperm2i128 insn. The pattern uses
37644 V4DImode instead of V2TImode. */
37645 target = gen_lowpart (V4DImode, d->target);
37646 op0 = gen_lowpart (V4DImode, d->op0);
37647 op1 = gen_lowpart (V4DImode, d->op1);
37648 rperm[0]
37649 = GEN_INT (((d->perm[0] & (nelt / 2)) ? 1 : 0)
37650 || ((d->perm[nelt / 2] & (nelt / 2)) ? 2 : 0));
37651 emit_insn (gen_avx2_permv2ti (target, op0, op1, rperm[0]));
37652 return true;
37653 }
37654 return false;
37655 }
37656 }
37657 else
37658 {
37659 if (GET_MODE_SIZE (d->vmode) == 16)
37660 {
37661 if (!TARGET_SSSE3)
37662 return false;
37663 }
37664 else if (GET_MODE_SIZE (d->vmode) == 32)
37665 {
37666 if (!TARGET_AVX2)
37667 return false;
37668
37669 /* V4DImode should be already handled through
37670 expand_vselect by vpermq instruction. */
37671 gcc_assert (d->vmode != V4DImode);
37672
37673 vmode = V32QImode;
37674 if (d->vmode == V8SImode
37675 || d->vmode == V16HImode
37676 || d->vmode == V32QImode)
37677 {
37678 /* First see if vpermq can be used for
37679 V8SImode/V16HImode/V32QImode. */
37680 if (valid_perm_using_mode_p (V4DImode, d))
37681 {
37682 for (i = 0; i < 4; i++)
37683 perm[i] = (d->perm[i * nelt / 4] * 4 / nelt) & 3;
37684 if (d->testing_p)
37685 return true;
37686 return expand_vselect (gen_lowpart (V4DImode, d->target),
37687 gen_lowpart (V4DImode, d->op0),
37688 perm, 4, false);
37689 }
37690
37691 /* Next see if vpermd can be used. */
37692 if (valid_perm_using_mode_p (V8SImode, d))
37693 vmode = V8SImode;
37694 }
37695 /* Or if vpermps can be used. */
37696 else if (d->vmode == V8SFmode)
37697 vmode = V8SImode;
37698
37699 if (vmode == V32QImode)
37700 {
37701 /* vpshufb only works intra lanes, it is not
37702 possible to shuffle bytes in between the lanes. */
37703 for (i = 0; i < nelt; ++i)
37704 if ((d->perm[i] ^ i) & (nelt / 2))
37705 return false;
37706 }
37707 }
37708 else
37709 return false;
37710 }
37711
37712 if (d->testing_p)
37713 return true;
37714
37715 if (vmode == V8SImode)
37716 for (i = 0; i < 8; ++i)
37717 rperm[i] = GEN_INT ((d->perm[i * nelt / 8] * 8 / nelt) & 7);
37718 else
37719 {
37720 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
37721 if (!d->one_operand_p)
37722 mask = 2 * nelt - 1;
37723 else if (vmode == V16QImode)
37724 mask = nelt - 1;
37725 else
37726 mask = nelt / 2 - 1;
37727
37728 for (i = 0; i < nelt; ++i)
37729 {
37730 unsigned j, e = d->perm[i] & mask;
37731 for (j = 0; j < eltsz; ++j)
37732 rperm[i * eltsz + j] = GEN_INT (e * eltsz + j);
37733 }
37734 }
37735
37736 vperm = gen_rtx_CONST_VECTOR (vmode,
37737 gen_rtvec_v (GET_MODE_NUNITS (vmode), rperm));
37738 vperm = force_reg (vmode, vperm);
37739
37740 target = gen_lowpart (vmode, d->target);
37741 op0 = gen_lowpart (vmode, d->op0);
37742 if (d->one_operand_p)
37743 {
37744 if (vmode == V16QImode)
37745 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, vperm));
37746 else if (vmode == V32QImode)
37747 emit_insn (gen_avx2_pshufbv32qi3 (target, op0, vperm));
37748 else if (vmode == V8SFmode)
37749 emit_insn (gen_avx2_permvarv8sf (target, op0, vperm));
37750 else
37751 emit_insn (gen_avx2_permvarv8si (target, op0, vperm));
37752 }
37753 else
37754 {
37755 op1 = gen_lowpart (vmode, d->op1);
37756 emit_insn (gen_xop_pperm (target, op0, op1, vperm));
37757 }
37758
37759 return true;
37760 }
37761
37762 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to instantiate D
37763 in a single instruction. */
37764
37765 static bool
37766 expand_vec_perm_1 (struct expand_vec_perm_d *d)
37767 {
37768 unsigned i, nelt = d->nelt;
37769 unsigned char perm2[MAX_VECT_LEN];
37770
37771 /* Check plain VEC_SELECT first, because AVX has instructions that could
37772 match both SEL and SEL+CONCAT, but the plain SEL will allow a memory
37773 input where SEL+CONCAT may not. */
37774 if (d->one_operand_p)
37775 {
37776 int mask = nelt - 1;
37777 bool identity_perm = true;
37778 bool broadcast_perm = true;
37779
37780 for (i = 0; i < nelt; i++)
37781 {
37782 perm2[i] = d->perm[i] & mask;
37783 if (perm2[i] != i)
37784 identity_perm = false;
37785 if (perm2[i])
37786 broadcast_perm = false;
37787 }
37788
37789 if (identity_perm)
37790 {
37791 if (!d->testing_p)
37792 emit_move_insn (d->target, d->op0);
37793 return true;
37794 }
37795 else if (broadcast_perm && TARGET_AVX2)
37796 {
37797 /* Use vpbroadcast{b,w,d}. */
37798 rtx (*gen) (rtx, rtx) = NULL;
37799 switch (d->vmode)
37800 {
37801 case V32QImode:
37802 gen = gen_avx2_pbroadcastv32qi_1;
37803 break;
37804 case V16HImode:
37805 gen = gen_avx2_pbroadcastv16hi_1;
37806 break;
37807 case V8SImode:
37808 gen = gen_avx2_pbroadcastv8si_1;
37809 break;
37810 case V16QImode:
37811 gen = gen_avx2_pbroadcastv16qi;
37812 break;
37813 case V8HImode:
37814 gen = gen_avx2_pbroadcastv8hi;
37815 break;
37816 case V8SFmode:
37817 gen = gen_avx2_vec_dupv8sf_1;
37818 break;
37819 /* For other modes prefer other shuffles this function creates. */
37820 default: break;
37821 }
37822 if (gen != NULL)
37823 {
37824 if (!d->testing_p)
37825 emit_insn (gen (d->target, d->op0));
37826 return true;
37827 }
37828 }
37829
37830 if (expand_vselect (d->target, d->op0, perm2, nelt, d->testing_p))
37831 return true;
37832
37833 /* There are plenty of patterns in sse.md that are written for
37834 SEL+CONCAT and are not replicated for a single op. Perhaps
37835 that should be changed, to avoid the nastiness here. */
37836
37837 /* Recognize interleave style patterns, which means incrementing
37838 every other permutation operand. */
37839 for (i = 0; i < nelt; i += 2)
37840 {
37841 perm2[i] = d->perm[i] & mask;
37842 perm2[i + 1] = (d->perm[i + 1] & mask) + nelt;
37843 }
37844 if (expand_vselect_vconcat (d->target, d->op0, d->op0, perm2, nelt,
37845 d->testing_p))
37846 return true;
37847
37848 /* Recognize shufps, which means adding {0, 0, nelt, nelt}. */
37849 if (nelt >= 4)
37850 {
37851 for (i = 0; i < nelt; i += 4)
37852 {
37853 perm2[i + 0] = d->perm[i + 0] & mask;
37854 perm2[i + 1] = d->perm[i + 1] & mask;
37855 perm2[i + 2] = (d->perm[i + 2] & mask) + nelt;
37856 perm2[i + 3] = (d->perm[i + 3] & mask) + nelt;
37857 }
37858
37859 if (expand_vselect_vconcat (d->target, d->op0, d->op0, perm2, nelt,
37860 d->testing_p))
37861 return true;
37862 }
37863 }
37864
37865 /* Finally, try the fully general two operand permute. */
37866 if (expand_vselect_vconcat (d->target, d->op0, d->op1, d->perm, nelt,
37867 d->testing_p))
37868 return true;
37869
37870 /* Recognize interleave style patterns with reversed operands. */
37871 if (!d->one_operand_p)
37872 {
37873 for (i = 0; i < nelt; ++i)
37874 {
37875 unsigned e = d->perm[i];
37876 if (e >= nelt)
37877 e -= nelt;
37878 else
37879 e += nelt;
37880 perm2[i] = e;
37881 }
37882
37883 if (expand_vselect_vconcat (d->target, d->op1, d->op0, perm2, nelt,
37884 d->testing_p))
37885 return true;
37886 }
37887
37888 /* Try the SSE4.1 blend variable merge instructions. */
37889 if (expand_vec_perm_blend (d))
37890 return true;
37891
37892 /* Try one of the AVX vpermil variable permutations. */
37893 if (expand_vec_perm_vpermil (d))
37894 return true;
37895
37896 /* Try the SSSE3 pshufb or XOP vpperm or AVX2 vperm2i128,
37897 vpshufb, vpermd, vpermps or vpermq variable permutation. */
37898 if (expand_vec_perm_pshufb (d))
37899 return true;
37900
37901 return false;
37902 }
37903
37904 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
37905 in terms of a pair of pshuflw + pshufhw instructions. */
37906
37907 static bool
37908 expand_vec_perm_pshuflw_pshufhw (struct expand_vec_perm_d *d)
37909 {
37910 unsigned char perm2[MAX_VECT_LEN];
37911 unsigned i;
37912 bool ok;
37913
37914 if (d->vmode != V8HImode || !d->one_operand_p)
37915 return false;
37916
37917 /* The two permutations only operate in 64-bit lanes. */
37918 for (i = 0; i < 4; ++i)
37919 if (d->perm[i] >= 4)
37920 return false;
37921 for (i = 4; i < 8; ++i)
37922 if (d->perm[i] < 4)
37923 return false;
37924
37925 if (d->testing_p)
37926 return true;
37927
37928 /* Emit the pshuflw. */
37929 memcpy (perm2, d->perm, 4);
37930 for (i = 4; i < 8; ++i)
37931 perm2[i] = i;
37932 ok = expand_vselect (d->target, d->op0, perm2, 8, d->testing_p);
37933 gcc_assert (ok);
37934
37935 /* Emit the pshufhw. */
37936 memcpy (perm2 + 4, d->perm + 4, 4);
37937 for (i = 0; i < 4; ++i)
37938 perm2[i] = i;
37939 ok = expand_vselect (d->target, d->target, perm2, 8, d->testing_p);
37940 gcc_assert (ok);
37941
37942 return true;
37943 }
37944
37945 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
37946 the permutation using the SSSE3 palignr instruction. This succeeds
37947 when all of the elements in PERM fit within one vector and we merely
37948 need to shift them down so that a single vector permutation has a
37949 chance to succeed. */
37950
37951 static bool
37952 expand_vec_perm_palignr (struct expand_vec_perm_d *d)
37953 {
37954 unsigned i, nelt = d->nelt;
37955 unsigned min, max;
37956 bool in_order, ok;
37957 rtx shift;
37958
37959 /* Even with AVX, palignr only operates on 128-bit vectors. */
37960 if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
37961 return false;
37962
37963 min = nelt, max = 0;
37964 for (i = 0; i < nelt; ++i)
37965 {
37966 unsigned e = d->perm[i];
37967 if (e < min)
37968 min = e;
37969 if (e > max)
37970 max = e;
37971 }
37972 if (min == 0 || max - min >= nelt)
37973 return false;
37974
37975 /* Given that we have SSSE3, we know we'll be able to implement the
37976 single operand permutation after the palignr with pshufb. */
37977 if (d->testing_p)
37978 return true;
37979
37980 shift = GEN_INT (min * GET_MODE_BITSIZE (GET_MODE_INNER (d->vmode)));
37981 emit_insn (gen_ssse3_palignrti (gen_lowpart (TImode, d->target),
37982 gen_lowpart (TImode, d->op1),
37983 gen_lowpart (TImode, d->op0), shift));
37984
37985 d->op0 = d->op1 = d->target;
37986 d->one_operand_p = true;
37987
37988 in_order = true;
37989 for (i = 0; i < nelt; ++i)
37990 {
37991 unsigned e = d->perm[i] - min;
37992 if (e != i)
37993 in_order = false;
37994 d->perm[i] = e;
37995 }
37996
37997 /* Test for the degenerate case where the alignment by itself
37998 produces the desired permutation. */
37999 if (in_order)
38000 return true;
38001
38002 ok = expand_vec_perm_1 (d);
38003 gcc_assert (ok);
38004
38005 return ok;
38006 }
38007
38008 static bool expand_vec_perm_interleave3 (struct expand_vec_perm_d *d);
38009
38010 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
38011 a two vector permutation into a single vector permutation by using
38012 an interleave operation to merge the vectors. */
38013
38014 static bool
38015 expand_vec_perm_interleave2 (struct expand_vec_perm_d *d)
38016 {
38017 struct expand_vec_perm_d dremap, dfinal;
38018 unsigned i, nelt = d->nelt, nelt2 = nelt / 2;
38019 unsigned HOST_WIDE_INT contents;
38020 unsigned char remap[2 * MAX_VECT_LEN];
38021 rtx seq;
38022 bool ok, same_halves = false;
38023
38024 if (GET_MODE_SIZE (d->vmode) == 16)
38025 {
38026 if (d->one_operand_p)
38027 return false;
38028 }
38029 else if (GET_MODE_SIZE (d->vmode) == 32)
38030 {
38031 if (!TARGET_AVX)
38032 return false;
38033 /* For 32-byte modes allow even d->one_operand_p.
38034 The lack of cross-lane shuffling in some instructions
38035 might prevent a single insn shuffle. */
38036 dfinal = *d;
38037 dfinal.testing_p = true;
38038 /* If expand_vec_perm_interleave3 can expand this into
38039 a 3 insn sequence, give up and let it be expanded as
38040 3 insn sequence. While that is one insn longer,
38041 it doesn't need a memory operand and in the common
38042 case that both interleave low and high permutations
38043 with the same operands are adjacent needs 4 insns
38044 for both after CSE. */
38045 if (expand_vec_perm_interleave3 (&dfinal))
38046 return false;
38047 }
38048 else
38049 return false;
38050
38051 /* Examine from whence the elements come. */
38052 contents = 0;
38053 for (i = 0; i < nelt; ++i)
38054 contents |= ((unsigned HOST_WIDE_INT) 1) << d->perm[i];
38055
38056 memset (remap, 0xff, sizeof (remap));
38057 dremap = *d;
38058
38059 if (GET_MODE_SIZE (d->vmode) == 16)
38060 {
38061 unsigned HOST_WIDE_INT h1, h2, h3, h4;
38062
38063 /* Split the two input vectors into 4 halves. */
38064 h1 = (((unsigned HOST_WIDE_INT) 1) << nelt2) - 1;
38065 h2 = h1 << nelt2;
38066 h3 = h2 << nelt2;
38067 h4 = h3 << nelt2;
38068
38069 /* If the elements from the low halves use interleave low, and similarly
38070 for interleave high. If the elements are from mis-matched halves, we
38071 can use shufps for V4SF/V4SI or do a DImode shuffle. */
38072 if ((contents & (h1 | h3)) == contents)
38073 {
38074 /* punpckl* */
38075 for (i = 0; i < nelt2; ++i)
38076 {
38077 remap[i] = i * 2;
38078 remap[i + nelt] = i * 2 + 1;
38079 dremap.perm[i * 2] = i;
38080 dremap.perm[i * 2 + 1] = i + nelt;
38081 }
38082 if (!TARGET_SSE2 && d->vmode == V4SImode)
38083 dremap.vmode = V4SFmode;
38084 }
38085 else if ((contents & (h2 | h4)) == contents)
38086 {
38087 /* punpckh* */
38088 for (i = 0; i < nelt2; ++i)
38089 {
38090 remap[i + nelt2] = i * 2;
38091 remap[i + nelt + nelt2] = i * 2 + 1;
38092 dremap.perm[i * 2] = i + nelt2;
38093 dremap.perm[i * 2 + 1] = i + nelt + nelt2;
38094 }
38095 if (!TARGET_SSE2 && d->vmode == V4SImode)
38096 dremap.vmode = V4SFmode;
38097 }
38098 else if ((contents & (h1 | h4)) == contents)
38099 {
38100 /* shufps */
38101 for (i = 0; i < nelt2; ++i)
38102 {
38103 remap[i] = i;
38104 remap[i + nelt + nelt2] = i + nelt2;
38105 dremap.perm[i] = i;
38106 dremap.perm[i + nelt2] = i + nelt + nelt2;
38107 }
38108 if (nelt != 4)
38109 {
38110 /* shufpd */
38111 dremap.vmode = V2DImode;
38112 dremap.nelt = 2;
38113 dremap.perm[0] = 0;
38114 dremap.perm[1] = 3;
38115 }
38116 }
38117 else if ((contents & (h2 | h3)) == contents)
38118 {
38119 /* shufps */
38120 for (i = 0; i < nelt2; ++i)
38121 {
38122 remap[i + nelt2] = i;
38123 remap[i + nelt] = i + nelt2;
38124 dremap.perm[i] = i + nelt2;
38125 dremap.perm[i + nelt2] = i + nelt;
38126 }
38127 if (nelt != 4)
38128 {
38129 /* shufpd */
38130 dremap.vmode = V2DImode;
38131 dremap.nelt = 2;
38132 dremap.perm[0] = 1;
38133 dremap.perm[1] = 2;
38134 }
38135 }
38136 else
38137 return false;
38138 }
38139 else
38140 {
38141 unsigned int nelt4 = nelt / 4, nzcnt = 0;
38142 unsigned HOST_WIDE_INT q[8];
38143 unsigned int nonzero_halves[4];
38144
38145 /* Split the two input vectors into 8 quarters. */
38146 q[0] = (((unsigned HOST_WIDE_INT) 1) << nelt4) - 1;
38147 for (i = 1; i < 8; ++i)
38148 q[i] = q[0] << (nelt4 * i);
38149 for (i = 0; i < 4; ++i)
38150 if (((q[2 * i] | q[2 * i + 1]) & contents) != 0)
38151 {
38152 nonzero_halves[nzcnt] = i;
38153 ++nzcnt;
38154 }
38155
38156 if (nzcnt == 1)
38157 {
38158 gcc_assert (d->one_operand_p);
38159 nonzero_halves[1] = nonzero_halves[0];
38160 same_halves = true;
38161 }
38162 else if (d->one_operand_p)
38163 {
38164 gcc_assert (nonzero_halves[0] == 0);
38165 gcc_assert (nonzero_halves[1] == 1);
38166 }
38167
38168 if (nzcnt <= 2)
38169 {
38170 if (d->perm[0] / nelt2 == nonzero_halves[1])
38171 {
38172 /* Attempt to increase the likelihood that dfinal
38173 shuffle will be intra-lane. */
38174 char tmph = nonzero_halves[0];
38175 nonzero_halves[0] = nonzero_halves[1];
38176 nonzero_halves[1] = tmph;
38177 }
38178
38179 /* vperm2f128 or vperm2i128. */
38180 for (i = 0; i < nelt2; ++i)
38181 {
38182 remap[i + nonzero_halves[1] * nelt2] = i + nelt2;
38183 remap[i + nonzero_halves[0] * nelt2] = i;
38184 dremap.perm[i + nelt2] = i + nonzero_halves[1] * nelt2;
38185 dremap.perm[i] = i + nonzero_halves[0] * nelt2;
38186 }
38187
38188 if (d->vmode != V8SFmode
38189 && d->vmode != V4DFmode
38190 && d->vmode != V8SImode)
38191 {
38192 dremap.vmode = V8SImode;
38193 dremap.nelt = 8;
38194 for (i = 0; i < 4; ++i)
38195 {
38196 dremap.perm[i] = i + nonzero_halves[0] * 4;
38197 dremap.perm[i + 4] = i + nonzero_halves[1] * 4;
38198 }
38199 }
38200 }
38201 else if (d->one_operand_p)
38202 return false;
38203 else if (TARGET_AVX2
38204 && (contents & (q[0] | q[2] | q[4] | q[6])) == contents)
38205 {
38206 /* vpunpckl* */
38207 for (i = 0; i < nelt4; ++i)
38208 {
38209 remap[i] = i * 2;
38210 remap[i + nelt] = i * 2 + 1;
38211 remap[i + nelt2] = i * 2 + nelt2;
38212 remap[i + nelt + nelt2] = i * 2 + nelt2 + 1;
38213 dremap.perm[i * 2] = i;
38214 dremap.perm[i * 2 + 1] = i + nelt;
38215 dremap.perm[i * 2 + nelt2] = i + nelt2;
38216 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2;
38217 }
38218 }
38219 else if (TARGET_AVX2
38220 && (contents & (q[1] | q[3] | q[5] | q[7])) == contents)
38221 {
38222 /* vpunpckh* */
38223 for (i = 0; i < nelt4; ++i)
38224 {
38225 remap[i + nelt4] = i * 2;
38226 remap[i + nelt + nelt4] = i * 2 + 1;
38227 remap[i + nelt2 + nelt4] = i * 2 + nelt2;
38228 remap[i + nelt + nelt2 + nelt4] = i * 2 + nelt2 + 1;
38229 dremap.perm[i * 2] = i + nelt4;
38230 dremap.perm[i * 2 + 1] = i + nelt + nelt4;
38231 dremap.perm[i * 2 + nelt2] = i + nelt2 + nelt4;
38232 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2 + nelt4;
38233 }
38234 }
38235 else
38236 return false;
38237 }
38238
38239 /* Use the remapping array set up above to move the elements from their
38240 swizzled locations into their final destinations. */
38241 dfinal = *d;
38242 for (i = 0; i < nelt; ++i)
38243 {
38244 unsigned e = remap[d->perm[i]];
38245 gcc_assert (e < nelt);
38246 /* If same_halves is true, both halves of the remapped vector are the
38247 same. Avoid cross-lane accesses if possible. */
38248 if (same_halves && i >= nelt2)
38249 {
38250 gcc_assert (e < nelt2);
38251 dfinal.perm[i] = e + nelt2;
38252 }
38253 else
38254 dfinal.perm[i] = e;
38255 }
38256 dfinal.op0 = gen_reg_rtx (dfinal.vmode);
38257 dfinal.op1 = dfinal.op0;
38258 dfinal.one_operand_p = true;
38259 dremap.target = dfinal.op0;
38260
38261 /* Test if the final remap can be done with a single insn. For V4SFmode or
38262 V4SImode this *will* succeed. For V8HImode or V16QImode it may not. */
38263 start_sequence ();
38264 ok = expand_vec_perm_1 (&dfinal);
38265 seq = get_insns ();
38266 end_sequence ();
38267
38268 if (!ok)
38269 return false;
38270
38271 if (d->testing_p)
38272 return true;
38273
38274 if (dremap.vmode != dfinal.vmode)
38275 {
38276 dremap.target = gen_lowpart (dremap.vmode, dremap.target);
38277 dremap.op0 = gen_lowpart (dremap.vmode, dremap.op0);
38278 dremap.op1 = gen_lowpart (dremap.vmode, dremap.op1);
38279 }
38280
38281 ok = expand_vec_perm_1 (&dremap);
38282 gcc_assert (ok);
38283
38284 emit_insn (seq);
38285 return true;
38286 }
38287
38288 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
38289 a single vector cross-lane permutation into vpermq followed
38290 by any of the single insn permutations. */
38291
38292 static bool
38293 expand_vec_perm_vpermq_perm_1 (struct expand_vec_perm_d *d)
38294 {
38295 struct expand_vec_perm_d dremap, dfinal;
38296 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, nelt4 = nelt / 4;
38297 unsigned contents[2];
38298 bool ok;
38299
38300 if (!(TARGET_AVX2
38301 && (d->vmode == V32QImode || d->vmode == V16HImode)
38302 && d->one_operand_p))
38303 return false;
38304
38305 contents[0] = 0;
38306 contents[1] = 0;
38307 for (i = 0; i < nelt2; ++i)
38308 {
38309 contents[0] |= 1u << (d->perm[i] / nelt4);
38310 contents[1] |= 1u << (d->perm[i + nelt2] / nelt4);
38311 }
38312
38313 for (i = 0; i < 2; ++i)
38314 {
38315 unsigned int cnt = 0;
38316 for (j = 0; j < 4; ++j)
38317 if ((contents[i] & (1u << j)) != 0 && ++cnt > 2)
38318 return false;
38319 }
38320
38321 if (d->testing_p)
38322 return true;
38323
38324 dremap = *d;
38325 dremap.vmode = V4DImode;
38326 dremap.nelt = 4;
38327 dremap.target = gen_reg_rtx (V4DImode);
38328 dremap.op0 = gen_lowpart (V4DImode, d->op0);
38329 dremap.op1 = dremap.op0;
38330 dremap.one_operand_p = true;
38331 for (i = 0; i < 2; ++i)
38332 {
38333 unsigned int cnt = 0;
38334 for (j = 0; j < 4; ++j)
38335 if ((contents[i] & (1u << j)) != 0)
38336 dremap.perm[2 * i + cnt++] = j;
38337 for (; cnt < 2; ++cnt)
38338 dremap.perm[2 * i + cnt] = 0;
38339 }
38340
38341 dfinal = *d;
38342 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
38343 dfinal.op1 = dfinal.op0;
38344 dfinal.one_operand_p = true;
38345 for (i = 0, j = 0; i < nelt; ++i)
38346 {
38347 if (i == nelt2)
38348 j = 2;
38349 dfinal.perm[i] = (d->perm[i] & (nelt4 - 1)) | (j ? nelt2 : 0);
38350 if ((d->perm[i] / nelt4) == dremap.perm[j])
38351 ;
38352 else if ((d->perm[i] / nelt4) == dremap.perm[j + 1])
38353 dfinal.perm[i] |= nelt4;
38354 else
38355 gcc_unreachable ();
38356 }
38357
38358 ok = expand_vec_perm_1 (&dremap);
38359 gcc_assert (ok);
38360
38361 ok = expand_vec_perm_1 (&dfinal);
38362 gcc_assert (ok);
38363
38364 return true;
38365 }
38366
38367 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to expand
38368 a vector permutation using two instructions, vperm2f128 resp.
38369 vperm2i128 followed by any single in-lane permutation. */
38370
38371 static bool
38372 expand_vec_perm_vperm2f128 (struct expand_vec_perm_d *d)
38373 {
38374 struct expand_vec_perm_d dfirst, dsecond;
38375 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, perm;
38376 bool ok;
38377
38378 if (!TARGET_AVX
38379 || GET_MODE_SIZE (d->vmode) != 32
38380 || (d->vmode != V8SFmode && d->vmode != V4DFmode && !TARGET_AVX2))
38381 return false;
38382
38383 dsecond = *d;
38384 dsecond.one_operand_p = false;
38385 dsecond.testing_p = true;
38386
38387 /* ((perm << 2)|perm) & 0x33 is the vperm2[fi]128
38388 immediate. For perm < 16 the second permutation uses
38389 d->op0 as first operand, for perm >= 16 it uses d->op1
38390 as first operand. The second operand is the result of
38391 vperm2[fi]128. */
38392 for (perm = 0; perm < 32; perm++)
38393 {
38394 /* Ignore permutations which do not move anything cross-lane. */
38395 if (perm < 16)
38396 {
38397 /* The second shuffle for e.g. V4DFmode has
38398 0123 and ABCD operands.
38399 Ignore AB23, as 23 is already in the second lane
38400 of the first operand. */
38401 if ((perm & 0xc) == (1 << 2)) continue;
38402 /* And 01CD, as 01 is in the first lane of the first
38403 operand. */
38404 if ((perm & 3) == 0) continue;
38405 /* And 4567, as then the vperm2[fi]128 doesn't change
38406 anything on the original 4567 second operand. */
38407 if ((perm & 0xf) == ((3 << 2) | 2)) continue;
38408 }
38409 else
38410 {
38411 /* The second shuffle for e.g. V4DFmode has
38412 4567 and ABCD operands.
38413 Ignore AB67, as 67 is already in the second lane
38414 of the first operand. */
38415 if ((perm & 0xc) == (3 << 2)) continue;
38416 /* And 45CD, as 45 is in the first lane of the first
38417 operand. */
38418 if ((perm & 3) == 2) continue;
38419 /* And 0123, as then the vperm2[fi]128 doesn't change
38420 anything on the original 0123 first operand. */
38421 if ((perm & 0xf) == (1 << 2)) continue;
38422 }
38423
38424 for (i = 0; i < nelt; i++)
38425 {
38426 j = d->perm[i] / nelt2;
38427 if (j == ((perm >> (2 * (i >= nelt2))) & 3))
38428 dsecond.perm[i] = nelt + (i & nelt2) + (d->perm[i] & (nelt2 - 1));
38429 else if (j == (unsigned) (i >= nelt2) + 2 * (perm >= 16))
38430 dsecond.perm[i] = d->perm[i] & (nelt - 1);
38431 else
38432 break;
38433 }
38434
38435 if (i == nelt)
38436 {
38437 start_sequence ();
38438 ok = expand_vec_perm_1 (&dsecond);
38439 end_sequence ();
38440 }
38441 else
38442 ok = false;
38443
38444 if (ok)
38445 {
38446 if (d->testing_p)
38447 return true;
38448
38449 /* Found a usable second shuffle. dfirst will be
38450 vperm2f128 on d->op0 and d->op1. */
38451 dsecond.testing_p = false;
38452 dfirst = *d;
38453 dfirst.target = gen_reg_rtx (d->vmode);
38454 for (i = 0; i < nelt; i++)
38455 dfirst.perm[i] = (i & (nelt2 - 1))
38456 + ((perm >> (2 * (i >= nelt2))) & 3) * nelt2;
38457
38458 ok = expand_vec_perm_1 (&dfirst);
38459 gcc_assert (ok);
38460
38461 /* And dsecond is some single insn shuffle, taking
38462 d->op0 and result of vperm2f128 (if perm < 16) or
38463 d->op1 and result of vperm2f128 (otherwise). */
38464 dsecond.op1 = dfirst.target;
38465 if (perm >= 16)
38466 dsecond.op0 = dfirst.op1;
38467
38468 ok = expand_vec_perm_1 (&dsecond);
38469 gcc_assert (ok);
38470
38471 return true;
38472 }
38473
38474 /* For one operand, the only useful vperm2f128 permutation is 0x10. */
38475 if (d->one_operand_p)
38476 return false;
38477 }
38478
38479 return false;
38480 }
38481
38482 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
38483 a two vector permutation using 2 intra-lane interleave insns
38484 and cross-lane shuffle for 32-byte vectors. */
38485
38486 static bool
38487 expand_vec_perm_interleave3 (struct expand_vec_perm_d *d)
38488 {
38489 unsigned i, nelt;
38490 rtx (*gen) (rtx, rtx, rtx);
38491
38492 if (d->one_operand_p)
38493 return false;
38494 if (TARGET_AVX2 && GET_MODE_SIZE (d->vmode) == 32)
38495 ;
38496 else if (TARGET_AVX && (d->vmode == V8SFmode || d->vmode == V4DFmode))
38497 ;
38498 else
38499 return false;
38500
38501 nelt = d->nelt;
38502 if (d->perm[0] != 0 && d->perm[0] != nelt / 2)
38503 return false;
38504 for (i = 0; i < nelt; i += 2)
38505 if (d->perm[i] != d->perm[0] + i / 2
38506 || d->perm[i + 1] != d->perm[0] + i / 2 + nelt)
38507 return false;
38508
38509 if (d->testing_p)
38510 return true;
38511
38512 switch (d->vmode)
38513 {
38514 case V32QImode:
38515 if (d->perm[0])
38516 gen = gen_vec_interleave_highv32qi;
38517 else
38518 gen = gen_vec_interleave_lowv32qi;
38519 break;
38520 case V16HImode:
38521 if (d->perm[0])
38522 gen = gen_vec_interleave_highv16hi;
38523 else
38524 gen = gen_vec_interleave_lowv16hi;
38525 break;
38526 case V8SImode:
38527 if (d->perm[0])
38528 gen = gen_vec_interleave_highv8si;
38529 else
38530 gen = gen_vec_interleave_lowv8si;
38531 break;
38532 case V4DImode:
38533 if (d->perm[0])
38534 gen = gen_vec_interleave_highv4di;
38535 else
38536 gen = gen_vec_interleave_lowv4di;
38537 break;
38538 case V8SFmode:
38539 if (d->perm[0])
38540 gen = gen_vec_interleave_highv8sf;
38541 else
38542 gen = gen_vec_interleave_lowv8sf;
38543 break;
38544 case V4DFmode:
38545 if (d->perm[0])
38546 gen = gen_vec_interleave_highv4df;
38547 else
38548 gen = gen_vec_interleave_lowv4df;
38549 break;
38550 default:
38551 gcc_unreachable ();
38552 }
38553
38554 emit_insn (gen (d->target, d->op0, d->op1));
38555 return true;
38556 }
38557
38558 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement
38559 a single vector permutation using a single intra-lane vector
38560 permutation, vperm2f128 swapping the lanes and vblend* insn blending
38561 the non-swapped and swapped vectors together. */
38562
38563 static bool
38564 expand_vec_perm_vperm2f128_vblend (struct expand_vec_perm_d *d)
38565 {
38566 struct expand_vec_perm_d dfirst, dsecond;
38567 unsigned i, j, msk, nelt = d->nelt, nelt2 = nelt / 2;
38568 rtx seq;
38569 bool ok;
38570 rtx (*blend) (rtx, rtx, rtx, rtx) = NULL;
38571
38572 if (!TARGET_AVX
38573 || TARGET_AVX2
38574 || (d->vmode != V8SFmode && d->vmode != V4DFmode)
38575 || !d->one_operand_p)
38576 return false;
38577
38578 dfirst = *d;
38579 for (i = 0; i < nelt; i++)
38580 dfirst.perm[i] = 0xff;
38581 for (i = 0, msk = 0; i < nelt; i++)
38582 {
38583 j = (d->perm[i] & nelt2) ? i | nelt2 : i & ~nelt2;
38584 if (dfirst.perm[j] != 0xff && dfirst.perm[j] != d->perm[i])
38585 return false;
38586 dfirst.perm[j] = d->perm[i];
38587 if (j != i)
38588 msk |= (1 << i);
38589 }
38590 for (i = 0; i < nelt; i++)
38591 if (dfirst.perm[i] == 0xff)
38592 dfirst.perm[i] = i;
38593
38594 if (!d->testing_p)
38595 dfirst.target = gen_reg_rtx (dfirst.vmode);
38596
38597 start_sequence ();
38598 ok = expand_vec_perm_1 (&dfirst);
38599 seq = get_insns ();
38600 end_sequence ();
38601
38602 if (!ok)
38603 return false;
38604
38605 if (d->testing_p)
38606 return true;
38607
38608 emit_insn (seq);
38609
38610 dsecond = *d;
38611 dsecond.op0 = dfirst.target;
38612 dsecond.op1 = dfirst.target;
38613 dsecond.one_operand_p = true;
38614 dsecond.target = gen_reg_rtx (dsecond.vmode);
38615 for (i = 0; i < nelt; i++)
38616 dsecond.perm[i] = i ^ nelt2;
38617
38618 ok = expand_vec_perm_1 (&dsecond);
38619 gcc_assert (ok);
38620
38621 blend = d->vmode == V8SFmode ? gen_avx_blendps256 : gen_avx_blendpd256;
38622 emit_insn (blend (d->target, dfirst.target, dsecond.target, GEN_INT (msk)));
38623 return true;
38624 }
38625
38626 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement a V4DF
38627 permutation using two vperm2f128, followed by a vshufpd insn blending
38628 the two vectors together. */
38629
38630 static bool
38631 expand_vec_perm_2vperm2f128_vshuf (struct expand_vec_perm_d *d)
38632 {
38633 struct expand_vec_perm_d dfirst, dsecond, dthird;
38634 bool ok;
38635
38636 if (!TARGET_AVX || (d->vmode != V4DFmode))
38637 return false;
38638
38639 if (d->testing_p)
38640 return true;
38641
38642 dfirst = *d;
38643 dsecond = *d;
38644 dthird = *d;
38645
38646 dfirst.perm[0] = (d->perm[0] & ~1);
38647 dfirst.perm[1] = (d->perm[0] & ~1) + 1;
38648 dfirst.perm[2] = (d->perm[2] & ~1);
38649 dfirst.perm[3] = (d->perm[2] & ~1) + 1;
38650 dsecond.perm[0] = (d->perm[1] & ~1);
38651 dsecond.perm[1] = (d->perm[1] & ~1) + 1;
38652 dsecond.perm[2] = (d->perm[3] & ~1);
38653 dsecond.perm[3] = (d->perm[3] & ~1) + 1;
38654 dthird.perm[0] = (d->perm[0] % 2);
38655 dthird.perm[1] = (d->perm[1] % 2) + 4;
38656 dthird.perm[2] = (d->perm[2] % 2) + 2;
38657 dthird.perm[3] = (d->perm[3] % 2) + 6;
38658
38659 dfirst.target = gen_reg_rtx (dfirst.vmode);
38660 dsecond.target = gen_reg_rtx (dsecond.vmode);
38661 dthird.op0 = dfirst.target;
38662 dthird.op1 = dsecond.target;
38663 dthird.one_operand_p = false;
38664
38665 canonicalize_perm (&dfirst);
38666 canonicalize_perm (&dsecond);
38667
38668 ok = expand_vec_perm_1 (&dfirst)
38669 && expand_vec_perm_1 (&dsecond)
38670 && expand_vec_perm_1 (&dthird);
38671
38672 gcc_assert (ok);
38673
38674 return true;
38675 }
38676
38677 /* A subroutine of expand_vec_perm_even_odd_1. Implement the double-word
38678 permutation with two pshufb insns and an ior. We should have already
38679 failed all two instruction sequences. */
38680
38681 static bool
38682 expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d)
38683 {
38684 rtx rperm[2][16], vperm, l, h, op, m128;
38685 unsigned int i, nelt, eltsz;
38686
38687 if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
38688 return false;
38689 gcc_assert (!d->one_operand_p);
38690
38691 nelt = d->nelt;
38692 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
38693
38694 /* Generate two permutation masks. If the required element is within
38695 the given vector it is shuffled into the proper lane. If the required
38696 element is in the other vector, force a zero into the lane by setting
38697 bit 7 in the permutation mask. */
38698 m128 = GEN_INT (-128);
38699 for (i = 0; i < nelt; ++i)
38700 {
38701 unsigned j, e = d->perm[i];
38702 unsigned which = (e >= nelt);
38703 if (e >= nelt)
38704 e -= nelt;
38705
38706 for (j = 0; j < eltsz; ++j)
38707 {
38708 rperm[which][i*eltsz + j] = GEN_INT (e*eltsz + j);
38709 rperm[1-which][i*eltsz + j] = m128;
38710 }
38711 }
38712
38713 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[0]));
38714 vperm = force_reg (V16QImode, vperm);
38715
38716 l = gen_reg_rtx (V16QImode);
38717 op = gen_lowpart (V16QImode, d->op0);
38718 emit_insn (gen_ssse3_pshufbv16qi3 (l, op, vperm));
38719
38720 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[1]));
38721 vperm = force_reg (V16QImode, vperm);
38722
38723 h = gen_reg_rtx (V16QImode);
38724 op = gen_lowpart (V16QImode, d->op1);
38725 emit_insn (gen_ssse3_pshufbv16qi3 (h, op, vperm));
38726
38727 op = gen_lowpart (V16QImode, d->target);
38728 emit_insn (gen_iorv16qi3 (op, l, h));
38729
38730 return true;
38731 }
38732
38733 /* Implement arbitrary permutation of one V32QImode and V16QImode operand
38734 with two vpshufb insns, vpermq and vpor. We should have already failed
38735 all two or three instruction sequences. */
38736
38737 static bool
38738 expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d)
38739 {
38740 rtx rperm[2][32], vperm, l, h, hp, op, m128;
38741 unsigned int i, nelt, eltsz;
38742
38743 if (!TARGET_AVX2
38744 || !d->one_operand_p
38745 || (d->vmode != V32QImode && d->vmode != V16HImode))
38746 return false;
38747
38748 if (d->testing_p)
38749 return true;
38750
38751 nelt = d->nelt;
38752 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
38753
38754 /* Generate two permutation masks. If the required element is within
38755 the same lane, it is shuffled in. If the required element from the
38756 other lane, force a zero by setting bit 7 in the permutation mask.
38757 In the other mask the mask has non-negative elements if element
38758 is requested from the other lane, but also moved to the other lane,
38759 so that the result of vpshufb can have the two V2TImode halves
38760 swapped. */
38761 m128 = GEN_INT (-128);
38762 for (i = 0; i < nelt; ++i)
38763 {
38764 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
38765 unsigned which = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
38766
38767 for (j = 0; j < eltsz; ++j)
38768 {
38769 rperm[!!which][(i * eltsz + j) ^ which] = GEN_INT (e * eltsz + j);
38770 rperm[!which][(i * eltsz + j) ^ (which ^ 16)] = m128;
38771 }
38772 }
38773
38774 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
38775 vperm = force_reg (V32QImode, vperm);
38776
38777 h = gen_reg_rtx (V32QImode);
38778 op = gen_lowpart (V32QImode, d->op0);
38779 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
38780
38781 /* Swap the 128-byte lanes of h into hp. */
38782 hp = gen_reg_rtx (V4DImode);
38783 op = gen_lowpart (V4DImode, h);
38784 emit_insn (gen_avx2_permv4di_1 (hp, op, const2_rtx, GEN_INT (3), const0_rtx,
38785 const1_rtx));
38786
38787 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
38788 vperm = force_reg (V32QImode, vperm);
38789
38790 l = gen_reg_rtx (V32QImode);
38791 op = gen_lowpart (V32QImode, d->op0);
38792 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
38793
38794 op = gen_lowpart (V32QImode, d->target);
38795 emit_insn (gen_iorv32qi3 (op, l, gen_lowpart (V32QImode, hp)));
38796
38797 return true;
38798 }
38799
38800 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
38801 and extract-odd permutations of two V32QImode and V16QImode operand
38802 with two vpshufb insns, vpor and vpermq. We should have already
38803 failed all two or three instruction sequences. */
38804
38805 static bool
38806 expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d *d)
38807 {
38808 rtx rperm[2][32], vperm, l, h, ior, op, m128;
38809 unsigned int i, nelt, eltsz;
38810
38811 if (!TARGET_AVX2
38812 || d->one_operand_p
38813 || (d->vmode != V32QImode && d->vmode != V16HImode))
38814 return false;
38815
38816 for (i = 0; i < d->nelt; ++i)
38817 if ((d->perm[i] ^ (i * 2)) & (3 * d->nelt / 2))
38818 return false;
38819
38820 if (d->testing_p)
38821 return true;
38822
38823 nelt = d->nelt;
38824 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
38825
38826 /* Generate two permutation masks. In the first permutation mask
38827 the first quarter will contain indexes for the first half
38828 of the op0, the second quarter will contain bit 7 set, third quarter
38829 will contain indexes for the second half of the op0 and the
38830 last quarter bit 7 set. In the second permutation mask
38831 the first quarter will contain bit 7 set, the second quarter
38832 indexes for the first half of the op1, the third quarter bit 7 set
38833 and last quarter indexes for the second half of the op1.
38834 I.e. the first mask e.g. for V32QImode extract even will be:
38835 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128
38836 (all values masked with 0xf except for -128) and second mask
38837 for extract even will be
38838 -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe. */
38839 m128 = GEN_INT (-128);
38840 for (i = 0; i < nelt; ++i)
38841 {
38842 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
38843 unsigned which = d->perm[i] >= nelt;
38844 unsigned xorv = (i >= nelt / 4 && i < 3 * nelt / 4) ? 24 : 0;
38845
38846 for (j = 0; j < eltsz; ++j)
38847 {
38848 rperm[which][(i * eltsz + j) ^ xorv] = GEN_INT (e * eltsz + j);
38849 rperm[1 - which][(i * eltsz + j) ^ xorv] = m128;
38850 }
38851 }
38852
38853 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
38854 vperm = force_reg (V32QImode, vperm);
38855
38856 l = gen_reg_rtx (V32QImode);
38857 op = gen_lowpart (V32QImode, d->op0);
38858 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
38859
38860 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
38861 vperm = force_reg (V32QImode, vperm);
38862
38863 h = gen_reg_rtx (V32QImode);
38864 op = gen_lowpart (V32QImode, d->op1);
38865 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
38866
38867 ior = gen_reg_rtx (V32QImode);
38868 emit_insn (gen_iorv32qi3 (ior, l, h));
38869
38870 /* Permute the V4DImode quarters using { 0, 2, 1, 3 } permutation. */
38871 op = gen_lowpart (V4DImode, d->target);
38872 ior = gen_lowpart (V4DImode, ior);
38873 emit_insn (gen_avx2_permv4di_1 (op, ior, const0_rtx, const2_rtx,
38874 const1_rtx, GEN_INT (3)));
38875
38876 return true;
38877 }
38878
38879 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement extract-even
38880 and extract-odd permutations. */
38881
38882 static bool
38883 expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
38884 {
38885 rtx t1, t2, t3;
38886
38887 switch (d->vmode)
38888 {
38889 case V4DFmode:
38890 t1 = gen_reg_rtx (V4DFmode);
38891 t2 = gen_reg_rtx (V4DFmode);
38892
38893 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
38894 emit_insn (gen_avx_vperm2f128v4df3 (t1, d->op0, d->op1, GEN_INT (0x20)));
38895 emit_insn (gen_avx_vperm2f128v4df3 (t2, d->op0, d->op1, GEN_INT (0x31)));
38896
38897 /* Now an unpck[lh]pd will produce the result required. */
38898 if (odd)
38899 t3 = gen_avx_unpckhpd256 (d->target, t1, t2);
38900 else
38901 t3 = gen_avx_unpcklpd256 (d->target, t1, t2);
38902 emit_insn (t3);
38903 break;
38904
38905 case V8SFmode:
38906 {
38907 int mask = odd ? 0xdd : 0x88;
38908
38909 t1 = gen_reg_rtx (V8SFmode);
38910 t2 = gen_reg_rtx (V8SFmode);
38911 t3 = gen_reg_rtx (V8SFmode);
38912
38913 /* Shuffle within the 128-bit lanes to produce:
38914 { 0 2 8 a 4 6 c e } | { 1 3 9 b 5 7 d f }. */
38915 emit_insn (gen_avx_shufps256 (t1, d->op0, d->op1,
38916 GEN_INT (mask)));
38917
38918 /* Shuffle the lanes around to produce:
38919 { 4 6 c e 0 2 8 a } and { 5 7 d f 1 3 9 b }. */
38920 emit_insn (gen_avx_vperm2f128v8sf3 (t2, t1, t1,
38921 GEN_INT (0x3)));
38922
38923 /* Shuffle within the 128-bit lanes to produce:
38924 { 0 2 4 6 4 6 0 2 } | { 1 3 5 7 5 7 1 3 }. */
38925 emit_insn (gen_avx_shufps256 (t3, t1, t2, GEN_INT (0x44)));
38926
38927 /* Shuffle within the 128-bit lanes to produce:
38928 { 8 a c e c e 8 a } | { 9 b d f d f 9 b }. */
38929 emit_insn (gen_avx_shufps256 (t2, t1, t2, GEN_INT (0xee)));
38930
38931 /* Shuffle the lanes around to produce:
38932 { 0 2 4 6 8 a c e } | { 1 3 5 7 9 b d f }. */
38933 emit_insn (gen_avx_vperm2f128v8sf3 (d->target, t3, t2,
38934 GEN_INT (0x20)));
38935 }
38936 break;
38937
38938 case V2DFmode:
38939 case V4SFmode:
38940 case V2DImode:
38941 case V4SImode:
38942 /* These are always directly implementable by expand_vec_perm_1. */
38943 gcc_unreachable ();
38944
38945 case V8HImode:
38946 if (TARGET_SSSE3)
38947 return expand_vec_perm_pshufb2 (d);
38948 else
38949 {
38950 /* We need 2*log2(N)-1 operations to achieve odd/even
38951 with interleave. */
38952 t1 = gen_reg_rtx (V8HImode);
38953 t2 = gen_reg_rtx (V8HImode);
38954 emit_insn (gen_vec_interleave_highv8hi (t1, d->op0, d->op1));
38955 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->op0, d->op1));
38956 emit_insn (gen_vec_interleave_highv8hi (t2, d->target, t1));
38957 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->target, t1));
38958 if (odd)
38959 t3 = gen_vec_interleave_highv8hi (d->target, d->target, t2);
38960 else
38961 t3 = gen_vec_interleave_lowv8hi (d->target, d->target, t2);
38962 emit_insn (t3);
38963 }
38964 break;
38965
38966 case V16QImode:
38967 if (TARGET_SSSE3)
38968 return expand_vec_perm_pshufb2 (d);
38969 else
38970 {
38971 t1 = gen_reg_rtx (V16QImode);
38972 t2 = gen_reg_rtx (V16QImode);
38973 t3 = gen_reg_rtx (V16QImode);
38974 emit_insn (gen_vec_interleave_highv16qi (t1, d->op0, d->op1));
38975 emit_insn (gen_vec_interleave_lowv16qi (d->target, d->op0, d->op1));
38976 emit_insn (gen_vec_interleave_highv16qi (t2, d->target, t1));
38977 emit_insn (gen_vec_interleave_lowv16qi (d->target, d->target, t1));
38978 emit_insn (gen_vec_interleave_highv16qi (t3, d->target, t2));
38979 emit_insn (gen_vec_interleave_lowv16qi (d->target, d->target, t2));
38980 if (odd)
38981 t3 = gen_vec_interleave_highv16qi (d->target, d->target, t3);
38982 else
38983 t3 = gen_vec_interleave_lowv16qi (d->target, d->target, t3);
38984 emit_insn (t3);
38985 }
38986 break;
38987
38988 case V16HImode:
38989 case V32QImode:
38990 return expand_vec_perm_vpshufb2_vpermq_even_odd (d);
38991
38992 case V4DImode:
38993 if (!TARGET_AVX2)
38994 {
38995 struct expand_vec_perm_d d_copy = *d;
38996 d_copy.vmode = V4DFmode;
38997 d_copy.target = gen_lowpart (V4DFmode, d->target);
38998 d_copy.op0 = gen_lowpart (V4DFmode, d->op0);
38999 d_copy.op1 = gen_lowpart (V4DFmode, d->op1);
39000 return expand_vec_perm_even_odd_1 (&d_copy, odd);
39001 }
39002
39003 t1 = gen_reg_rtx (V4DImode);
39004 t2 = gen_reg_rtx (V4DImode);
39005
39006 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
39007 emit_insn (gen_avx2_permv2ti (t1, d->op0, d->op1, GEN_INT (0x20)));
39008 emit_insn (gen_avx2_permv2ti (t2, d->op0, d->op1, GEN_INT (0x31)));
39009
39010 /* Now an vpunpck[lh]qdq will produce the result required. */
39011 if (odd)
39012 t3 = gen_avx2_interleave_highv4di (d->target, t1, t2);
39013 else
39014 t3 = gen_avx2_interleave_lowv4di (d->target, t1, t2);
39015 emit_insn (t3);
39016 break;
39017
39018 case V8SImode:
39019 if (!TARGET_AVX2)
39020 {
39021 struct expand_vec_perm_d d_copy = *d;
39022 d_copy.vmode = V8SFmode;
39023 d_copy.target = gen_lowpart (V8SFmode, d->target);
39024 d_copy.op0 = gen_lowpart (V8SFmode, d->op0);
39025 d_copy.op1 = gen_lowpart (V8SFmode, d->op1);
39026 return expand_vec_perm_even_odd_1 (&d_copy, odd);
39027 }
39028
39029 t1 = gen_reg_rtx (V8SImode);
39030 t2 = gen_reg_rtx (V8SImode);
39031
39032 /* Shuffle the lanes around into
39033 { 0 1 2 3 8 9 a b } and { 4 5 6 7 c d e f }. */
39034 emit_insn (gen_avx2_permv2ti (gen_lowpart (V4DImode, t1),
39035 gen_lowpart (V4DImode, d->op0),
39036 gen_lowpart (V4DImode, d->op1),
39037 GEN_INT (0x20)));
39038 emit_insn (gen_avx2_permv2ti (gen_lowpart (V4DImode, t2),
39039 gen_lowpart (V4DImode, d->op0),
39040 gen_lowpart (V4DImode, d->op1),
39041 GEN_INT (0x31)));
39042
39043 /* Swap the 2nd and 3rd position in each lane into
39044 { 0 2 1 3 8 a 9 b } and { 4 6 5 7 c e d f }. */
39045 emit_insn (gen_avx2_pshufdv3 (t1, t1,
39046 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
39047 emit_insn (gen_avx2_pshufdv3 (t2, t2,
39048 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
39049
39050 /* Now an vpunpck[lh]qdq will produce
39051 { 0 2 4 6 8 a c e } resp. { 1 3 5 7 9 b d f }. */
39052 if (odd)
39053 t3 = gen_avx2_interleave_highv4di (gen_lowpart (V4DImode, d->target),
39054 gen_lowpart (V4DImode, t1),
39055 gen_lowpart (V4DImode, t2));
39056 else
39057 t3 = gen_avx2_interleave_lowv4di (gen_lowpart (V4DImode, d->target),
39058 gen_lowpart (V4DImode, t1),
39059 gen_lowpart (V4DImode, t2));
39060 emit_insn (t3);
39061 break;
39062
39063 default:
39064 gcc_unreachable ();
39065 }
39066
39067 return true;
39068 }
39069
39070 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
39071 extract-even and extract-odd permutations. */
39072
39073 static bool
39074 expand_vec_perm_even_odd (struct expand_vec_perm_d *d)
39075 {
39076 unsigned i, odd, nelt = d->nelt;
39077
39078 odd = d->perm[0];
39079 if (odd != 0 && odd != 1)
39080 return false;
39081
39082 for (i = 1; i < nelt; ++i)
39083 if (d->perm[i] != 2 * i + odd)
39084 return false;
39085
39086 return expand_vec_perm_even_odd_1 (d, odd);
39087 }
39088
39089 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement broadcast
39090 permutations. We assume that expand_vec_perm_1 has already failed. */
39091
39092 static bool
39093 expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
39094 {
39095 unsigned elt = d->perm[0], nelt2 = d->nelt / 2;
39096 enum machine_mode vmode = d->vmode;
39097 unsigned char perm2[4];
39098 rtx op0 = d->op0;
39099 bool ok;
39100
39101 switch (vmode)
39102 {
39103 case V4DFmode:
39104 case V8SFmode:
39105 /* These are special-cased in sse.md so that we can optionally
39106 use the vbroadcast instruction. They expand to two insns
39107 if the input happens to be in a register. */
39108 gcc_unreachable ();
39109
39110 case V2DFmode:
39111 case V2DImode:
39112 case V4SFmode:
39113 case V4SImode:
39114 /* These are always implementable using standard shuffle patterns. */
39115 gcc_unreachable ();
39116
39117 case V8HImode:
39118 case V16QImode:
39119 /* These can be implemented via interleave. We save one insn by
39120 stopping once we have promoted to V4SImode and then use pshufd. */
39121 do
39122 {
39123 rtx dest;
39124 rtx (*gen) (rtx, rtx, rtx)
39125 = vmode == V16QImode ? gen_vec_interleave_lowv16qi
39126 : gen_vec_interleave_lowv8hi;
39127
39128 if (elt >= nelt2)
39129 {
39130 gen = vmode == V16QImode ? gen_vec_interleave_highv16qi
39131 : gen_vec_interleave_highv8hi;
39132 elt -= nelt2;
39133 }
39134 nelt2 /= 2;
39135
39136 dest = gen_reg_rtx (vmode);
39137 emit_insn (gen (dest, op0, op0));
39138 vmode = get_mode_wider_vector (vmode);
39139 op0 = gen_lowpart (vmode, dest);
39140 }
39141 while (vmode != V4SImode);
39142
39143 memset (perm2, elt, 4);
39144 ok = expand_vselect (gen_lowpart (V4SImode, d->target), op0, perm2, 4,
39145 d->testing_p);
39146 gcc_assert (ok);
39147 return true;
39148
39149 case V32QImode:
39150 case V16HImode:
39151 case V8SImode:
39152 case V4DImode:
39153 /* For AVX2 broadcasts of the first element vpbroadcast* or
39154 vpermq should be used by expand_vec_perm_1. */
39155 gcc_assert (!TARGET_AVX2 || d->perm[0]);
39156 return false;
39157
39158 default:
39159 gcc_unreachable ();
39160 }
39161 }
39162
39163 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
39164 broadcast permutations. */
39165
39166 static bool
39167 expand_vec_perm_broadcast (struct expand_vec_perm_d *d)
39168 {
39169 unsigned i, elt, nelt = d->nelt;
39170
39171 if (!d->one_operand_p)
39172 return false;
39173
39174 elt = d->perm[0];
39175 for (i = 1; i < nelt; ++i)
39176 if (d->perm[i] != elt)
39177 return false;
39178
39179 return expand_vec_perm_broadcast_1 (d);
39180 }
39181
39182 /* Implement arbitrary permutation of two V32QImode and V16QImode operands
39183 with 4 vpshufb insns, 2 vpermq and 3 vpor. We should have already failed
39184 all the shorter instruction sequences. */
39185
39186 static bool
39187 expand_vec_perm_vpshufb4_vpermq2 (struct expand_vec_perm_d *d)
39188 {
39189 rtx rperm[4][32], vperm, l[2], h[2], op, m128;
39190 unsigned int i, nelt, eltsz;
39191 bool used[4];
39192
39193 if (!TARGET_AVX2
39194 || d->one_operand_p
39195 || (d->vmode != V32QImode && d->vmode != V16HImode))
39196 return false;
39197
39198 if (d->testing_p)
39199 return true;
39200
39201 nelt = d->nelt;
39202 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
39203
39204 /* Generate 4 permutation masks. If the required element is within
39205 the same lane, it is shuffled in. If the required element from the
39206 other lane, force a zero by setting bit 7 in the permutation mask.
39207 In the other mask the mask has non-negative elements if element
39208 is requested from the other lane, but also moved to the other lane,
39209 so that the result of vpshufb can have the two V2TImode halves
39210 swapped. */
39211 m128 = GEN_INT (-128);
39212 for (i = 0; i < 32; ++i)
39213 {
39214 rperm[0][i] = m128;
39215 rperm[1][i] = m128;
39216 rperm[2][i] = m128;
39217 rperm[3][i] = m128;
39218 }
39219 used[0] = false;
39220 used[1] = false;
39221 used[2] = false;
39222 used[3] = false;
39223 for (i = 0; i < nelt; ++i)
39224 {
39225 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
39226 unsigned xlane = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
39227 unsigned int which = ((d->perm[i] & nelt) ? 2 : 0) + (xlane ? 1 : 0);
39228
39229 for (j = 0; j < eltsz; ++j)
39230 rperm[which][(i * eltsz + j) ^ xlane] = GEN_INT (e * eltsz + j);
39231 used[which] = true;
39232 }
39233
39234 for (i = 0; i < 2; ++i)
39235 {
39236 if (!used[2 * i + 1])
39237 {
39238 h[i] = NULL_RTX;
39239 continue;
39240 }
39241 vperm = gen_rtx_CONST_VECTOR (V32QImode,
39242 gen_rtvec_v (32, rperm[2 * i + 1]));
39243 vperm = force_reg (V32QImode, vperm);
39244 h[i] = gen_reg_rtx (V32QImode);
39245 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
39246 emit_insn (gen_avx2_pshufbv32qi3 (h[i], op, vperm));
39247 }
39248
39249 /* Swap the 128-byte lanes of h[X]. */
39250 for (i = 0; i < 2; ++i)
39251 {
39252 if (h[i] == NULL_RTX)
39253 continue;
39254 op = gen_reg_rtx (V4DImode);
39255 emit_insn (gen_avx2_permv4di_1 (op, gen_lowpart (V4DImode, h[i]),
39256 const2_rtx, GEN_INT (3), const0_rtx,
39257 const1_rtx));
39258 h[i] = gen_lowpart (V32QImode, op);
39259 }
39260
39261 for (i = 0; i < 2; ++i)
39262 {
39263 if (!used[2 * i])
39264 {
39265 l[i] = NULL_RTX;
39266 continue;
39267 }
39268 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[2 * i]));
39269 vperm = force_reg (V32QImode, vperm);
39270 l[i] = gen_reg_rtx (V32QImode);
39271 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
39272 emit_insn (gen_avx2_pshufbv32qi3 (l[i], op, vperm));
39273 }
39274
39275 for (i = 0; i < 2; ++i)
39276 {
39277 if (h[i] && l[i])
39278 {
39279 op = gen_reg_rtx (V32QImode);
39280 emit_insn (gen_iorv32qi3 (op, l[i], h[i]));
39281 l[i] = op;
39282 }
39283 else if (h[i])
39284 l[i] = h[i];
39285 }
39286
39287 gcc_assert (l[0] && l[1]);
39288 op = gen_lowpart (V32QImode, d->target);
39289 emit_insn (gen_iorv32qi3 (op, l[0], l[1]));
39290 return true;
39291 }
39292
39293 /* The guts of ix86_expand_vec_perm_const, also used by the ok hook.
39294 With all of the interface bits taken care of, perform the expansion
39295 in D and return true on success. */
39296
39297 static bool
39298 ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
39299 {
39300 /* Try a single instruction expansion. */
39301 if (expand_vec_perm_1 (d))
39302 return true;
39303
39304 /* Try sequences of two instructions. */
39305
39306 if (expand_vec_perm_pshuflw_pshufhw (d))
39307 return true;
39308
39309 if (expand_vec_perm_palignr (d))
39310 return true;
39311
39312 if (expand_vec_perm_interleave2 (d))
39313 return true;
39314
39315 if (expand_vec_perm_broadcast (d))
39316 return true;
39317
39318 if (expand_vec_perm_vpermq_perm_1 (d))
39319 return true;
39320
39321 if (expand_vec_perm_vperm2f128 (d))
39322 return true;
39323
39324 /* Try sequences of three instructions. */
39325
39326 if (expand_vec_perm_2vperm2f128_vshuf (d))
39327 return true;
39328
39329 if (expand_vec_perm_pshufb2 (d))
39330 return true;
39331
39332 if (expand_vec_perm_interleave3 (d))
39333 return true;
39334
39335 if (expand_vec_perm_vperm2f128_vblend (d))
39336 return true;
39337
39338 /* Try sequences of four instructions. */
39339
39340 if (expand_vec_perm_vpshufb2_vpermq (d))
39341 return true;
39342
39343 if (expand_vec_perm_vpshufb2_vpermq_even_odd (d))
39344 return true;
39345
39346 /* ??? Look for narrow permutations whose element orderings would
39347 allow the promotion to a wider mode. */
39348
39349 /* ??? Look for sequences of interleave or a wider permute that place
39350 the data into the correct lanes for a half-vector shuffle like
39351 pshuf[lh]w or vpermilps. */
39352
39353 /* ??? Look for sequences of interleave that produce the desired results.
39354 The combinatorics of punpck[lh] get pretty ugly... */
39355
39356 if (expand_vec_perm_even_odd (d))
39357 return true;
39358
39359 /* Even longer sequences. */
39360 if (expand_vec_perm_vpshufb4_vpermq2 (d))
39361 return true;
39362
39363 return false;
39364 }
39365
39366 /* If a permutation only uses one operand, make it clear. Returns true
39367 if the permutation references both operands. */
39368
39369 static bool
39370 canonicalize_perm (struct expand_vec_perm_d *d)
39371 {
39372 int i, which, nelt = d->nelt;
39373
39374 for (i = which = 0; i < nelt; ++i)
39375 which |= (d->perm[i] < nelt ? 1 : 2);
39376
39377 d->one_operand_p = true;
39378 switch (which)
39379 {
39380 default:
39381 gcc_unreachable();
39382
39383 case 3:
39384 if (!rtx_equal_p (d->op0, d->op1))
39385 {
39386 d->one_operand_p = false;
39387 break;
39388 }
39389 /* The elements of PERM do not suggest that only the first operand
39390 is used, but both operands are identical. Allow easier matching
39391 of the permutation by folding the permutation into the single
39392 input vector. */
39393 /* FALLTHRU */
39394
39395 case 2:
39396 for (i = 0; i < nelt; ++i)
39397 d->perm[i] &= nelt - 1;
39398 d->op0 = d->op1;
39399 break;
39400
39401 case 1:
39402 d->op1 = d->op0;
39403 break;
39404 }
39405
39406 return (which == 3);
39407 }
39408
39409 bool
39410 ix86_expand_vec_perm_const (rtx operands[4])
39411 {
39412 struct expand_vec_perm_d d;
39413 unsigned char perm[MAX_VECT_LEN];
39414 int i, nelt;
39415 bool two_args;
39416 rtx sel;
39417
39418 d.target = operands[0];
39419 d.op0 = operands[1];
39420 d.op1 = operands[2];
39421 sel = operands[3];
39422
39423 d.vmode = GET_MODE (d.target);
39424 gcc_assert (VECTOR_MODE_P (d.vmode));
39425 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
39426 d.testing_p = false;
39427
39428 gcc_assert (GET_CODE (sel) == CONST_VECTOR);
39429 gcc_assert (XVECLEN (sel, 0) == nelt);
39430 gcc_checking_assert (sizeof (d.perm) == sizeof (perm));
39431
39432 for (i = 0; i < nelt; ++i)
39433 {
39434 rtx e = XVECEXP (sel, 0, i);
39435 int ei = INTVAL (e) & (2 * nelt - 1);
39436 d.perm[i] = ei;
39437 perm[i] = ei;
39438 }
39439
39440 two_args = canonicalize_perm (&d);
39441
39442 if (ix86_expand_vec_perm_const_1 (&d))
39443 return true;
39444
39445 /* If the selector says both arguments are needed, but the operands are the
39446 same, the above tried to expand with one_operand_p and flattened selector.
39447 If that didn't work, retry without one_operand_p; we succeeded with that
39448 during testing. */
39449 if (two_args && d.one_operand_p)
39450 {
39451 d.one_operand_p = false;
39452 memcpy (d.perm, perm, sizeof (perm));
39453 return ix86_expand_vec_perm_const_1 (&d);
39454 }
39455
39456 return false;
39457 }
39458
39459 /* Implement targetm.vectorize.vec_perm_const_ok. */
39460
39461 static bool
39462 ix86_vectorize_vec_perm_const_ok (enum machine_mode vmode,
39463 const unsigned char *sel)
39464 {
39465 struct expand_vec_perm_d d;
39466 unsigned int i, nelt, which;
39467 bool ret;
39468
39469 d.vmode = vmode;
39470 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
39471 d.testing_p = true;
39472
39473 /* Given sufficient ISA support we can just return true here
39474 for selected vector modes. */
39475 if (GET_MODE_SIZE (d.vmode) == 16)
39476 {
39477 /* All implementable with a single vpperm insn. */
39478 if (TARGET_XOP)
39479 return true;
39480 /* All implementable with 2 pshufb + 1 ior. */
39481 if (TARGET_SSSE3)
39482 return true;
39483 /* All implementable with shufpd or unpck[lh]pd. */
39484 if (d.nelt == 2)
39485 return true;
39486 }
39487
39488 /* Extract the values from the vector CST into the permutation
39489 array in D. */
39490 memcpy (d.perm, sel, nelt);
39491 for (i = which = 0; i < nelt; ++i)
39492 {
39493 unsigned char e = d.perm[i];
39494 gcc_assert (e < 2 * nelt);
39495 which |= (e < nelt ? 1 : 2);
39496 }
39497
39498 /* For all elements from second vector, fold the elements to first. */
39499 if (which == 2)
39500 for (i = 0; i < nelt; ++i)
39501 d.perm[i] -= nelt;
39502
39503 /* Check whether the mask can be applied to the vector type. */
39504 d.one_operand_p = (which != 3);
39505
39506 /* Implementable with shufps or pshufd. */
39507 if (d.one_operand_p && (d.vmode == V4SFmode || d.vmode == V4SImode))
39508 return true;
39509
39510 /* Otherwise we have to go through the motions and see if we can
39511 figure out how to generate the requested permutation. */
39512 d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
39513 d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
39514 if (!d.one_operand_p)
39515 d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
39516
39517 start_sequence ();
39518 ret = ix86_expand_vec_perm_const_1 (&d);
39519 end_sequence ();
39520
39521 return ret;
39522 }
39523
39524 void
39525 ix86_expand_vec_extract_even_odd (rtx targ, rtx op0, rtx op1, unsigned odd)
39526 {
39527 struct expand_vec_perm_d d;
39528 unsigned i, nelt;
39529
39530 d.target = targ;
39531 d.op0 = op0;
39532 d.op1 = op1;
39533 d.vmode = GET_MODE (targ);
39534 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
39535 d.one_operand_p = false;
39536 d.testing_p = false;
39537
39538 for (i = 0; i < nelt; ++i)
39539 d.perm[i] = i * 2 + odd;
39540
39541 /* We'll either be able to implement the permutation directly... */
39542 if (expand_vec_perm_1 (&d))
39543 return;
39544
39545 /* ... or we use the special-case patterns. */
39546 expand_vec_perm_even_odd_1 (&d, odd);
39547 }
39548
39549 static void
39550 ix86_expand_vec_interleave (rtx targ, rtx op0, rtx op1, bool high_p)
39551 {
39552 struct expand_vec_perm_d d;
39553 unsigned i, nelt, base;
39554 bool ok;
39555
39556 d.target = targ;
39557 d.op0 = op0;
39558 d.op1 = op1;
39559 d.vmode = GET_MODE (targ);
39560 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
39561 d.one_operand_p = false;
39562 d.testing_p = false;
39563
39564 base = high_p ? nelt / 2 : 0;
39565 for (i = 0; i < nelt / 2; ++i)
39566 {
39567 d.perm[i * 2] = i + base;
39568 d.perm[i * 2 + 1] = i + base + nelt;
39569 }
39570
39571 /* Note that for AVX this isn't one instruction. */
39572 ok = ix86_expand_vec_perm_const_1 (&d);
39573 gcc_assert (ok);
39574 }
39575
39576
39577 /* Expand a vector operation CODE for a V*QImode in terms of the
39578 same operation on V*HImode. */
39579
39580 void
39581 ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
39582 {
39583 enum machine_mode qimode = GET_MODE (dest);
39584 enum machine_mode himode;
39585 rtx (*gen_il) (rtx, rtx, rtx);
39586 rtx (*gen_ih) (rtx, rtx, rtx);
39587 rtx op1_l, op1_h, op2_l, op2_h, res_l, res_h;
39588 struct expand_vec_perm_d d;
39589 bool ok, full_interleave;
39590 bool uns_p = false;
39591 int i;
39592
39593 switch (qimode)
39594 {
39595 case V16QImode:
39596 himode = V8HImode;
39597 gen_il = gen_vec_interleave_lowv16qi;
39598 gen_ih = gen_vec_interleave_highv16qi;
39599 break;
39600 case V32QImode:
39601 himode = V16HImode;
39602 gen_il = gen_avx2_interleave_lowv32qi;
39603 gen_ih = gen_avx2_interleave_highv32qi;
39604 break;
39605 default:
39606 gcc_unreachable ();
39607 }
39608
39609 op2_l = op2_h = op2;
39610 switch (code)
39611 {
39612 case MULT:
39613 /* Unpack data such that we've got a source byte in each low byte of
39614 each word. We don't care what goes into the high byte of each word.
39615 Rather than trying to get zero in there, most convenient is to let
39616 it be a copy of the low byte. */
39617 op2_l = gen_reg_rtx (qimode);
39618 op2_h = gen_reg_rtx (qimode);
39619 emit_insn (gen_il (op2_l, op2, op2));
39620 emit_insn (gen_ih (op2_h, op2, op2));
39621 /* FALLTHRU */
39622
39623 op1_l = gen_reg_rtx (qimode);
39624 op1_h = gen_reg_rtx (qimode);
39625 emit_insn (gen_il (op1_l, op1, op1));
39626 emit_insn (gen_ih (op1_h, op1, op1));
39627 full_interleave = qimode == V16QImode;
39628 break;
39629
39630 case ASHIFT:
39631 case LSHIFTRT:
39632 uns_p = true;
39633 /* FALLTHRU */
39634 case ASHIFTRT:
39635 op1_l = gen_reg_rtx (himode);
39636 op1_h = gen_reg_rtx (himode);
39637 ix86_expand_sse_unpack (op1_l, op1, uns_p, false);
39638 ix86_expand_sse_unpack (op1_h, op1, uns_p, true);
39639 full_interleave = true;
39640 break;
39641 default:
39642 gcc_unreachable ();
39643 }
39644
39645 /* Perform the operation. */
39646 res_l = expand_simple_binop (himode, code, op1_l, op2_l, NULL_RTX,
39647 1, OPTAB_DIRECT);
39648 res_h = expand_simple_binop (himode, code, op1_h, op2_h, NULL_RTX,
39649 1, OPTAB_DIRECT);
39650 gcc_assert (res_l && res_h);
39651
39652 /* Merge the data back into the right place. */
39653 d.target = dest;
39654 d.op0 = gen_lowpart (qimode, res_l);
39655 d.op1 = gen_lowpart (qimode, res_h);
39656 d.vmode = qimode;
39657 d.nelt = GET_MODE_NUNITS (qimode);
39658 d.one_operand_p = false;
39659 d.testing_p = false;
39660
39661 if (full_interleave)
39662 {
39663 /* For SSE2, we used an full interleave, so the desired
39664 results are in the even elements. */
39665 for (i = 0; i < 32; ++i)
39666 d.perm[i] = i * 2;
39667 }
39668 else
39669 {
39670 /* For AVX, the interleave used above was not cross-lane. So the
39671 extraction is evens but with the second and third quarter swapped.
39672 Happily, that is even one insn shorter than even extraction. */
39673 for (i = 0; i < 32; ++i)
39674 d.perm[i] = i * 2 + ((i & 24) == 8 ? 16 : (i & 24) == 16 ? -16 : 0);
39675 }
39676
39677 ok = ix86_expand_vec_perm_const_1 (&d);
39678 gcc_assert (ok);
39679
39680 set_unique_reg_note (get_last_insn (), REG_EQUAL,
39681 gen_rtx_fmt_ee (code, qimode, op1, op2));
39682 }
39683
39684 void
39685 ix86_expand_mul_widen_evenodd (rtx dest, rtx op1, rtx op2,
39686 bool uns_p, bool odd_p)
39687 {
39688 enum machine_mode mode = GET_MODE (op1);
39689 enum machine_mode wmode = GET_MODE (dest);
39690 rtx x;
39691
39692 /* We only play even/odd games with vectors of SImode. */
39693 gcc_assert (mode == V4SImode || mode == V8SImode);
39694
39695 /* If we're looking for the odd results, shift those members down to
39696 the even slots. For some cpus this is faster than a PSHUFD. */
39697 if (odd_p)
39698 {
39699 if (TARGET_XOP && mode == V4SImode)
39700 {
39701 x = force_reg (wmode, CONST0_RTX (wmode));
39702 emit_insn (gen_xop_pmacsdqh (dest, op1, op2, x));
39703 return;
39704 }
39705
39706 x = GEN_INT (GET_MODE_UNIT_BITSIZE (mode));
39707 op1 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op1),
39708 x, NULL, 1, OPTAB_DIRECT);
39709 op2 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op2),
39710 x, NULL, 1, OPTAB_DIRECT);
39711 op1 = gen_lowpart (mode, op1);
39712 op2 = gen_lowpart (mode, op2);
39713 }
39714
39715 if (mode == V8SImode)
39716 {
39717 if (uns_p)
39718 x = gen_vec_widen_umult_even_v8si (dest, op1, op2);
39719 else
39720 x = gen_vec_widen_smult_even_v8si (dest, op1, op2);
39721 }
39722 else if (uns_p)
39723 x = gen_vec_widen_umult_even_v4si (dest, op1, op2);
39724 else if (TARGET_SSE4_1)
39725 x = gen_sse4_1_mulv2siv2di3 (dest, op1, op2);
39726 else
39727 {
39728 rtx s1, s2, t0, t1, t2;
39729
39730 /* The easiest way to implement this without PMULDQ is to go through
39731 the motions as if we are performing a full 64-bit multiply. With
39732 the exception that we need to do less shuffling of the elements. */
39733
39734 /* Compute the sign-extension, aka highparts, of the two operands. */
39735 s1 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
39736 op1, pc_rtx, pc_rtx);
39737 s2 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
39738 op2, pc_rtx, pc_rtx);
39739
39740 /* Multiply LO(A) * HI(B), and vice-versa. */
39741 t1 = gen_reg_rtx (wmode);
39742 t2 = gen_reg_rtx (wmode);
39743 emit_insn (gen_vec_widen_umult_even_v4si (t1, s1, op2));
39744 emit_insn (gen_vec_widen_umult_even_v4si (t2, s2, op1));
39745
39746 /* Multiply LO(A) * LO(B). */
39747 t0 = gen_reg_rtx (wmode);
39748 emit_insn (gen_vec_widen_umult_even_v4si (t0, op1, op2));
39749
39750 /* Combine and shift the highparts into place. */
39751 t1 = expand_binop (wmode, add_optab, t1, t2, t1, 1, OPTAB_DIRECT);
39752 t1 = expand_binop (wmode, ashl_optab, t1, GEN_INT (32), t1,
39753 1, OPTAB_DIRECT);
39754
39755 /* Combine high and low parts. */
39756 force_expand_binop (wmode, add_optab, t0, t1, dest, 1, OPTAB_DIRECT);
39757 return;
39758 }
39759 emit_insn (x);
39760 }
39761
39762 void
39763 ix86_expand_mul_widen_hilo (rtx dest, rtx op1, rtx op2,
39764 bool uns_p, bool high_p)
39765 {
39766 enum machine_mode wmode = GET_MODE (dest);
39767 enum machine_mode mode = GET_MODE (op1);
39768 rtx t1, t2, t3, t4, mask;
39769
39770 switch (mode)
39771 {
39772 case V4SImode:
39773 t1 = gen_reg_rtx (mode);
39774 t2 = gen_reg_rtx (mode);
39775 if (TARGET_XOP && !uns_p)
39776 {
39777 /* With XOP, we have pmacsdqh, aka mul_widen_odd. In this case,
39778 shuffle the elements once so that all elements are in the right
39779 place for immediate use: { A C B D }. */
39780 emit_insn (gen_sse2_pshufd_1 (t1, op1, const0_rtx, const2_rtx,
39781 const1_rtx, GEN_INT (3)));
39782 emit_insn (gen_sse2_pshufd_1 (t2, op2, const0_rtx, const2_rtx,
39783 const1_rtx, GEN_INT (3)));
39784 }
39785 else
39786 {
39787 /* Put the elements into place for the multiply. */
39788 ix86_expand_vec_interleave (t1, op1, op1, high_p);
39789 ix86_expand_vec_interleave (t2, op2, op2, high_p);
39790 high_p = false;
39791 }
39792 ix86_expand_mul_widen_evenodd (dest, t1, t2, uns_p, high_p);
39793 break;
39794
39795 case V8SImode:
39796 /* Shuffle the elements between the lanes. After this we
39797 have { A B E F | C D G H } for each operand. */
39798 t1 = gen_reg_rtx (V4DImode);
39799 t2 = gen_reg_rtx (V4DImode);
39800 emit_insn (gen_avx2_permv4di_1 (t1, gen_lowpart (V4DImode, op1),
39801 const0_rtx, const2_rtx,
39802 const1_rtx, GEN_INT (3)));
39803 emit_insn (gen_avx2_permv4di_1 (t2, gen_lowpart (V4DImode, op2),
39804 const0_rtx, const2_rtx,
39805 const1_rtx, GEN_INT (3)));
39806
39807 /* Shuffle the elements within the lanes. After this we
39808 have { A A B B | C C D D } or { E E F F | G G H H }. */
39809 t3 = gen_reg_rtx (V8SImode);
39810 t4 = gen_reg_rtx (V8SImode);
39811 mask = GEN_INT (high_p
39812 ? 2 + (2 << 2) + (3 << 4) + (3 << 6)
39813 : 0 + (0 << 2) + (1 << 4) + (1 << 6));
39814 emit_insn (gen_avx2_pshufdv3 (t3, gen_lowpart (V8SImode, t1), mask));
39815 emit_insn (gen_avx2_pshufdv3 (t4, gen_lowpart (V8SImode, t2), mask));
39816
39817 ix86_expand_mul_widen_evenodd (dest, t3, t4, uns_p, false);
39818 break;
39819
39820 case V8HImode:
39821 case V16HImode:
39822 t1 = expand_binop (mode, smul_optab, op1, op2, NULL_RTX,
39823 uns_p, OPTAB_DIRECT);
39824 t2 = expand_binop (mode,
39825 uns_p ? umul_highpart_optab : smul_highpart_optab,
39826 op1, op2, NULL_RTX, uns_p, OPTAB_DIRECT);
39827 gcc_assert (t1 && t2);
39828
39829 ix86_expand_vec_interleave (gen_lowpart (mode, dest), t1, t2, high_p);
39830 break;
39831
39832 case V16QImode:
39833 case V32QImode:
39834 t1 = gen_reg_rtx (wmode);
39835 t2 = gen_reg_rtx (wmode);
39836 ix86_expand_sse_unpack (t1, op1, uns_p, high_p);
39837 ix86_expand_sse_unpack (t2, op2, uns_p, high_p);
39838
39839 emit_insn (gen_rtx_SET (VOIDmode, dest, gen_rtx_MULT (wmode, t1, t2)));
39840 break;
39841
39842 default:
39843 gcc_unreachable ();
39844 }
39845 }
39846
39847 void
39848 ix86_expand_sse2_mulv4si3 (rtx op0, rtx op1, rtx op2)
39849 {
39850 rtx res_1, res_2;
39851
39852 res_1 = gen_reg_rtx (V4SImode);
39853 res_2 = gen_reg_rtx (V4SImode);
39854 ix86_expand_mul_widen_evenodd (gen_lowpart (V2DImode, res_1),
39855 op1, op2, true, false);
39856 ix86_expand_mul_widen_evenodd (gen_lowpart (V2DImode, res_2),
39857 op1, op2, true, true);
39858
39859 /* Move the results in element 2 down to element 1; we don't care
39860 what goes in elements 2 and 3. Then we can merge the parts
39861 back together with an interleave.
39862
39863 Note that two other sequences were tried:
39864 (1) Use interleaves at the start instead of psrldq, which allows
39865 us to use a single shufps to merge things back at the end.
39866 (2) Use shufps here to combine the two vectors, then pshufd to
39867 put the elements in the correct order.
39868 In both cases the cost of the reformatting stall was too high
39869 and the overall sequence slower. */
39870
39871 emit_insn (gen_sse2_pshufd_1 (res_1, res_1, const0_rtx, const2_rtx,
39872 const0_rtx, const0_rtx));
39873 emit_insn (gen_sse2_pshufd_1 (res_2, res_2, const0_rtx, const2_rtx,
39874 const0_rtx, const0_rtx));
39875 res_1 = emit_insn (gen_vec_interleave_lowv4si (op0, res_1, res_2));
39876
39877 set_unique_reg_note (res_1, REG_EQUAL, gen_rtx_MULT (V4SImode, op1, op2));
39878 }
39879
39880 void
39881 ix86_expand_sse2_mulvxdi3 (rtx op0, rtx op1, rtx op2)
39882 {
39883 enum machine_mode mode = GET_MODE (op0);
39884 rtx t1, t2, t3, t4, t5, t6;
39885
39886 if (TARGET_XOP && mode == V2DImode)
39887 {
39888 /* op1: A,B,C,D, op2: E,F,G,H */
39889 op1 = gen_lowpart (V4SImode, op1);
39890 op2 = gen_lowpart (V4SImode, op2);
39891
39892 t1 = gen_reg_rtx (V4SImode);
39893 t2 = gen_reg_rtx (V4SImode);
39894 t3 = gen_reg_rtx (V2DImode);
39895 t4 = gen_reg_rtx (V2DImode);
39896
39897 /* t1: B,A,D,C */
39898 emit_insn (gen_sse2_pshufd_1 (t1, op1,
39899 GEN_INT (1),
39900 GEN_INT (0),
39901 GEN_INT (3),
39902 GEN_INT (2)));
39903
39904 /* t2: (B*E),(A*F),(D*G),(C*H) */
39905 emit_insn (gen_mulv4si3 (t2, t1, op2));
39906
39907 /* t3: (B*E)+(A*F), (D*G)+(C*H) */
39908 emit_insn (gen_xop_phadddq (t3, t2));
39909
39910 /* t4: ((B*E)+(A*F))<<32, ((D*G)+(C*H))<<32 */
39911 emit_insn (gen_ashlv2di3 (t4, t3, GEN_INT (32)));
39912
39913 /* op0: (((B*E)+(A*F))<<32)+(B*F), (((D*G)+(C*H))<<32)+(D*H) */
39914 emit_insn (gen_xop_pmacsdql (op0, op1, op2, t4));
39915 }
39916 else
39917 {
39918 enum machine_mode nmode;
39919 rtx (*umul) (rtx, rtx, rtx);
39920
39921 if (mode == V2DImode)
39922 {
39923 umul = gen_vec_widen_umult_even_v4si;
39924 nmode = V4SImode;
39925 }
39926 else if (mode == V4DImode)
39927 {
39928 umul = gen_vec_widen_umult_even_v8si;
39929 nmode = V8SImode;
39930 }
39931 else
39932 gcc_unreachable ();
39933
39934
39935 /* Multiply low parts. */
39936 t1 = gen_reg_rtx (mode);
39937 emit_insn (umul (t1, gen_lowpart (nmode, op1), gen_lowpart (nmode, op2)));
39938
39939 /* Shift input vectors right 32 bits so we can multiply high parts. */
39940 t6 = GEN_INT (32);
39941 t2 = expand_binop (mode, lshr_optab, op1, t6, NULL, 1, OPTAB_DIRECT);
39942 t3 = expand_binop (mode, lshr_optab, op2, t6, NULL, 1, OPTAB_DIRECT);
39943
39944 /* Multiply high parts by low parts. */
39945 t4 = gen_reg_rtx (mode);
39946 t5 = gen_reg_rtx (mode);
39947 emit_insn (umul (t4, gen_lowpart (nmode, t2), gen_lowpart (nmode, op2)));
39948 emit_insn (umul (t5, gen_lowpart (nmode, t3), gen_lowpart (nmode, op1)));
39949
39950 /* Combine and shift the highparts back. */
39951 t4 = expand_binop (mode, add_optab, t4, t5, t4, 1, OPTAB_DIRECT);
39952 t4 = expand_binop (mode, ashl_optab, t4, t6, t4, 1, OPTAB_DIRECT);
39953
39954 /* Combine high and low parts. */
39955 force_expand_binop (mode, add_optab, t1, t4, op0, 1, OPTAB_DIRECT);
39956 }
39957
39958 set_unique_reg_note (get_last_insn (), REG_EQUAL,
39959 gen_rtx_MULT (mode, op1, op2));
39960 }
39961
39962 /* Expand an insert into a vector register through pinsr insn.
39963 Return true if successful. */
39964
39965 bool
39966 ix86_expand_pinsr (rtx *operands)
39967 {
39968 rtx dst = operands[0];
39969 rtx src = operands[3];
39970
39971 unsigned int size = INTVAL (operands[1]);
39972 unsigned int pos = INTVAL (operands[2]);
39973
39974 if (GET_CODE (dst) == SUBREG)
39975 {
39976 pos += SUBREG_BYTE (dst) * BITS_PER_UNIT;
39977 dst = SUBREG_REG (dst);
39978 }
39979
39980 if (GET_CODE (src) == SUBREG)
39981 src = SUBREG_REG (src);
39982
39983 switch (GET_MODE (dst))
39984 {
39985 case V16QImode:
39986 case V8HImode:
39987 case V4SImode:
39988 case V2DImode:
39989 {
39990 enum machine_mode srcmode, dstmode;
39991 rtx (*pinsr)(rtx, rtx, rtx, rtx);
39992
39993 srcmode = mode_for_size (size, MODE_INT, 0);
39994
39995 switch (srcmode)
39996 {
39997 case QImode:
39998 if (!TARGET_SSE4_1)
39999 return false;
40000 dstmode = V16QImode;
40001 pinsr = gen_sse4_1_pinsrb;
40002 break;
40003
40004 case HImode:
40005 if (!TARGET_SSE2)
40006 return false;
40007 dstmode = V8HImode;
40008 pinsr = gen_sse2_pinsrw;
40009 break;
40010
40011 case SImode:
40012 if (!TARGET_SSE4_1)
40013 return false;
40014 dstmode = V4SImode;
40015 pinsr = gen_sse4_1_pinsrd;
40016 break;
40017
40018 case DImode:
40019 gcc_assert (TARGET_64BIT);
40020 if (!TARGET_SSE4_1)
40021 return false;
40022 dstmode = V2DImode;
40023 pinsr = gen_sse4_1_pinsrq;
40024 break;
40025
40026 default:
40027 return false;
40028 }
40029
40030 dst = gen_lowpart (dstmode, dst);
40031 src = gen_lowpart (srcmode, src);
40032
40033 pos /= size;
40034
40035 emit_insn (pinsr (dst, dst, src, GEN_INT (1 << pos)));
40036 return true;
40037 }
40038
40039 default:
40040 return false;
40041 }
40042 }
40043 \f
40044 /* This function returns the calling abi specific va_list type node.
40045 It returns the FNDECL specific va_list type. */
40046
40047 static tree
40048 ix86_fn_abi_va_list (tree fndecl)
40049 {
40050 if (!TARGET_64BIT)
40051 return va_list_type_node;
40052 gcc_assert (fndecl != NULL_TREE);
40053
40054 if (ix86_function_abi ((const_tree) fndecl) == MS_ABI)
40055 return ms_va_list_type_node;
40056 else
40057 return sysv_va_list_type_node;
40058 }
40059
40060 /* Returns the canonical va_list type specified by TYPE. If there
40061 is no valid TYPE provided, it return NULL_TREE. */
40062
40063 static tree
40064 ix86_canonical_va_list_type (tree type)
40065 {
40066 tree wtype, htype;
40067
40068 /* Resolve references and pointers to va_list type. */
40069 if (TREE_CODE (type) == MEM_REF)
40070 type = TREE_TYPE (type);
40071 else if (POINTER_TYPE_P (type) && POINTER_TYPE_P (TREE_TYPE(type)))
40072 type = TREE_TYPE (type);
40073 else if (POINTER_TYPE_P (type) && TREE_CODE (TREE_TYPE (type)) == ARRAY_TYPE)
40074 type = TREE_TYPE (type);
40075
40076 if (TARGET_64BIT && va_list_type_node != NULL_TREE)
40077 {
40078 wtype = va_list_type_node;
40079 gcc_assert (wtype != NULL_TREE);
40080 htype = type;
40081 if (TREE_CODE (wtype) == ARRAY_TYPE)
40082 {
40083 /* If va_list is an array type, the argument may have decayed
40084 to a pointer type, e.g. by being passed to another function.
40085 In that case, unwrap both types so that we can compare the
40086 underlying records. */
40087 if (TREE_CODE (htype) == ARRAY_TYPE
40088 || POINTER_TYPE_P (htype))
40089 {
40090 wtype = TREE_TYPE (wtype);
40091 htype = TREE_TYPE (htype);
40092 }
40093 }
40094 if (TYPE_MAIN_VARIANT (wtype) == TYPE_MAIN_VARIANT (htype))
40095 return va_list_type_node;
40096 wtype = sysv_va_list_type_node;
40097 gcc_assert (wtype != NULL_TREE);
40098 htype = type;
40099 if (TREE_CODE (wtype) == ARRAY_TYPE)
40100 {
40101 /* If va_list is an array type, the argument may have decayed
40102 to a pointer type, e.g. by being passed to another function.
40103 In that case, unwrap both types so that we can compare the
40104 underlying records. */
40105 if (TREE_CODE (htype) == ARRAY_TYPE
40106 || POINTER_TYPE_P (htype))
40107 {
40108 wtype = TREE_TYPE (wtype);
40109 htype = TREE_TYPE (htype);
40110 }
40111 }
40112 if (TYPE_MAIN_VARIANT (wtype) == TYPE_MAIN_VARIANT (htype))
40113 return sysv_va_list_type_node;
40114 wtype = ms_va_list_type_node;
40115 gcc_assert (wtype != NULL_TREE);
40116 htype = type;
40117 if (TREE_CODE (wtype) == ARRAY_TYPE)
40118 {
40119 /* If va_list is an array type, the argument may have decayed
40120 to a pointer type, e.g. by being passed to another function.
40121 In that case, unwrap both types so that we can compare the
40122 underlying records. */
40123 if (TREE_CODE (htype) == ARRAY_TYPE
40124 || POINTER_TYPE_P (htype))
40125 {
40126 wtype = TREE_TYPE (wtype);
40127 htype = TREE_TYPE (htype);
40128 }
40129 }
40130 if (TYPE_MAIN_VARIANT (wtype) == TYPE_MAIN_VARIANT (htype))
40131 return ms_va_list_type_node;
40132 return NULL_TREE;
40133 }
40134 return std_canonical_va_list_type (type);
40135 }
40136
40137 /* Iterate through the target-specific builtin types for va_list.
40138 IDX denotes the iterator, *PTREE is set to the result type of
40139 the va_list builtin, and *PNAME to its internal type.
40140 Returns zero if there is no element for this index, otherwise
40141 IDX should be increased upon the next call.
40142 Note, do not iterate a base builtin's name like __builtin_va_list.
40143 Used from c_common_nodes_and_builtins. */
40144
40145 static int
40146 ix86_enum_va_list (int idx, const char **pname, tree *ptree)
40147 {
40148 if (TARGET_64BIT)
40149 {
40150 switch (idx)
40151 {
40152 default:
40153 break;
40154
40155 case 0:
40156 *ptree = ms_va_list_type_node;
40157 *pname = "__builtin_ms_va_list";
40158 return 1;
40159
40160 case 1:
40161 *ptree = sysv_va_list_type_node;
40162 *pname = "__builtin_sysv_va_list";
40163 return 1;
40164 }
40165 }
40166
40167 return 0;
40168 }
40169
40170 #undef TARGET_SCHED_DISPATCH
40171 #define TARGET_SCHED_DISPATCH has_dispatch
40172 #undef TARGET_SCHED_DISPATCH_DO
40173 #define TARGET_SCHED_DISPATCH_DO do_dispatch
40174 #undef TARGET_SCHED_REASSOCIATION_WIDTH
40175 #define TARGET_SCHED_REASSOCIATION_WIDTH ix86_reassociation_width
40176 #undef TARGET_SCHED_REORDER
40177 #define TARGET_SCHED_REORDER ix86_sched_reorder
40178 #undef TARGET_SCHED_ADJUST_PRIORITY
40179 #define TARGET_SCHED_ADJUST_PRIORITY ix86_adjust_priority
40180 #undef TARGET_SCHED_DEPENDENCIES_EVALUATION_HOOK
40181 #define TARGET_SCHED_DEPENDENCIES_EVALUATION_HOOK ix86_dependencies_evaluation_hook
40182
40183 /* The size of the dispatch window is the total number of bytes of
40184 object code allowed in a window. */
40185 #define DISPATCH_WINDOW_SIZE 16
40186
40187 /* Number of dispatch windows considered for scheduling. */
40188 #define MAX_DISPATCH_WINDOWS 3
40189
40190 /* Maximum number of instructions in a window. */
40191 #define MAX_INSN 4
40192
40193 /* Maximum number of immediate operands in a window. */
40194 #define MAX_IMM 4
40195
40196 /* Maximum number of immediate bits allowed in a window. */
40197 #define MAX_IMM_SIZE 128
40198
40199 /* Maximum number of 32 bit immediates allowed in a window. */
40200 #define MAX_IMM_32 4
40201
40202 /* Maximum number of 64 bit immediates allowed in a window. */
40203 #define MAX_IMM_64 2
40204
40205 /* Maximum total of loads or prefetches allowed in a window. */
40206 #define MAX_LOAD 2
40207
40208 /* Maximum total of stores allowed in a window. */
40209 #define MAX_STORE 1
40210
40211 #undef BIG
40212 #define BIG 100
40213
40214
40215 /* Dispatch groups. Istructions that affect the mix in a dispatch window. */
40216 enum dispatch_group {
40217 disp_no_group = 0,
40218 disp_load,
40219 disp_store,
40220 disp_load_store,
40221 disp_prefetch,
40222 disp_imm,
40223 disp_imm_32,
40224 disp_imm_64,
40225 disp_branch,
40226 disp_cmp,
40227 disp_jcc,
40228 disp_last
40229 };
40230
40231 /* Number of allowable groups in a dispatch window. It is an array
40232 indexed by dispatch_group enum. 100 is used as a big number,
40233 because the number of these kind of operations does not have any
40234 effect in dispatch window, but we need them for other reasons in
40235 the table. */
40236 static unsigned int num_allowable_groups[disp_last] = {
40237 0, 2, 1, 1, 2, 4, 4, 2, 1, BIG, BIG
40238 };
40239
40240 char group_name[disp_last + 1][16] = {
40241 "disp_no_group", "disp_load", "disp_store", "disp_load_store",
40242 "disp_prefetch", "disp_imm", "disp_imm_32", "disp_imm_64",
40243 "disp_branch", "disp_cmp", "disp_jcc", "disp_last"
40244 };
40245
40246 /* Instruction path. */
40247 enum insn_path {
40248 no_path = 0,
40249 path_single, /* Single micro op. */
40250 path_double, /* Double micro op. */
40251 path_multi, /* Instructions with more than 2 micro op.. */
40252 last_path
40253 };
40254
40255 /* sched_insn_info defines a window to the instructions scheduled in
40256 the basic block. It contains a pointer to the insn_info table and
40257 the instruction scheduled.
40258
40259 Windows are allocated for each basic block and are linked
40260 together. */
40261 typedef struct sched_insn_info_s {
40262 rtx insn;
40263 enum dispatch_group group;
40264 enum insn_path path;
40265 int byte_len;
40266 int imm_bytes;
40267 } sched_insn_info;
40268
40269 /* Linked list of dispatch windows. This is a two way list of
40270 dispatch windows of a basic block. It contains information about
40271 the number of uops in the window and the total number of
40272 instructions and of bytes in the object code for this dispatch
40273 window. */
40274 typedef struct dispatch_windows_s {
40275 int num_insn; /* Number of insn in the window. */
40276 int num_uops; /* Number of uops in the window. */
40277 int window_size; /* Number of bytes in the window. */
40278 int window_num; /* Window number between 0 or 1. */
40279 int num_imm; /* Number of immediates in an insn. */
40280 int num_imm_32; /* Number of 32 bit immediates in an insn. */
40281 int num_imm_64; /* Number of 64 bit immediates in an insn. */
40282 int imm_size; /* Total immediates in the window. */
40283 int num_loads; /* Total memory loads in the window. */
40284 int num_stores; /* Total memory stores in the window. */
40285 int violation; /* Violation exists in window. */
40286 sched_insn_info *window; /* Pointer to the window. */
40287 struct dispatch_windows_s *next;
40288 struct dispatch_windows_s *prev;
40289 } dispatch_windows;
40290
40291 /* Immediate valuse used in an insn. */
40292 typedef struct imm_info_s
40293 {
40294 int imm;
40295 int imm32;
40296 int imm64;
40297 } imm_info;
40298
40299 static dispatch_windows *dispatch_window_list;
40300 static dispatch_windows *dispatch_window_list1;
40301
40302 /* Get dispatch group of insn. */
40303
40304 static enum dispatch_group
40305 get_mem_group (rtx insn)
40306 {
40307 enum attr_memory memory;
40308
40309 if (INSN_CODE (insn) < 0)
40310 return disp_no_group;
40311 memory = get_attr_memory (insn);
40312 if (memory == MEMORY_STORE)
40313 return disp_store;
40314
40315 if (memory == MEMORY_LOAD)
40316 return disp_load;
40317
40318 if (memory == MEMORY_BOTH)
40319 return disp_load_store;
40320
40321 return disp_no_group;
40322 }
40323
40324 /* Return true if insn is a compare instruction. */
40325
40326 static bool
40327 is_cmp (rtx insn)
40328 {
40329 enum attr_type type;
40330
40331 type = get_attr_type (insn);
40332 return (type == TYPE_TEST
40333 || type == TYPE_ICMP
40334 || type == TYPE_FCMP
40335 || GET_CODE (PATTERN (insn)) == COMPARE);
40336 }
40337
40338 /* Return true if a dispatch violation encountered. */
40339
40340 static bool
40341 dispatch_violation (void)
40342 {
40343 if (dispatch_window_list->next)
40344 return dispatch_window_list->next->violation;
40345 return dispatch_window_list->violation;
40346 }
40347
40348 /* Return true if insn is a branch instruction. */
40349
40350 static bool
40351 is_branch (rtx insn)
40352 {
40353 return (CALL_P (insn) || JUMP_P (insn));
40354 }
40355
40356 /* Return true if insn is a prefetch instruction. */
40357
40358 static bool
40359 is_prefetch (rtx insn)
40360 {
40361 return NONJUMP_INSN_P (insn) && GET_CODE (PATTERN (insn)) == PREFETCH;
40362 }
40363
40364 /* This function initializes a dispatch window and the list container holding a
40365 pointer to the window. */
40366
40367 static void
40368 init_window (int window_num)
40369 {
40370 int i;
40371 dispatch_windows *new_list;
40372
40373 if (window_num == 0)
40374 new_list = dispatch_window_list;
40375 else
40376 new_list = dispatch_window_list1;
40377
40378 new_list->num_insn = 0;
40379 new_list->num_uops = 0;
40380 new_list->window_size = 0;
40381 new_list->next = NULL;
40382 new_list->prev = NULL;
40383 new_list->window_num = window_num;
40384 new_list->num_imm = 0;
40385 new_list->num_imm_32 = 0;
40386 new_list->num_imm_64 = 0;
40387 new_list->imm_size = 0;
40388 new_list->num_loads = 0;
40389 new_list->num_stores = 0;
40390 new_list->violation = false;
40391
40392 for (i = 0; i < MAX_INSN; i++)
40393 {
40394 new_list->window[i].insn = NULL;
40395 new_list->window[i].group = disp_no_group;
40396 new_list->window[i].path = no_path;
40397 new_list->window[i].byte_len = 0;
40398 new_list->window[i].imm_bytes = 0;
40399 }
40400 return;
40401 }
40402
40403 /* This function allocates and initializes a dispatch window and the
40404 list container holding a pointer to the window. */
40405
40406 static dispatch_windows *
40407 allocate_window (void)
40408 {
40409 dispatch_windows *new_list = XNEW (struct dispatch_windows_s);
40410 new_list->window = XNEWVEC (struct sched_insn_info_s, MAX_INSN + 1);
40411
40412 return new_list;
40413 }
40414
40415 /* This routine initializes the dispatch scheduling information. It
40416 initiates building dispatch scheduler tables and constructs the
40417 first dispatch window. */
40418
40419 static void
40420 init_dispatch_sched (void)
40421 {
40422 /* Allocate a dispatch list and a window. */
40423 dispatch_window_list = allocate_window ();
40424 dispatch_window_list1 = allocate_window ();
40425 init_window (0);
40426 init_window (1);
40427 }
40428
40429 /* This function returns true if a branch is detected. End of a basic block
40430 does not have to be a branch, but here we assume only branches end a
40431 window. */
40432
40433 static bool
40434 is_end_basic_block (enum dispatch_group group)
40435 {
40436 return group == disp_branch;
40437 }
40438
40439 /* This function is called when the end of a window processing is reached. */
40440
40441 static void
40442 process_end_window (void)
40443 {
40444 gcc_assert (dispatch_window_list->num_insn <= MAX_INSN);
40445 if (dispatch_window_list->next)
40446 {
40447 gcc_assert (dispatch_window_list1->num_insn <= MAX_INSN);
40448 gcc_assert (dispatch_window_list->window_size
40449 + dispatch_window_list1->window_size <= 48);
40450 init_window (1);
40451 }
40452 init_window (0);
40453 }
40454
40455 /* Allocates a new dispatch window and adds it to WINDOW_LIST.
40456 WINDOW_NUM is either 0 or 1. A maximum of two windows are generated
40457 for 48 bytes of instructions. Note that these windows are not dispatch
40458 windows that their sizes are DISPATCH_WINDOW_SIZE. */
40459
40460 static dispatch_windows *
40461 allocate_next_window (int window_num)
40462 {
40463 if (window_num == 0)
40464 {
40465 if (dispatch_window_list->next)
40466 init_window (1);
40467 init_window (0);
40468 return dispatch_window_list;
40469 }
40470
40471 dispatch_window_list->next = dispatch_window_list1;
40472 dispatch_window_list1->prev = dispatch_window_list;
40473
40474 return dispatch_window_list1;
40475 }
40476
40477 /* Increment the number of immediate operands of an instruction. */
40478
40479 static int
40480 find_constant_1 (rtx *in_rtx, imm_info *imm_values)
40481 {
40482 if (*in_rtx == 0)
40483 return 0;
40484
40485 switch ( GET_CODE (*in_rtx))
40486 {
40487 case CONST:
40488 case SYMBOL_REF:
40489 case CONST_INT:
40490 (imm_values->imm)++;
40491 if (x86_64_immediate_operand (*in_rtx, SImode))
40492 (imm_values->imm32)++;
40493 else
40494 (imm_values->imm64)++;
40495 break;
40496
40497 case CONST_DOUBLE:
40498 (imm_values->imm)++;
40499 (imm_values->imm64)++;
40500 break;
40501
40502 case CODE_LABEL:
40503 if (LABEL_KIND (*in_rtx) == LABEL_NORMAL)
40504 {
40505 (imm_values->imm)++;
40506 (imm_values->imm32)++;
40507 }
40508 break;
40509
40510 default:
40511 break;
40512 }
40513
40514 return 0;
40515 }
40516
40517 /* Compute number of immediate operands of an instruction. */
40518
40519 static void
40520 find_constant (rtx in_rtx, imm_info *imm_values)
40521 {
40522 for_each_rtx (INSN_P (in_rtx) ? &PATTERN (in_rtx) : &in_rtx,
40523 (rtx_function) find_constant_1, (void *) imm_values);
40524 }
40525
40526 /* Return total size of immediate operands of an instruction along with number
40527 of corresponding immediate-operands. It initializes its parameters to zero
40528 befor calling FIND_CONSTANT.
40529 INSN is the input instruction. IMM is the total of immediates.
40530 IMM32 is the number of 32 bit immediates. IMM64 is the number of 64
40531 bit immediates. */
40532
40533 static int
40534 get_num_immediates (rtx insn, int *imm, int *imm32, int *imm64)
40535 {
40536 imm_info imm_values = {0, 0, 0};
40537
40538 find_constant (insn, &imm_values);
40539 *imm = imm_values.imm;
40540 *imm32 = imm_values.imm32;
40541 *imm64 = imm_values.imm64;
40542 return imm_values.imm32 * 4 + imm_values.imm64 * 8;
40543 }
40544
40545 /* This function indicates if an operand of an instruction is an
40546 immediate. */
40547
40548 static bool
40549 has_immediate (rtx insn)
40550 {
40551 int num_imm_operand;
40552 int num_imm32_operand;
40553 int num_imm64_operand;
40554
40555 if (insn)
40556 return get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
40557 &num_imm64_operand);
40558 return false;
40559 }
40560
40561 /* Return single or double path for instructions. */
40562
40563 static enum insn_path
40564 get_insn_path (rtx insn)
40565 {
40566 enum attr_amdfam10_decode path = get_attr_amdfam10_decode (insn);
40567
40568 if ((int)path == 0)
40569 return path_single;
40570
40571 if ((int)path == 1)
40572 return path_double;
40573
40574 return path_multi;
40575 }
40576
40577 /* Return insn dispatch group. */
40578
40579 static enum dispatch_group
40580 get_insn_group (rtx insn)
40581 {
40582 enum dispatch_group group = get_mem_group (insn);
40583 if (group)
40584 return group;
40585
40586 if (is_branch (insn))
40587 return disp_branch;
40588
40589 if (is_cmp (insn))
40590 return disp_cmp;
40591
40592 if (has_immediate (insn))
40593 return disp_imm;
40594
40595 if (is_prefetch (insn))
40596 return disp_prefetch;
40597
40598 return disp_no_group;
40599 }
40600
40601 /* Count number of GROUP restricted instructions in a dispatch
40602 window WINDOW_LIST. */
40603
40604 static int
40605 count_num_restricted (rtx insn, dispatch_windows *window_list)
40606 {
40607 enum dispatch_group group = get_insn_group (insn);
40608 int imm_size;
40609 int num_imm_operand;
40610 int num_imm32_operand;
40611 int num_imm64_operand;
40612
40613 if (group == disp_no_group)
40614 return 0;
40615
40616 if (group == disp_imm)
40617 {
40618 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
40619 &num_imm64_operand);
40620 if (window_list->imm_size + imm_size > MAX_IMM_SIZE
40621 || num_imm_operand + window_list->num_imm > MAX_IMM
40622 || (num_imm32_operand > 0
40623 && (window_list->num_imm_32 + num_imm32_operand > MAX_IMM_32
40624 || window_list->num_imm_64 * 2 + num_imm32_operand > MAX_IMM_32))
40625 || (num_imm64_operand > 0
40626 && (window_list->num_imm_64 + num_imm64_operand > MAX_IMM_64
40627 || window_list->num_imm_32 + num_imm64_operand * 2 > MAX_IMM_32))
40628 || (window_list->imm_size + imm_size == MAX_IMM_SIZE
40629 && num_imm64_operand > 0
40630 && ((window_list->num_imm_64 > 0
40631 && window_list->num_insn >= 2)
40632 || window_list->num_insn >= 3)))
40633 return BIG;
40634
40635 return 1;
40636 }
40637
40638 if ((group == disp_load_store
40639 && (window_list->num_loads >= MAX_LOAD
40640 || window_list->num_stores >= MAX_STORE))
40641 || ((group == disp_load
40642 || group == disp_prefetch)
40643 && window_list->num_loads >= MAX_LOAD)
40644 || (group == disp_store
40645 && window_list->num_stores >= MAX_STORE))
40646 return BIG;
40647
40648 return 1;
40649 }
40650
40651 /* This function returns true if insn satisfies dispatch rules on the
40652 last window scheduled. */
40653
40654 static bool
40655 fits_dispatch_window (rtx insn)
40656 {
40657 dispatch_windows *window_list = dispatch_window_list;
40658 dispatch_windows *window_list_next = dispatch_window_list->next;
40659 unsigned int num_restrict;
40660 enum dispatch_group group = get_insn_group (insn);
40661 enum insn_path path = get_insn_path (insn);
40662 int sum;
40663
40664 /* Make disp_cmp and disp_jcc get scheduled at the latest. These
40665 instructions should be given the lowest priority in the
40666 scheduling process in Haifa scheduler to make sure they will be
40667 scheduled in the same dispatch window as the reference to them. */
40668 if (group == disp_jcc || group == disp_cmp)
40669 return false;
40670
40671 /* Check nonrestricted. */
40672 if (group == disp_no_group || group == disp_branch)
40673 return true;
40674
40675 /* Get last dispatch window. */
40676 if (window_list_next)
40677 window_list = window_list_next;
40678
40679 if (window_list->window_num == 1)
40680 {
40681 sum = window_list->prev->window_size + window_list->window_size;
40682
40683 if (sum == 32
40684 || (min_insn_size (insn) + sum) >= 48)
40685 /* Window 1 is full. Go for next window. */
40686 return true;
40687 }
40688
40689 num_restrict = count_num_restricted (insn, window_list);
40690
40691 if (num_restrict > num_allowable_groups[group])
40692 return false;
40693
40694 /* See if it fits in the first window. */
40695 if (window_list->window_num == 0)
40696 {
40697 /* The first widow should have only single and double path
40698 uops. */
40699 if (path == path_double
40700 && (window_list->num_uops + 2) > MAX_INSN)
40701 return false;
40702 else if (path != path_single)
40703 return false;
40704 }
40705 return true;
40706 }
40707
40708 /* Add an instruction INSN with NUM_UOPS micro-operations to the
40709 dispatch window WINDOW_LIST. */
40710
40711 static void
40712 add_insn_window (rtx insn, dispatch_windows *window_list, int num_uops)
40713 {
40714 int byte_len = min_insn_size (insn);
40715 int num_insn = window_list->num_insn;
40716 int imm_size;
40717 sched_insn_info *window = window_list->window;
40718 enum dispatch_group group = get_insn_group (insn);
40719 enum insn_path path = get_insn_path (insn);
40720 int num_imm_operand;
40721 int num_imm32_operand;
40722 int num_imm64_operand;
40723
40724 if (!window_list->violation && group != disp_cmp
40725 && !fits_dispatch_window (insn))
40726 window_list->violation = true;
40727
40728 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
40729 &num_imm64_operand);
40730
40731 /* Initialize window with new instruction. */
40732 window[num_insn].insn = insn;
40733 window[num_insn].byte_len = byte_len;
40734 window[num_insn].group = group;
40735 window[num_insn].path = path;
40736 window[num_insn].imm_bytes = imm_size;
40737
40738 window_list->window_size += byte_len;
40739 window_list->num_insn = num_insn + 1;
40740 window_list->num_uops = window_list->num_uops + num_uops;
40741 window_list->imm_size += imm_size;
40742 window_list->num_imm += num_imm_operand;
40743 window_list->num_imm_32 += num_imm32_operand;
40744 window_list->num_imm_64 += num_imm64_operand;
40745
40746 if (group == disp_store)
40747 window_list->num_stores += 1;
40748 else if (group == disp_load
40749 || group == disp_prefetch)
40750 window_list->num_loads += 1;
40751 else if (group == disp_load_store)
40752 {
40753 window_list->num_stores += 1;
40754 window_list->num_loads += 1;
40755 }
40756 }
40757
40758 /* Adds a scheduled instruction, INSN, to the current dispatch window.
40759 If the total bytes of instructions or the number of instructions in
40760 the window exceed allowable, it allocates a new window. */
40761
40762 static void
40763 add_to_dispatch_window (rtx insn)
40764 {
40765 int byte_len;
40766 dispatch_windows *window_list;
40767 dispatch_windows *next_list;
40768 dispatch_windows *window0_list;
40769 enum insn_path path;
40770 enum dispatch_group insn_group;
40771 bool insn_fits;
40772 int num_insn;
40773 int num_uops;
40774 int window_num;
40775 int insn_num_uops;
40776 int sum;
40777
40778 if (INSN_CODE (insn) < 0)
40779 return;
40780
40781 byte_len = min_insn_size (insn);
40782 window_list = dispatch_window_list;
40783 next_list = window_list->next;
40784 path = get_insn_path (insn);
40785 insn_group = get_insn_group (insn);
40786
40787 /* Get the last dispatch window. */
40788 if (next_list)
40789 window_list = dispatch_window_list->next;
40790
40791 if (path == path_single)
40792 insn_num_uops = 1;
40793 else if (path == path_double)
40794 insn_num_uops = 2;
40795 else
40796 insn_num_uops = (int) path;
40797
40798 /* If current window is full, get a new window.
40799 Window number zero is full, if MAX_INSN uops are scheduled in it.
40800 Window number one is full, if window zero's bytes plus window
40801 one's bytes is 32, or if the bytes of the new instruction added
40802 to the total makes it greater than 48, or it has already MAX_INSN
40803 instructions in it. */
40804 num_insn = window_list->num_insn;
40805 num_uops = window_list->num_uops;
40806 window_num = window_list->window_num;
40807 insn_fits = fits_dispatch_window (insn);
40808
40809 if (num_insn >= MAX_INSN
40810 || num_uops + insn_num_uops > MAX_INSN
40811 || !(insn_fits))
40812 {
40813 window_num = ~window_num & 1;
40814 window_list = allocate_next_window (window_num);
40815 }
40816
40817 if (window_num == 0)
40818 {
40819 add_insn_window (insn, window_list, insn_num_uops);
40820 if (window_list->num_insn >= MAX_INSN
40821 && insn_group == disp_branch)
40822 {
40823 process_end_window ();
40824 return;
40825 }
40826 }
40827 else if (window_num == 1)
40828 {
40829 window0_list = window_list->prev;
40830 sum = window0_list->window_size + window_list->window_size;
40831 if (sum == 32
40832 || (byte_len + sum) >= 48)
40833 {
40834 process_end_window ();
40835 window_list = dispatch_window_list;
40836 }
40837
40838 add_insn_window (insn, window_list, insn_num_uops);
40839 }
40840 else
40841 gcc_unreachable ();
40842
40843 if (is_end_basic_block (insn_group))
40844 {
40845 /* End of basic block is reached do end-basic-block process. */
40846 process_end_window ();
40847 return;
40848 }
40849 }
40850
40851 /* Print the dispatch window, WINDOW_NUM, to FILE. */
40852
40853 DEBUG_FUNCTION static void
40854 debug_dispatch_window_file (FILE *file, int window_num)
40855 {
40856 dispatch_windows *list;
40857 int i;
40858
40859 if (window_num == 0)
40860 list = dispatch_window_list;
40861 else
40862 list = dispatch_window_list1;
40863
40864 fprintf (file, "Window #%d:\n", list->window_num);
40865 fprintf (file, " num_insn = %d, num_uops = %d, window_size = %d\n",
40866 list->num_insn, list->num_uops, list->window_size);
40867 fprintf (file, " num_imm = %d, num_imm_32 = %d, num_imm_64 = %d, imm_size = %d\n",
40868 list->num_imm, list->num_imm_32, list->num_imm_64, list->imm_size);
40869
40870 fprintf (file, " num_loads = %d, num_stores = %d\n", list->num_loads,
40871 list->num_stores);
40872 fprintf (file, " insn info:\n");
40873
40874 for (i = 0; i < MAX_INSN; i++)
40875 {
40876 if (!list->window[i].insn)
40877 break;
40878 fprintf (file, " group[%d] = %s, insn[%d] = %p, path[%d] = %d byte_len[%d] = %d, imm_bytes[%d] = %d\n",
40879 i, group_name[list->window[i].group],
40880 i, (void *)list->window[i].insn,
40881 i, list->window[i].path,
40882 i, list->window[i].byte_len,
40883 i, list->window[i].imm_bytes);
40884 }
40885 }
40886
40887 /* Print to stdout a dispatch window. */
40888
40889 DEBUG_FUNCTION void
40890 debug_dispatch_window (int window_num)
40891 {
40892 debug_dispatch_window_file (stdout, window_num);
40893 }
40894
40895 /* Print INSN dispatch information to FILE. */
40896
40897 DEBUG_FUNCTION static void
40898 debug_insn_dispatch_info_file (FILE *file, rtx insn)
40899 {
40900 int byte_len;
40901 enum insn_path path;
40902 enum dispatch_group group;
40903 int imm_size;
40904 int num_imm_operand;
40905 int num_imm32_operand;
40906 int num_imm64_operand;
40907
40908 if (INSN_CODE (insn) < 0)
40909 return;
40910
40911 byte_len = min_insn_size (insn);
40912 path = get_insn_path (insn);
40913 group = get_insn_group (insn);
40914 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
40915 &num_imm64_operand);
40916
40917 fprintf (file, " insn info:\n");
40918 fprintf (file, " group = %s, path = %d, byte_len = %d\n",
40919 group_name[group], path, byte_len);
40920 fprintf (file, " num_imm = %d, num_imm_32 = %d, num_imm_64 = %d, imm_size = %d\n",
40921 num_imm_operand, num_imm32_operand, num_imm64_operand, imm_size);
40922 }
40923
40924 /* Print to STDERR the status of the ready list with respect to
40925 dispatch windows. */
40926
40927 DEBUG_FUNCTION void
40928 debug_ready_dispatch (void)
40929 {
40930 int i;
40931 int no_ready = number_in_ready ();
40932
40933 fprintf (stdout, "Number of ready: %d\n", no_ready);
40934
40935 for (i = 0; i < no_ready; i++)
40936 debug_insn_dispatch_info_file (stdout, get_ready_element (i));
40937 }
40938
40939 /* This routine is the driver of the dispatch scheduler. */
40940
40941 static void
40942 do_dispatch (rtx insn, int mode)
40943 {
40944 if (mode == DISPATCH_INIT)
40945 init_dispatch_sched ();
40946 else if (mode == ADD_TO_DISPATCH_WINDOW)
40947 add_to_dispatch_window (insn);
40948 }
40949
40950 /* Return TRUE if Dispatch Scheduling is supported. */
40951
40952 static bool
40953 has_dispatch (rtx insn, int action)
40954 {
40955 if ((TARGET_BDVER1 || TARGET_BDVER2)
40956 && flag_dispatch_scheduler)
40957 switch (action)
40958 {
40959 default:
40960 return false;
40961
40962 case IS_DISPATCH_ON:
40963 return true;
40964 break;
40965
40966 case IS_CMP:
40967 return is_cmp (insn);
40968
40969 case DISPATCH_VIOLATION:
40970 return dispatch_violation ();
40971
40972 case FITS_DISPATCH_WINDOW:
40973 return fits_dispatch_window (insn);
40974 }
40975
40976 return false;
40977 }
40978
40979 /* Implementation of reassociation_width target hook used by
40980 reassoc phase to identify parallelism level in reassociated
40981 tree. Statements tree_code is passed in OPC. Arguments type
40982 is passed in MODE.
40983
40984 Currently parallel reassociation is enabled for Atom
40985 processors only and we set reassociation width to be 2
40986 because Atom may issue up to 2 instructions per cycle.
40987
40988 Return value should be fixed if parallel reassociation is
40989 enabled for other processors. */
40990
40991 static int
40992 ix86_reassociation_width (unsigned int opc ATTRIBUTE_UNUSED,
40993 enum machine_mode mode)
40994 {
40995 int res = 1;
40996
40997 if (INTEGRAL_MODE_P (mode) && TARGET_REASSOC_INT_TO_PARALLEL)
40998 res = 2;
40999 else if (FLOAT_MODE_P (mode) && TARGET_REASSOC_FP_TO_PARALLEL)
41000 res = 2;
41001
41002 return res;
41003 }
41004
41005 /* ??? No autovectorization into MMX or 3DNOW until we can reliably
41006 place emms and femms instructions. */
41007
41008 static enum machine_mode
41009 ix86_preferred_simd_mode (enum machine_mode mode)
41010 {
41011 if (!TARGET_SSE)
41012 return word_mode;
41013
41014 switch (mode)
41015 {
41016 case QImode:
41017 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V32QImode : V16QImode;
41018 case HImode:
41019 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V16HImode : V8HImode;
41020 case SImode:
41021 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V8SImode : V4SImode;
41022 case DImode:
41023 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V4DImode : V2DImode;
41024
41025 case SFmode:
41026 if (TARGET_AVX && !TARGET_PREFER_AVX128)
41027 return V8SFmode;
41028 else
41029 return V4SFmode;
41030
41031 case DFmode:
41032 if (!TARGET_VECTORIZE_DOUBLE)
41033 return word_mode;
41034 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
41035 return V4DFmode;
41036 else if (TARGET_SSE2)
41037 return V2DFmode;
41038 /* FALLTHRU */
41039
41040 default:
41041 return word_mode;
41042 }
41043 }
41044
41045 /* If AVX is enabled then try vectorizing with both 256bit and 128bit
41046 vectors. */
41047
41048 static unsigned int
41049 ix86_autovectorize_vector_sizes (void)
41050 {
41051 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? 32 | 16 : 0;
41052 }
41053
41054 \f
41055
41056 /* Return class of registers which could be used for pseudo of MODE
41057 and of class RCLASS for spilling instead of memory. Return NO_REGS
41058 if it is not possible or non-profitable. */
41059 static reg_class_t
41060 ix86_spill_class (reg_class_t rclass, enum machine_mode mode)
41061 {
41062 if (TARGET_SSE && TARGET_GENERAL_REGS_SSE_SPILL && ! TARGET_MMX
41063 && hard_reg_set_subset_p (reg_class_contents[rclass],
41064 reg_class_contents[GENERAL_REGS])
41065 && (mode == SImode || (TARGET_64BIT && mode == DImode)))
41066 return SSE_REGS;
41067 return NO_REGS;
41068 }
41069
41070 /* Implement targetm.vectorize.init_cost. */
41071
41072 static void *
41073 ix86_init_cost (struct loop *loop_info ATTRIBUTE_UNUSED)
41074 {
41075 unsigned *cost = XNEWVEC (unsigned, 3);
41076 cost[vect_prologue] = cost[vect_body] = cost[vect_epilogue] = 0;
41077 return cost;
41078 }
41079
41080 /* Implement targetm.vectorize.add_stmt_cost. */
41081
41082 static unsigned
41083 ix86_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
41084 struct _stmt_vec_info *stmt_info, int misalign,
41085 enum vect_cost_model_location where)
41086 {
41087 unsigned *cost = (unsigned *) data;
41088 unsigned retval = 0;
41089
41090 if (flag_vect_cost_model)
41091 {
41092 tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
41093 int stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign);
41094
41095 /* Statements in an inner loop relative to the loop being
41096 vectorized are weighted more heavily. The value here is
41097 arbitrary and could potentially be improved with analysis. */
41098 if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
41099 count *= 50; /* FIXME. */
41100
41101 retval = (unsigned) (count * stmt_cost);
41102 cost[where] += retval;
41103 }
41104
41105 return retval;
41106 }
41107
41108 /* Implement targetm.vectorize.finish_cost. */
41109
41110 static void
41111 ix86_finish_cost (void *data, unsigned *prologue_cost,
41112 unsigned *body_cost, unsigned *epilogue_cost)
41113 {
41114 unsigned *cost = (unsigned *) data;
41115 *prologue_cost = cost[vect_prologue];
41116 *body_cost = cost[vect_body];
41117 *epilogue_cost = cost[vect_epilogue];
41118 }
41119
41120 /* Implement targetm.vectorize.destroy_cost_data. */
41121
41122 static void
41123 ix86_destroy_cost_data (void *data)
41124 {
41125 free (data);
41126 }
41127
41128 /* Validate target specific memory model bits in VAL. */
41129
41130 static unsigned HOST_WIDE_INT
41131 ix86_memmodel_check (unsigned HOST_WIDE_INT val)
41132 {
41133 unsigned HOST_WIDE_INT model = val & MEMMODEL_MASK;
41134 unsigned HOST_WIDE_INT strong;
41135
41136 if (val & ~(unsigned HOST_WIDE_INT)(IX86_HLE_ACQUIRE|IX86_HLE_RELEASE
41137 |MEMMODEL_MASK)
41138 || ((val & IX86_HLE_ACQUIRE) && (val & IX86_HLE_RELEASE)))
41139 {
41140 warning (OPT_Winvalid_memory_model,
41141 "Unknown architecture specific memory model");
41142 return MEMMODEL_SEQ_CST;
41143 }
41144 strong = (model == MEMMODEL_ACQ_REL || model == MEMMODEL_SEQ_CST);
41145 if (val & IX86_HLE_ACQUIRE && !(model == MEMMODEL_ACQUIRE || strong))
41146 {
41147 warning (OPT_Winvalid_memory_model,
41148 "HLE_ACQUIRE not used with ACQUIRE or stronger memory model");
41149 return MEMMODEL_SEQ_CST | IX86_HLE_ACQUIRE;
41150 }
41151 if (val & IX86_HLE_RELEASE && !(model == MEMMODEL_RELEASE || strong))
41152 {
41153 warning (OPT_Winvalid_memory_model,
41154 "HLE_RELEASE not used with RELEASE or stronger memory model");
41155 return MEMMODEL_SEQ_CST | IX86_HLE_RELEASE;
41156 }
41157 return val;
41158 }
41159
41160 /* Initialize the GCC target structure. */
41161 #undef TARGET_RETURN_IN_MEMORY
41162 #define TARGET_RETURN_IN_MEMORY ix86_return_in_memory
41163
41164 #undef TARGET_LEGITIMIZE_ADDRESS
41165 #define TARGET_LEGITIMIZE_ADDRESS ix86_legitimize_address
41166
41167 #undef TARGET_ATTRIBUTE_TABLE
41168 #define TARGET_ATTRIBUTE_TABLE ix86_attribute_table
41169 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
41170 # undef TARGET_MERGE_DECL_ATTRIBUTES
41171 # define TARGET_MERGE_DECL_ATTRIBUTES merge_dllimport_decl_attributes
41172 #endif
41173
41174 #undef TARGET_COMP_TYPE_ATTRIBUTES
41175 #define TARGET_COMP_TYPE_ATTRIBUTES ix86_comp_type_attributes
41176
41177 #undef TARGET_INIT_BUILTINS
41178 #define TARGET_INIT_BUILTINS ix86_init_builtins
41179 #undef TARGET_BUILTIN_DECL
41180 #define TARGET_BUILTIN_DECL ix86_builtin_decl
41181 #undef TARGET_EXPAND_BUILTIN
41182 #define TARGET_EXPAND_BUILTIN ix86_expand_builtin
41183
41184 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
41185 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
41186 ix86_builtin_vectorized_function
41187
41188 #undef TARGET_VECTORIZE_BUILTIN_TM_LOAD
41189 #define TARGET_VECTORIZE_BUILTIN_TM_LOAD ix86_builtin_tm_load
41190
41191 #undef TARGET_VECTORIZE_BUILTIN_TM_STORE
41192 #define TARGET_VECTORIZE_BUILTIN_TM_STORE ix86_builtin_tm_store
41193
41194 #undef TARGET_VECTORIZE_BUILTIN_GATHER
41195 #define TARGET_VECTORIZE_BUILTIN_GATHER ix86_vectorize_builtin_gather
41196
41197 #undef TARGET_BUILTIN_RECIPROCAL
41198 #define TARGET_BUILTIN_RECIPROCAL ix86_builtin_reciprocal
41199
41200 #undef TARGET_ASM_FUNCTION_EPILOGUE
41201 #define TARGET_ASM_FUNCTION_EPILOGUE ix86_output_function_epilogue
41202
41203 #undef TARGET_ENCODE_SECTION_INFO
41204 #ifndef SUBTARGET_ENCODE_SECTION_INFO
41205 #define TARGET_ENCODE_SECTION_INFO ix86_encode_section_info
41206 #else
41207 #define TARGET_ENCODE_SECTION_INFO SUBTARGET_ENCODE_SECTION_INFO
41208 #endif
41209
41210 #undef TARGET_ASM_OPEN_PAREN
41211 #define TARGET_ASM_OPEN_PAREN ""
41212 #undef TARGET_ASM_CLOSE_PAREN
41213 #define TARGET_ASM_CLOSE_PAREN ""
41214
41215 #undef TARGET_ASM_BYTE_OP
41216 #define TARGET_ASM_BYTE_OP ASM_BYTE
41217
41218 #undef TARGET_ASM_ALIGNED_HI_OP
41219 #define TARGET_ASM_ALIGNED_HI_OP ASM_SHORT
41220 #undef TARGET_ASM_ALIGNED_SI_OP
41221 #define TARGET_ASM_ALIGNED_SI_OP ASM_LONG
41222 #ifdef ASM_QUAD
41223 #undef TARGET_ASM_ALIGNED_DI_OP
41224 #define TARGET_ASM_ALIGNED_DI_OP ASM_QUAD
41225 #endif
41226
41227 #undef TARGET_PROFILE_BEFORE_PROLOGUE
41228 #define TARGET_PROFILE_BEFORE_PROLOGUE ix86_profile_before_prologue
41229
41230 #undef TARGET_ASM_UNALIGNED_HI_OP
41231 #define TARGET_ASM_UNALIGNED_HI_OP TARGET_ASM_ALIGNED_HI_OP
41232 #undef TARGET_ASM_UNALIGNED_SI_OP
41233 #define TARGET_ASM_UNALIGNED_SI_OP TARGET_ASM_ALIGNED_SI_OP
41234 #undef TARGET_ASM_UNALIGNED_DI_OP
41235 #define TARGET_ASM_UNALIGNED_DI_OP TARGET_ASM_ALIGNED_DI_OP
41236
41237 #undef TARGET_PRINT_OPERAND
41238 #define TARGET_PRINT_OPERAND ix86_print_operand
41239 #undef TARGET_PRINT_OPERAND_ADDRESS
41240 #define TARGET_PRINT_OPERAND_ADDRESS ix86_print_operand_address
41241 #undef TARGET_PRINT_OPERAND_PUNCT_VALID_P
41242 #define TARGET_PRINT_OPERAND_PUNCT_VALID_P ix86_print_operand_punct_valid_p
41243 #undef TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA
41244 #define TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA i386_asm_output_addr_const_extra
41245
41246 #undef TARGET_SCHED_INIT_GLOBAL
41247 #define TARGET_SCHED_INIT_GLOBAL ix86_sched_init_global
41248 #undef TARGET_SCHED_ADJUST_COST
41249 #define TARGET_SCHED_ADJUST_COST ix86_adjust_cost
41250 #undef TARGET_SCHED_ISSUE_RATE
41251 #define TARGET_SCHED_ISSUE_RATE ix86_issue_rate
41252 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
41253 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
41254 ia32_multipass_dfa_lookahead
41255
41256 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
41257 #define TARGET_FUNCTION_OK_FOR_SIBCALL ix86_function_ok_for_sibcall
41258
41259 #undef TARGET_MEMMODEL_CHECK
41260 #define TARGET_MEMMODEL_CHECK ix86_memmodel_check
41261
41262 #ifdef HAVE_AS_TLS
41263 #undef TARGET_HAVE_TLS
41264 #define TARGET_HAVE_TLS true
41265 #endif
41266 #undef TARGET_CANNOT_FORCE_CONST_MEM
41267 #define TARGET_CANNOT_FORCE_CONST_MEM ix86_cannot_force_const_mem
41268 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
41269 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P hook_bool_mode_const_rtx_true
41270
41271 #undef TARGET_DELEGITIMIZE_ADDRESS
41272 #define TARGET_DELEGITIMIZE_ADDRESS ix86_delegitimize_address
41273
41274 #undef TARGET_MS_BITFIELD_LAYOUT_P
41275 #define TARGET_MS_BITFIELD_LAYOUT_P ix86_ms_bitfield_layout_p
41276
41277 #if TARGET_MACHO
41278 #undef TARGET_BINDS_LOCAL_P
41279 #define TARGET_BINDS_LOCAL_P darwin_binds_local_p
41280 #endif
41281 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
41282 #undef TARGET_BINDS_LOCAL_P
41283 #define TARGET_BINDS_LOCAL_P i386_pe_binds_local_p
41284 #endif
41285
41286 #undef TARGET_ASM_OUTPUT_MI_THUNK
41287 #define TARGET_ASM_OUTPUT_MI_THUNK x86_output_mi_thunk
41288 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
41289 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK x86_can_output_mi_thunk
41290
41291 #undef TARGET_ASM_FILE_START
41292 #define TARGET_ASM_FILE_START x86_file_start
41293
41294 #undef TARGET_OPTION_OVERRIDE
41295 #define TARGET_OPTION_OVERRIDE ix86_option_override
41296
41297 #undef TARGET_REGISTER_MOVE_COST
41298 #define TARGET_REGISTER_MOVE_COST ix86_register_move_cost
41299 #undef TARGET_MEMORY_MOVE_COST
41300 #define TARGET_MEMORY_MOVE_COST ix86_memory_move_cost
41301 #undef TARGET_RTX_COSTS
41302 #define TARGET_RTX_COSTS ix86_rtx_costs
41303 #undef TARGET_ADDRESS_COST
41304 #define TARGET_ADDRESS_COST ix86_address_cost
41305
41306 #undef TARGET_FIXED_CONDITION_CODE_REGS
41307 #define TARGET_FIXED_CONDITION_CODE_REGS ix86_fixed_condition_code_regs
41308 #undef TARGET_CC_MODES_COMPATIBLE
41309 #define TARGET_CC_MODES_COMPATIBLE ix86_cc_modes_compatible
41310
41311 #undef TARGET_MACHINE_DEPENDENT_REORG
41312 #define TARGET_MACHINE_DEPENDENT_REORG ix86_reorg
41313
41314 #undef TARGET_BUILTIN_SETJMP_FRAME_VALUE
41315 #define TARGET_BUILTIN_SETJMP_FRAME_VALUE ix86_builtin_setjmp_frame_value
41316
41317 #undef TARGET_BUILD_BUILTIN_VA_LIST
41318 #define TARGET_BUILD_BUILTIN_VA_LIST ix86_build_builtin_va_list
41319
41320 #undef TARGET_FOLD_BUILTIN
41321 #define TARGET_FOLD_BUILTIN ix86_fold_builtin
41322
41323 #undef TARGET_ENUM_VA_LIST_P
41324 #define TARGET_ENUM_VA_LIST_P ix86_enum_va_list
41325
41326 #undef TARGET_FN_ABI_VA_LIST
41327 #define TARGET_FN_ABI_VA_LIST ix86_fn_abi_va_list
41328
41329 #undef TARGET_CANONICAL_VA_LIST_TYPE
41330 #define TARGET_CANONICAL_VA_LIST_TYPE ix86_canonical_va_list_type
41331
41332 #undef TARGET_EXPAND_BUILTIN_VA_START
41333 #define TARGET_EXPAND_BUILTIN_VA_START ix86_va_start
41334
41335 #undef TARGET_MD_ASM_CLOBBERS
41336 #define TARGET_MD_ASM_CLOBBERS ix86_md_asm_clobbers
41337
41338 #undef TARGET_PROMOTE_PROTOTYPES
41339 #define TARGET_PROMOTE_PROTOTYPES hook_bool_const_tree_true
41340 #undef TARGET_STRUCT_VALUE_RTX
41341 #define TARGET_STRUCT_VALUE_RTX ix86_struct_value_rtx
41342 #undef TARGET_SETUP_INCOMING_VARARGS
41343 #define TARGET_SETUP_INCOMING_VARARGS ix86_setup_incoming_varargs
41344 #undef TARGET_MUST_PASS_IN_STACK
41345 #define TARGET_MUST_PASS_IN_STACK ix86_must_pass_in_stack
41346 #undef TARGET_FUNCTION_ARG_ADVANCE
41347 #define TARGET_FUNCTION_ARG_ADVANCE ix86_function_arg_advance
41348 #undef TARGET_FUNCTION_ARG
41349 #define TARGET_FUNCTION_ARG ix86_function_arg
41350 #undef TARGET_FUNCTION_ARG_BOUNDARY
41351 #define TARGET_FUNCTION_ARG_BOUNDARY ix86_function_arg_boundary
41352 #undef TARGET_PASS_BY_REFERENCE
41353 #define TARGET_PASS_BY_REFERENCE ix86_pass_by_reference
41354 #undef TARGET_INTERNAL_ARG_POINTER
41355 #define TARGET_INTERNAL_ARG_POINTER ix86_internal_arg_pointer
41356 #undef TARGET_UPDATE_STACK_BOUNDARY
41357 #define TARGET_UPDATE_STACK_BOUNDARY ix86_update_stack_boundary
41358 #undef TARGET_GET_DRAP_RTX
41359 #define TARGET_GET_DRAP_RTX ix86_get_drap_rtx
41360 #undef TARGET_STRICT_ARGUMENT_NAMING
41361 #define TARGET_STRICT_ARGUMENT_NAMING hook_bool_CUMULATIVE_ARGS_true
41362 #undef TARGET_STATIC_CHAIN
41363 #define TARGET_STATIC_CHAIN ix86_static_chain
41364 #undef TARGET_TRAMPOLINE_INIT
41365 #define TARGET_TRAMPOLINE_INIT ix86_trampoline_init
41366 #undef TARGET_RETURN_POPS_ARGS
41367 #define TARGET_RETURN_POPS_ARGS ix86_return_pops_args
41368
41369 #undef TARGET_LEGITIMATE_COMBINED_INSN
41370 #define TARGET_LEGITIMATE_COMBINED_INSN ix86_legitimate_combined_insn
41371
41372 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
41373 #define TARGET_GIMPLIFY_VA_ARG_EXPR ix86_gimplify_va_arg
41374
41375 #undef TARGET_SCALAR_MODE_SUPPORTED_P
41376 #define TARGET_SCALAR_MODE_SUPPORTED_P ix86_scalar_mode_supported_p
41377
41378 #undef TARGET_VECTOR_MODE_SUPPORTED_P
41379 #define TARGET_VECTOR_MODE_SUPPORTED_P ix86_vector_mode_supported_p
41380
41381 #undef TARGET_C_MODE_FOR_SUFFIX
41382 #define TARGET_C_MODE_FOR_SUFFIX ix86_c_mode_for_suffix
41383
41384 #ifdef HAVE_AS_TLS
41385 #undef TARGET_ASM_OUTPUT_DWARF_DTPREL
41386 #define TARGET_ASM_OUTPUT_DWARF_DTPREL i386_output_dwarf_dtprel
41387 #endif
41388
41389 #ifdef SUBTARGET_INSERT_ATTRIBUTES
41390 #undef TARGET_INSERT_ATTRIBUTES
41391 #define TARGET_INSERT_ATTRIBUTES SUBTARGET_INSERT_ATTRIBUTES
41392 #endif
41393
41394 #undef TARGET_MANGLE_TYPE
41395 #define TARGET_MANGLE_TYPE ix86_mangle_type
41396
41397 #if !TARGET_MACHO
41398 #undef TARGET_STACK_PROTECT_FAIL
41399 #define TARGET_STACK_PROTECT_FAIL ix86_stack_protect_fail
41400 #endif
41401
41402 #undef TARGET_FUNCTION_VALUE
41403 #define TARGET_FUNCTION_VALUE ix86_function_value
41404
41405 #undef TARGET_FUNCTION_VALUE_REGNO_P
41406 #define TARGET_FUNCTION_VALUE_REGNO_P ix86_function_value_regno_p
41407
41408 #undef TARGET_PROMOTE_FUNCTION_MODE
41409 #define TARGET_PROMOTE_FUNCTION_MODE ix86_promote_function_mode
41410
41411 #undef TARGET_MEMBER_TYPE_FORCES_BLK
41412 #define TARGET_MEMBER_TYPE_FORCES_BLK ix86_member_type_forces_blk
41413
41414 #undef TARGET_SECONDARY_RELOAD
41415 #define TARGET_SECONDARY_RELOAD ix86_secondary_reload
41416
41417 #undef TARGET_CLASS_MAX_NREGS
41418 #define TARGET_CLASS_MAX_NREGS ix86_class_max_nregs
41419
41420 #undef TARGET_PREFERRED_RELOAD_CLASS
41421 #define TARGET_PREFERRED_RELOAD_CLASS ix86_preferred_reload_class
41422 #undef TARGET_PREFERRED_OUTPUT_RELOAD_CLASS
41423 #define TARGET_PREFERRED_OUTPUT_RELOAD_CLASS ix86_preferred_output_reload_class
41424 #undef TARGET_CLASS_LIKELY_SPILLED_P
41425 #define TARGET_CLASS_LIKELY_SPILLED_P ix86_class_likely_spilled_p
41426
41427 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
41428 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
41429 ix86_builtin_vectorization_cost
41430 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
41431 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
41432 ix86_vectorize_vec_perm_const_ok
41433 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
41434 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE \
41435 ix86_preferred_simd_mode
41436 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
41437 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
41438 ix86_autovectorize_vector_sizes
41439 #undef TARGET_VECTORIZE_INIT_COST
41440 #define TARGET_VECTORIZE_INIT_COST ix86_init_cost
41441 #undef TARGET_VECTORIZE_ADD_STMT_COST
41442 #define TARGET_VECTORIZE_ADD_STMT_COST ix86_add_stmt_cost
41443 #undef TARGET_VECTORIZE_FINISH_COST
41444 #define TARGET_VECTORIZE_FINISH_COST ix86_finish_cost
41445 #undef TARGET_VECTORIZE_DESTROY_COST_DATA
41446 #define TARGET_VECTORIZE_DESTROY_COST_DATA ix86_destroy_cost_data
41447
41448 #undef TARGET_SET_CURRENT_FUNCTION
41449 #define TARGET_SET_CURRENT_FUNCTION ix86_set_current_function
41450
41451 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
41452 #define TARGET_OPTION_VALID_ATTRIBUTE_P ix86_valid_target_attribute_p
41453
41454 #undef TARGET_OPTION_SAVE
41455 #define TARGET_OPTION_SAVE ix86_function_specific_save
41456
41457 #undef TARGET_OPTION_RESTORE
41458 #define TARGET_OPTION_RESTORE ix86_function_specific_restore
41459
41460 #undef TARGET_OPTION_PRINT
41461 #define TARGET_OPTION_PRINT ix86_function_specific_print
41462
41463 #undef TARGET_CAN_INLINE_P
41464 #define TARGET_CAN_INLINE_P ix86_can_inline_p
41465
41466 #undef TARGET_EXPAND_TO_RTL_HOOK
41467 #define TARGET_EXPAND_TO_RTL_HOOK ix86_maybe_switch_abi
41468
41469 #undef TARGET_LEGITIMATE_ADDRESS_P
41470 #define TARGET_LEGITIMATE_ADDRESS_P ix86_legitimate_address_p
41471
41472 #undef TARGET_LRA_P
41473 #define TARGET_LRA_P ix86_lra_p
41474
41475 #undef TARGET_REGISTER_PRIORITY
41476 #define TARGET_REGISTER_PRIORITY ix86_register_priority
41477
41478 #undef TARGET_LEGITIMATE_CONSTANT_P
41479 #define TARGET_LEGITIMATE_CONSTANT_P ix86_legitimate_constant_p
41480
41481 #undef TARGET_FRAME_POINTER_REQUIRED
41482 #define TARGET_FRAME_POINTER_REQUIRED ix86_frame_pointer_required
41483
41484 #undef TARGET_CAN_ELIMINATE
41485 #define TARGET_CAN_ELIMINATE ix86_can_eliminate
41486
41487 #undef TARGET_EXTRA_LIVE_ON_ENTRY
41488 #define TARGET_EXTRA_LIVE_ON_ENTRY ix86_live_on_entry
41489
41490 #undef TARGET_ASM_CODE_END
41491 #define TARGET_ASM_CODE_END ix86_code_end
41492
41493 #undef TARGET_CONDITIONAL_REGISTER_USAGE
41494 #define TARGET_CONDITIONAL_REGISTER_USAGE ix86_conditional_register_usage
41495
41496 #if TARGET_MACHO
41497 #undef TARGET_INIT_LIBFUNCS
41498 #define TARGET_INIT_LIBFUNCS darwin_rename_builtins
41499 #endif
41500
41501 #undef TARGET_SPILL_CLASS
41502 #define TARGET_SPILL_CLASS ix86_spill_class
41503
41504 struct gcc_target targetm = TARGET_INITIALIZER;
41505 \f
41506 #include "gt-i386.h"