Check ptr_mode and use Pmode in ix86_trampoline_init
[gcc.git] / gcc / config / i386 / i386.c
1 /* Subroutines used for code generation on IA-32.
2 Copyright (C) 1988, 1992, 1994, 1995, 1996, 1997, 1998, 1999, 2000,
3 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012
4 Free Software Foundation, Inc.
5
6 This file is part of GCC.
7
8 GCC is free software; you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation; either version 3, or (at your option)
11 any later version.
12
13 GCC is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
21
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "tm.h"
26 #include "rtl.h"
27 #include "tree.h"
28 #include "tm_p.h"
29 #include "regs.h"
30 #include "hard-reg-set.h"
31 #include "insn-config.h"
32 #include "conditions.h"
33 #include "output.h"
34 #include "insn-codes.h"
35 #include "insn-attr.h"
36 #include "flags.h"
37 #include "except.h"
38 #include "function.h"
39 #include "recog.h"
40 #include "expr.h"
41 #include "optabs.h"
42 #include "diagnostic-core.h"
43 #include "toplev.h"
44 #include "basic-block.h"
45 #include "ggc.h"
46 #include "target.h"
47 #include "target-def.h"
48 #include "common/common-target.h"
49 #include "langhooks.h"
50 #include "cgraph.h"
51 #include "gimple.h"
52 #include "dwarf2.h"
53 #include "df.h"
54 #include "tm-constrs.h"
55 #include "params.h"
56 #include "cselib.h"
57 #include "debug.h"
58 #include "sched-int.h"
59 #include "sbitmap.h"
60 #include "fibheap.h"
61 #include "opts.h"
62 #include "diagnostic.h"
63
64 enum upper_128bits_state
65 {
66 unknown = 0,
67 unused,
68 used
69 };
70
71 typedef struct block_info_def
72 {
73 /* State of the upper 128bits of AVX registers at exit. */
74 enum upper_128bits_state state;
75 /* TRUE if state of the upper 128bits of AVX registers is unchanged
76 in this block. */
77 bool unchanged;
78 /* TRUE if block has been processed. */
79 bool processed;
80 /* TRUE if block has been scanned. */
81 bool scanned;
82 /* Previous state of the upper 128bits of AVX registers at entry. */
83 enum upper_128bits_state prev;
84 } *block_info;
85
86 #define BLOCK_INFO(B) ((block_info) (B)->aux)
87
88 enum call_avx256_state
89 {
90 /* Callee returns 256bit AVX register. */
91 callee_return_avx256 = -1,
92 /* Callee returns and passes 256bit AVX register. */
93 callee_return_pass_avx256,
94 /* Callee passes 256bit AVX register. */
95 callee_pass_avx256,
96 /* Callee doesn't return nor passe 256bit AVX register, or no
97 256bit AVX register in function return. */
98 call_no_avx256,
99 /* vzeroupper intrinsic. */
100 vzeroupper_intrinsic
101 };
102
103 /* Check if a 256bit AVX register is referenced in stores. */
104
105 static void
106 check_avx256_stores (rtx dest, const_rtx set, void *data)
107 {
108 if ((REG_P (dest)
109 && VALID_AVX256_REG_MODE (GET_MODE (dest)))
110 || (GET_CODE (set) == SET
111 && REG_P (SET_SRC (set))
112 && VALID_AVX256_REG_MODE (GET_MODE (SET_SRC (set)))))
113 {
114 enum upper_128bits_state *state
115 = (enum upper_128bits_state *) data;
116 *state = used;
117 }
118 }
119
120 /* Helper function for move_or_delete_vzeroupper_1. Look for vzeroupper
121 in basic block BB. Delete it if upper 128bit AVX registers are
122 unused. If it isn't deleted, move it to just before a jump insn.
123
124 STATE is state of the upper 128bits of AVX registers at entry. */
125
126 static void
127 move_or_delete_vzeroupper_2 (basic_block bb,
128 enum upper_128bits_state state)
129 {
130 rtx insn, bb_end;
131 rtx vzeroupper_insn = NULL_RTX;
132 rtx pat;
133 int avx256;
134 bool unchanged;
135
136 if (BLOCK_INFO (bb)->unchanged)
137 {
138 if (dump_file)
139 fprintf (dump_file, " [bb %i] unchanged: upper 128bits: %d\n",
140 bb->index, state);
141
142 BLOCK_INFO (bb)->state = state;
143 return;
144 }
145
146 if (BLOCK_INFO (bb)->scanned && BLOCK_INFO (bb)->prev == state)
147 {
148 if (dump_file)
149 fprintf (dump_file, " [bb %i] scanned: upper 128bits: %d\n",
150 bb->index, BLOCK_INFO (bb)->state);
151 return;
152 }
153
154 BLOCK_INFO (bb)->prev = state;
155
156 if (dump_file)
157 fprintf (dump_file, " [bb %i] entry: upper 128bits: %d\n",
158 bb->index, state);
159
160 unchanged = true;
161
162 /* BB_END changes when it is deleted. */
163 bb_end = BB_END (bb);
164 insn = BB_HEAD (bb);
165 while (insn != bb_end)
166 {
167 insn = NEXT_INSN (insn);
168
169 if (!NONDEBUG_INSN_P (insn))
170 continue;
171
172 /* Move vzeroupper before jump/call. */
173 if (JUMP_P (insn) || CALL_P (insn))
174 {
175 if (!vzeroupper_insn)
176 continue;
177
178 if (PREV_INSN (insn) != vzeroupper_insn)
179 {
180 if (dump_file)
181 {
182 fprintf (dump_file, "Move vzeroupper after:\n");
183 print_rtl_single (dump_file, PREV_INSN (insn));
184 fprintf (dump_file, "before:\n");
185 print_rtl_single (dump_file, insn);
186 }
187 reorder_insns_nobb (vzeroupper_insn, vzeroupper_insn,
188 PREV_INSN (insn));
189 }
190 vzeroupper_insn = NULL_RTX;
191 continue;
192 }
193
194 pat = PATTERN (insn);
195
196 /* Check insn for vzeroupper intrinsic. */
197 if (GET_CODE (pat) == UNSPEC_VOLATILE
198 && XINT (pat, 1) == UNSPECV_VZEROUPPER)
199 {
200 if (dump_file)
201 {
202 /* Found vzeroupper intrinsic. */
203 fprintf (dump_file, "Found vzeroupper:\n");
204 print_rtl_single (dump_file, insn);
205 }
206 }
207 else
208 {
209 /* Check insn for vzeroall intrinsic. */
210 if (GET_CODE (pat) == PARALLEL
211 && GET_CODE (XVECEXP (pat, 0, 0)) == UNSPEC_VOLATILE
212 && XINT (XVECEXP (pat, 0, 0), 1) == UNSPECV_VZEROALL)
213 {
214 state = unused;
215 unchanged = false;
216
217 /* Delete pending vzeroupper insertion. */
218 if (vzeroupper_insn)
219 {
220 delete_insn (vzeroupper_insn);
221 vzeroupper_insn = NULL_RTX;
222 }
223 }
224 else if (state != used)
225 {
226 note_stores (pat, check_avx256_stores, &state);
227 if (state == used)
228 unchanged = false;
229 }
230 continue;
231 }
232
233 /* Process vzeroupper intrinsic. */
234 avx256 = INTVAL (XVECEXP (pat, 0, 0));
235
236 if (state == unused)
237 {
238 /* Since the upper 128bits are cleared, callee must not pass
239 256bit AVX register. We only need to check if callee
240 returns 256bit AVX register. */
241 if (avx256 == callee_return_avx256)
242 {
243 state = used;
244 unchanged = false;
245 }
246
247 /* Remove unnecessary vzeroupper since upper 128bits are
248 cleared. */
249 if (dump_file)
250 {
251 fprintf (dump_file, "Delete redundant vzeroupper:\n");
252 print_rtl_single (dump_file, insn);
253 }
254 delete_insn (insn);
255 }
256 else
257 {
258 /* Set state to UNUSED if callee doesn't return 256bit AVX
259 register. */
260 if (avx256 != callee_return_pass_avx256)
261 state = unused;
262
263 if (avx256 == callee_return_pass_avx256
264 || avx256 == callee_pass_avx256)
265 {
266 /* Must remove vzeroupper since callee passes in 256bit
267 AVX register. */
268 if (dump_file)
269 {
270 fprintf (dump_file, "Delete callee pass vzeroupper:\n");
271 print_rtl_single (dump_file, insn);
272 }
273 delete_insn (insn);
274 }
275 else
276 {
277 vzeroupper_insn = insn;
278 unchanged = false;
279 }
280 }
281 }
282
283 BLOCK_INFO (bb)->state = state;
284 BLOCK_INFO (bb)->unchanged = unchanged;
285 BLOCK_INFO (bb)->scanned = true;
286
287 if (dump_file)
288 fprintf (dump_file, " [bb %i] exit: %s: upper 128bits: %d\n",
289 bb->index, unchanged ? "unchanged" : "changed",
290 state);
291 }
292
293 /* Helper function for move_or_delete_vzeroupper. Process vzeroupper
294 in BLOCK and check its predecessor blocks. Treat UNKNOWN state
295 as USED if UNKNOWN_IS_UNUSED is true. Return TRUE if the exit
296 state is changed. */
297
298 static bool
299 move_or_delete_vzeroupper_1 (basic_block block, bool unknown_is_unused)
300 {
301 edge e;
302 edge_iterator ei;
303 enum upper_128bits_state state, old_state, new_state;
304 bool seen_unknown;
305
306 if (dump_file)
307 fprintf (dump_file, " Process [bb %i]: status: %d\n",
308 block->index, BLOCK_INFO (block)->processed);
309
310 if (BLOCK_INFO (block)->processed)
311 return false;
312
313 state = unused;
314
315 /* Check all predecessor edges of this block. */
316 seen_unknown = false;
317 FOR_EACH_EDGE (e, ei, block->preds)
318 {
319 if (e->src == block)
320 continue;
321 switch (BLOCK_INFO (e->src)->state)
322 {
323 case unknown:
324 if (!unknown_is_unused)
325 seen_unknown = true;
326 case unused:
327 break;
328 case used:
329 state = used;
330 goto done;
331 }
332 }
333
334 if (seen_unknown)
335 state = unknown;
336
337 done:
338 old_state = BLOCK_INFO (block)->state;
339 move_or_delete_vzeroupper_2 (block, state);
340 new_state = BLOCK_INFO (block)->state;
341
342 if (state != unknown || new_state == used)
343 BLOCK_INFO (block)->processed = true;
344
345 /* Need to rescan if the upper 128bits of AVX registers are changed
346 to USED at exit. */
347 if (new_state != old_state)
348 {
349 if (new_state == used)
350 cfun->machine->rescan_vzeroupper_p = 1;
351 return true;
352 }
353 else
354 return false;
355 }
356
357 /* Go through the instruction stream looking for vzeroupper. Delete
358 it if upper 128bit AVX registers are unused. If it isn't deleted,
359 move it to just before a jump insn. */
360
361 static void
362 move_or_delete_vzeroupper (void)
363 {
364 edge e;
365 edge_iterator ei;
366 basic_block bb;
367 fibheap_t worklist, pending, fibheap_swap;
368 sbitmap visited, in_worklist, in_pending, sbitmap_swap;
369 int *bb_order;
370 int *rc_order;
371 int i;
372
373 /* Set up block info for each basic block. */
374 alloc_aux_for_blocks (sizeof (struct block_info_def));
375
376 /* Process outgoing edges of entry point. */
377 if (dump_file)
378 fprintf (dump_file, "Process outgoing edges of entry point\n");
379
380 FOR_EACH_EDGE (e, ei, ENTRY_BLOCK_PTR->succs)
381 {
382 move_or_delete_vzeroupper_2 (e->dest,
383 cfun->machine->caller_pass_avx256_p
384 ? used : unused);
385 BLOCK_INFO (e->dest)->processed = true;
386 }
387
388 /* Compute reverse completion order of depth first search of the CFG
389 so that the data-flow runs faster. */
390 rc_order = XNEWVEC (int, n_basic_blocks - NUM_FIXED_BLOCKS);
391 bb_order = XNEWVEC (int, last_basic_block);
392 pre_and_rev_post_order_compute (NULL, rc_order, false);
393 for (i = 0; i < n_basic_blocks - NUM_FIXED_BLOCKS; i++)
394 bb_order[rc_order[i]] = i;
395 free (rc_order);
396
397 worklist = fibheap_new ();
398 pending = fibheap_new ();
399 visited = sbitmap_alloc (last_basic_block);
400 in_worklist = sbitmap_alloc (last_basic_block);
401 in_pending = sbitmap_alloc (last_basic_block);
402 sbitmap_zero (in_worklist);
403
404 /* Don't check outgoing edges of entry point. */
405 sbitmap_ones (in_pending);
406 FOR_EACH_BB (bb)
407 if (BLOCK_INFO (bb)->processed)
408 RESET_BIT (in_pending, bb->index);
409 else
410 {
411 move_or_delete_vzeroupper_1 (bb, false);
412 fibheap_insert (pending, bb_order[bb->index], bb);
413 }
414
415 if (dump_file)
416 fprintf (dump_file, "Check remaining basic blocks\n");
417
418 while (!fibheap_empty (pending))
419 {
420 fibheap_swap = pending;
421 pending = worklist;
422 worklist = fibheap_swap;
423 sbitmap_swap = in_pending;
424 in_pending = in_worklist;
425 in_worklist = sbitmap_swap;
426
427 sbitmap_zero (visited);
428
429 cfun->machine->rescan_vzeroupper_p = 0;
430
431 while (!fibheap_empty (worklist))
432 {
433 bb = (basic_block) fibheap_extract_min (worklist);
434 RESET_BIT (in_worklist, bb->index);
435 gcc_assert (!TEST_BIT (visited, bb->index));
436 if (!TEST_BIT (visited, bb->index))
437 {
438 edge_iterator ei;
439
440 SET_BIT (visited, bb->index);
441
442 if (move_or_delete_vzeroupper_1 (bb, false))
443 FOR_EACH_EDGE (e, ei, bb->succs)
444 {
445 if (e->dest == EXIT_BLOCK_PTR
446 || BLOCK_INFO (e->dest)->processed)
447 continue;
448
449 if (TEST_BIT (visited, e->dest->index))
450 {
451 if (!TEST_BIT (in_pending, e->dest->index))
452 {
453 /* Send E->DEST to next round. */
454 SET_BIT (in_pending, e->dest->index);
455 fibheap_insert (pending,
456 bb_order[e->dest->index],
457 e->dest);
458 }
459 }
460 else if (!TEST_BIT (in_worklist, e->dest->index))
461 {
462 /* Add E->DEST to current round. */
463 SET_BIT (in_worklist, e->dest->index);
464 fibheap_insert (worklist, bb_order[e->dest->index],
465 e->dest);
466 }
467 }
468 }
469 }
470
471 if (!cfun->machine->rescan_vzeroupper_p)
472 break;
473 }
474
475 free (bb_order);
476 fibheap_delete (worklist);
477 fibheap_delete (pending);
478 sbitmap_free (visited);
479 sbitmap_free (in_worklist);
480 sbitmap_free (in_pending);
481
482 if (dump_file)
483 fprintf (dump_file, "Process remaining basic blocks\n");
484
485 FOR_EACH_BB (bb)
486 move_or_delete_vzeroupper_1 (bb, true);
487
488 free_aux_for_blocks ();
489 }
490
491 static rtx legitimize_dllimport_symbol (rtx, bool);
492
493 #ifndef CHECK_STACK_LIMIT
494 #define CHECK_STACK_LIMIT (-1)
495 #endif
496
497 /* Return index of given mode in mult and division cost tables. */
498 #define MODE_INDEX(mode) \
499 ((mode) == QImode ? 0 \
500 : (mode) == HImode ? 1 \
501 : (mode) == SImode ? 2 \
502 : (mode) == DImode ? 3 \
503 : 4)
504
505 /* Processor costs (relative to an add) */
506 /* We assume COSTS_N_INSNS is defined as (N)*4 and an addition is 2 bytes. */
507 #define COSTS_N_BYTES(N) ((N) * 2)
508
509 #define DUMMY_STRINGOP_ALGS {libcall, {{-1, libcall}}}
510
511 const
512 struct processor_costs ix86_size_cost = {/* costs for tuning for size */
513 COSTS_N_BYTES (2), /* cost of an add instruction */
514 COSTS_N_BYTES (3), /* cost of a lea instruction */
515 COSTS_N_BYTES (2), /* variable shift costs */
516 COSTS_N_BYTES (3), /* constant shift costs */
517 {COSTS_N_BYTES (3), /* cost of starting multiply for QI */
518 COSTS_N_BYTES (3), /* HI */
519 COSTS_N_BYTES (3), /* SI */
520 COSTS_N_BYTES (3), /* DI */
521 COSTS_N_BYTES (5)}, /* other */
522 0, /* cost of multiply per each bit set */
523 {COSTS_N_BYTES (3), /* cost of a divide/mod for QI */
524 COSTS_N_BYTES (3), /* HI */
525 COSTS_N_BYTES (3), /* SI */
526 COSTS_N_BYTES (3), /* DI */
527 COSTS_N_BYTES (5)}, /* other */
528 COSTS_N_BYTES (3), /* cost of movsx */
529 COSTS_N_BYTES (3), /* cost of movzx */
530 0, /* "large" insn */
531 2, /* MOVE_RATIO */
532 2, /* cost for loading QImode using movzbl */
533 {2, 2, 2}, /* cost of loading integer registers
534 in QImode, HImode and SImode.
535 Relative to reg-reg move (2). */
536 {2, 2, 2}, /* cost of storing integer registers */
537 2, /* cost of reg,reg fld/fst */
538 {2, 2, 2}, /* cost of loading fp registers
539 in SFmode, DFmode and XFmode */
540 {2, 2, 2}, /* cost of storing fp registers
541 in SFmode, DFmode and XFmode */
542 3, /* cost of moving MMX register */
543 {3, 3}, /* cost of loading MMX registers
544 in SImode and DImode */
545 {3, 3}, /* cost of storing MMX registers
546 in SImode and DImode */
547 3, /* cost of moving SSE register */
548 {3, 3, 3}, /* cost of loading SSE registers
549 in SImode, DImode and TImode */
550 {3, 3, 3}, /* cost of storing SSE registers
551 in SImode, DImode and TImode */
552 3, /* MMX or SSE register to integer */
553 0, /* size of l1 cache */
554 0, /* size of l2 cache */
555 0, /* size of prefetch block */
556 0, /* number of parallel prefetches */
557 2, /* Branch cost */
558 COSTS_N_BYTES (2), /* cost of FADD and FSUB insns. */
559 COSTS_N_BYTES (2), /* cost of FMUL instruction. */
560 COSTS_N_BYTES (2), /* cost of FDIV instruction. */
561 COSTS_N_BYTES (2), /* cost of FABS instruction. */
562 COSTS_N_BYTES (2), /* cost of FCHS instruction. */
563 COSTS_N_BYTES (2), /* cost of FSQRT instruction. */
564 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
565 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}}},
566 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
567 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}}},
568 1, /* scalar_stmt_cost. */
569 1, /* scalar load_cost. */
570 1, /* scalar_store_cost. */
571 1, /* vec_stmt_cost. */
572 1, /* vec_to_scalar_cost. */
573 1, /* scalar_to_vec_cost. */
574 1, /* vec_align_load_cost. */
575 1, /* vec_unalign_load_cost. */
576 1, /* vec_store_cost. */
577 1, /* cond_taken_branch_cost. */
578 1, /* cond_not_taken_branch_cost. */
579 };
580
581 /* Processor costs (relative to an add) */
582 static const
583 struct processor_costs i386_cost = { /* 386 specific costs */
584 COSTS_N_INSNS (1), /* cost of an add instruction */
585 COSTS_N_INSNS (1), /* cost of a lea instruction */
586 COSTS_N_INSNS (3), /* variable shift costs */
587 COSTS_N_INSNS (2), /* constant shift costs */
588 {COSTS_N_INSNS (6), /* cost of starting multiply for QI */
589 COSTS_N_INSNS (6), /* HI */
590 COSTS_N_INSNS (6), /* SI */
591 COSTS_N_INSNS (6), /* DI */
592 COSTS_N_INSNS (6)}, /* other */
593 COSTS_N_INSNS (1), /* cost of multiply per each bit set */
594 {COSTS_N_INSNS (23), /* cost of a divide/mod for QI */
595 COSTS_N_INSNS (23), /* HI */
596 COSTS_N_INSNS (23), /* SI */
597 COSTS_N_INSNS (23), /* DI */
598 COSTS_N_INSNS (23)}, /* other */
599 COSTS_N_INSNS (3), /* cost of movsx */
600 COSTS_N_INSNS (2), /* cost of movzx */
601 15, /* "large" insn */
602 3, /* MOVE_RATIO */
603 4, /* cost for loading QImode using movzbl */
604 {2, 4, 2}, /* cost of loading integer registers
605 in QImode, HImode and SImode.
606 Relative to reg-reg move (2). */
607 {2, 4, 2}, /* cost of storing integer registers */
608 2, /* cost of reg,reg fld/fst */
609 {8, 8, 8}, /* cost of loading fp registers
610 in SFmode, DFmode and XFmode */
611 {8, 8, 8}, /* cost of storing fp registers
612 in SFmode, DFmode and XFmode */
613 2, /* cost of moving MMX register */
614 {4, 8}, /* cost of loading MMX registers
615 in SImode and DImode */
616 {4, 8}, /* cost of storing MMX registers
617 in SImode and DImode */
618 2, /* cost of moving SSE register */
619 {4, 8, 16}, /* cost of loading SSE registers
620 in SImode, DImode and TImode */
621 {4, 8, 16}, /* cost of storing SSE registers
622 in SImode, DImode and TImode */
623 3, /* MMX or SSE register to integer */
624 0, /* size of l1 cache */
625 0, /* size of l2 cache */
626 0, /* size of prefetch block */
627 0, /* number of parallel prefetches */
628 1, /* Branch cost */
629 COSTS_N_INSNS (23), /* cost of FADD and FSUB insns. */
630 COSTS_N_INSNS (27), /* cost of FMUL instruction. */
631 COSTS_N_INSNS (88), /* cost of FDIV instruction. */
632 COSTS_N_INSNS (22), /* cost of FABS instruction. */
633 COSTS_N_INSNS (24), /* cost of FCHS instruction. */
634 COSTS_N_INSNS (122), /* cost of FSQRT instruction. */
635 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
636 DUMMY_STRINGOP_ALGS},
637 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
638 DUMMY_STRINGOP_ALGS},
639 1, /* scalar_stmt_cost. */
640 1, /* scalar load_cost. */
641 1, /* scalar_store_cost. */
642 1, /* vec_stmt_cost. */
643 1, /* vec_to_scalar_cost. */
644 1, /* scalar_to_vec_cost. */
645 1, /* vec_align_load_cost. */
646 2, /* vec_unalign_load_cost. */
647 1, /* vec_store_cost. */
648 3, /* cond_taken_branch_cost. */
649 1, /* cond_not_taken_branch_cost. */
650 };
651
652 static const
653 struct processor_costs i486_cost = { /* 486 specific costs */
654 COSTS_N_INSNS (1), /* cost of an add instruction */
655 COSTS_N_INSNS (1), /* cost of a lea instruction */
656 COSTS_N_INSNS (3), /* variable shift costs */
657 COSTS_N_INSNS (2), /* constant shift costs */
658 {COSTS_N_INSNS (12), /* cost of starting multiply for QI */
659 COSTS_N_INSNS (12), /* HI */
660 COSTS_N_INSNS (12), /* SI */
661 COSTS_N_INSNS (12), /* DI */
662 COSTS_N_INSNS (12)}, /* other */
663 1, /* cost of multiply per each bit set */
664 {COSTS_N_INSNS (40), /* cost of a divide/mod for QI */
665 COSTS_N_INSNS (40), /* HI */
666 COSTS_N_INSNS (40), /* SI */
667 COSTS_N_INSNS (40), /* DI */
668 COSTS_N_INSNS (40)}, /* other */
669 COSTS_N_INSNS (3), /* cost of movsx */
670 COSTS_N_INSNS (2), /* cost of movzx */
671 15, /* "large" insn */
672 3, /* MOVE_RATIO */
673 4, /* cost for loading QImode using movzbl */
674 {2, 4, 2}, /* cost of loading integer registers
675 in QImode, HImode and SImode.
676 Relative to reg-reg move (2). */
677 {2, 4, 2}, /* cost of storing integer registers */
678 2, /* cost of reg,reg fld/fst */
679 {8, 8, 8}, /* cost of loading fp registers
680 in SFmode, DFmode and XFmode */
681 {8, 8, 8}, /* cost of storing fp registers
682 in SFmode, DFmode and XFmode */
683 2, /* cost of moving MMX register */
684 {4, 8}, /* cost of loading MMX registers
685 in SImode and DImode */
686 {4, 8}, /* cost of storing MMX registers
687 in SImode and DImode */
688 2, /* cost of moving SSE register */
689 {4, 8, 16}, /* cost of loading SSE registers
690 in SImode, DImode and TImode */
691 {4, 8, 16}, /* cost of storing SSE registers
692 in SImode, DImode and TImode */
693 3, /* MMX or SSE register to integer */
694 4, /* size of l1 cache. 486 has 8kB cache
695 shared for code and data, so 4kB is
696 not really precise. */
697 4, /* size of l2 cache */
698 0, /* size of prefetch block */
699 0, /* number of parallel prefetches */
700 1, /* Branch cost */
701 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
702 COSTS_N_INSNS (16), /* cost of FMUL instruction. */
703 COSTS_N_INSNS (73), /* cost of FDIV instruction. */
704 COSTS_N_INSNS (3), /* cost of FABS instruction. */
705 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
706 COSTS_N_INSNS (83), /* cost of FSQRT instruction. */
707 {{rep_prefix_4_byte, {{-1, rep_prefix_4_byte}}},
708 DUMMY_STRINGOP_ALGS},
709 {{rep_prefix_4_byte, {{-1, rep_prefix_4_byte}}},
710 DUMMY_STRINGOP_ALGS},
711 1, /* scalar_stmt_cost. */
712 1, /* scalar load_cost. */
713 1, /* scalar_store_cost. */
714 1, /* vec_stmt_cost. */
715 1, /* vec_to_scalar_cost. */
716 1, /* scalar_to_vec_cost. */
717 1, /* vec_align_load_cost. */
718 2, /* vec_unalign_load_cost. */
719 1, /* vec_store_cost. */
720 3, /* cond_taken_branch_cost. */
721 1, /* cond_not_taken_branch_cost. */
722 };
723
724 static const
725 struct processor_costs pentium_cost = {
726 COSTS_N_INSNS (1), /* cost of an add instruction */
727 COSTS_N_INSNS (1), /* cost of a lea instruction */
728 COSTS_N_INSNS (4), /* variable shift costs */
729 COSTS_N_INSNS (1), /* constant shift costs */
730 {COSTS_N_INSNS (11), /* cost of starting multiply for QI */
731 COSTS_N_INSNS (11), /* HI */
732 COSTS_N_INSNS (11), /* SI */
733 COSTS_N_INSNS (11), /* DI */
734 COSTS_N_INSNS (11)}, /* other */
735 0, /* cost of multiply per each bit set */
736 {COSTS_N_INSNS (25), /* cost of a divide/mod for QI */
737 COSTS_N_INSNS (25), /* HI */
738 COSTS_N_INSNS (25), /* SI */
739 COSTS_N_INSNS (25), /* DI */
740 COSTS_N_INSNS (25)}, /* other */
741 COSTS_N_INSNS (3), /* cost of movsx */
742 COSTS_N_INSNS (2), /* cost of movzx */
743 8, /* "large" insn */
744 6, /* MOVE_RATIO */
745 6, /* cost for loading QImode using movzbl */
746 {2, 4, 2}, /* cost of loading integer registers
747 in QImode, HImode and SImode.
748 Relative to reg-reg move (2). */
749 {2, 4, 2}, /* cost of storing integer registers */
750 2, /* cost of reg,reg fld/fst */
751 {2, 2, 6}, /* cost of loading fp registers
752 in SFmode, DFmode and XFmode */
753 {4, 4, 6}, /* cost of storing fp registers
754 in SFmode, DFmode and XFmode */
755 8, /* cost of moving MMX register */
756 {8, 8}, /* cost of loading MMX registers
757 in SImode and DImode */
758 {8, 8}, /* cost of storing MMX registers
759 in SImode and DImode */
760 2, /* cost of moving SSE register */
761 {4, 8, 16}, /* cost of loading SSE registers
762 in SImode, DImode and TImode */
763 {4, 8, 16}, /* cost of storing SSE registers
764 in SImode, DImode and TImode */
765 3, /* MMX or SSE register to integer */
766 8, /* size of l1 cache. */
767 8, /* size of l2 cache */
768 0, /* size of prefetch block */
769 0, /* number of parallel prefetches */
770 2, /* Branch cost */
771 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
772 COSTS_N_INSNS (3), /* cost of FMUL instruction. */
773 COSTS_N_INSNS (39), /* cost of FDIV instruction. */
774 COSTS_N_INSNS (1), /* cost of FABS instruction. */
775 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
776 COSTS_N_INSNS (70), /* cost of FSQRT instruction. */
777 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
778 DUMMY_STRINGOP_ALGS},
779 {{libcall, {{-1, rep_prefix_4_byte}}},
780 DUMMY_STRINGOP_ALGS},
781 1, /* scalar_stmt_cost. */
782 1, /* scalar load_cost. */
783 1, /* scalar_store_cost. */
784 1, /* vec_stmt_cost. */
785 1, /* vec_to_scalar_cost. */
786 1, /* scalar_to_vec_cost. */
787 1, /* vec_align_load_cost. */
788 2, /* vec_unalign_load_cost. */
789 1, /* vec_store_cost. */
790 3, /* cond_taken_branch_cost. */
791 1, /* cond_not_taken_branch_cost. */
792 };
793
794 static const
795 struct processor_costs pentiumpro_cost = {
796 COSTS_N_INSNS (1), /* cost of an add instruction */
797 COSTS_N_INSNS (1), /* cost of a lea instruction */
798 COSTS_N_INSNS (1), /* variable shift costs */
799 COSTS_N_INSNS (1), /* constant shift costs */
800 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
801 COSTS_N_INSNS (4), /* HI */
802 COSTS_N_INSNS (4), /* SI */
803 COSTS_N_INSNS (4), /* DI */
804 COSTS_N_INSNS (4)}, /* other */
805 0, /* cost of multiply per each bit set */
806 {COSTS_N_INSNS (17), /* cost of a divide/mod for QI */
807 COSTS_N_INSNS (17), /* HI */
808 COSTS_N_INSNS (17), /* SI */
809 COSTS_N_INSNS (17), /* DI */
810 COSTS_N_INSNS (17)}, /* other */
811 COSTS_N_INSNS (1), /* cost of movsx */
812 COSTS_N_INSNS (1), /* cost of movzx */
813 8, /* "large" insn */
814 6, /* MOVE_RATIO */
815 2, /* cost for loading QImode using movzbl */
816 {4, 4, 4}, /* cost of loading integer registers
817 in QImode, HImode and SImode.
818 Relative to reg-reg move (2). */
819 {2, 2, 2}, /* cost of storing integer registers */
820 2, /* cost of reg,reg fld/fst */
821 {2, 2, 6}, /* cost of loading fp registers
822 in SFmode, DFmode and XFmode */
823 {4, 4, 6}, /* cost of storing fp registers
824 in SFmode, DFmode and XFmode */
825 2, /* cost of moving MMX register */
826 {2, 2}, /* cost of loading MMX registers
827 in SImode and DImode */
828 {2, 2}, /* cost of storing MMX registers
829 in SImode and DImode */
830 2, /* cost of moving SSE register */
831 {2, 2, 8}, /* cost of loading SSE registers
832 in SImode, DImode and TImode */
833 {2, 2, 8}, /* cost of storing SSE registers
834 in SImode, DImode and TImode */
835 3, /* MMX or SSE register to integer */
836 8, /* size of l1 cache. */
837 256, /* size of l2 cache */
838 32, /* size of prefetch block */
839 6, /* number of parallel prefetches */
840 2, /* Branch cost */
841 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
842 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
843 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
844 COSTS_N_INSNS (2), /* cost of FABS instruction. */
845 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
846 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
847 /* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes
848 (we ensure the alignment). For small blocks inline loop is still a
849 noticeable win, for bigger blocks either rep movsl or rep movsb is
850 way to go. Rep movsb has apparently more expensive startup time in CPU,
851 but after 4K the difference is down in the noise. */
852 {{rep_prefix_4_byte, {{128, loop}, {1024, unrolled_loop},
853 {8192, rep_prefix_4_byte}, {-1, rep_prefix_1_byte}}},
854 DUMMY_STRINGOP_ALGS},
855 {{rep_prefix_4_byte, {{1024, unrolled_loop},
856 {8192, rep_prefix_4_byte}, {-1, libcall}}},
857 DUMMY_STRINGOP_ALGS},
858 1, /* scalar_stmt_cost. */
859 1, /* scalar load_cost. */
860 1, /* scalar_store_cost. */
861 1, /* vec_stmt_cost. */
862 1, /* vec_to_scalar_cost. */
863 1, /* scalar_to_vec_cost. */
864 1, /* vec_align_load_cost. */
865 2, /* vec_unalign_load_cost. */
866 1, /* vec_store_cost. */
867 3, /* cond_taken_branch_cost. */
868 1, /* cond_not_taken_branch_cost. */
869 };
870
871 static const
872 struct processor_costs geode_cost = {
873 COSTS_N_INSNS (1), /* cost of an add instruction */
874 COSTS_N_INSNS (1), /* cost of a lea instruction */
875 COSTS_N_INSNS (2), /* variable shift costs */
876 COSTS_N_INSNS (1), /* constant shift costs */
877 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
878 COSTS_N_INSNS (4), /* HI */
879 COSTS_N_INSNS (7), /* SI */
880 COSTS_N_INSNS (7), /* DI */
881 COSTS_N_INSNS (7)}, /* other */
882 0, /* cost of multiply per each bit set */
883 {COSTS_N_INSNS (15), /* cost of a divide/mod for QI */
884 COSTS_N_INSNS (23), /* HI */
885 COSTS_N_INSNS (39), /* SI */
886 COSTS_N_INSNS (39), /* DI */
887 COSTS_N_INSNS (39)}, /* other */
888 COSTS_N_INSNS (1), /* cost of movsx */
889 COSTS_N_INSNS (1), /* cost of movzx */
890 8, /* "large" insn */
891 4, /* MOVE_RATIO */
892 1, /* cost for loading QImode using movzbl */
893 {1, 1, 1}, /* cost of loading integer registers
894 in QImode, HImode and SImode.
895 Relative to reg-reg move (2). */
896 {1, 1, 1}, /* cost of storing integer registers */
897 1, /* cost of reg,reg fld/fst */
898 {1, 1, 1}, /* cost of loading fp registers
899 in SFmode, DFmode and XFmode */
900 {4, 6, 6}, /* cost of storing fp registers
901 in SFmode, DFmode and XFmode */
902
903 1, /* cost of moving MMX register */
904 {1, 1}, /* cost of loading MMX registers
905 in SImode and DImode */
906 {1, 1}, /* cost of storing MMX registers
907 in SImode and DImode */
908 1, /* cost of moving SSE register */
909 {1, 1, 1}, /* cost of loading SSE registers
910 in SImode, DImode and TImode */
911 {1, 1, 1}, /* cost of storing SSE registers
912 in SImode, DImode and TImode */
913 1, /* MMX or SSE register to integer */
914 64, /* size of l1 cache. */
915 128, /* size of l2 cache. */
916 32, /* size of prefetch block */
917 1, /* number of parallel prefetches */
918 1, /* Branch cost */
919 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
920 COSTS_N_INSNS (11), /* cost of FMUL instruction. */
921 COSTS_N_INSNS (47), /* cost of FDIV instruction. */
922 COSTS_N_INSNS (1), /* cost of FABS instruction. */
923 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
924 COSTS_N_INSNS (54), /* cost of FSQRT instruction. */
925 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
926 DUMMY_STRINGOP_ALGS},
927 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
928 DUMMY_STRINGOP_ALGS},
929 1, /* scalar_stmt_cost. */
930 1, /* scalar load_cost. */
931 1, /* scalar_store_cost. */
932 1, /* vec_stmt_cost. */
933 1, /* vec_to_scalar_cost. */
934 1, /* scalar_to_vec_cost. */
935 1, /* vec_align_load_cost. */
936 2, /* vec_unalign_load_cost. */
937 1, /* vec_store_cost. */
938 3, /* cond_taken_branch_cost. */
939 1, /* cond_not_taken_branch_cost. */
940 };
941
942 static const
943 struct processor_costs k6_cost = {
944 COSTS_N_INSNS (1), /* cost of an add instruction */
945 COSTS_N_INSNS (2), /* cost of a lea instruction */
946 COSTS_N_INSNS (1), /* variable shift costs */
947 COSTS_N_INSNS (1), /* constant shift costs */
948 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
949 COSTS_N_INSNS (3), /* HI */
950 COSTS_N_INSNS (3), /* SI */
951 COSTS_N_INSNS (3), /* DI */
952 COSTS_N_INSNS (3)}, /* other */
953 0, /* cost of multiply per each bit set */
954 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
955 COSTS_N_INSNS (18), /* HI */
956 COSTS_N_INSNS (18), /* SI */
957 COSTS_N_INSNS (18), /* DI */
958 COSTS_N_INSNS (18)}, /* other */
959 COSTS_N_INSNS (2), /* cost of movsx */
960 COSTS_N_INSNS (2), /* cost of movzx */
961 8, /* "large" insn */
962 4, /* MOVE_RATIO */
963 3, /* cost for loading QImode using movzbl */
964 {4, 5, 4}, /* cost of loading integer registers
965 in QImode, HImode and SImode.
966 Relative to reg-reg move (2). */
967 {2, 3, 2}, /* cost of storing integer registers */
968 4, /* cost of reg,reg fld/fst */
969 {6, 6, 6}, /* cost of loading fp registers
970 in SFmode, DFmode and XFmode */
971 {4, 4, 4}, /* cost of storing fp registers
972 in SFmode, DFmode and XFmode */
973 2, /* cost of moving MMX register */
974 {2, 2}, /* cost of loading MMX registers
975 in SImode and DImode */
976 {2, 2}, /* cost of storing MMX registers
977 in SImode and DImode */
978 2, /* cost of moving SSE register */
979 {2, 2, 8}, /* cost of loading SSE registers
980 in SImode, DImode and TImode */
981 {2, 2, 8}, /* cost of storing SSE registers
982 in SImode, DImode and TImode */
983 6, /* MMX or SSE register to integer */
984 32, /* size of l1 cache. */
985 32, /* size of l2 cache. Some models
986 have integrated l2 cache, but
987 optimizing for k6 is not important
988 enough to worry about that. */
989 32, /* size of prefetch block */
990 1, /* number of parallel prefetches */
991 1, /* Branch cost */
992 COSTS_N_INSNS (2), /* cost of FADD and FSUB insns. */
993 COSTS_N_INSNS (2), /* cost of FMUL instruction. */
994 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
995 COSTS_N_INSNS (2), /* cost of FABS instruction. */
996 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
997 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
998 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
999 DUMMY_STRINGOP_ALGS},
1000 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
1001 DUMMY_STRINGOP_ALGS},
1002 1, /* scalar_stmt_cost. */
1003 1, /* scalar load_cost. */
1004 1, /* scalar_store_cost. */
1005 1, /* vec_stmt_cost. */
1006 1, /* vec_to_scalar_cost. */
1007 1, /* scalar_to_vec_cost. */
1008 1, /* vec_align_load_cost. */
1009 2, /* vec_unalign_load_cost. */
1010 1, /* vec_store_cost. */
1011 3, /* cond_taken_branch_cost. */
1012 1, /* cond_not_taken_branch_cost. */
1013 };
1014
1015 static const
1016 struct processor_costs athlon_cost = {
1017 COSTS_N_INSNS (1), /* cost of an add instruction */
1018 COSTS_N_INSNS (2), /* cost of a lea instruction */
1019 COSTS_N_INSNS (1), /* variable shift costs */
1020 COSTS_N_INSNS (1), /* constant shift costs */
1021 {COSTS_N_INSNS (5), /* cost of starting multiply for QI */
1022 COSTS_N_INSNS (5), /* HI */
1023 COSTS_N_INSNS (5), /* SI */
1024 COSTS_N_INSNS (5), /* DI */
1025 COSTS_N_INSNS (5)}, /* other */
1026 0, /* cost of multiply per each bit set */
1027 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1028 COSTS_N_INSNS (26), /* HI */
1029 COSTS_N_INSNS (42), /* SI */
1030 COSTS_N_INSNS (74), /* DI */
1031 COSTS_N_INSNS (74)}, /* other */
1032 COSTS_N_INSNS (1), /* cost of movsx */
1033 COSTS_N_INSNS (1), /* cost of movzx */
1034 8, /* "large" insn */
1035 9, /* MOVE_RATIO */
1036 4, /* cost for loading QImode using movzbl */
1037 {3, 4, 3}, /* cost of loading integer registers
1038 in QImode, HImode and SImode.
1039 Relative to reg-reg move (2). */
1040 {3, 4, 3}, /* cost of storing integer registers */
1041 4, /* cost of reg,reg fld/fst */
1042 {4, 4, 12}, /* cost of loading fp registers
1043 in SFmode, DFmode and XFmode */
1044 {6, 6, 8}, /* cost of storing fp registers
1045 in SFmode, DFmode and XFmode */
1046 2, /* cost of moving MMX register */
1047 {4, 4}, /* cost of loading MMX registers
1048 in SImode and DImode */
1049 {4, 4}, /* cost of storing MMX registers
1050 in SImode and DImode */
1051 2, /* cost of moving SSE register */
1052 {4, 4, 6}, /* cost of loading SSE registers
1053 in SImode, DImode and TImode */
1054 {4, 4, 5}, /* cost of storing SSE registers
1055 in SImode, DImode and TImode */
1056 5, /* MMX or SSE register to integer */
1057 64, /* size of l1 cache. */
1058 256, /* size of l2 cache. */
1059 64, /* size of prefetch block */
1060 6, /* number of parallel prefetches */
1061 5, /* Branch cost */
1062 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1063 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1064 COSTS_N_INSNS (24), /* cost of FDIV instruction. */
1065 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1066 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1067 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1068 /* For some reason, Athlon deals better with REP prefix (relative to loops)
1069 compared to K8. Alignment becomes important after 8 bytes for memcpy and
1070 128 bytes for memset. */
1071 {{libcall, {{2048, rep_prefix_4_byte}, {-1, libcall}}},
1072 DUMMY_STRINGOP_ALGS},
1073 {{libcall, {{2048, rep_prefix_4_byte}, {-1, libcall}}},
1074 DUMMY_STRINGOP_ALGS},
1075 1, /* scalar_stmt_cost. */
1076 1, /* scalar load_cost. */
1077 1, /* scalar_store_cost. */
1078 1, /* vec_stmt_cost. */
1079 1, /* vec_to_scalar_cost. */
1080 1, /* scalar_to_vec_cost. */
1081 1, /* vec_align_load_cost. */
1082 2, /* vec_unalign_load_cost. */
1083 1, /* vec_store_cost. */
1084 3, /* cond_taken_branch_cost. */
1085 1, /* cond_not_taken_branch_cost. */
1086 };
1087
1088 static const
1089 struct processor_costs k8_cost = {
1090 COSTS_N_INSNS (1), /* cost of an add instruction */
1091 COSTS_N_INSNS (2), /* cost of a lea instruction */
1092 COSTS_N_INSNS (1), /* variable shift costs */
1093 COSTS_N_INSNS (1), /* constant shift costs */
1094 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1095 COSTS_N_INSNS (4), /* HI */
1096 COSTS_N_INSNS (3), /* SI */
1097 COSTS_N_INSNS (4), /* DI */
1098 COSTS_N_INSNS (5)}, /* other */
1099 0, /* cost of multiply per each bit set */
1100 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1101 COSTS_N_INSNS (26), /* HI */
1102 COSTS_N_INSNS (42), /* SI */
1103 COSTS_N_INSNS (74), /* DI */
1104 COSTS_N_INSNS (74)}, /* other */
1105 COSTS_N_INSNS (1), /* cost of movsx */
1106 COSTS_N_INSNS (1), /* cost of movzx */
1107 8, /* "large" insn */
1108 9, /* MOVE_RATIO */
1109 4, /* cost for loading QImode using movzbl */
1110 {3, 4, 3}, /* cost of loading integer registers
1111 in QImode, HImode and SImode.
1112 Relative to reg-reg move (2). */
1113 {3, 4, 3}, /* cost of storing integer registers */
1114 4, /* cost of reg,reg fld/fst */
1115 {4, 4, 12}, /* cost of loading fp registers
1116 in SFmode, DFmode and XFmode */
1117 {6, 6, 8}, /* cost of storing fp registers
1118 in SFmode, DFmode and XFmode */
1119 2, /* cost of moving MMX register */
1120 {3, 3}, /* cost of loading MMX registers
1121 in SImode and DImode */
1122 {4, 4}, /* cost of storing MMX registers
1123 in SImode and DImode */
1124 2, /* cost of moving SSE register */
1125 {4, 3, 6}, /* cost of loading SSE registers
1126 in SImode, DImode and TImode */
1127 {4, 4, 5}, /* cost of storing SSE registers
1128 in SImode, DImode and TImode */
1129 5, /* MMX or SSE register to integer */
1130 64, /* size of l1 cache. */
1131 512, /* size of l2 cache. */
1132 64, /* size of prefetch block */
1133 /* New AMD processors never drop prefetches; if they cannot be performed
1134 immediately, they are queued. We set number of simultaneous prefetches
1135 to a large constant to reflect this (it probably is not a good idea not
1136 to limit number of prefetches at all, as their execution also takes some
1137 time). */
1138 100, /* number of parallel prefetches */
1139 3, /* Branch cost */
1140 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1141 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1142 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1143 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1144 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1145 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1146 /* K8 has optimized REP instruction for medium sized blocks, but for very
1147 small blocks it is better to use loop. For large blocks, libcall can
1148 do nontemporary accesses and beat inline considerably. */
1149 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
1150 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1151 {{libcall, {{8, loop}, {24, unrolled_loop},
1152 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1153 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1154 4, /* scalar_stmt_cost. */
1155 2, /* scalar load_cost. */
1156 2, /* scalar_store_cost. */
1157 5, /* vec_stmt_cost. */
1158 0, /* vec_to_scalar_cost. */
1159 2, /* scalar_to_vec_cost. */
1160 2, /* vec_align_load_cost. */
1161 3, /* vec_unalign_load_cost. */
1162 3, /* vec_store_cost. */
1163 3, /* cond_taken_branch_cost. */
1164 2, /* cond_not_taken_branch_cost. */
1165 };
1166
1167 struct processor_costs amdfam10_cost = {
1168 COSTS_N_INSNS (1), /* cost of an add instruction */
1169 COSTS_N_INSNS (2), /* cost of a lea instruction */
1170 COSTS_N_INSNS (1), /* variable shift costs */
1171 COSTS_N_INSNS (1), /* constant shift costs */
1172 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1173 COSTS_N_INSNS (4), /* HI */
1174 COSTS_N_INSNS (3), /* SI */
1175 COSTS_N_INSNS (4), /* DI */
1176 COSTS_N_INSNS (5)}, /* other */
1177 0, /* cost of multiply per each bit set */
1178 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1179 COSTS_N_INSNS (35), /* HI */
1180 COSTS_N_INSNS (51), /* SI */
1181 COSTS_N_INSNS (83), /* DI */
1182 COSTS_N_INSNS (83)}, /* other */
1183 COSTS_N_INSNS (1), /* cost of movsx */
1184 COSTS_N_INSNS (1), /* cost of movzx */
1185 8, /* "large" insn */
1186 9, /* MOVE_RATIO */
1187 4, /* cost for loading QImode using movzbl */
1188 {3, 4, 3}, /* cost of loading integer registers
1189 in QImode, HImode and SImode.
1190 Relative to reg-reg move (2). */
1191 {3, 4, 3}, /* cost of storing integer registers */
1192 4, /* cost of reg,reg fld/fst */
1193 {4, 4, 12}, /* cost of loading fp registers
1194 in SFmode, DFmode and XFmode */
1195 {6, 6, 8}, /* cost of storing fp registers
1196 in SFmode, DFmode and XFmode */
1197 2, /* cost of moving MMX register */
1198 {3, 3}, /* cost of loading MMX registers
1199 in SImode and DImode */
1200 {4, 4}, /* cost of storing MMX registers
1201 in SImode and DImode */
1202 2, /* cost of moving SSE register */
1203 {4, 4, 3}, /* cost of loading SSE registers
1204 in SImode, DImode and TImode */
1205 {4, 4, 5}, /* cost of storing SSE registers
1206 in SImode, DImode and TImode */
1207 3, /* MMX or SSE register to integer */
1208 /* On K8:
1209 MOVD reg64, xmmreg Double FSTORE 4
1210 MOVD reg32, xmmreg Double FSTORE 4
1211 On AMDFAM10:
1212 MOVD reg64, xmmreg Double FADD 3
1213 1/1 1/1
1214 MOVD reg32, xmmreg Double FADD 3
1215 1/1 1/1 */
1216 64, /* size of l1 cache. */
1217 512, /* size of l2 cache. */
1218 64, /* size of prefetch block */
1219 /* New AMD processors never drop prefetches; if they cannot be performed
1220 immediately, they are queued. We set number of simultaneous prefetches
1221 to a large constant to reflect this (it probably is not a good idea not
1222 to limit number of prefetches at all, as their execution also takes some
1223 time). */
1224 100, /* number of parallel prefetches */
1225 2, /* Branch cost */
1226 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1227 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1228 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1229 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1230 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1231 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1232
1233 /* AMDFAM10 has optimized REP instruction for medium sized blocks, but for
1234 very small blocks it is better to use loop. For large blocks, libcall can
1235 do nontemporary accesses and beat inline considerably. */
1236 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
1237 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1238 {{libcall, {{8, loop}, {24, unrolled_loop},
1239 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1240 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1241 4, /* scalar_stmt_cost. */
1242 2, /* scalar load_cost. */
1243 2, /* scalar_store_cost. */
1244 6, /* vec_stmt_cost. */
1245 0, /* vec_to_scalar_cost. */
1246 2, /* scalar_to_vec_cost. */
1247 2, /* vec_align_load_cost. */
1248 2, /* vec_unalign_load_cost. */
1249 2, /* vec_store_cost. */
1250 2, /* cond_taken_branch_cost. */
1251 1, /* cond_not_taken_branch_cost. */
1252 };
1253
1254 struct processor_costs bdver1_cost = {
1255 COSTS_N_INSNS (1), /* cost of an add instruction */
1256 COSTS_N_INSNS (1), /* cost of a lea instruction */
1257 COSTS_N_INSNS (1), /* variable shift costs */
1258 COSTS_N_INSNS (1), /* constant shift costs */
1259 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1260 COSTS_N_INSNS (4), /* HI */
1261 COSTS_N_INSNS (4), /* SI */
1262 COSTS_N_INSNS (6), /* DI */
1263 COSTS_N_INSNS (6)}, /* other */
1264 0, /* cost of multiply per each bit set */
1265 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1266 COSTS_N_INSNS (35), /* HI */
1267 COSTS_N_INSNS (51), /* SI */
1268 COSTS_N_INSNS (83), /* DI */
1269 COSTS_N_INSNS (83)}, /* other */
1270 COSTS_N_INSNS (1), /* cost of movsx */
1271 COSTS_N_INSNS (1), /* cost of movzx */
1272 8, /* "large" insn */
1273 9, /* MOVE_RATIO */
1274 4, /* cost for loading QImode using movzbl */
1275 {5, 5, 4}, /* cost of loading integer registers
1276 in QImode, HImode and SImode.
1277 Relative to reg-reg move (2). */
1278 {4, 4, 4}, /* cost of storing integer registers */
1279 2, /* cost of reg,reg fld/fst */
1280 {5, 5, 12}, /* cost of loading fp registers
1281 in SFmode, DFmode and XFmode */
1282 {4, 4, 8}, /* cost of storing fp registers
1283 in SFmode, DFmode and XFmode */
1284 2, /* cost of moving MMX register */
1285 {4, 4}, /* cost of loading MMX registers
1286 in SImode and DImode */
1287 {4, 4}, /* cost of storing MMX registers
1288 in SImode and DImode */
1289 2, /* cost of moving SSE register */
1290 {4, 4, 4}, /* cost of loading SSE registers
1291 in SImode, DImode and TImode */
1292 {4, 4, 4}, /* cost of storing SSE registers
1293 in SImode, DImode and TImode */
1294 2, /* MMX or SSE register to integer */
1295 /* On K8:
1296 MOVD reg64, xmmreg Double FSTORE 4
1297 MOVD reg32, xmmreg Double FSTORE 4
1298 On AMDFAM10:
1299 MOVD reg64, xmmreg Double FADD 3
1300 1/1 1/1
1301 MOVD reg32, xmmreg Double FADD 3
1302 1/1 1/1 */
1303 16, /* size of l1 cache. */
1304 2048, /* size of l2 cache. */
1305 64, /* size of prefetch block */
1306 /* New AMD processors never drop prefetches; if they cannot be performed
1307 immediately, they are queued. We set number of simultaneous prefetches
1308 to a large constant to reflect this (it probably is not a good idea not
1309 to limit number of prefetches at all, as their execution also takes some
1310 time). */
1311 100, /* number of parallel prefetches */
1312 2, /* Branch cost */
1313 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1314 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1315 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1316 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1317 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1318 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1319
1320 /* BDVER1 has optimized REP instruction for medium sized blocks, but for
1321 very small blocks it is better to use loop. For large blocks, libcall
1322 can do nontemporary accesses and beat inline considerably. */
1323 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
1324 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1325 {{libcall, {{8, loop}, {24, unrolled_loop},
1326 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1327 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1328 6, /* scalar_stmt_cost. */
1329 4, /* scalar load_cost. */
1330 4, /* scalar_store_cost. */
1331 6, /* vec_stmt_cost. */
1332 0, /* vec_to_scalar_cost. */
1333 2, /* scalar_to_vec_cost. */
1334 4, /* vec_align_load_cost. */
1335 4, /* vec_unalign_load_cost. */
1336 4, /* vec_store_cost. */
1337 2, /* cond_taken_branch_cost. */
1338 1, /* cond_not_taken_branch_cost. */
1339 };
1340
1341 struct processor_costs bdver2_cost = {
1342 COSTS_N_INSNS (1), /* cost of an add instruction */
1343 COSTS_N_INSNS (1), /* cost of a lea instruction */
1344 COSTS_N_INSNS (1), /* variable shift costs */
1345 COSTS_N_INSNS (1), /* constant shift costs */
1346 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1347 COSTS_N_INSNS (4), /* HI */
1348 COSTS_N_INSNS (4), /* SI */
1349 COSTS_N_INSNS (6), /* DI */
1350 COSTS_N_INSNS (6)}, /* other */
1351 0, /* cost of multiply per each bit set */
1352 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1353 COSTS_N_INSNS (35), /* HI */
1354 COSTS_N_INSNS (51), /* SI */
1355 COSTS_N_INSNS (83), /* DI */
1356 COSTS_N_INSNS (83)}, /* other */
1357 COSTS_N_INSNS (1), /* cost of movsx */
1358 COSTS_N_INSNS (1), /* cost of movzx */
1359 8, /* "large" insn */
1360 9, /* MOVE_RATIO */
1361 4, /* cost for loading QImode using movzbl */
1362 {5, 5, 4}, /* cost of loading integer registers
1363 in QImode, HImode and SImode.
1364 Relative to reg-reg move (2). */
1365 {4, 4, 4}, /* cost of storing integer registers */
1366 2, /* cost of reg,reg fld/fst */
1367 {5, 5, 12}, /* cost of loading fp registers
1368 in SFmode, DFmode and XFmode */
1369 {4, 4, 8}, /* cost of storing fp registers
1370 in SFmode, DFmode and XFmode */
1371 2, /* cost of moving MMX register */
1372 {4, 4}, /* cost of loading MMX registers
1373 in SImode and DImode */
1374 {4, 4}, /* cost of storing MMX registers
1375 in SImode and DImode */
1376 2, /* cost of moving SSE register */
1377 {4, 4, 4}, /* cost of loading SSE registers
1378 in SImode, DImode and TImode */
1379 {4, 4, 4}, /* cost of storing SSE registers
1380 in SImode, DImode and TImode */
1381 2, /* MMX or SSE register to integer */
1382 /* On K8:
1383 MOVD reg64, xmmreg Double FSTORE 4
1384 MOVD reg32, xmmreg Double FSTORE 4
1385 On AMDFAM10:
1386 MOVD reg64, xmmreg Double FADD 3
1387 1/1 1/1
1388 MOVD reg32, xmmreg Double FADD 3
1389 1/1 1/1 */
1390 16, /* size of l1 cache. */
1391 2048, /* size of l2 cache. */
1392 64, /* size of prefetch block */
1393 /* New AMD processors never drop prefetches; if they cannot be performed
1394 immediately, they are queued. We set number of simultaneous prefetches
1395 to a large constant to reflect this (it probably is not a good idea not
1396 to limit number of prefetches at all, as their execution also takes some
1397 time). */
1398 100, /* number of parallel prefetches */
1399 2, /* Branch cost */
1400 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1401 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1402 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1403 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1404 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1405 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1406
1407 /* BDVER2 has optimized REP instruction for medium sized blocks, but for
1408 very small blocks it is better to use loop. For large blocks, libcall
1409 can do nontemporary accesses and beat inline considerably. */
1410 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
1411 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1412 {{libcall, {{8, loop}, {24, unrolled_loop},
1413 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1414 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1415 6, /* scalar_stmt_cost. */
1416 4, /* scalar load_cost. */
1417 4, /* scalar_store_cost. */
1418 6, /* vec_stmt_cost. */
1419 0, /* vec_to_scalar_cost. */
1420 2, /* scalar_to_vec_cost. */
1421 4, /* vec_align_load_cost. */
1422 4, /* vec_unalign_load_cost. */
1423 4, /* vec_store_cost. */
1424 2, /* cond_taken_branch_cost. */
1425 1, /* cond_not_taken_branch_cost. */
1426 };
1427
1428 struct processor_costs btver1_cost = {
1429 COSTS_N_INSNS (1), /* cost of an add instruction */
1430 COSTS_N_INSNS (2), /* cost of a lea instruction */
1431 COSTS_N_INSNS (1), /* variable shift costs */
1432 COSTS_N_INSNS (1), /* constant shift costs */
1433 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1434 COSTS_N_INSNS (4), /* HI */
1435 COSTS_N_INSNS (3), /* SI */
1436 COSTS_N_INSNS (4), /* DI */
1437 COSTS_N_INSNS (5)}, /* other */
1438 0, /* cost of multiply per each bit set */
1439 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1440 COSTS_N_INSNS (35), /* HI */
1441 COSTS_N_INSNS (51), /* SI */
1442 COSTS_N_INSNS (83), /* DI */
1443 COSTS_N_INSNS (83)}, /* other */
1444 COSTS_N_INSNS (1), /* cost of movsx */
1445 COSTS_N_INSNS (1), /* cost of movzx */
1446 8, /* "large" insn */
1447 9, /* MOVE_RATIO */
1448 4, /* cost for loading QImode using movzbl */
1449 {3, 4, 3}, /* cost of loading integer registers
1450 in QImode, HImode and SImode.
1451 Relative to reg-reg move (2). */
1452 {3, 4, 3}, /* cost of storing integer registers */
1453 4, /* cost of reg,reg fld/fst */
1454 {4, 4, 12}, /* cost of loading fp registers
1455 in SFmode, DFmode and XFmode */
1456 {6, 6, 8}, /* cost of storing fp registers
1457 in SFmode, DFmode and XFmode */
1458 2, /* cost of moving MMX register */
1459 {3, 3}, /* cost of loading MMX registers
1460 in SImode and DImode */
1461 {4, 4}, /* cost of storing MMX registers
1462 in SImode and DImode */
1463 2, /* cost of moving SSE register */
1464 {4, 4, 3}, /* cost of loading SSE registers
1465 in SImode, DImode and TImode */
1466 {4, 4, 5}, /* cost of storing SSE registers
1467 in SImode, DImode and TImode */
1468 3, /* MMX or SSE register to integer */
1469 /* On K8:
1470 MOVD reg64, xmmreg Double FSTORE 4
1471 MOVD reg32, xmmreg Double FSTORE 4
1472 On AMDFAM10:
1473 MOVD reg64, xmmreg Double FADD 3
1474 1/1 1/1
1475 MOVD reg32, xmmreg Double FADD 3
1476 1/1 1/1 */
1477 32, /* size of l1 cache. */
1478 512, /* size of l2 cache. */
1479 64, /* size of prefetch block */
1480 100, /* number of parallel prefetches */
1481 2, /* Branch cost */
1482 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1483 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1484 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1485 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1486 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1487 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1488
1489 /* BTVER1 has optimized REP instruction for medium sized blocks, but for
1490 very small blocks it is better to use loop. For large blocks, libcall can
1491 do nontemporary accesses and beat inline considerably. */
1492 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
1493 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1494 {{libcall, {{8, loop}, {24, unrolled_loop},
1495 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1496 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1497 4, /* scalar_stmt_cost. */
1498 2, /* scalar load_cost. */
1499 2, /* scalar_store_cost. */
1500 6, /* vec_stmt_cost. */
1501 0, /* vec_to_scalar_cost. */
1502 2, /* scalar_to_vec_cost. */
1503 2, /* vec_align_load_cost. */
1504 2, /* vec_unalign_load_cost. */
1505 2, /* vec_store_cost. */
1506 2, /* cond_taken_branch_cost. */
1507 1, /* cond_not_taken_branch_cost. */
1508 };
1509
1510 static const
1511 struct processor_costs pentium4_cost = {
1512 COSTS_N_INSNS (1), /* cost of an add instruction */
1513 COSTS_N_INSNS (3), /* cost of a lea instruction */
1514 COSTS_N_INSNS (4), /* variable shift costs */
1515 COSTS_N_INSNS (4), /* constant shift costs */
1516 {COSTS_N_INSNS (15), /* cost of starting multiply for QI */
1517 COSTS_N_INSNS (15), /* HI */
1518 COSTS_N_INSNS (15), /* SI */
1519 COSTS_N_INSNS (15), /* DI */
1520 COSTS_N_INSNS (15)}, /* other */
1521 0, /* cost of multiply per each bit set */
1522 {COSTS_N_INSNS (56), /* cost of a divide/mod for QI */
1523 COSTS_N_INSNS (56), /* HI */
1524 COSTS_N_INSNS (56), /* SI */
1525 COSTS_N_INSNS (56), /* DI */
1526 COSTS_N_INSNS (56)}, /* other */
1527 COSTS_N_INSNS (1), /* cost of movsx */
1528 COSTS_N_INSNS (1), /* cost of movzx */
1529 16, /* "large" insn */
1530 6, /* MOVE_RATIO */
1531 2, /* cost for loading QImode using movzbl */
1532 {4, 5, 4}, /* cost of loading integer registers
1533 in QImode, HImode and SImode.
1534 Relative to reg-reg move (2). */
1535 {2, 3, 2}, /* cost of storing integer registers */
1536 2, /* cost of reg,reg fld/fst */
1537 {2, 2, 6}, /* cost of loading fp registers
1538 in SFmode, DFmode and XFmode */
1539 {4, 4, 6}, /* cost of storing fp registers
1540 in SFmode, DFmode and XFmode */
1541 2, /* cost of moving MMX register */
1542 {2, 2}, /* cost of loading MMX registers
1543 in SImode and DImode */
1544 {2, 2}, /* cost of storing MMX registers
1545 in SImode and DImode */
1546 12, /* cost of moving SSE register */
1547 {12, 12, 12}, /* cost of loading SSE registers
1548 in SImode, DImode and TImode */
1549 {2, 2, 8}, /* cost of storing SSE registers
1550 in SImode, DImode and TImode */
1551 10, /* MMX or SSE register to integer */
1552 8, /* size of l1 cache. */
1553 256, /* size of l2 cache. */
1554 64, /* size of prefetch block */
1555 6, /* number of parallel prefetches */
1556 2, /* Branch cost */
1557 COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */
1558 COSTS_N_INSNS (7), /* cost of FMUL instruction. */
1559 COSTS_N_INSNS (43), /* cost of FDIV instruction. */
1560 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1561 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1562 COSTS_N_INSNS (43), /* cost of FSQRT instruction. */
1563 {{libcall, {{12, loop_1_byte}, {-1, rep_prefix_4_byte}}},
1564 DUMMY_STRINGOP_ALGS},
1565 {{libcall, {{6, loop_1_byte}, {48, loop}, {20480, rep_prefix_4_byte},
1566 {-1, libcall}}},
1567 DUMMY_STRINGOP_ALGS},
1568 1, /* scalar_stmt_cost. */
1569 1, /* scalar load_cost. */
1570 1, /* scalar_store_cost. */
1571 1, /* vec_stmt_cost. */
1572 1, /* vec_to_scalar_cost. */
1573 1, /* scalar_to_vec_cost. */
1574 1, /* vec_align_load_cost. */
1575 2, /* vec_unalign_load_cost. */
1576 1, /* vec_store_cost. */
1577 3, /* cond_taken_branch_cost. */
1578 1, /* cond_not_taken_branch_cost. */
1579 };
1580
1581 static const
1582 struct processor_costs nocona_cost = {
1583 COSTS_N_INSNS (1), /* cost of an add instruction */
1584 COSTS_N_INSNS (1), /* cost of a lea instruction */
1585 COSTS_N_INSNS (1), /* variable shift costs */
1586 COSTS_N_INSNS (1), /* constant shift costs */
1587 {COSTS_N_INSNS (10), /* cost of starting multiply for QI */
1588 COSTS_N_INSNS (10), /* HI */
1589 COSTS_N_INSNS (10), /* SI */
1590 COSTS_N_INSNS (10), /* DI */
1591 COSTS_N_INSNS (10)}, /* other */
1592 0, /* cost of multiply per each bit set */
1593 {COSTS_N_INSNS (66), /* cost of a divide/mod for QI */
1594 COSTS_N_INSNS (66), /* HI */
1595 COSTS_N_INSNS (66), /* SI */
1596 COSTS_N_INSNS (66), /* DI */
1597 COSTS_N_INSNS (66)}, /* other */
1598 COSTS_N_INSNS (1), /* cost of movsx */
1599 COSTS_N_INSNS (1), /* cost of movzx */
1600 16, /* "large" insn */
1601 17, /* MOVE_RATIO */
1602 4, /* cost for loading QImode using movzbl */
1603 {4, 4, 4}, /* cost of loading integer registers
1604 in QImode, HImode and SImode.
1605 Relative to reg-reg move (2). */
1606 {4, 4, 4}, /* cost of storing integer registers */
1607 3, /* cost of reg,reg fld/fst */
1608 {12, 12, 12}, /* cost of loading fp registers
1609 in SFmode, DFmode and XFmode */
1610 {4, 4, 4}, /* cost of storing fp registers
1611 in SFmode, DFmode and XFmode */
1612 6, /* cost of moving MMX register */
1613 {12, 12}, /* cost of loading MMX registers
1614 in SImode and DImode */
1615 {12, 12}, /* cost of storing MMX registers
1616 in SImode and DImode */
1617 6, /* cost of moving SSE register */
1618 {12, 12, 12}, /* cost of loading SSE registers
1619 in SImode, DImode and TImode */
1620 {12, 12, 12}, /* cost of storing SSE registers
1621 in SImode, DImode and TImode */
1622 8, /* MMX or SSE register to integer */
1623 8, /* size of l1 cache. */
1624 1024, /* size of l2 cache. */
1625 128, /* size of prefetch block */
1626 8, /* number of parallel prefetches */
1627 1, /* Branch cost */
1628 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1629 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1630 COSTS_N_INSNS (40), /* cost of FDIV instruction. */
1631 COSTS_N_INSNS (3), /* cost of FABS instruction. */
1632 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
1633 COSTS_N_INSNS (44), /* cost of FSQRT instruction. */
1634 {{libcall, {{12, loop_1_byte}, {-1, rep_prefix_4_byte}}},
1635 {libcall, {{32, loop}, {20000, rep_prefix_8_byte},
1636 {100000, unrolled_loop}, {-1, libcall}}}},
1637 {{libcall, {{6, loop_1_byte}, {48, loop}, {20480, rep_prefix_4_byte},
1638 {-1, libcall}}},
1639 {libcall, {{24, loop}, {64, unrolled_loop},
1640 {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1641 1, /* scalar_stmt_cost. */
1642 1, /* scalar load_cost. */
1643 1, /* scalar_store_cost. */
1644 1, /* vec_stmt_cost. */
1645 1, /* vec_to_scalar_cost. */
1646 1, /* scalar_to_vec_cost. */
1647 1, /* vec_align_load_cost. */
1648 2, /* vec_unalign_load_cost. */
1649 1, /* vec_store_cost. */
1650 3, /* cond_taken_branch_cost. */
1651 1, /* cond_not_taken_branch_cost. */
1652 };
1653
1654 static const
1655 struct processor_costs atom_cost = {
1656 COSTS_N_INSNS (1), /* cost of an add instruction */
1657 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1658 COSTS_N_INSNS (1), /* variable shift costs */
1659 COSTS_N_INSNS (1), /* constant shift costs */
1660 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1661 COSTS_N_INSNS (4), /* HI */
1662 COSTS_N_INSNS (3), /* SI */
1663 COSTS_N_INSNS (4), /* DI */
1664 COSTS_N_INSNS (2)}, /* other */
1665 0, /* cost of multiply per each bit set */
1666 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1667 COSTS_N_INSNS (26), /* HI */
1668 COSTS_N_INSNS (42), /* SI */
1669 COSTS_N_INSNS (74), /* DI */
1670 COSTS_N_INSNS (74)}, /* other */
1671 COSTS_N_INSNS (1), /* cost of movsx */
1672 COSTS_N_INSNS (1), /* cost of movzx */
1673 8, /* "large" insn */
1674 17, /* MOVE_RATIO */
1675 4, /* cost for loading QImode using movzbl */
1676 {4, 4, 4}, /* cost of loading integer registers
1677 in QImode, HImode and SImode.
1678 Relative to reg-reg move (2). */
1679 {4, 4, 4}, /* cost of storing integer registers */
1680 4, /* cost of reg,reg fld/fst */
1681 {12, 12, 12}, /* cost of loading fp registers
1682 in SFmode, DFmode and XFmode */
1683 {6, 6, 8}, /* cost of storing fp registers
1684 in SFmode, DFmode and XFmode */
1685 2, /* cost of moving MMX register */
1686 {8, 8}, /* cost of loading MMX registers
1687 in SImode and DImode */
1688 {8, 8}, /* cost of storing MMX registers
1689 in SImode and DImode */
1690 2, /* cost of moving SSE register */
1691 {8, 8, 8}, /* cost of loading SSE registers
1692 in SImode, DImode and TImode */
1693 {8, 8, 8}, /* cost of storing SSE registers
1694 in SImode, DImode and TImode */
1695 5, /* MMX or SSE register to integer */
1696 32, /* size of l1 cache. */
1697 256, /* size of l2 cache. */
1698 64, /* size of prefetch block */
1699 6, /* number of parallel prefetches */
1700 3, /* Branch cost */
1701 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1702 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1703 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1704 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1705 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1706 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1707 {{libcall, {{11, loop}, {-1, rep_prefix_4_byte}}},
1708 {libcall, {{32, loop}, {64, rep_prefix_4_byte},
1709 {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1710 {{libcall, {{8, loop}, {15, unrolled_loop},
1711 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1712 {libcall, {{24, loop}, {32, unrolled_loop},
1713 {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1714 1, /* scalar_stmt_cost. */
1715 1, /* scalar load_cost. */
1716 1, /* scalar_store_cost. */
1717 1, /* vec_stmt_cost. */
1718 1, /* vec_to_scalar_cost. */
1719 1, /* scalar_to_vec_cost. */
1720 1, /* vec_align_load_cost. */
1721 2, /* vec_unalign_load_cost. */
1722 1, /* vec_store_cost. */
1723 3, /* cond_taken_branch_cost. */
1724 1, /* cond_not_taken_branch_cost. */
1725 };
1726
1727 /* Generic64 should produce code tuned for Nocona and K8. */
1728 static const
1729 struct processor_costs generic64_cost = {
1730 COSTS_N_INSNS (1), /* cost of an add instruction */
1731 /* On all chips taken into consideration lea is 2 cycles and more. With
1732 this cost however our current implementation of synth_mult results in
1733 use of unnecessary temporary registers causing regression on several
1734 SPECfp benchmarks. */
1735 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1736 COSTS_N_INSNS (1), /* variable shift costs */
1737 COSTS_N_INSNS (1), /* constant shift costs */
1738 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1739 COSTS_N_INSNS (4), /* HI */
1740 COSTS_N_INSNS (3), /* SI */
1741 COSTS_N_INSNS (4), /* DI */
1742 COSTS_N_INSNS (2)}, /* other */
1743 0, /* cost of multiply per each bit set */
1744 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1745 COSTS_N_INSNS (26), /* HI */
1746 COSTS_N_INSNS (42), /* SI */
1747 COSTS_N_INSNS (74), /* DI */
1748 COSTS_N_INSNS (74)}, /* other */
1749 COSTS_N_INSNS (1), /* cost of movsx */
1750 COSTS_N_INSNS (1), /* cost of movzx */
1751 8, /* "large" insn */
1752 17, /* MOVE_RATIO */
1753 4, /* cost for loading QImode using movzbl */
1754 {4, 4, 4}, /* cost of loading integer registers
1755 in QImode, HImode and SImode.
1756 Relative to reg-reg move (2). */
1757 {4, 4, 4}, /* cost of storing integer registers */
1758 4, /* cost of reg,reg fld/fst */
1759 {12, 12, 12}, /* cost of loading fp registers
1760 in SFmode, DFmode and XFmode */
1761 {6, 6, 8}, /* cost of storing fp registers
1762 in SFmode, DFmode and XFmode */
1763 2, /* cost of moving MMX register */
1764 {8, 8}, /* cost of loading MMX registers
1765 in SImode and DImode */
1766 {8, 8}, /* cost of storing MMX registers
1767 in SImode and DImode */
1768 2, /* cost of moving SSE register */
1769 {8, 8, 8}, /* cost of loading SSE registers
1770 in SImode, DImode and TImode */
1771 {8, 8, 8}, /* cost of storing SSE registers
1772 in SImode, DImode and TImode */
1773 5, /* MMX or SSE register to integer */
1774 32, /* size of l1 cache. */
1775 512, /* size of l2 cache. */
1776 64, /* size of prefetch block */
1777 6, /* number of parallel prefetches */
1778 /* Benchmarks shows large regressions on K8 sixtrack benchmark when this
1779 value is increased to perhaps more appropriate value of 5. */
1780 3, /* Branch cost */
1781 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1782 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1783 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1784 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1785 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1786 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1787 {DUMMY_STRINGOP_ALGS,
1788 {libcall, {{32, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1789 {DUMMY_STRINGOP_ALGS,
1790 {libcall, {{32, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1791 1, /* scalar_stmt_cost. */
1792 1, /* scalar load_cost. */
1793 1, /* scalar_store_cost. */
1794 1, /* vec_stmt_cost. */
1795 1, /* vec_to_scalar_cost. */
1796 1, /* scalar_to_vec_cost. */
1797 1, /* vec_align_load_cost. */
1798 2, /* vec_unalign_load_cost. */
1799 1, /* vec_store_cost. */
1800 3, /* cond_taken_branch_cost. */
1801 1, /* cond_not_taken_branch_cost. */
1802 };
1803
1804 /* Generic32 should produce code tuned for PPro, Pentium4, Nocona,
1805 Athlon and K8. */
1806 static const
1807 struct processor_costs generic32_cost = {
1808 COSTS_N_INSNS (1), /* cost of an add instruction */
1809 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1810 COSTS_N_INSNS (1), /* variable shift costs */
1811 COSTS_N_INSNS (1), /* constant shift costs */
1812 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1813 COSTS_N_INSNS (4), /* HI */
1814 COSTS_N_INSNS (3), /* SI */
1815 COSTS_N_INSNS (4), /* DI */
1816 COSTS_N_INSNS (2)}, /* other */
1817 0, /* cost of multiply per each bit set */
1818 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1819 COSTS_N_INSNS (26), /* HI */
1820 COSTS_N_INSNS (42), /* SI */
1821 COSTS_N_INSNS (74), /* DI */
1822 COSTS_N_INSNS (74)}, /* other */
1823 COSTS_N_INSNS (1), /* cost of movsx */
1824 COSTS_N_INSNS (1), /* cost of movzx */
1825 8, /* "large" insn */
1826 17, /* MOVE_RATIO */
1827 4, /* cost for loading QImode using movzbl */
1828 {4, 4, 4}, /* cost of loading integer registers
1829 in QImode, HImode and SImode.
1830 Relative to reg-reg move (2). */
1831 {4, 4, 4}, /* cost of storing integer registers */
1832 4, /* cost of reg,reg fld/fst */
1833 {12, 12, 12}, /* cost of loading fp registers
1834 in SFmode, DFmode and XFmode */
1835 {6, 6, 8}, /* cost of storing fp registers
1836 in SFmode, DFmode and XFmode */
1837 2, /* cost of moving MMX register */
1838 {8, 8}, /* cost of loading MMX registers
1839 in SImode and DImode */
1840 {8, 8}, /* cost of storing MMX registers
1841 in SImode and DImode */
1842 2, /* cost of moving SSE register */
1843 {8, 8, 8}, /* cost of loading SSE registers
1844 in SImode, DImode and TImode */
1845 {8, 8, 8}, /* cost of storing SSE registers
1846 in SImode, DImode and TImode */
1847 5, /* MMX or SSE register to integer */
1848 32, /* size of l1 cache. */
1849 256, /* size of l2 cache. */
1850 64, /* size of prefetch block */
1851 6, /* number of parallel prefetches */
1852 3, /* Branch cost */
1853 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1854 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1855 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1856 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1857 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1858 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1859 {{libcall, {{32, loop}, {8192, rep_prefix_4_byte}, {-1, libcall}}},
1860 DUMMY_STRINGOP_ALGS},
1861 {{libcall, {{32, loop}, {8192, rep_prefix_4_byte}, {-1, libcall}}},
1862 DUMMY_STRINGOP_ALGS},
1863 1, /* scalar_stmt_cost. */
1864 1, /* scalar load_cost. */
1865 1, /* scalar_store_cost. */
1866 1, /* vec_stmt_cost. */
1867 1, /* vec_to_scalar_cost. */
1868 1, /* scalar_to_vec_cost. */
1869 1, /* vec_align_load_cost. */
1870 2, /* vec_unalign_load_cost. */
1871 1, /* vec_store_cost. */
1872 3, /* cond_taken_branch_cost. */
1873 1, /* cond_not_taken_branch_cost. */
1874 };
1875
1876 const struct processor_costs *ix86_cost = &pentium_cost;
1877
1878 /* Processor feature/optimization bitmasks. */
1879 #define m_386 (1<<PROCESSOR_I386)
1880 #define m_486 (1<<PROCESSOR_I486)
1881 #define m_PENT (1<<PROCESSOR_PENTIUM)
1882 #define m_PPRO (1<<PROCESSOR_PENTIUMPRO)
1883 #define m_PENT4 (1<<PROCESSOR_PENTIUM4)
1884 #define m_NOCONA (1<<PROCESSOR_NOCONA)
1885 #define m_P4_NOCONA (m_PENT4 | m_NOCONA)
1886 #define m_CORE2_32 (1<<PROCESSOR_CORE2_32)
1887 #define m_CORE2_64 (1<<PROCESSOR_CORE2_64)
1888 #define m_COREI7_32 (1<<PROCESSOR_COREI7_32)
1889 #define m_COREI7_64 (1<<PROCESSOR_COREI7_64)
1890 #define m_COREI7 (m_COREI7_32 | m_COREI7_64)
1891 #define m_CORE2I7_32 (m_CORE2_32 | m_COREI7_32)
1892 #define m_CORE2I7_64 (m_CORE2_64 | m_COREI7_64)
1893 #define m_CORE2I7 (m_CORE2I7_32 | m_CORE2I7_64)
1894 #define m_ATOM (1<<PROCESSOR_ATOM)
1895
1896 #define m_GEODE (1<<PROCESSOR_GEODE)
1897 #define m_K6 (1<<PROCESSOR_K6)
1898 #define m_K6_GEODE (m_K6 | m_GEODE)
1899 #define m_K8 (1<<PROCESSOR_K8)
1900 #define m_ATHLON (1<<PROCESSOR_ATHLON)
1901 #define m_ATHLON_K8 (m_K8 | m_ATHLON)
1902 #define m_AMDFAM10 (1<<PROCESSOR_AMDFAM10)
1903 #define m_BDVER1 (1<<PROCESSOR_BDVER1)
1904 #define m_BDVER2 (1<<PROCESSOR_BDVER2)
1905 #define m_BDVER (m_BDVER1 | m_BDVER2)
1906 #define m_BTVER1 (1<<PROCESSOR_BTVER1)
1907 #define m_AMD_MULTIPLE (m_ATHLON_K8 | m_AMDFAM10 | m_BDVER | m_BTVER1)
1908
1909 #define m_GENERIC32 (1<<PROCESSOR_GENERIC32)
1910 #define m_GENERIC64 (1<<PROCESSOR_GENERIC64)
1911
1912 /* Generic instruction choice should be common subset of supported CPUs
1913 (PPro/PENT4/NOCONA/CORE2/Athlon/K8). */
1914 #define m_GENERIC (m_GENERIC32 | m_GENERIC64)
1915
1916 /* Feature tests against the various tunings. */
1917 unsigned char ix86_tune_features[X86_TUNE_LAST];
1918
1919 /* Feature tests against the various tunings used to create ix86_tune_features
1920 based on the processor mask. */
1921 static unsigned int initial_ix86_tune_features[X86_TUNE_LAST] = {
1922 /* X86_TUNE_USE_LEAVE: Leave does not affect Nocona SPEC2000 results
1923 negatively, so enabling for Generic64 seems like good code size
1924 tradeoff. We can't enable it for 32bit generic because it does not
1925 work well with PPro base chips. */
1926 m_386 | m_CORE2I7_64 | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC64,
1927
1928 /* X86_TUNE_PUSH_MEMORY */
1929 m_386 | m_P4_NOCONA | m_CORE2I7 | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC,
1930
1931 /* X86_TUNE_ZERO_EXTEND_WITH_AND */
1932 m_486 | m_PENT,
1933
1934 /* X86_TUNE_UNROLL_STRLEN */
1935 m_486 | m_PENT | m_PPRO | m_ATOM | m_CORE2I7 | m_K6 | m_AMD_MULTIPLE | m_GENERIC,
1936
1937 /* X86_TUNE_BRANCH_PREDICTION_HINTS: Branch hints were put in P4 based
1938 on simulation result. But after P4 was made, no performance benefit
1939 was observed with branch hints. It also increases the code size.
1940 As a result, icc never generates branch hints. */
1941 0,
1942
1943 /* X86_TUNE_DOUBLE_WITH_ADD */
1944 ~m_386,
1945
1946 /* X86_TUNE_USE_SAHF */
1947 m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_K6_GEODE | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER1 | m_GENERIC,
1948
1949 /* X86_TUNE_MOVX: Enable to zero extend integer registers to avoid
1950 partial dependencies. */
1951 m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_GEODE | m_AMD_MULTIPLE | m_GENERIC,
1952
1953 /* X86_TUNE_PARTIAL_REG_STALL: We probably ought to watch for partial
1954 register stalls on Generic32 compilation setting as well. However
1955 in current implementation the partial register stalls are not eliminated
1956 very well - they can be introduced via subregs synthesized by combine
1957 and can happen in caller/callee saving sequences. Because this option
1958 pays back little on PPro based chips and is in conflict with partial reg
1959 dependencies used by Athlon/P4 based chips, it is better to leave it off
1960 for generic32 for now. */
1961 m_PPRO,
1962
1963 /* X86_TUNE_PARTIAL_FLAG_REG_STALL */
1964 m_CORE2I7 | m_GENERIC,
1965
1966 /* X86_TUNE_USE_HIMODE_FIOP */
1967 m_386 | m_486 | m_K6_GEODE,
1968
1969 /* X86_TUNE_USE_SIMODE_FIOP */
1970 ~(m_PENT | m_PPRO | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC),
1971
1972 /* X86_TUNE_USE_MOV0 */
1973 m_K6,
1974
1975 /* X86_TUNE_USE_CLTD */
1976 ~(m_PENT | m_CORE2I7 | m_ATOM | m_K6 | m_GENERIC),
1977
1978 /* X86_TUNE_USE_XCHGB: Use xchgb %rh,%rl instead of rolw/rorw $8,rx. */
1979 m_PENT4,
1980
1981 /* X86_TUNE_SPLIT_LONG_MOVES */
1982 m_PPRO,
1983
1984 /* X86_TUNE_READ_MODIFY_WRITE */
1985 ~m_PENT,
1986
1987 /* X86_TUNE_READ_MODIFY */
1988 ~(m_PENT | m_PPRO),
1989
1990 /* X86_TUNE_PROMOTE_QIMODE */
1991 m_386 | m_486 | m_PENT | m_CORE2I7 | m_ATOM | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC,
1992
1993 /* X86_TUNE_FAST_PREFIX */
1994 ~(m_386 | m_486 | m_PENT),
1995
1996 /* X86_TUNE_SINGLE_STRINGOP */
1997 m_386 | m_P4_NOCONA,
1998
1999 /* X86_TUNE_QIMODE_MATH */
2000 ~0,
2001
2002 /* X86_TUNE_HIMODE_MATH: On PPro this flag is meant to avoid partial
2003 register stalls. Just like X86_TUNE_PARTIAL_REG_STALL this option
2004 might be considered for Generic32 if our scheme for avoiding partial
2005 stalls was more effective. */
2006 ~m_PPRO,
2007
2008 /* X86_TUNE_PROMOTE_QI_REGS */
2009 0,
2010
2011 /* X86_TUNE_PROMOTE_HI_REGS */
2012 m_PPRO,
2013
2014 /* X86_TUNE_SINGLE_POP: Enable if single pop insn is preferred
2015 over esp addition. */
2016 m_386 | m_486 | m_PENT | m_PPRO,
2017
2018 /* X86_TUNE_DOUBLE_POP: Enable if double pop insn is preferred
2019 over esp addition. */
2020 m_PENT,
2021
2022 /* X86_TUNE_SINGLE_PUSH: Enable if single push insn is preferred
2023 over esp subtraction. */
2024 m_386 | m_486 | m_PENT | m_K6_GEODE,
2025
2026 /* X86_TUNE_DOUBLE_PUSH. Enable if double push insn is preferred
2027 over esp subtraction. */
2028 m_PENT | m_K6_GEODE,
2029
2030 /* X86_TUNE_INTEGER_DFMODE_MOVES: Enable if integer moves are preferred
2031 for DFmode copies */
2032 ~(m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_GEODE | m_AMD_MULTIPLE | m_ATOM | m_GENERIC),
2033
2034 /* X86_TUNE_PARTIAL_REG_DEPENDENCY */
2035 m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
2036
2037 /* X86_TUNE_SSE_PARTIAL_REG_DEPENDENCY: In the Generic model we have a
2038 conflict here in between PPro/Pentium4 based chips that thread 128bit
2039 SSE registers as single units versus K8 based chips that divide SSE
2040 registers to two 64bit halves. This knob promotes all store destinations
2041 to be 128bit to allow register renaming on 128bit SSE units, but usually
2042 results in one extra microop on 64bit SSE units. Experimental results
2043 shows that disabling this option on P4 brings over 20% SPECfp regression,
2044 while enabling it on K8 brings roughly 2.4% regression that can be partly
2045 masked by careful scheduling of moves. */
2046 m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMDFAM10 | m_BDVER | m_GENERIC,
2047
2048 /* X86_TUNE_SSE_UNALIGNED_LOAD_OPTIMAL */
2049 m_COREI7 | m_AMDFAM10 | m_BDVER | m_BTVER1,
2050
2051 /* X86_TUNE_SSE_UNALIGNED_STORE_OPTIMAL */
2052 m_COREI7 | m_BDVER,
2053
2054 /* X86_TUNE_SSE_PACKED_SINGLE_INSN_OPTIMAL */
2055 m_BDVER ,
2056
2057 /* X86_TUNE_SSE_SPLIT_REGS: Set for machines where the type and dependencies
2058 are resolved on SSE register parts instead of whole registers, so we may
2059 maintain just lower part of scalar values in proper format leaving the
2060 upper part undefined. */
2061 m_ATHLON_K8,
2062
2063 /* X86_TUNE_SSE_TYPELESS_STORES */
2064 m_AMD_MULTIPLE,
2065
2066 /* X86_TUNE_SSE_LOAD0_BY_PXOR */
2067 m_PPRO | m_P4_NOCONA,
2068
2069 /* X86_TUNE_MEMORY_MISMATCH_STALL */
2070 m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
2071
2072 /* X86_TUNE_PROLOGUE_USING_MOVE */
2073 m_PPRO | m_CORE2I7 | m_ATOM | m_ATHLON_K8 | m_GENERIC,
2074
2075 /* X86_TUNE_EPILOGUE_USING_MOVE */
2076 m_PPRO | m_CORE2I7 | m_ATOM | m_ATHLON_K8 | m_GENERIC,
2077
2078 /* X86_TUNE_SHIFT1 */
2079 ~m_486,
2080
2081 /* X86_TUNE_USE_FFREEP */
2082 m_AMD_MULTIPLE,
2083
2084 /* X86_TUNE_INTER_UNIT_MOVES */
2085 ~(m_AMD_MULTIPLE | m_GENERIC),
2086
2087 /* X86_TUNE_INTER_UNIT_CONVERSIONS */
2088 ~(m_AMDFAM10 | m_BDVER ),
2089
2090 /* X86_TUNE_FOUR_JUMP_LIMIT: Some CPU cores are not able to predict more
2091 than 4 branch instructions in the 16 byte window. */
2092 m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
2093
2094 /* X86_TUNE_SCHEDULE */
2095 m_PENT | m_PPRO | m_CORE2I7 | m_ATOM | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC,
2096
2097 /* X86_TUNE_USE_BT */
2098 m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
2099
2100 /* X86_TUNE_USE_INCDEC */
2101 ~(m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_GENERIC),
2102
2103 /* X86_TUNE_PAD_RETURNS */
2104 m_CORE2I7 | m_AMD_MULTIPLE | m_GENERIC,
2105
2106 /* X86_TUNE_PAD_SHORT_FUNCTION: Pad short funtion. */
2107 m_ATOM,
2108
2109 /* X86_TUNE_EXT_80387_CONSTANTS */
2110 m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_K6_GEODE | m_ATHLON_K8 | m_GENERIC,
2111
2112 /* X86_TUNE_SHORTEN_X87_SSE */
2113 ~m_K8,
2114
2115 /* X86_TUNE_AVOID_VECTOR_DECODE */
2116 m_CORE2I7_64 | m_K8 | m_GENERIC64,
2117
2118 /* X86_TUNE_PROMOTE_HIMODE_IMUL: Modern CPUs have same latency for HImode
2119 and SImode multiply, but 386 and 486 do HImode multiply faster. */
2120 ~(m_386 | m_486),
2121
2122 /* X86_TUNE_SLOW_IMUL_IMM32_MEM: Imul of 32-bit constant and memory is
2123 vector path on AMD machines. */
2124 m_CORE2I7_64 | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER1 | m_GENERIC64,
2125
2126 /* X86_TUNE_SLOW_IMUL_IMM8: Imul of 8-bit constant is vector path on AMD
2127 machines. */
2128 m_CORE2I7_64 | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER1 | m_GENERIC64,
2129
2130 /* X86_TUNE_MOVE_M1_VIA_OR: On pentiums, it is faster to load -1 via OR
2131 than a MOV. */
2132 m_PENT,
2133
2134 /* X86_TUNE_NOT_UNPAIRABLE: NOT is not pairable on Pentium, while XOR is,
2135 but one byte longer. */
2136 m_PENT,
2137
2138 /* X86_TUNE_NOT_VECTORMODE: On AMD K6, NOT is vector decoded with memory
2139 operand that cannot be represented using a modRM byte. The XOR
2140 replacement is long decoded, so this split helps here as well. */
2141 m_K6,
2142
2143 /* X86_TUNE_USE_VECTOR_FP_CONVERTS: Prefer vector packed SSE conversion
2144 from FP to FP. */
2145 m_CORE2I7 | m_AMDFAM10 | m_GENERIC,
2146
2147 /* X86_TUNE_USE_VECTOR_CONVERTS: Prefer vector packed SSE conversion
2148 from integer to FP. */
2149 m_AMDFAM10,
2150
2151 /* X86_TUNE_FUSE_CMP_AND_BRANCH: Fuse a compare or test instruction
2152 with a subsequent conditional jump instruction into a single
2153 compare-and-branch uop. */
2154 m_BDVER,
2155
2156 /* X86_TUNE_OPT_AGU: Optimize for Address Generation Unit. This flag
2157 will impact LEA instruction selection. */
2158 m_ATOM,
2159
2160 /* X86_TUNE_VECTORIZE_DOUBLE: Enable double precision vector
2161 instructions. */
2162 ~m_ATOM,
2163
2164 /* X86_SOFTARE_PREFETCHING_BENEFICIAL: Enable software prefetching
2165 at -O3. For the moment, the prefetching seems badly tuned for Intel
2166 chips. */
2167 m_K6_GEODE | m_AMD_MULTIPLE,
2168
2169 /* X86_TUNE_AVX128_OPTIMAL: Enable 128-bit AVX instruction generation for
2170 the auto-vectorizer. */
2171 m_BDVER,
2172
2173 /* X86_TUNE_REASSOC_INT_TO_PARALLEL: Try to produce parallel computations
2174 during reassociation of integer computation. */
2175 m_ATOM,
2176
2177 /* X86_TUNE_REASSOC_FP_TO_PARALLEL: Try to produce parallel computations
2178 during reassociation of fp computation. */
2179 m_ATOM
2180 };
2181
2182 /* Feature tests against the various architecture variations. */
2183 unsigned char ix86_arch_features[X86_ARCH_LAST];
2184
2185 /* Feature tests against the various architecture variations, used to create
2186 ix86_arch_features based on the processor mask. */
2187 static unsigned int initial_ix86_arch_features[X86_ARCH_LAST] = {
2188 /* X86_ARCH_CMOVE: Conditional move was added for pentiumpro. */
2189 ~(m_386 | m_486 | m_PENT | m_K6),
2190
2191 /* X86_ARCH_CMPXCHG: Compare and exchange was added for 80486. */
2192 ~m_386,
2193
2194 /* X86_ARCH_CMPXCHG8B: Compare and exchange 8 bytes was added for pentium. */
2195 ~(m_386 | m_486),
2196
2197 /* X86_ARCH_XADD: Exchange and add was added for 80486. */
2198 ~m_386,
2199
2200 /* X86_ARCH_BSWAP: Byteswap was added for 80486. */
2201 ~m_386,
2202 };
2203
2204 static const unsigned int x86_accumulate_outgoing_args
2205 = m_PPRO | m_P4_NOCONA | m_ATOM | m_CORE2I7 | m_AMD_MULTIPLE | m_GENERIC;
2206
2207 static const unsigned int x86_arch_always_fancy_math_387
2208 = m_PENT | m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC;
2209
2210 static const unsigned int x86_avx256_split_unaligned_load
2211 = m_COREI7 | m_GENERIC;
2212
2213 static const unsigned int x86_avx256_split_unaligned_store
2214 = m_COREI7 | m_BDVER | m_GENERIC;
2215
2216 /* In case the average insn count for single function invocation is
2217 lower than this constant, emit fast (but longer) prologue and
2218 epilogue code. */
2219 #define FAST_PROLOGUE_INSN_COUNT 20
2220
2221 /* Names for 8 (low), 8 (high), and 16-bit registers, respectively. */
2222 static const char *const qi_reg_name[] = QI_REGISTER_NAMES;
2223 static const char *const qi_high_reg_name[] = QI_HIGH_REGISTER_NAMES;
2224 static const char *const hi_reg_name[] = HI_REGISTER_NAMES;
2225
2226 /* Array of the smallest class containing reg number REGNO, indexed by
2227 REGNO. Used by REGNO_REG_CLASS in i386.h. */
2228
2229 enum reg_class const regclass_map[FIRST_PSEUDO_REGISTER] =
2230 {
2231 /* ax, dx, cx, bx */
2232 AREG, DREG, CREG, BREG,
2233 /* si, di, bp, sp */
2234 SIREG, DIREG, NON_Q_REGS, NON_Q_REGS,
2235 /* FP registers */
2236 FP_TOP_REG, FP_SECOND_REG, FLOAT_REGS, FLOAT_REGS,
2237 FLOAT_REGS, FLOAT_REGS, FLOAT_REGS, FLOAT_REGS,
2238 /* arg pointer */
2239 NON_Q_REGS,
2240 /* flags, fpsr, fpcr, frame */
2241 NO_REGS, NO_REGS, NO_REGS, NON_Q_REGS,
2242 /* SSE registers */
2243 SSE_FIRST_REG, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
2244 SSE_REGS, SSE_REGS,
2245 /* MMX registers */
2246 MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS,
2247 MMX_REGS, MMX_REGS,
2248 /* REX registers */
2249 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
2250 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
2251 /* SSE REX registers */
2252 SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
2253 SSE_REGS, SSE_REGS,
2254 };
2255
2256 /* The "default" register map used in 32bit mode. */
2257
2258 int const dbx_register_map[FIRST_PSEUDO_REGISTER] =
2259 {
2260 0, 2, 1, 3, 6, 7, 4, 5, /* general regs */
2261 12, 13, 14, 15, 16, 17, 18, 19, /* fp regs */
2262 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2263 21, 22, 23, 24, 25, 26, 27, 28, /* SSE */
2264 29, 30, 31, 32, 33, 34, 35, 36, /* MMX */
2265 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
2266 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
2267 };
2268
2269 /* The "default" register map used in 64bit mode. */
2270
2271 int const dbx64_register_map[FIRST_PSEUDO_REGISTER] =
2272 {
2273 0, 1, 2, 3, 4, 5, 6, 7, /* general regs */
2274 33, 34, 35, 36, 37, 38, 39, 40, /* fp regs */
2275 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2276 17, 18, 19, 20, 21, 22, 23, 24, /* SSE */
2277 41, 42, 43, 44, 45, 46, 47, 48, /* MMX */
2278 8,9,10,11,12,13,14,15, /* extended integer registers */
2279 25, 26, 27, 28, 29, 30, 31, 32, /* extended SSE registers */
2280 };
2281
2282 /* Define the register numbers to be used in Dwarf debugging information.
2283 The SVR4 reference port C compiler uses the following register numbers
2284 in its Dwarf output code:
2285 0 for %eax (gcc regno = 0)
2286 1 for %ecx (gcc regno = 2)
2287 2 for %edx (gcc regno = 1)
2288 3 for %ebx (gcc regno = 3)
2289 4 for %esp (gcc regno = 7)
2290 5 for %ebp (gcc regno = 6)
2291 6 for %esi (gcc regno = 4)
2292 7 for %edi (gcc regno = 5)
2293 The following three DWARF register numbers are never generated by
2294 the SVR4 C compiler or by the GNU compilers, but SDB on x86/svr4
2295 believes these numbers have these meanings.
2296 8 for %eip (no gcc equivalent)
2297 9 for %eflags (gcc regno = 17)
2298 10 for %trapno (no gcc equivalent)
2299 It is not at all clear how we should number the FP stack registers
2300 for the x86 architecture. If the version of SDB on x86/svr4 were
2301 a bit less brain dead with respect to floating-point then we would
2302 have a precedent to follow with respect to DWARF register numbers
2303 for x86 FP registers, but the SDB on x86/svr4 is so completely
2304 broken with respect to FP registers that it is hardly worth thinking
2305 of it as something to strive for compatibility with.
2306 The version of x86/svr4 SDB I have at the moment does (partially)
2307 seem to believe that DWARF register number 11 is associated with
2308 the x86 register %st(0), but that's about all. Higher DWARF
2309 register numbers don't seem to be associated with anything in
2310 particular, and even for DWARF regno 11, SDB only seems to under-
2311 stand that it should say that a variable lives in %st(0) (when
2312 asked via an `=' command) if we said it was in DWARF regno 11,
2313 but SDB still prints garbage when asked for the value of the
2314 variable in question (via a `/' command).
2315 (Also note that the labels SDB prints for various FP stack regs
2316 when doing an `x' command are all wrong.)
2317 Note that these problems generally don't affect the native SVR4
2318 C compiler because it doesn't allow the use of -O with -g and
2319 because when it is *not* optimizing, it allocates a memory
2320 location for each floating-point variable, and the memory
2321 location is what gets described in the DWARF AT_location
2322 attribute for the variable in question.
2323 Regardless of the severe mental illness of the x86/svr4 SDB, we
2324 do something sensible here and we use the following DWARF
2325 register numbers. Note that these are all stack-top-relative
2326 numbers.
2327 11 for %st(0) (gcc regno = 8)
2328 12 for %st(1) (gcc regno = 9)
2329 13 for %st(2) (gcc regno = 10)
2330 14 for %st(3) (gcc regno = 11)
2331 15 for %st(4) (gcc regno = 12)
2332 16 for %st(5) (gcc regno = 13)
2333 17 for %st(6) (gcc regno = 14)
2334 18 for %st(7) (gcc regno = 15)
2335 */
2336 int const svr4_dbx_register_map[FIRST_PSEUDO_REGISTER] =
2337 {
2338 0, 2, 1, 3, 6, 7, 5, 4, /* general regs */
2339 11, 12, 13, 14, 15, 16, 17, 18, /* fp regs */
2340 -1, 9, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2341 21, 22, 23, 24, 25, 26, 27, 28, /* SSE registers */
2342 29, 30, 31, 32, 33, 34, 35, 36, /* MMX registers */
2343 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
2344 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
2345 };
2346
2347 /* Define parameter passing and return registers. */
2348
2349 static int const x86_64_int_parameter_registers[6] =
2350 {
2351 DI_REG, SI_REG, DX_REG, CX_REG, R8_REG, R9_REG
2352 };
2353
2354 static int const x86_64_ms_abi_int_parameter_registers[4] =
2355 {
2356 CX_REG, DX_REG, R8_REG, R9_REG
2357 };
2358
2359 static int const x86_64_int_return_registers[4] =
2360 {
2361 AX_REG, DX_REG, DI_REG, SI_REG
2362 };
2363
2364 /* Define the structure for the machine field in struct function. */
2365
2366 struct GTY(()) stack_local_entry {
2367 unsigned short mode;
2368 unsigned short n;
2369 rtx rtl;
2370 struct stack_local_entry *next;
2371 };
2372
2373 /* Structure describing stack frame layout.
2374 Stack grows downward:
2375
2376 [arguments]
2377 <- ARG_POINTER
2378 saved pc
2379
2380 saved static chain if ix86_static_chain_on_stack
2381
2382 saved frame pointer if frame_pointer_needed
2383 <- HARD_FRAME_POINTER
2384 [saved regs]
2385 <- regs_save_offset
2386 [padding0]
2387
2388 [saved SSE regs]
2389 <- sse_regs_save_offset
2390 [padding1] |
2391 | <- FRAME_POINTER
2392 [va_arg registers] |
2393 |
2394 [frame] |
2395 |
2396 [padding2] | = to_allocate
2397 <- STACK_POINTER
2398 */
2399 struct ix86_frame
2400 {
2401 int nsseregs;
2402 int nregs;
2403 int va_arg_size;
2404 int red_zone_size;
2405 int outgoing_arguments_size;
2406 HOST_WIDE_INT frame;
2407
2408 /* The offsets relative to ARG_POINTER. */
2409 HOST_WIDE_INT frame_pointer_offset;
2410 HOST_WIDE_INT hard_frame_pointer_offset;
2411 HOST_WIDE_INT stack_pointer_offset;
2412 HOST_WIDE_INT hfp_save_offset;
2413 HOST_WIDE_INT reg_save_offset;
2414 HOST_WIDE_INT sse_reg_save_offset;
2415
2416 /* When save_regs_using_mov is set, emit prologue using
2417 move instead of push instructions. */
2418 bool save_regs_using_mov;
2419 };
2420
2421 /* Which cpu are we scheduling for. */
2422 enum attr_cpu ix86_schedule;
2423
2424 /* Which cpu are we optimizing for. */
2425 enum processor_type ix86_tune;
2426
2427 /* Which instruction set architecture to use. */
2428 enum processor_type ix86_arch;
2429
2430 /* true if sse prefetch instruction is not NOOP. */
2431 int x86_prefetch_sse;
2432
2433 /* -mstackrealign option */
2434 static const char ix86_force_align_arg_pointer_string[]
2435 = "force_align_arg_pointer";
2436
2437 static rtx (*ix86_gen_leave) (void);
2438 static rtx (*ix86_gen_add3) (rtx, rtx, rtx);
2439 static rtx (*ix86_gen_sub3) (rtx, rtx, rtx);
2440 static rtx (*ix86_gen_sub3_carry) (rtx, rtx, rtx, rtx, rtx);
2441 static rtx (*ix86_gen_one_cmpl2) (rtx, rtx);
2442 static rtx (*ix86_gen_monitor) (rtx, rtx, rtx);
2443 static rtx (*ix86_gen_andsp) (rtx, rtx, rtx);
2444 static rtx (*ix86_gen_allocate_stack_worker) (rtx, rtx);
2445 static rtx (*ix86_gen_adjust_stack_and_probe) (rtx, rtx, rtx);
2446 static rtx (*ix86_gen_probe_stack_range) (rtx, rtx, rtx);
2447
2448 /* Preferred alignment for stack boundary in bits. */
2449 unsigned int ix86_preferred_stack_boundary;
2450
2451 /* Alignment for incoming stack boundary in bits specified at
2452 command line. */
2453 static unsigned int ix86_user_incoming_stack_boundary;
2454
2455 /* Default alignment for incoming stack boundary in bits. */
2456 static unsigned int ix86_default_incoming_stack_boundary;
2457
2458 /* Alignment for incoming stack boundary in bits. */
2459 unsigned int ix86_incoming_stack_boundary;
2460
2461 /* Calling abi specific va_list type nodes. */
2462 static GTY(()) tree sysv_va_list_type_node;
2463 static GTY(()) tree ms_va_list_type_node;
2464
2465 /* Prefix built by ASM_GENERATE_INTERNAL_LABEL. */
2466 char internal_label_prefix[16];
2467 int internal_label_prefix_len;
2468
2469 /* Fence to use after loop using movnt. */
2470 tree x86_mfence;
2471
2472 /* Register class used for passing given 64bit part of the argument.
2473 These represent classes as documented by the PS ABI, with the exception
2474 of SSESF, SSEDF classes, that are basically SSE class, just gcc will
2475 use SF or DFmode move instead of DImode to avoid reformatting penalties.
2476
2477 Similarly we play games with INTEGERSI_CLASS to use cheaper SImode moves
2478 whenever possible (upper half does contain padding). */
2479 enum x86_64_reg_class
2480 {
2481 X86_64_NO_CLASS,
2482 X86_64_INTEGER_CLASS,
2483 X86_64_INTEGERSI_CLASS,
2484 X86_64_SSE_CLASS,
2485 X86_64_SSESF_CLASS,
2486 X86_64_SSEDF_CLASS,
2487 X86_64_SSEUP_CLASS,
2488 X86_64_X87_CLASS,
2489 X86_64_X87UP_CLASS,
2490 X86_64_COMPLEX_X87_CLASS,
2491 X86_64_MEMORY_CLASS
2492 };
2493
2494 #define MAX_CLASSES 4
2495
2496 /* Table of constants used by fldpi, fldln2, etc.... */
2497 static REAL_VALUE_TYPE ext_80387_constants_table [5];
2498 static bool ext_80387_constants_init = 0;
2499
2500 \f
2501 static struct machine_function * ix86_init_machine_status (void);
2502 static rtx ix86_function_value (const_tree, const_tree, bool);
2503 static bool ix86_function_value_regno_p (const unsigned int);
2504 static unsigned int ix86_function_arg_boundary (enum machine_mode,
2505 const_tree);
2506 static rtx ix86_static_chain (const_tree, bool);
2507 static int ix86_function_regparm (const_tree, const_tree);
2508 static void ix86_compute_frame_layout (struct ix86_frame *);
2509 static bool ix86_expand_vector_init_one_nonzero (bool, enum machine_mode,
2510 rtx, rtx, int);
2511 static void ix86_add_new_builtins (HOST_WIDE_INT);
2512 static tree ix86_canonical_va_list_type (tree);
2513 static void predict_jump (int);
2514 static unsigned int split_stack_prologue_scratch_regno (void);
2515 static bool i386_asm_output_addr_const_extra (FILE *, rtx);
2516
2517 enum ix86_function_specific_strings
2518 {
2519 IX86_FUNCTION_SPECIFIC_ARCH,
2520 IX86_FUNCTION_SPECIFIC_TUNE,
2521 IX86_FUNCTION_SPECIFIC_MAX
2522 };
2523
2524 static char *ix86_target_string (HOST_WIDE_INT, int, const char *,
2525 const char *, enum fpmath_unit, bool);
2526 static void ix86_debug_options (void) ATTRIBUTE_UNUSED;
2527 static void ix86_function_specific_save (struct cl_target_option *);
2528 static void ix86_function_specific_restore (struct cl_target_option *);
2529 static void ix86_function_specific_print (FILE *, int,
2530 struct cl_target_option *);
2531 static bool ix86_valid_target_attribute_p (tree, tree, tree, int);
2532 static bool ix86_valid_target_attribute_inner_p (tree, char *[],
2533 struct gcc_options *);
2534 static bool ix86_can_inline_p (tree, tree);
2535 static void ix86_set_current_function (tree);
2536 static unsigned int ix86_minimum_incoming_stack_boundary (bool);
2537
2538 static enum calling_abi ix86_function_abi (const_tree);
2539
2540 \f
2541 #ifndef SUBTARGET32_DEFAULT_CPU
2542 #define SUBTARGET32_DEFAULT_CPU "i386"
2543 #endif
2544
2545 /* The svr4 ABI for the i386 says that records and unions are returned
2546 in memory. */
2547 #ifndef DEFAULT_PCC_STRUCT_RETURN
2548 #define DEFAULT_PCC_STRUCT_RETURN 1
2549 #endif
2550
2551 /* Whether -mtune= or -march= were specified */
2552 static int ix86_tune_defaulted;
2553 static int ix86_arch_specified;
2554
2555 /* Vectorization library interface and handlers. */
2556 static tree (*ix86_veclib_handler) (enum built_in_function, tree, tree);
2557
2558 static tree ix86_veclibabi_svml (enum built_in_function, tree, tree);
2559 static tree ix86_veclibabi_acml (enum built_in_function, tree, tree);
2560
2561 /* Processor target table, indexed by processor number */
2562 struct ptt
2563 {
2564 const struct processor_costs *cost; /* Processor costs */
2565 const int align_loop; /* Default alignments. */
2566 const int align_loop_max_skip;
2567 const int align_jump;
2568 const int align_jump_max_skip;
2569 const int align_func;
2570 };
2571
2572 static const struct ptt processor_target_table[PROCESSOR_max] =
2573 {
2574 {&i386_cost, 4, 3, 4, 3, 4},
2575 {&i486_cost, 16, 15, 16, 15, 16},
2576 {&pentium_cost, 16, 7, 16, 7, 16},
2577 {&pentiumpro_cost, 16, 15, 16, 10, 16},
2578 {&geode_cost, 0, 0, 0, 0, 0},
2579 {&k6_cost, 32, 7, 32, 7, 32},
2580 {&athlon_cost, 16, 7, 16, 7, 16},
2581 {&pentium4_cost, 0, 0, 0, 0, 0},
2582 {&k8_cost, 16, 7, 16, 7, 16},
2583 {&nocona_cost, 0, 0, 0, 0, 0},
2584 /* Core 2 32-bit. */
2585 {&generic32_cost, 16, 10, 16, 10, 16},
2586 /* Core 2 64-bit. */
2587 {&generic64_cost, 16, 10, 16, 10, 16},
2588 /* Core i7 32-bit. */
2589 {&generic32_cost, 16, 10, 16, 10, 16},
2590 /* Core i7 64-bit. */
2591 {&generic64_cost, 16, 10, 16, 10, 16},
2592 {&generic32_cost, 16, 7, 16, 7, 16},
2593 {&generic64_cost, 16, 10, 16, 10, 16},
2594 {&amdfam10_cost, 32, 24, 32, 7, 32},
2595 {&bdver1_cost, 32, 24, 32, 7, 32},
2596 {&bdver2_cost, 32, 24, 32, 7, 32},
2597 {&btver1_cost, 32, 24, 32, 7, 32},
2598 {&atom_cost, 16, 15, 16, 7, 16}
2599 };
2600
2601 static const char *const cpu_names[TARGET_CPU_DEFAULT_max] =
2602 {
2603 "generic",
2604 "i386",
2605 "i486",
2606 "pentium",
2607 "pentium-mmx",
2608 "pentiumpro",
2609 "pentium2",
2610 "pentium3",
2611 "pentium4",
2612 "pentium-m",
2613 "prescott",
2614 "nocona",
2615 "core2",
2616 "corei7",
2617 "atom",
2618 "geode",
2619 "k6",
2620 "k6-2",
2621 "k6-3",
2622 "athlon",
2623 "athlon-4",
2624 "k8",
2625 "amdfam10",
2626 "bdver1",
2627 "bdver2",
2628 "btver1"
2629 };
2630 \f
2631 /* Return true if a red-zone is in use. */
2632
2633 static inline bool
2634 ix86_using_red_zone (void)
2635 {
2636 return TARGET_RED_ZONE && !TARGET_64BIT_MS_ABI;
2637 }
2638 \f
2639 /* Return a string that documents the current -m options. The caller is
2640 responsible for freeing the string. */
2641
2642 static char *
2643 ix86_target_string (HOST_WIDE_INT isa, int flags, const char *arch,
2644 const char *tune, enum fpmath_unit fpmath,
2645 bool add_nl_p)
2646 {
2647 struct ix86_target_opts
2648 {
2649 const char *option; /* option string */
2650 HOST_WIDE_INT mask; /* isa mask options */
2651 };
2652
2653 /* This table is ordered so that options like -msse4.2 that imply
2654 preceding options while match those first. */
2655 static struct ix86_target_opts isa_opts[] =
2656 {
2657 { "-m64", OPTION_MASK_ISA_64BIT },
2658 { "-mfma4", OPTION_MASK_ISA_FMA4 },
2659 { "-mfma", OPTION_MASK_ISA_FMA },
2660 { "-mxop", OPTION_MASK_ISA_XOP },
2661 { "-mlwp", OPTION_MASK_ISA_LWP },
2662 { "-msse4a", OPTION_MASK_ISA_SSE4A },
2663 { "-msse4.2", OPTION_MASK_ISA_SSE4_2 },
2664 { "-msse4.1", OPTION_MASK_ISA_SSE4_1 },
2665 { "-mssse3", OPTION_MASK_ISA_SSSE3 },
2666 { "-msse3", OPTION_MASK_ISA_SSE3 },
2667 { "-msse2", OPTION_MASK_ISA_SSE2 },
2668 { "-msse", OPTION_MASK_ISA_SSE },
2669 { "-m3dnow", OPTION_MASK_ISA_3DNOW },
2670 { "-m3dnowa", OPTION_MASK_ISA_3DNOW_A },
2671 { "-mmmx", OPTION_MASK_ISA_MMX },
2672 { "-mabm", OPTION_MASK_ISA_ABM },
2673 { "-mbmi", OPTION_MASK_ISA_BMI },
2674 { "-mbmi2", OPTION_MASK_ISA_BMI2 },
2675 { "-mlzcnt", OPTION_MASK_ISA_LZCNT },
2676 { "-mtbm", OPTION_MASK_ISA_TBM },
2677 { "-mpopcnt", OPTION_MASK_ISA_POPCNT },
2678 { "-mmovbe", OPTION_MASK_ISA_MOVBE },
2679 { "-mcrc32", OPTION_MASK_ISA_CRC32 },
2680 { "-maes", OPTION_MASK_ISA_AES },
2681 { "-mpclmul", OPTION_MASK_ISA_PCLMUL },
2682 { "-mfsgsbase", OPTION_MASK_ISA_FSGSBASE },
2683 { "-mrdrnd", OPTION_MASK_ISA_RDRND },
2684 { "-mf16c", OPTION_MASK_ISA_F16C },
2685 };
2686
2687 /* Flag options. */
2688 static struct ix86_target_opts flag_opts[] =
2689 {
2690 { "-m128bit-long-double", MASK_128BIT_LONG_DOUBLE },
2691 { "-m80387", MASK_80387 },
2692 { "-maccumulate-outgoing-args", MASK_ACCUMULATE_OUTGOING_ARGS },
2693 { "-malign-double", MASK_ALIGN_DOUBLE },
2694 { "-mcld", MASK_CLD },
2695 { "-mfp-ret-in-387", MASK_FLOAT_RETURNS },
2696 { "-mieee-fp", MASK_IEEE_FP },
2697 { "-minline-all-stringops", MASK_INLINE_ALL_STRINGOPS },
2698 { "-minline-stringops-dynamically", MASK_INLINE_STRINGOPS_DYNAMICALLY },
2699 { "-mms-bitfields", MASK_MS_BITFIELD_LAYOUT },
2700 { "-mno-align-stringops", MASK_NO_ALIGN_STRINGOPS },
2701 { "-mno-fancy-math-387", MASK_NO_FANCY_MATH_387 },
2702 { "-mno-push-args", MASK_NO_PUSH_ARGS },
2703 { "-mno-red-zone", MASK_NO_RED_ZONE },
2704 { "-momit-leaf-frame-pointer", MASK_OMIT_LEAF_FRAME_POINTER },
2705 { "-mrecip", MASK_RECIP },
2706 { "-mrtd", MASK_RTD },
2707 { "-msseregparm", MASK_SSEREGPARM },
2708 { "-mstack-arg-probe", MASK_STACK_PROBE },
2709 { "-mtls-direct-seg-refs", MASK_TLS_DIRECT_SEG_REFS },
2710 { "-mvect8-ret-in-mem", MASK_VECT8_RETURNS },
2711 { "-m8bit-idiv", MASK_USE_8BIT_IDIV },
2712 { "-mvzeroupper", MASK_VZEROUPPER },
2713 { "-mavx256-split-unaligned-load", MASK_AVX256_SPLIT_UNALIGNED_LOAD},
2714 { "-mavx256-split-unaligned-store", MASK_AVX256_SPLIT_UNALIGNED_STORE},
2715 { "-mprefer-avx128", MASK_PREFER_AVX128},
2716 };
2717
2718 const char *opts[ARRAY_SIZE (isa_opts) + ARRAY_SIZE (flag_opts) + 6][2];
2719
2720 char isa_other[40];
2721 char target_other[40];
2722 unsigned num = 0;
2723 unsigned i, j;
2724 char *ret;
2725 char *ptr;
2726 size_t len;
2727 size_t line_len;
2728 size_t sep_len;
2729
2730 memset (opts, '\0', sizeof (opts));
2731
2732 /* Add -march= option. */
2733 if (arch)
2734 {
2735 opts[num][0] = "-march=";
2736 opts[num++][1] = arch;
2737 }
2738
2739 /* Add -mtune= option. */
2740 if (tune)
2741 {
2742 opts[num][0] = "-mtune=";
2743 opts[num++][1] = tune;
2744 }
2745
2746 /* Pick out the options in isa options. */
2747 for (i = 0; i < ARRAY_SIZE (isa_opts); i++)
2748 {
2749 if ((isa & isa_opts[i].mask) != 0)
2750 {
2751 opts[num++][0] = isa_opts[i].option;
2752 isa &= ~ isa_opts[i].mask;
2753 }
2754 }
2755
2756 if (isa && add_nl_p)
2757 {
2758 opts[num++][0] = isa_other;
2759 sprintf (isa_other, "(other isa: %#" HOST_WIDE_INT_PRINT "x)",
2760 isa);
2761 }
2762
2763 /* Add flag options. */
2764 for (i = 0; i < ARRAY_SIZE (flag_opts); i++)
2765 {
2766 if ((flags & flag_opts[i].mask) != 0)
2767 {
2768 opts[num++][0] = flag_opts[i].option;
2769 flags &= ~ flag_opts[i].mask;
2770 }
2771 }
2772
2773 if (flags && add_nl_p)
2774 {
2775 opts[num++][0] = target_other;
2776 sprintf (target_other, "(other flags: %#x)", flags);
2777 }
2778
2779 /* Add -fpmath= option. */
2780 if (fpmath)
2781 {
2782 opts[num][0] = "-mfpmath=";
2783 switch ((int) fpmath)
2784 {
2785 case FPMATH_387:
2786 opts[num++][1] = "387";
2787 break;
2788
2789 case FPMATH_SSE:
2790 opts[num++][1] = "sse";
2791 break;
2792
2793 case FPMATH_387 | FPMATH_SSE:
2794 opts[num++][1] = "sse+387";
2795 break;
2796
2797 default:
2798 gcc_unreachable ();
2799 }
2800 }
2801
2802 /* Any options? */
2803 if (num == 0)
2804 return NULL;
2805
2806 gcc_assert (num < ARRAY_SIZE (opts));
2807
2808 /* Size the string. */
2809 len = 0;
2810 sep_len = (add_nl_p) ? 3 : 1;
2811 for (i = 0; i < num; i++)
2812 {
2813 len += sep_len;
2814 for (j = 0; j < 2; j++)
2815 if (opts[i][j])
2816 len += strlen (opts[i][j]);
2817 }
2818
2819 /* Build the string. */
2820 ret = ptr = (char *) xmalloc (len);
2821 line_len = 0;
2822
2823 for (i = 0; i < num; i++)
2824 {
2825 size_t len2[2];
2826
2827 for (j = 0; j < 2; j++)
2828 len2[j] = (opts[i][j]) ? strlen (opts[i][j]) : 0;
2829
2830 if (i != 0)
2831 {
2832 *ptr++ = ' ';
2833 line_len++;
2834
2835 if (add_nl_p && line_len + len2[0] + len2[1] > 70)
2836 {
2837 *ptr++ = '\\';
2838 *ptr++ = '\n';
2839 line_len = 0;
2840 }
2841 }
2842
2843 for (j = 0; j < 2; j++)
2844 if (opts[i][j])
2845 {
2846 memcpy (ptr, opts[i][j], len2[j]);
2847 ptr += len2[j];
2848 line_len += len2[j];
2849 }
2850 }
2851
2852 *ptr = '\0';
2853 gcc_assert (ret + len >= ptr);
2854
2855 return ret;
2856 }
2857
2858 /* Return true, if profiling code should be emitted before
2859 prologue. Otherwise it returns false.
2860 Note: For x86 with "hotfix" it is sorried. */
2861 static bool
2862 ix86_profile_before_prologue (void)
2863 {
2864 return flag_fentry != 0;
2865 }
2866
2867 /* Function that is callable from the debugger to print the current
2868 options. */
2869 void
2870 ix86_debug_options (void)
2871 {
2872 char *opts = ix86_target_string (ix86_isa_flags, target_flags,
2873 ix86_arch_string, ix86_tune_string,
2874 ix86_fpmath, true);
2875
2876 if (opts)
2877 {
2878 fprintf (stderr, "%s\n\n", opts);
2879 free (opts);
2880 }
2881 else
2882 fputs ("<no options>\n\n", stderr);
2883
2884 return;
2885 }
2886 \f
2887 /* Override various settings based on options. If MAIN_ARGS_P, the
2888 options are from the command line, otherwise they are from
2889 attributes. */
2890
2891 static void
2892 ix86_option_override_internal (bool main_args_p)
2893 {
2894 int i;
2895 unsigned int ix86_arch_mask, ix86_tune_mask;
2896 const bool ix86_tune_specified = (ix86_tune_string != NULL);
2897 const char *prefix;
2898 const char *suffix;
2899 const char *sw;
2900
2901 #define PTA_3DNOW (HOST_WIDE_INT_1 << 0)
2902 #define PTA_3DNOW_A (HOST_WIDE_INT_1 << 1)
2903 #define PTA_64BIT (HOST_WIDE_INT_1 << 2)
2904 #define PTA_ABM (HOST_WIDE_INT_1 << 3)
2905 #define PTA_AES (HOST_WIDE_INT_1 << 4)
2906 #define PTA_AVX (HOST_WIDE_INT_1 << 5)
2907 #define PTA_BMI (HOST_WIDE_INT_1 << 6)
2908 #define PTA_CX16 (HOST_WIDE_INT_1 << 7)
2909 #define PTA_F16C (HOST_WIDE_INT_1 << 8)
2910 #define PTA_FMA (HOST_WIDE_INT_1 << 9)
2911 #define PTA_FMA4 (HOST_WIDE_INT_1 << 10)
2912 #define PTA_FSGSBASE (HOST_WIDE_INT_1 << 11)
2913 #define PTA_LWP (HOST_WIDE_INT_1 << 12)
2914 #define PTA_LZCNT (HOST_WIDE_INT_1 << 13)
2915 #define PTA_MMX (HOST_WIDE_INT_1 << 14)
2916 #define PTA_MOVBE (HOST_WIDE_INT_1 << 15)
2917 #define PTA_NO_SAHF (HOST_WIDE_INT_1 << 16)
2918 #define PTA_PCLMUL (HOST_WIDE_INT_1 << 17)
2919 #define PTA_POPCNT (HOST_WIDE_INT_1 << 18)
2920 #define PTA_PREFETCH_SSE (HOST_WIDE_INT_1 << 19)
2921 #define PTA_RDRND (HOST_WIDE_INT_1 << 20)
2922 #define PTA_SSE (HOST_WIDE_INT_1 << 21)
2923 #define PTA_SSE2 (HOST_WIDE_INT_1 << 22)
2924 #define PTA_SSE3 (HOST_WIDE_INT_1 << 23)
2925 #define PTA_SSE4_1 (HOST_WIDE_INT_1 << 24)
2926 #define PTA_SSE4_2 (HOST_WIDE_INT_1 << 25)
2927 #define PTA_SSE4A (HOST_WIDE_INT_1 << 26)
2928 #define PTA_SSSE3 (HOST_WIDE_INT_1 << 27)
2929 #define PTA_TBM (HOST_WIDE_INT_1 << 28)
2930 #define PTA_XOP (HOST_WIDE_INT_1 << 29)
2931 #define PTA_AVX2 (HOST_WIDE_INT_1 << 30)
2932 #define PTA_BMI2 (HOST_WIDE_INT_1 << 31)
2933 /* if this reaches 64, need to widen struct pta flags below */
2934
2935 static struct pta
2936 {
2937 const char *const name; /* processor name or nickname. */
2938 const enum processor_type processor;
2939 const enum attr_cpu schedule;
2940 const unsigned HOST_WIDE_INT flags;
2941 }
2942 const processor_alias_table[] =
2943 {
2944 {"i386", PROCESSOR_I386, CPU_NONE, 0},
2945 {"i486", PROCESSOR_I486, CPU_NONE, 0},
2946 {"i586", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
2947 {"pentium", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
2948 {"pentium-mmx", PROCESSOR_PENTIUM, CPU_PENTIUM, PTA_MMX},
2949 {"winchip-c6", PROCESSOR_I486, CPU_NONE, PTA_MMX},
2950 {"winchip2", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
2951 {"c3", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
2952 {"c3-2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, PTA_MMX | PTA_SSE},
2953 {"i686", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
2954 {"pentiumpro", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
2955 {"pentium2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, PTA_MMX},
2956 {"pentium3", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
2957 PTA_MMX | PTA_SSE},
2958 {"pentium3m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
2959 PTA_MMX | PTA_SSE},
2960 {"pentium-m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
2961 PTA_MMX | PTA_SSE | PTA_SSE2},
2962 {"pentium4", PROCESSOR_PENTIUM4, CPU_NONE,
2963 PTA_MMX |PTA_SSE | PTA_SSE2},
2964 {"pentium4m", PROCESSOR_PENTIUM4, CPU_NONE,
2965 PTA_MMX | PTA_SSE | PTA_SSE2},
2966 {"prescott", PROCESSOR_NOCONA, CPU_NONE,
2967 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3},
2968 {"nocona", PROCESSOR_NOCONA, CPU_NONE,
2969 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2970 | PTA_CX16 | PTA_NO_SAHF},
2971 {"core2", PROCESSOR_CORE2_64, CPU_CORE2,
2972 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2973 | PTA_SSSE3 | PTA_CX16},
2974 {"corei7", PROCESSOR_COREI7_64, CPU_COREI7,
2975 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2976 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_CX16},
2977 {"corei7-avx", PROCESSOR_COREI7_64, CPU_COREI7,
2978 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2979 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX
2980 | PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL},
2981 {"core-avx-i", PROCESSOR_COREI7_64, CPU_COREI7,
2982 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2983 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX
2984 | PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL | PTA_FSGSBASE
2985 | PTA_RDRND | PTA_F16C},
2986 {"core-avx2", PROCESSOR_COREI7_64, CPU_COREI7,
2987 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2988 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX | PTA_AVX2
2989 | PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL | PTA_FSGSBASE
2990 | PTA_RDRND | PTA_F16C | PTA_BMI | PTA_BMI2 | PTA_LZCNT
2991 | PTA_FMA | PTA_MOVBE},
2992 {"atom", PROCESSOR_ATOM, CPU_ATOM,
2993 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2994 | PTA_SSSE3 | PTA_CX16 | PTA_MOVBE},
2995 {"geode", PROCESSOR_GEODE, CPU_GEODE,
2996 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A |PTA_PREFETCH_SSE},
2997 {"k6", PROCESSOR_K6, CPU_K6, PTA_MMX},
2998 {"k6-2", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW},
2999 {"k6-3", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW},
3000 {"athlon", PROCESSOR_ATHLON, CPU_ATHLON,
3001 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
3002 {"athlon-tbird", PROCESSOR_ATHLON, CPU_ATHLON,
3003 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
3004 {"athlon-4", PROCESSOR_ATHLON, CPU_ATHLON,
3005 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE},
3006 {"athlon-xp", PROCESSOR_ATHLON, CPU_ATHLON,
3007 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE},
3008 {"athlon-mp", PROCESSOR_ATHLON, CPU_ATHLON,
3009 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE},
3010 {"x86-64", PROCESSOR_K8, CPU_K8,
3011 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_NO_SAHF},
3012 {"k8", PROCESSOR_K8, CPU_K8,
3013 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3014 | PTA_SSE2 | PTA_NO_SAHF},
3015 {"k8-sse3", PROCESSOR_K8, CPU_K8,
3016 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3017 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF},
3018 {"opteron", PROCESSOR_K8, CPU_K8,
3019 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3020 | PTA_SSE2 | PTA_NO_SAHF},
3021 {"opteron-sse3", PROCESSOR_K8, CPU_K8,
3022 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3023 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF},
3024 {"athlon64", PROCESSOR_K8, CPU_K8,
3025 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3026 | PTA_SSE2 | PTA_NO_SAHF},
3027 {"athlon64-sse3", PROCESSOR_K8, CPU_K8,
3028 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3029 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF},
3030 {"athlon-fx", PROCESSOR_K8, CPU_K8,
3031 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3032 | PTA_SSE2 | PTA_NO_SAHF},
3033 {"amdfam10", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
3034 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3035 | PTA_SSE2 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM},
3036 {"barcelona", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
3037 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3038 | PTA_SSE2 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM},
3039 {"bdver1", PROCESSOR_BDVER1, CPU_BDVER1,
3040 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3041 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
3042 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
3043 | PTA_XOP | PTA_LWP},
3044 {"bdver2", PROCESSOR_BDVER2, CPU_BDVER2,
3045 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3046 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
3047 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX
3048 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_TBM | PTA_F16C
3049 | PTA_FMA},
3050 {"btver1", PROCESSOR_BTVER1, CPU_GENERIC64,
3051 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3052 | PTA_SSSE3 | PTA_SSE4A |PTA_ABM | PTA_CX16},
3053 {"generic32", PROCESSOR_GENERIC32, CPU_PENTIUMPRO,
3054 0 /* flags are only used for -march switch. */ },
3055 {"generic64", PROCESSOR_GENERIC64, CPU_GENERIC64,
3056 PTA_64BIT /* flags are only used for -march switch. */ },
3057 };
3058
3059 /* -mrecip options. */
3060 static struct
3061 {
3062 const char *string; /* option name */
3063 unsigned int mask; /* mask bits to set */
3064 }
3065 const recip_options[] =
3066 {
3067 { "all", RECIP_MASK_ALL },
3068 { "none", RECIP_MASK_NONE },
3069 { "div", RECIP_MASK_DIV },
3070 { "sqrt", RECIP_MASK_SQRT },
3071 { "vec-div", RECIP_MASK_VEC_DIV },
3072 { "vec-sqrt", RECIP_MASK_VEC_SQRT },
3073 };
3074
3075 int const pta_size = ARRAY_SIZE (processor_alias_table);
3076
3077 /* Set up prefix/suffix so the error messages refer to either the command
3078 line argument, or the attribute(target). */
3079 if (main_args_p)
3080 {
3081 prefix = "-m";
3082 suffix = "";
3083 sw = "switch";
3084 }
3085 else
3086 {
3087 prefix = "option(\"";
3088 suffix = "\")";
3089 sw = "attribute";
3090 }
3091
3092 #ifdef SUBTARGET_OVERRIDE_OPTIONS
3093 SUBTARGET_OVERRIDE_OPTIONS;
3094 #endif
3095
3096 #ifdef SUBSUBTARGET_OVERRIDE_OPTIONS
3097 SUBSUBTARGET_OVERRIDE_OPTIONS;
3098 #endif
3099
3100 if (TARGET_X32)
3101 ix86_isa_flags |= OPTION_MASK_ISA_64BIT;
3102
3103 /* -fPIC is the default for x86_64. */
3104 if (TARGET_MACHO && TARGET_64BIT)
3105 flag_pic = 2;
3106
3107 /* Need to check -mtune=generic first. */
3108 if (ix86_tune_string)
3109 {
3110 if (!strcmp (ix86_tune_string, "generic")
3111 || !strcmp (ix86_tune_string, "i686")
3112 /* As special support for cross compilers we read -mtune=native
3113 as -mtune=generic. With native compilers we won't see the
3114 -mtune=native, as it was changed by the driver. */
3115 || !strcmp (ix86_tune_string, "native"))
3116 {
3117 if (TARGET_64BIT)
3118 ix86_tune_string = "generic64";
3119 else
3120 ix86_tune_string = "generic32";
3121 }
3122 /* If this call is for setting the option attribute, allow the
3123 generic32/generic64 that was previously set. */
3124 else if (!main_args_p
3125 && (!strcmp (ix86_tune_string, "generic32")
3126 || !strcmp (ix86_tune_string, "generic64")))
3127 ;
3128 else if (!strncmp (ix86_tune_string, "generic", 7))
3129 error ("bad value (%s) for %stune=%s %s",
3130 ix86_tune_string, prefix, suffix, sw);
3131 else if (!strcmp (ix86_tune_string, "x86-64"))
3132 warning (OPT_Wdeprecated, "%stune=x86-64%s is deprecated; use "
3133 "%stune=k8%s or %stune=generic%s instead as appropriate",
3134 prefix, suffix, prefix, suffix, prefix, suffix);
3135 }
3136 else
3137 {
3138 if (ix86_arch_string)
3139 ix86_tune_string = ix86_arch_string;
3140 if (!ix86_tune_string)
3141 {
3142 ix86_tune_string = cpu_names[TARGET_CPU_DEFAULT];
3143 ix86_tune_defaulted = 1;
3144 }
3145
3146 /* ix86_tune_string is set to ix86_arch_string or defaulted. We
3147 need to use a sensible tune option. */
3148 if (!strcmp (ix86_tune_string, "generic")
3149 || !strcmp (ix86_tune_string, "x86-64")
3150 || !strcmp (ix86_tune_string, "i686"))
3151 {
3152 if (TARGET_64BIT)
3153 ix86_tune_string = "generic64";
3154 else
3155 ix86_tune_string = "generic32";
3156 }
3157 }
3158
3159 if (ix86_stringop_alg == rep_prefix_8_byte && !TARGET_64BIT)
3160 {
3161 /* rep; movq isn't available in 32-bit code. */
3162 error ("-mstringop-strategy=rep_8byte not supported for 32-bit code");
3163 ix86_stringop_alg = no_stringop;
3164 }
3165
3166 if (!ix86_arch_string)
3167 ix86_arch_string = TARGET_64BIT ? "x86-64" : SUBTARGET32_DEFAULT_CPU;
3168 else
3169 ix86_arch_specified = 1;
3170
3171 if (!global_options_set.x_ix86_abi)
3172 ix86_abi = DEFAULT_ABI;
3173
3174 if (global_options_set.x_ix86_cmodel)
3175 {
3176 switch (ix86_cmodel)
3177 {
3178 case CM_SMALL:
3179 case CM_SMALL_PIC:
3180 if (flag_pic)
3181 ix86_cmodel = CM_SMALL_PIC;
3182 if (!TARGET_64BIT)
3183 error ("code model %qs not supported in the %s bit mode",
3184 "small", "32");
3185 break;
3186
3187 case CM_MEDIUM:
3188 case CM_MEDIUM_PIC:
3189 if (flag_pic)
3190 ix86_cmodel = CM_MEDIUM_PIC;
3191 if (!TARGET_64BIT)
3192 error ("code model %qs not supported in the %s bit mode",
3193 "medium", "32");
3194 else if (TARGET_X32)
3195 error ("code model %qs not supported in x32 mode",
3196 "medium");
3197 break;
3198
3199 case CM_LARGE:
3200 case CM_LARGE_PIC:
3201 if (flag_pic)
3202 ix86_cmodel = CM_LARGE_PIC;
3203 if (!TARGET_64BIT)
3204 error ("code model %qs not supported in the %s bit mode",
3205 "large", "32");
3206 else if (TARGET_X32)
3207 error ("code model %qs not supported in x32 mode",
3208 "medium");
3209 break;
3210
3211 case CM_32:
3212 if (flag_pic)
3213 error ("code model %s does not support PIC mode", "32");
3214 if (TARGET_64BIT)
3215 error ("code model %qs not supported in the %s bit mode",
3216 "32", "64");
3217 break;
3218
3219 case CM_KERNEL:
3220 if (flag_pic)
3221 {
3222 error ("code model %s does not support PIC mode", "kernel");
3223 ix86_cmodel = CM_32;
3224 }
3225 if (!TARGET_64BIT)
3226 error ("code model %qs not supported in the %s bit mode",
3227 "kernel", "32");
3228 break;
3229
3230 default:
3231 gcc_unreachable ();
3232 }
3233 }
3234 else
3235 {
3236 /* For TARGET_64BIT and MS_ABI, force pic on, in order to enable the
3237 use of rip-relative addressing. This eliminates fixups that
3238 would otherwise be needed if this object is to be placed in a
3239 DLL, and is essentially just as efficient as direct addressing. */
3240 if (TARGET_64BIT && DEFAULT_ABI == MS_ABI)
3241 ix86_cmodel = CM_SMALL_PIC, flag_pic = 1;
3242 else if (TARGET_64BIT)
3243 ix86_cmodel = flag_pic ? CM_SMALL_PIC : CM_SMALL;
3244 else
3245 ix86_cmodel = CM_32;
3246 }
3247 if (TARGET_MACHO && ix86_asm_dialect == ASM_INTEL)
3248 {
3249 error ("-masm=intel not supported in this configuration");
3250 ix86_asm_dialect = ASM_ATT;
3251 }
3252 if ((TARGET_64BIT != 0) != ((ix86_isa_flags & OPTION_MASK_ISA_64BIT) != 0))
3253 sorry ("%i-bit mode not compiled in",
3254 (ix86_isa_flags & OPTION_MASK_ISA_64BIT) ? 64 : 32);
3255
3256 for (i = 0; i < pta_size; i++)
3257 if (! strcmp (ix86_arch_string, processor_alias_table[i].name))
3258 {
3259 ix86_schedule = processor_alias_table[i].schedule;
3260 ix86_arch = processor_alias_table[i].processor;
3261 /* Default cpu tuning to the architecture. */
3262 ix86_tune = ix86_arch;
3263
3264 if (TARGET_64BIT && !(processor_alias_table[i].flags & PTA_64BIT))
3265 error ("CPU you selected does not support x86-64 "
3266 "instruction set");
3267
3268 if (processor_alias_table[i].flags & PTA_MMX
3269 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_MMX))
3270 ix86_isa_flags |= OPTION_MASK_ISA_MMX;
3271 if (processor_alias_table[i].flags & PTA_3DNOW
3272 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW))
3273 ix86_isa_flags |= OPTION_MASK_ISA_3DNOW;
3274 if (processor_alias_table[i].flags & PTA_3DNOW_A
3275 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW_A))
3276 ix86_isa_flags |= OPTION_MASK_ISA_3DNOW_A;
3277 if (processor_alias_table[i].flags & PTA_SSE
3278 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE))
3279 ix86_isa_flags |= OPTION_MASK_ISA_SSE;
3280 if (processor_alias_table[i].flags & PTA_SSE2
3281 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE2))
3282 ix86_isa_flags |= OPTION_MASK_ISA_SSE2;
3283 if (processor_alias_table[i].flags & PTA_SSE3
3284 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE3))
3285 ix86_isa_flags |= OPTION_MASK_ISA_SSE3;
3286 if (processor_alias_table[i].flags & PTA_SSSE3
3287 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSSE3))
3288 ix86_isa_flags |= OPTION_MASK_ISA_SSSE3;
3289 if (processor_alias_table[i].flags & PTA_SSE4_1
3290 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_1))
3291 ix86_isa_flags |= OPTION_MASK_ISA_SSE4_1;
3292 if (processor_alias_table[i].flags & PTA_SSE4_2
3293 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_2))
3294 ix86_isa_flags |= OPTION_MASK_ISA_SSE4_2;
3295 if (processor_alias_table[i].flags & PTA_AVX
3296 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX))
3297 ix86_isa_flags |= OPTION_MASK_ISA_AVX;
3298 if (processor_alias_table[i].flags & PTA_AVX2
3299 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX2))
3300 ix86_isa_flags |= OPTION_MASK_ISA_AVX2;
3301 if (processor_alias_table[i].flags & PTA_FMA
3302 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA))
3303 ix86_isa_flags |= OPTION_MASK_ISA_FMA;
3304 if (processor_alias_table[i].flags & PTA_SSE4A
3305 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4A))
3306 ix86_isa_flags |= OPTION_MASK_ISA_SSE4A;
3307 if (processor_alias_table[i].flags & PTA_FMA4
3308 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA4))
3309 ix86_isa_flags |= OPTION_MASK_ISA_FMA4;
3310 if (processor_alias_table[i].flags & PTA_XOP
3311 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_XOP))
3312 ix86_isa_flags |= OPTION_MASK_ISA_XOP;
3313 if (processor_alias_table[i].flags & PTA_LWP
3314 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_LWP))
3315 ix86_isa_flags |= OPTION_MASK_ISA_LWP;
3316 if (processor_alias_table[i].flags & PTA_ABM
3317 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_ABM))
3318 ix86_isa_flags |= OPTION_MASK_ISA_ABM;
3319 if (processor_alias_table[i].flags & PTA_BMI
3320 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI))
3321 ix86_isa_flags |= OPTION_MASK_ISA_BMI;
3322 if (processor_alias_table[i].flags & (PTA_LZCNT | PTA_ABM)
3323 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_LZCNT))
3324 ix86_isa_flags |= OPTION_MASK_ISA_LZCNT;
3325 if (processor_alias_table[i].flags & PTA_TBM
3326 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_TBM))
3327 ix86_isa_flags |= OPTION_MASK_ISA_TBM;
3328 if (processor_alias_table[i].flags & PTA_BMI2
3329 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI2))
3330 ix86_isa_flags |= OPTION_MASK_ISA_BMI2;
3331 if (processor_alias_table[i].flags & PTA_CX16
3332 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_CX16))
3333 ix86_isa_flags |= OPTION_MASK_ISA_CX16;
3334 if (processor_alias_table[i].flags & (PTA_POPCNT | PTA_ABM)
3335 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_POPCNT))
3336 ix86_isa_flags |= OPTION_MASK_ISA_POPCNT;
3337 if (!(TARGET_64BIT && (processor_alias_table[i].flags & PTA_NO_SAHF))
3338 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SAHF))
3339 ix86_isa_flags |= OPTION_MASK_ISA_SAHF;
3340 if (processor_alias_table[i].flags & PTA_MOVBE
3341 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_MOVBE))
3342 ix86_isa_flags |= OPTION_MASK_ISA_MOVBE;
3343 if (processor_alias_table[i].flags & PTA_AES
3344 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AES))
3345 ix86_isa_flags |= OPTION_MASK_ISA_AES;
3346 if (processor_alias_table[i].flags & PTA_PCLMUL
3347 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_PCLMUL))
3348 ix86_isa_flags |= OPTION_MASK_ISA_PCLMUL;
3349 if (processor_alias_table[i].flags & PTA_FSGSBASE
3350 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_FSGSBASE))
3351 ix86_isa_flags |= OPTION_MASK_ISA_FSGSBASE;
3352 if (processor_alias_table[i].flags & PTA_RDRND
3353 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_RDRND))
3354 ix86_isa_flags |= OPTION_MASK_ISA_RDRND;
3355 if (processor_alias_table[i].flags & PTA_F16C
3356 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_F16C))
3357 ix86_isa_flags |= OPTION_MASK_ISA_F16C;
3358 if (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE))
3359 x86_prefetch_sse = true;
3360
3361 break;
3362 }
3363
3364 if (!strcmp (ix86_arch_string, "generic"))
3365 error ("generic CPU can be used only for %stune=%s %s",
3366 prefix, suffix, sw);
3367 else if (!strncmp (ix86_arch_string, "generic", 7) || i == pta_size)
3368 error ("bad value (%s) for %sarch=%s %s",
3369 ix86_arch_string, prefix, suffix, sw);
3370
3371 ix86_arch_mask = 1u << ix86_arch;
3372 for (i = 0; i < X86_ARCH_LAST; ++i)
3373 ix86_arch_features[i] = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
3374
3375 for (i = 0; i < pta_size; i++)
3376 if (! strcmp (ix86_tune_string, processor_alias_table[i].name))
3377 {
3378 ix86_schedule = processor_alias_table[i].schedule;
3379 ix86_tune = processor_alias_table[i].processor;
3380 if (TARGET_64BIT)
3381 {
3382 if (!(processor_alias_table[i].flags & PTA_64BIT))
3383 {
3384 if (ix86_tune_defaulted)
3385 {
3386 ix86_tune_string = "x86-64";
3387 for (i = 0; i < pta_size; i++)
3388 if (! strcmp (ix86_tune_string,
3389 processor_alias_table[i].name))
3390 break;
3391 ix86_schedule = processor_alias_table[i].schedule;
3392 ix86_tune = processor_alias_table[i].processor;
3393 }
3394 else
3395 error ("CPU you selected does not support x86-64 "
3396 "instruction set");
3397 }
3398 }
3399 else
3400 {
3401 /* Adjust tuning when compiling for 32-bit ABI. */
3402 switch (ix86_tune)
3403 {
3404 case PROCESSOR_GENERIC64:
3405 ix86_tune = PROCESSOR_GENERIC32;
3406 ix86_schedule = CPU_PENTIUMPRO;
3407 break;
3408
3409 case PROCESSOR_CORE2_64:
3410 ix86_tune = PROCESSOR_CORE2_32;
3411 break;
3412
3413 case PROCESSOR_COREI7_64:
3414 ix86_tune = PROCESSOR_COREI7_32;
3415 break;
3416
3417 default:
3418 break;
3419 }
3420 }
3421 /* Intel CPUs have always interpreted SSE prefetch instructions as
3422 NOPs; so, we can enable SSE prefetch instructions even when
3423 -mtune (rather than -march) points us to a processor that has them.
3424 However, the VIA C3 gives a SIGILL, so we only do that for i686 and
3425 higher processors. */
3426 if (TARGET_CMOVE
3427 && (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE)))
3428 x86_prefetch_sse = true;
3429 break;
3430 }
3431
3432 if (ix86_tune_specified && i == pta_size)
3433 error ("bad value (%s) for %stune=%s %s",
3434 ix86_tune_string, prefix, suffix, sw);
3435
3436 ix86_tune_mask = 1u << ix86_tune;
3437 for (i = 0; i < X86_TUNE_LAST; ++i)
3438 ix86_tune_features[i] = !!(initial_ix86_tune_features[i] & ix86_tune_mask);
3439
3440 #ifndef USE_IX86_FRAME_POINTER
3441 #define USE_IX86_FRAME_POINTER 0
3442 #endif
3443
3444 #ifndef USE_X86_64_FRAME_POINTER
3445 #define USE_X86_64_FRAME_POINTER 0
3446 #endif
3447
3448 /* Set the default values for switches whose default depends on TARGET_64BIT
3449 in case they weren't overwritten by command line options. */
3450 if (TARGET_64BIT)
3451 {
3452 if (optimize >= 1 && !global_options_set.x_flag_omit_frame_pointer)
3453 flag_omit_frame_pointer = !USE_X86_64_FRAME_POINTER;
3454 if (flag_asynchronous_unwind_tables == 2)
3455 flag_unwind_tables = flag_asynchronous_unwind_tables = 1;
3456 if (flag_pcc_struct_return == 2)
3457 flag_pcc_struct_return = 0;
3458 }
3459 else
3460 {
3461 if (optimize >= 1 && !global_options_set.x_flag_omit_frame_pointer)
3462 flag_omit_frame_pointer = !(USE_IX86_FRAME_POINTER || optimize_size);
3463 if (flag_asynchronous_unwind_tables == 2)
3464 flag_asynchronous_unwind_tables = !USE_IX86_FRAME_POINTER;
3465 if (flag_pcc_struct_return == 2)
3466 flag_pcc_struct_return = DEFAULT_PCC_STRUCT_RETURN;
3467 }
3468
3469 if (optimize_size)
3470 ix86_cost = &ix86_size_cost;
3471 else
3472 ix86_cost = processor_target_table[ix86_tune].cost;
3473
3474 /* Arrange to set up i386_stack_locals for all functions. */
3475 init_machine_status = ix86_init_machine_status;
3476
3477 /* Validate -mregparm= value. */
3478 if (global_options_set.x_ix86_regparm)
3479 {
3480 if (TARGET_64BIT)
3481 warning (0, "-mregparm is ignored in 64-bit mode");
3482 if (ix86_regparm > REGPARM_MAX)
3483 {
3484 error ("-mregparm=%d is not between 0 and %d",
3485 ix86_regparm, REGPARM_MAX);
3486 ix86_regparm = 0;
3487 }
3488 }
3489 if (TARGET_64BIT)
3490 ix86_regparm = REGPARM_MAX;
3491
3492 /* Default align_* from the processor table. */
3493 if (align_loops == 0)
3494 {
3495 align_loops = processor_target_table[ix86_tune].align_loop;
3496 align_loops_max_skip = processor_target_table[ix86_tune].align_loop_max_skip;
3497 }
3498 if (align_jumps == 0)
3499 {
3500 align_jumps = processor_target_table[ix86_tune].align_jump;
3501 align_jumps_max_skip = processor_target_table[ix86_tune].align_jump_max_skip;
3502 }
3503 if (align_functions == 0)
3504 {
3505 align_functions = processor_target_table[ix86_tune].align_func;
3506 }
3507
3508 /* Provide default for -mbranch-cost= value. */
3509 if (!global_options_set.x_ix86_branch_cost)
3510 ix86_branch_cost = ix86_cost->branch_cost;
3511
3512 if (TARGET_64BIT)
3513 {
3514 target_flags |= TARGET_SUBTARGET64_DEFAULT & ~target_flags_explicit;
3515
3516 /* Enable by default the SSE and MMX builtins. Do allow the user to
3517 explicitly disable any of these. In particular, disabling SSE and
3518 MMX for kernel code is extremely useful. */
3519 if (!ix86_arch_specified)
3520 ix86_isa_flags
3521 |= ((OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_MMX
3522 | TARGET_SUBTARGET64_ISA_DEFAULT) & ~ix86_isa_flags_explicit);
3523
3524 if (TARGET_RTD)
3525 warning (0, "%srtd%s is ignored in 64bit mode", prefix, suffix);
3526 }
3527 else
3528 {
3529 target_flags |= TARGET_SUBTARGET32_DEFAULT & ~target_flags_explicit;
3530
3531 if (!ix86_arch_specified)
3532 ix86_isa_flags
3533 |= TARGET_SUBTARGET32_ISA_DEFAULT & ~ix86_isa_flags_explicit;
3534
3535 /* i386 ABI does not specify red zone. It still makes sense to use it
3536 when programmer takes care to stack from being destroyed. */
3537 if (!(target_flags_explicit & MASK_NO_RED_ZONE))
3538 target_flags |= MASK_NO_RED_ZONE;
3539 }
3540
3541 /* Keep nonleaf frame pointers. */
3542 if (flag_omit_frame_pointer)
3543 target_flags &= ~MASK_OMIT_LEAF_FRAME_POINTER;
3544 else if (TARGET_OMIT_LEAF_FRAME_POINTER)
3545 flag_omit_frame_pointer = 1;
3546
3547 /* If we're doing fast math, we don't care about comparison order
3548 wrt NaNs. This lets us use a shorter comparison sequence. */
3549 if (flag_finite_math_only)
3550 target_flags &= ~MASK_IEEE_FP;
3551
3552 /* If the architecture always has an FPU, turn off NO_FANCY_MATH_387,
3553 since the insns won't need emulation. */
3554 if (x86_arch_always_fancy_math_387 & ix86_arch_mask)
3555 target_flags &= ~MASK_NO_FANCY_MATH_387;
3556
3557 /* Likewise, if the target doesn't have a 387, or we've specified
3558 software floating point, don't use 387 inline intrinsics. */
3559 if (!TARGET_80387)
3560 target_flags |= MASK_NO_FANCY_MATH_387;
3561
3562 /* Turn on MMX builtins for -msse. */
3563 if (TARGET_SSE)
3564 {
3565 ix86_isa_flags |= OPTION_MASK_ISA_MMX & ~ix86_isa_flags_explicit;
3566 x86_prefetch_sse = true;
3567 }
3568
3569 /* Turn on popcnt instruction for -msse4.2 or -mabm. */
3570 if (TARGET_SSE4_2 || TARGET_ABM)
3571 ix86_isa_flags |= OPTION_MASK_ISA_POPCNT & ~ix86_isa_flags_explicit;
3572
3573 /* Turn on lzcnt instruction for -mabm. */
3574 if (TARGET_ABM)
3575 ix86_isa_flags |= OPTION_MASK_ISA_LZCNT & ~ix86_isa_flags_explicit;
3576
3577 /* Validate -mpreferred-stack-boundary= value or default it to
3578 PREFERRED_STACK_BOUNDARY_DEFAULT. */
3579 ix86_preferred_stack_boundary = PREFERRED_STACK_BOUNDARY_DEFAULT;
3580 if (global_options_set.x_ix86_preferred_stack_boundary_arg)
3581 {
3582 int min = (TARGET_64BIT ? 4 : 2);
3583 int max = (TARGET_SEH ? 4 : 12);
3584
3585 if (ix86_preferred_stack_boundary_arg < min
3586 || ix86_preferred_stack_boundary_arg > max)
3587 {
3588 if (min == max)
3589 error ("-mpreferred-stack-boundary is not supported "
3590 "for this target");
3591 else
3592 error ("-mpreferred-stack-boundary=%d is not between %d and %d",
3593 ix86_preferred_stack_boundary_arg, min, max);
3594 }
3595 else
3596 ix86_preferred_stack_boundary
3597 = (1 << ix86_preferred_stack_boundary_arg) * BITS_PER_UNIT;
3598 }
3599
3600 /* Set the default value for -mstackrealign. */
3601 if (ix86_force_align_arg_pointer == -1)
3602 ix86_force_align_arg_pointer = STACK_REALIGN_DEFAULT;
3603
3604 ix86_default_incoming_stack_boundary = PREFERRED_STACK_BOUNDARY;
3605
3606 /* Validate -mincoming-stack-boundary= value or default it to
3607 MIN_STACK_BOUNDARY/PREFERRED_STACK_BOUNDARY. */
3608 ix86_incoming_stack_boundary = ix86_default_incoming_stack_boundary;
3609 if (global_options_set.x_ix86_incoming_stack_boundary_arg)
3610 {
3611 if (ix86_incoming_stack_boundary_arg < (TARGET_64BIT ? 4 : 2)
3612 || ix86_incoming_stack_boundary_arg > 12)
3613 error ("-mincoming-stack-boundary=%d is not between %d and 12",
3614 ix86_incoming_stack_boundary_arg, TARGET_64BIT ? 4 : 2);
3615 else
3616 {
3617 ix86_user_incoming_stack_boundary
3618 = (1 << ix86_incoming_stack_boundary_arg) * BITS_PER_UNIT;
3619 ix86_incoming_stack_boundary
3620 = ix86_user_incoming_stack_boundary;
3621 }
3622 }
3623
3624 /* Accept -msseregparm only if at least SSE support is enabled. */
3625 if (TARGET_SSEREGPARM
3626 && ! TARGET_SSE)
3627 error ("%ssseregparm%s used without SSE enabled", prefix, suffix);
3628
3629 if (global_options_set.x_ix86_fpmath)
3630 {
3631 if (ix86_fpmath & FPMATH_SSE)
3632 {
3633 if (!TARGET_SSE)
3634 {
3635 warning (0, "SSE instruction set disabled, using 387 arithmetics");
3636 ix86_fpmath = FPMATH_387;
3637 }
3638 else if ((ix86_fpmath & FPMATH_387) && !TARGET_80387)
3639 {
3640 warning (0, "387 instruction set disabled, using SSE arithmetics");
3641 ix86_fpmath = FPMATH_SSE;
3642 }
3643 }
3644 }
3645 else
3646 ix86_fpmath = TARGET_FPMATH_DEFAULT;
3647
3648 /* If the i387 is disabled, then do not return values in it. */
3649 if (!TARGET_80387)
3650 target_flags &= ~MASK_FLOAT_RETURNS;
3651
3652 /* Use external vectorized library in vectorizing intrinsics. */
3653 if (global_options_set.x_ix86_veclibabi_type)
3654 switch (ix86_veclibabi_type)
3655 {
3656 case ix86_veclibabi_type_svml:
3657 ix86_veclib_handler = ix86_veclibabi_svml;
3658 break;
3659
3660 case ix86_veclibabi_type_acml:
3661 ix86_veclib_handler = ix86_veclibabi_acml;
3662 break;
3663
3664 default:
3665 gcc_unreachable ();
3666 }
3667
3668 if ((!USE_IX86_FRAME_POINTER
3669 || (x86_accumulate_outgoing_args & ix86_tune_mask))
3670 && !(target_flags_explicit & MASK_ACCUMULATE_OUTGOING_ARGS)
3671 && !optimize_size)
3672 target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
3673
3674 /* ??? Unwind info is not correct around the CFG unless either a frame
3675 pointer is present or M_A_O_A is set. Fixing this requires rewriting
3676 unwind info generation to be aware of the CFG and propagating states
3677 around edges. */
3678 if ((flag_unwind_tables || flag_asynchronous_unwind_tables
3679 || flag_exceptions || flag_non_call_exceptions)
3680 && flag_omit_frame_pointer
3681 && !(target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
3682 {
3683 if (target_flags_explicit & MASK_ACCUMULATE_OUTGOING_ARGS)
3684 warning (0, "unwind tables currently require either a frame pointer "
3685 "or %saccumulate-outgoing-args%s for correctness",
3686 prefix, suffix);
3687 target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
3688 }
3689
3690 /* If stack probes are required, the space used for large function
3691 arguments on the stack must also be probed, so enable
3692 -maccumulate-outgoing-args so this happens in the prologue. */
3693 if (TARGET_STACK_PROBE
3694 && !(target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
3695 {
3696 if (target_flags_explicit & MASK_ACCUMULATE_OUTGOING_ARGS)
3697 warning (0, "stack probing requires %saccumulate-outgoing-args%s "
3698 "for correctness", prefix, suffix);
3699 target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
3700 }
3701
3702 /* For sane SSE instruction set generation we need fcomi instruction.
3703 It is safe to enable all CMOVE instructions. Also, RDRAND intrinsic
3704 expands to a sequence that includes conditional move. */
3705 if (TARGET_SSE || TARGET_RDRND)
3706 TARGET_CMOVE = 1;
3707
3708 /* Figure out what ASM_GENERATE_INTERNAL_LABEL builds as a prefix. */
3709 {
3710 char *p;
3711 ASM_GENERATE_INTERNAL_LABEL (internal_label_prefix, "LX", 0);
3712 p = strchr (internal_label_prefix, 'X');
3713 internal_label_prefix_len = p - internal_label_prefix;
3714 *p = '\0';
3715 }
3716
3717 /* When scheduling description is not available, disable scheduler pass
3718 so it won't slow down the compilation and make x87 code slower. */
3719 if (!TARGET_SCHEDULE)
3720 flag_schedule_insns_after_reload = flag_schedule_insns = 0;
3721
3722 maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
3723 ix86_cost->simultaneous_prefetches,
3724 global_options.x_param_values,
3725 global_options_set.x_param_values);
3726 maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE, ix86_cost->prefetch_block,
3727 global_options.x_param_values,
3728 global_options_set.x_param_values);
3729 maybe_set_param_value (PARAM_L1_CACHE_SIZE, ix86_cost->l1_cache_size,
3730 global_options.x_param_values,
3731 global_options_set.x_param_values);
3732 maybe_set_param_value (PARAM_L2_CACHE_SIZE, ix86_cost->l2_cache_size,
3733 global_options.x_param_values,
3734 global_options_set.x_param_values);
3735
3736 /* Enable sw prefetching at -O3 for CPUS that prefetching is helpful. */
3737 if (flag_prefetch_loop_arrays < 0
3738 && HAVE_prefetch
3739 && optimize >= 3
3740 && TARGET_SOFTWARE_PREFETCHING_BENEFICIAL)
3741 flag_prefetch_loop_arrays = 1;
3742
3743 /* If using typedef char *va_list, signal that __builtin_va_start (&ap, 0)
3744 can be optimized to ap = __builtin_next_arg (0). */
3745 if (!TARGET_64BIT && !flag_split_stack)
3746 targetm.expand_builtin_va_start = NULL;
3747
3748 if (TARGET_64BIT)
3749 {
3750 ix86_gen_leave = gen_leave_rex64;
3751 ix86_gen_add3 = gen_adddi3;
3752 ix86_gen_sub3 = gen_subdi3;
3753 ix86_gen_sub3_carry = gen_subdi3_carry;
3754 ix86_gen_one_cmpl2 = gen_one_cmpldi2;
3755 ix86_gen_monitor = gen_sse3_monitor64;
3756 ix86_gen_andsp = gen_anddi3;
3757 ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_di;
3758 ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probedi;
3759 ix86_gen_probe_stack_range = gen_probe_stack_rangedi;
3760 }
3761 else
3762 {
3763 ix86_gen_leave = gen_leave;
3764 ix86_gen_add3 = gen_addsi3;
3765 ix86_gen_sub3 = gen_subsi3;
3766 ix86_gen_sub3_carry = gen_subsi3_carry;
3767 ix86_gen_one_cmpl2 = gen_one_cmplsi2;
3768 ix86_gen_monitor = gen_sse3_monitor;
3769 ix86_gen_andsp = gen_andsi3;
3770 ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_si;
3771 ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probesi;
3772 ix86_gen_probe_stack_range = gen_probe_stack_rangesi;
3773 }
3774
3775 #ifdef USE_IX86_CLD
3776 /* Use -mcld by default for 32-bit code if configured with --enable-cld. */
3777 if (!TARGET_64BIT)
3778 target_flags |= MASK_CLD & ~target_flags_explicit;
3779 #endif
3780
3781 if (!TARGET_64BIT && flag_pic)
3782 {
3783 if (flag_fentry > 0)
3784 sorry ("-mfentry isn%'t supported for 32-bit in combination "
3785 "with -fpic");
3786 flag_fentry = 0;
3787 }
3788 else if (TARGET_SEH)
3789 {
3790 if (flag_fentry == 0)
3791 sorry ("-mno-fentry isn%'t compatible with SEH");
3792 flag_fentry = 1;
3793 }
3794 else if (flag_fentry < 0)
3795 {
3796 #if defined(PROFILE_BEFORE_PROLOGUE)
3797 flag_fentry = 1;
3798 #else
3799 flag_fentry = 0;
3800 #endif
3801 }
3802
3803 if (TARGET_AVX)
3804 {
3805 /* When not optimize for size, enable vzeroupper optimization for
3806 TARGET_AVX with -fexpensive-optimizations and split 32-byte
3807 AVX unaligned load/store. */
3808 if (!optimize_size)
3809 {
3810 if (flag_expensive_optimizations
3811 && !(target_flags_explicit & MASK_VZEROUPPER))
3812 target_flags |= MASK_VZEROUPPER;
3813 if ((x86_avx256_split_unaligned_load & ix86_tune_mask)
3814 && !(target_flags_explicit & MASK_AVX256_SPLIT_UNALIGNED_LOAD))
3815 target_flags |= MASK_AVX256_SPLIT_UNALIGNED_LOAD;
3816 if ((x86_avx256_split_unaligned_store & ix86_tune_mask)
3817 && !(target_flags_explicit & MASK_AVX256_SPLIT_UNALIGNED_STORE))
3818 target_flags |= MASK_AVX256_SPLIT_UNALIGNED_STORE;
3819 /* Enable 128-bit AVX instruction generation for the auto-vectorizer. */
3820 if (TARGET_AVX128_OPTIMAL && !(target_flags_explicit & MASK_PREFER_AVX128))
3821 target_flags |= MASK_PREFER_AVX128;
3822 }
3823 }
3824 else
3825 {
3826 /* Disable vzeroupper pass if TARGET_AVX is disabled. */
3827 target_flags &= ~MASK_VZEROUPPER;
3828 }
3829
3830 if (ix86_recip_name)
3831 {
3832 char *p = ASTRDUP (ix86_recip_name);
3833 char *q;
3834 unsigned int mask, i;
3835 bool invert;
3836
3837 while ((q = strtok (p, ",")) != NULL)
3838 {
3839 p = NULL;
3840 if (*q == '!')
3841 {
3842 invert = true;
3843 q++;
3844 }
3845 else
3846 invert = false;
3847
3848 if (!strcmp (q, "default"))
3849 mask = RECIP_MASK_ALL;
3850 else
3851 {
3852 for (i = 0; i < ARRAY_SIZE (recip_options); i++)
3853 if (!strcmp (q, recip_options[i].string))
3854 {
3855 mask = recip_options[i].mask;
3856 break;
3857 }
3858
3859 if (i == ARRAY_SIZE (recip_options))
3860 {
3861 error ("unknown option for -mrecip=%s", q);
3862 invert = false;
3863 mask = RECIP_MASK_NONE;
3864 }
3865 }
3866
3867 recip_mask_explicit |= mask;
3868 if (invert)
3869 recip_mask &= ~mask;
3870 else
3871 recip_mask |= mask;
3872 }
3873 }
3874
3875 if (TARGET_RECIP)
3876 recip_mask |= RECIP_MASK_ALL & ~recip_mask_explicit;
3877 else if (target_flags_explicit & MASK_RECIP)
3878 recip_mask &= ~(RECIP_MASK_ALL & ~recip_mask_explicit);
3879
3880 /* Save the initial options in case the user does function specific
3881 options. */
3882 if (main_args_p)
3883 target_option_default_node = target_option_current_node
3884 = build_target_option_node ();
3885 }
3886
3887 /* Return TRUE if VAL is passed in register with 256bit AVX modes. */
3888
3889 static bool
3890 function_pass_avx256_p (const_rtx val)
3891 {
3892 if (!val)
3893 return false;
3894
3895 if (REG_P (val) && VALID_AVX256_REG_MODE (GET_MODE (val)))
3896 return true;
3897
3898 if (GET_CODE (val) == PARALLEL)
3899 {
3900 int i;
3901 rtx r;
3902
3903 for (i = XVECLEN (val, 0) - 1; i >= 0; i--)
3904 {
3905 r = XVECEXP (val, 0, i);
3906 if (GET_CODE (r) == EXPR_LIST
3907 && XEXP (r, 0)
3908 && REG_P (XEXP (r, 0))
3909 && (GET_MODE (XEXP (r, 0)) == OImode
3910 || VALID_AVX256_REG_MODE (GET_MODE (XEXP (r, 0)))))
3911 return true;
3912 }
3913 }
3914
3915 return false;
3916 }
3917
3918 /* Implement the TARGET_OPTION_OVERRIDE hook. */
3919
3920 static void
3921 ix86_option_override (void)
3922 {
3923 ix86_option_override_internal (true);
3924 }
3925
3926 /* Update register usage after having seen the compiler flags. */
3927
3928 static void
3929 ix86_conditional_register_usage (void)
3930 {
3931 int i;
3932 unsigned int j;
3933
3934 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
3935 {
3936 if (fixed_regs[i] > 1)
3937 fixed_regs[i] = (fixed_regs[i] == (TARGET_64BIT ? 3 : 2));
3938 if (call_used_regs[i] > 1)
3939 call_used_regs[i] = (call_used_regs[i] == (TARGET_64BIT ? 3 : 2));
3940 }
3941
3942 /* The PIC register, if it exists, is fixed. */
3943 j = PIC_OFFSET_TABLE_REGNUM;
3944 if (j != INVALID_REGNUM)
3945 fixed_regs[j] = call_used_regs[j] = 1;
3946
3947 /* The 64-bit MS_ABI changes the set of call-used registers. */
3948 if (TARGET_64BIT_MS_ABI)
3949 {
3950 call_used_regs[SI_REG] = 0;
3951 call_used_regs[DI_REG] = 0;
3952 call_used_regs[XMM6_REG] = 0;
3953 call_used_regs[XMM7_REG] = 0;
3954 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
3955 call_used_regs[i] = 0;
3956 }
3957
3958 /* The default setting of CLOBBERED_REGS is for 32-bit; add in the
3959 other call-clobbered regs for 64-bit. */
3960 if (TARGET_64BIT)
3961 {
3962 CLEAR_HARD_REG_SET (reg_class_contents[(int)CLOBBERED_REGS]);
3963
3964 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
3965 if (TEST_HARD_REG_BIT (reg_class_contents[(int)GENERAL_REGS], i)
3966 && call_used_regs[i])
3967 SET_HARD_REG_BIT (reg_class_contents[(int)CLOBBERED_REGS], i);
3968 }
3969
3970 /* If MMX is disabled, squash the registers. */
3971 if (! TARGET_MMX)
3972 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
3973 if (TEST_HARD_REG_BIT (reg_class_contents[(int)MMX_REGS], i))
3974 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
3975
3976 /* If SSE is disabled, squash the registers. */
3977 if (! TARGET_SSE)
3978 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
3979 if (TEST_HARD_REG_BIT (reg_class_contents[(int)SSE_REGS], i))
3980 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
3981
3982 /* If the FPU is disabled, squash the registers. */
3983 if (! (TARGET_80387 || TARGET_FLOAT_RETURNS_IN_80387))
3984 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
3985 if (TEST_HARD_REG_BIT (reg_class_contents[(int)FLOAT_REGS], i))
3986 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
3987
3988 /* If 32-bit, squash the 64-bit registers. */
3989 if (! TARGET_64BIT)
3990 {
3991 for (i = FIRST_REX_INT_REG; i <= LAST_REX_INT_REG; i++)
3992 reg_names[i] = "";
3993 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
3994 reg_names[i] = "";
3995 }
3996 }
3997
3998 \f
3999 /* Save the current options */
4000
4001 static void
4002 ix86_function_specific_save (struct cl_target_option *ptr)
4003 {
4004 ptr->arch = ix86_arch;
4005 ptr->schedule = ix86_schedule;
4006 ptr->tune = ix86_tune;
4007 ptr->branch_cost = ix86_branch_cost;
4008 ptr->tune_defaulted = ix86_tune_defaulted;
4009 ptr->arch_specified = ix86_arch_specified;
4010 ptr->x_ix86_isa_flags_explicit = ix86_isa_flags_explicit;
4011 ptr->ix86_target_flags_explicit = target_flags_explicit;
4012 ptr->x_recip_mask_explicit = recip_mask_explicit;
4013
4014 /* The fields are char but the variables are not; make sure the
4015 values fit in the fields. */
4016 gcc_assert (ptr->arch == ix86_arch);
4017 gcc_assert (ptr->schedule == ix86_schedule);
4018 gcc_assert (ptr->tune == ix86_tune);
4019 gcc_assert (ptr->branch_cost == ix86_branch_cost);
4020 }
4021
4022 /* Restore the current options */
4023
4024 static void
4025 ix86_function_specific_restore (struct cl_target_option *ptr)
4026 {
4027 enum processor_type old_tune = ix86_tune;
4028 enum processor_type old_arch = ix86_arch;
4029 unsigned int ix86_arch_mask, ix86_tune_mask;
4030 int i;
4031
4032 ix86_arch = (enum processor_type) ptr->arch;
4033 ix86_schedule = (enum attr_cpu) ptr->schedule;
4034 ix86_tune = (enum processor_type) ptr->tune;
4035 ix86_branch_cost = ptr->branch_cost;
4036 ix86_tune_defaulted = ptr->tune_defaulted;
4037 ix86_arch_specified = ptr->arch_specified;
4038 ix86_isa_flags_explicit = ptr->x_ix86_isa_flags_explicit;
4039 target_flags_explicit = ptr->ix86_target_flags_explicit;
4040 recip_mask_explicit = ptr->x_recip_mask_explicit;
4041
4042 /* Recreate the arch feature tests if the arch changed */
4043 if (old_arch != ix86_arch)
4044 {
4045 ix86_arch_mask = 1u << ix86_arch;
4046 for (i = 0; i < X86_ARCH_LAST; ++i)
4047 ix86_arch_features[i]
4048 = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
4049 }
4050
4051 /* Recreate the tune optimization tests */
4052 if (old_tune != ix86_tune)
4053 {
4054 ix86_tune_mask = 1u << ix86_tune;
4055 for (i = 0; i < X86_TUNE_LAST; ++i)
4056 ix86_tune_features[i]
4057 = !!(initial_ix86_tune_features[i] & ix86_tune_mask);
4058 }
4059 }
4060
4061 /* Print the current options */
4062
4063 static void
4064 ix86_function_specific_print (FILE *file, int indent,
4065 struct cl_target_option *ptr)
4066 {
4067 char *target_string
4068 = ix86_target_string (ptr->x_ix86_isa_flags, ptr->x_target_flags,
4069 NULL, NULL, ptr->x_ix86_fpmath, false);
4070
4071 fprintf (file, "%*sarch = %d (%s)\n",
4072 indent, "",
4073 ptr->arch,
4074 ((ptr->arch < TARGET_CPU_DEFAULT_max)
4075 ? cpu_names[ptr->arch]
4076 : "<unknown>"));
4077
4078 fprintf (file, "%*stune = %d (%s)\n",
4079 indent, "",
4080 ptr->tune,
4081 ((ptr->tune < TARGET_CPU_DEFAULT_max)
4082 ? cpu_names[ptr->tune]
4083 : "<unknown>"));
4084
4085 fprintf (file, "%*sbranch_cost = %d\n", indent, "", ptr->branch_cost);
4086
4087 if (target_string)
4088 {
4089 fprintf (file, "%*s%s\n", indent, "", target_string);
4090 free (target_string);
4091 }
4092 }
4093
4094 \f
4095 /* Inner function to process the attribute((target(...))), take an argument and
4096 set the current options from the argument. If we have a list, recursively go
4097 over the list. */
4098
4099 static bool
4100 ix86_valid_target_attribute_inner_p (tree args, char *p_strings[],
4101 struct gcc_options *enum_opts_set)
4102 {
4103 char *next_optstr;
4104 bool ret = true;
4105
4106 #define IX86_ATTR_ISA(S,O) { S, sizeof (S)-1, ix86_opt_isa, O, 0 }
4107 #define IX86_ATTR_STR(S,O) { S, sizeof (S)-1, ix86_opt_str, O, 0 }
4108 #define IX86_ATTR_ENUM(S,O) { S, sizeof (S)-1, ix86_opt_enum, O, 0 }
4109 #define IX86_ATTR_YES(S,O,M) { S, sizeof (S)-1, ix86_opt_yes, O, M }
4110 #define IX86_ATTR_NO(S,O,M) { S, sizeof (S)-1, ix86_opt_no, O, M }
4111
4112 enum ix86_opt_type
4113 {
4114 ix86_opt_unknown,
4115 ix86_opt_yes,
4116 ix86_opt_no,
4117 ix86_opt_str,
4118 ix86_opt_enum,
4119 ix86_opt_isa
4120 };
4121
4122 static const struct
4123 {
4124 const char *string;
4125 size_t len;
4126 enum ix86_opt_type type;
4127 int opt;
4128 int mask;
4129 } attrs[] = {
4130 /* isa options */
4131 IX86_ATTR_ISA ("3dnow", OPT_m3dnow),
4132 IX86_ATTR_ISA ("abm", OPT_mabm),
4133 IX86_ATTR_ISA ("bmi", OPT_mbmi),
4134 IX86_ATTR_ISA ("bmi2", OPT_mbmi2),
4135 IX86_ATTR_ISA ("lzcnt", OPT_mlzcnt),
4136 IX86_ATTR_ISA ("tbm", OPT_mtbm),
4137 IX86_ATTR_ISA ("aes", OPT_maes),
4138 IX86_ATTR_ISA ("avx", OPT_mavx),
4139 IX86_ATTR_ISA ("avx2", OPT_mavx2),
4140 IX86_ATTR_ISA ("mmx", OPT_mmmx),
4141 IX86_ATTR_ISA ("pclmul", OPT_mpclmul),
4142 IX86_ATTR_ISA ("popcnt", OPT_mpopcnt),
4143 IX86_ATTR_ISA ("sse", OPT_msse),
4144 IX86_ATTR_ISA ("sse2", OPT_msse2),
4145 IX86_ATTR_ISA ("sse3", OPT_msse3),
4146 IX86_ATTR_ISA ("sse4", OPT_msse4),
4147 IX86_ATTR_ISA ("sse4.1", OPT_msse4_1),
4148 IX86_ATTR_ISA ("sse4.2", OPT_msse4_2),
4149 IX86_ATTR_ISA ("sse4a", OPT_msse4a),
4150 IX86_ATTR_ISA ("ssse3", OPT_mssse3),
4151 IX86_ATTR_ISA ("fma4", OPT_mfma4),
4152 IX86_ATTR_ISA ("fma", OPT_mfma),
4153 IX86_ATTR_ISA ("xop", OPT_mxop),
4154 IX86_ATTR_ISA ("lwp", OPT_mlwp),
4155 IX86_ATTR_ISA ("fsgsbase", OPT_mfsgsbase),
4156 IX86_ATTR_ISA ("rdrnd", OPT_mrdrnd),
4157 IX86_ATTR_ISA ("f16c", OPT_mf16c),
4158
4159 /* enum options */
4160 IX86_ATTR_ENUM ("fpmath=", OPT_mfpmath_),
4161
4162 /* string options */
4163 IX86_ATTR_STR ("arch=", IX86_FUNCTION_SPECIFIC_ARCH),
4164 IX86_ATTR_STR ("tune=", IX86_FUNCTION_SPECIFIC_TUNE),
4165
4166 /* flag options */
4167 IX86_ATTR_YES ("cld",
4168 OPT_mcld,
4169 MASK_CLD),
4170
4171 IX86_ATTR_NO ("fancy-math-387",
4172 OPT_mfancy_math_387,
4173 MASK_NO_FANCY_MATH_387),
4174
4175 IX86_ATTR_YES ("ieee-fp",
4176 OPT_mieee_fp,
4177 MASK_IEEE_FP),
4178
4179 IX86_ATTR_YES ("inline-all-stringops",
4180 OPT_minline_all_stringops,
4181 MASK_INLINE_ALL_STRINGOPS),
4182
4183 IX86_ATTR_YES ("inline-stringops-dynamically",
4184 OPT_minline_stringops_dynamically,
4185 MASK_INLINE_STRINGOPS_DYNAMICALLY),
4186
4187 IX86_ATTR_NO ("align-stringops",
4188 OPT_mno_align_stringops,
4189 MASK_NO_ALIGN_STRINGOPS),
4190
4191 IX86_ATTR_YES ("recip",
4192 OPT_mrecip,
4193 MASK_RECIP),
4194
4195 };
4196
4197 /* If this is a list, recurse to get the options. */
4198 if (TREE_CODE (args) == TREE_LIST)
4199 {
4200 bool ret = true;
4201
4202 for (; args; args = TREE_CHAIN (args))
4203 if (TREE_VALUE (args)
4204 && !ix86_valid_target_attribute_inner_p (TREE_VALUE (args),
4205 p_strings, enum_opts_set))
4206 ret = false;
4207
4208 return ret;
4209 }
4210
4211 else if (TREE_CODE (args) != STRING_CST)
4212 gcc_unreachable ();
4213
4214 /* Handle multiple arguments separated by commas. */
4215 next_optstr = ASTRDUP (TREE_STRING_POINTER (args));
4216
4217 while (next_optstr && *next_optstr != '\0')
4218 {
4219 char *p = next_optstr;
4220 char *orig_p = p;
4221 char *comma = strchr (next_optstr, ',');
4222 const char *opt_string;
4223 size_t len, opt_len;
4224 int opt;
4225 bool opt_set_p;
4226 char ch;
4227 unsigned i;
4228 enum ix86_opt_type type = ix86_opt_unknown;
4229 int mask = 0;
4230
4231 if (comma)
4232 {
4233 *comma = '\0';
4234 len = comma - next_optstr;
4235 next_optstr = comma + 1;
4236 }
4237 else
4238 {
4239 len = strlen (p);
4240 next_optstr = NULL;
4241 }
4242
4243 /* Recognize no-xxx. */
4244 if (len > 3 && p[0] == 'n' && p[1] == 'o' && p[2] == '-')
4245 {
4246 opt_set_p = false;
4247 p += 3;
4248 len -= 3;
4249 }
4250 else
4251 opt_set_p = true;
4252
4253 /* Find the option. */
4254 ch = *p;
4255 opt = N_OPTS;
4256 for (i = 0; i < ARRAY_SIZE (attrs); i++)
4257 {
4258 type = attrs[i].type;
4259 opt_len = attrs[i].len;
4260 if (ch == attrs[i].string[0]
4261 && ((type != ix86_opt_str && type != ix86_opt_enum)
4262 ? len == opt_len
4263 : len > opt_len)
4264 && memcmp (p, attrs[i].string, opt_len) == 0)
4265 {
4266 opt = attrs[i].opt;
4267 mask = attrs[i].mask;
4268 opt_string = attrs[i].string;
4269 break;
4270 }
4271 }
4272
4273 /* Process the option. */
4274 if (opt == N_OPTS)
4275 {
4276 error ("attribute(target(\"%s\")) is unknown", orig_p);
4277 ret = false;
4278 }
4279
4280 else if (type == ix86_opt_isa)
4281 {
4282 struct cl_decoded_option decoded;
4283
4284 generate_option (opt, NULL, opt_set_p, CL_TARGET, &decoded);
4285 ix86_handle_option (&global_options, &global_options_set,
4286 &decoded, input_location);
4287 }
4288
4289 else if (type == ix86_opt_yes || type == ix86_opt_no)
4290 {
4291 if (type == ix86_opt_no)
4292 opt_set_p = !opt_set_p;
4293
4294 if (opt_set_p)
4295 target_flags |= mask;
4296 else
4297 target_flags &= ~mask;
4298 }
4299
4300 else if (type == ix86_opt_str)
4301 {
4302 if (p_strings[opt])
4303 {
4304 error ("option(\"%s\") was already specified", opt_string);
4305 ret = false;
4306 }
4307 else
4308 p_strings[opt] = xstrdup (p + opt_len);
4309 }
4310
4311 else if (type == ix86_opt_enum)
4312 {
4313 bool arg_ok;
4314 int value;
4315
4316 arg_ok = opt_enum_arg_to_value (opt, p + opt_len, &value, CL_TARGET);
4317 if (arg_ok)
4318 set_option (&global_options, enum_opts_set, opt, value,
4319 p + opt_len, DK_UNSPECIFIED, input_location,
4320 global_dc);
4321 else
4322 {
4323 error ("attribute(target(\"%s\")) is unknown", orig_p);
4324 ret = false;
4325 }
4326 }
4327
4328 else
4329 gcc_unreachable ();
4330 }
4331
4332 return ret;
4333 }
4334
4335 /* Return a TARGET_OPTION_NODE tree of the target options listed or NULL. */
4336
4337 tree
4338 ix86_valid_target_attribute_tree (tree args)
4339 {
4340 const char *orig_arch_string = ix86_arch_string;
4341 const char *orig_tune_string = ix86_tune_string;
4342 enum fpmath_unit orig_fpmath_set = global_options_set.x_ix86_fpmath;
4343 int orig_tune_defaulted = ix86_tune_defaulted;
4344 int orig_arch_specified = ix86_arch_specified;
4345 char *option_strings[IX86_FUNCTION_SPECIFIC_MAX] = { NULL, NULL };
4346 tree t = NULL_TREE;
4347 int i;
4348 struct cl_target_option *def
4349 = TREE_TARGET_OPTION (target_option_default_node);
4350 struct gcc_options enum_opts_set;
4351
4352 memset (&enum_opts_set, 0, sizeof (enum_opts_set));
4353
4354 /* Process each of the options on the chain. */
4355 if (! ix86_valid_target_attribute_inner_p (args, option_strings,
4356 &enum_opts_set))
4357 return NULL_TREE;
4358
4359 /* If the changed options are different from the default, rerun
4360 ix86_option_override_internal, and then save the options away.
4361 The string options are are attribute options, and will be undone
4362 when we copy the save structure. */
4363 if (ix86_isa_flags != def->x_ix86_isa_flags
4364 || target_flags != def->x_target_flags
4365 || option_strings[IX86_FUNCTION_SPECIFIC_ARCH]
4366 || option_strings[IX86_FUNCTION_SPECIFIC_TUNE]
4367 || enum_opts_set.x_ix86_fpmath)
4368 {
4369 /* If we are using the default tune= or arch=, undo the string assigned,
4370 and use the default. */
4371 if (option_strings[IX86_FUNCTION_SPECIFIC_ARCH])
4372 ix86_arch_string = option_strings[IX86_FUNCTION_SPECIFIC_ARCH];
4373 else if (!orig_arch_specified)
4374 ix86_arch_string = NULL;
4375
4376 if (option_strings[IX86_FUNCTION_SPECIFIC_TUNE])
4377 ix86_tune_string = option_strings[IX86_FUNCTION_SPECIFIC_TUNE];
4378 else if (orig_tune_defaulted)
4379 ix86_tune_string = NULL;
4380
4381 /* If fpmath= is not set, and we now have sse2 on 32-bit, use it. */
4382 if (enum_opts_set.x_ix86_fpmath)
4383 global_options_set.x_ix86_fpmath = (enum fpmath_unit) 1;
4384 else if (!TARGET_64BIT && TARGET_SSE)
4385 {
4386 ix86_fpmath = (enum fpmath_unit) (FPMATH_SSE | FPMATH_387);
4387 global_options_set.x_ix86_fpmath = (enum fpmath_unit) 1;
4388 }
4389
4390 /* Do any overrides, such as arch=xxx, or tune=xxx support. */
4391 ix86_option_override_internal (false);
4392
4393 /* Add any builtin functions with the new isa if any. */
4394 ix86_add_new_builtins (ix86_isa_flags);
4395
4396 /* Save the current options unless we are validating options for
4397 #pragma. */
4398 t = build_target_option_node ();
4399
4400 ix86_arch_string = orig_arch_string;
4401 ix86_tune_string = orig_tune_string;
4402 global_options_set.x_ix86_fpmath = orig_fpmath_set;
4403
4404 /* Free up memory allocated to hold the strings */
4405 for (i = 0; i < IX86_FUNCTION_SPECIFIC_MAX; i++)
4406 free (option_strings[i]);
4407 }
4408
4409 return t;
4410 }
4411
4412 /* Hook to validate attribute((target("string"))). */
4413
4414 static bool
4415 ix86_valid_target_attribute_p (tree fndecl,
4416 tree ARG_UNUSED (name),
4417 tree args,
4418 int ARG_UNUSED (flags))
4419 {
4420 struct cl_target_option cur_target;
4421 bool ret = true;
4422 tree old_optimize = build_optimization_node ();
4423 tree new_target, new_optimize;
4424 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
4425
4426 /* If the function changed the optimization levels as well as setting target
4427 options, start with the optimizations specified. */
4428 if (func_optimize && func_optimize != old_optimize)
4429 cl_optimization_restore (&global_options,
4430 TREE_OPTIMIZATION (func_optimize));
4431
4432 /* The target attributes may also change some optimization flags, so update
4433 the optimization options if necessary. */
4434 cl_target_option_save (&cur_target, &global_options);
4435 new_target = ix86_valid_target_attribute_tree (args);
4436 new_optimize = build_optimization_node ();
4437
4438 if (!new_target)
4439 ret = false;
4440
4441 else if (fndecl)
4442 {
4443 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
4444
4445 if (old_optimize != new_optimize)
4446 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
4447 }
4448
4449 cl_target_option_restore (&global_options, &cur_target);
4450
4451 if (old_optimize != new_optimize)
4452 cl_optimization_restore (&global_options,
4453 TREE_OPTIMIZATION (old_optimize));
4454
4455 return ret;
4456 }
4457
4458 \f
4459 /* Hook to determine if one function can safely inline another. */
4460
4461 static bool
4462 ix86_can_inline_p (tree caller, tree callee)
4463 {
4464 bool ret = false;
4465 tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
4466 tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
4467
4468 /* If callee has no option attributes, then it is ok to inline. */
4469 if (!callee_tree)
4470 ret = true;
4471
4472 /* If caller has no option attributes, but callee does then it is not ok to
4473 inline. */
4474 else if (!caller_tree)
4475 ret = false;
4476
4477 else
4478 {
4479 struct cl_target_option *caller_opts = TREE_TARGET_OPTION (caller_tree);
4480 struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree);
4481
4482 /* Callee's isa options should a subset of the caller's, i.e. a SSE4 function
4483 can inline a SSE2 function but a SSE2 function can't inline a SSE4
4484 function. */
4485 if ((caller_opts->x_ix86_isa_flags & callee_opts->x_ix86_isa_flags)
4486 != callee_opts->x_ix86_isa_flags)
4487 ret = false;
4488
4489 /* See if we have the same non-isa options. */
4490 else if (caller_opts->x_target_flags != callee_opts->x_target_flags)
4491 ret = false;
4492
4493 /* See if arch, tune, etc. are the same. */
4494 else if (caller_opts->arch != callee_opts->arch)
4495 ret = false;
4496
4497 else if (caller_opts->tune != callee_opts->tune)
4498 ret = false;
4499
4500 else if (caller_opts->x_ix86_fpmath != callee_opts->x_ix86_fpmath)
4501 ret = false;
4502
4503 else if (caller_opts->branch_cost != callee_opts->branch_cost)
4504 ret = false;
4505
4506 else
4507 ret = true;
4508 }
4509
4510 return ret;
4511 }
4512
4513 \f
4514 /* Remember the last target of ix86_set_current_function. */
4515 static GTY(()) tree ix86_previous_fndecl;
4516
4517 /* Establish appropriate back-end context for processing the function
4518 FNDECL. The argument might be NULL to indicate processing at top
4519 level, outside of any function scope. */
4520 static void
4521 ix86_set_current_function (tree fndecl)
4522 {
4523 /* Only change the context if the function changes. This hook is called
4524 several times in the course of compiling a function, and we don't want to
4525 slow things down too much or call target_reinit when it isn't safe. */
4526 if (fndecl && fndecl != ix86_previous_fndecl)
4527 {
4528 tree old_tree = (ix86_previous_fndecl
4529 ? DECL_FUNCTION_SPECIFIC_TARGET (ix86_previous_fndecl)
4530 : NULL_TREE);
4531
4532 tree new_tree = (fndecl
4533 ? DECL_FUNCTION_SPECIFIC_TARGET (fndecl)
4534 : NULL_TREE);
4535
4536 ix86_previous_fndecl = fndecl;
4537 if (old_tree == new_tree)
4538 ;
4539
4540 else if (new_tree)
4541 {
4542 cl_target_option_restore (&global_options,
4543 TREE_TARGET_OPTION (new_tree));
4544 target_reinit ();
4545 }
4546
4547 else if (old_tree)
4548 {
4549 struct cl_target_option *def
4550 = TREE_TARGET_OPTION (target_option_current_node);
4551
4552 cl_target_option_restore (&global_options, def);
4553 target_reinit ();
4554 }
4555 }
4556 }
4557
4558 \f
4559 /* Return true if this goes in large data/bss. */
4560
4561 static bool
4562 ix86_in_large_data_p (tree exp)
4563 {
4564 if (ix86_cmodel != CM_MEDIUM && ix86_cmodel != CM_MEDIUM_PIC)
4565 return false;
4566
4567 /* Functions are never large data. */
4568 if (TREE_CODE (exp) == FUNCTION_DECL)
4569 return false;
4570
4571 if (TREE_CODE (exp) == VAR_DECL && DECL_SECTION_NAME (exp))
4572 {
4573 const char *section = TREE_STRING_POINTER (DECL_SECTION_NAME (exp));
4574 if (strcmp (section, ".ldata") == 0
4575 || strcmp (section, ".lbss") == 0)
4576 return true;
4577 return false;
4578 }
4579 else
4580 {
4581 HOST_WIDE_INT size = int_size_in_bytes (TREE_TYPE (exp));
4582
4583 /* If this is an incomplete type with size 0, then we can't put it
4584 in data because it might be too big when completed. */
4585 if (!size || size > ix86_section_threshold)
4586 return true;
4587 }
4588
4589 return false;
4590 }
4591
4592 /* Switch to the appropriate section for output of DECL.
4593 DECL is either a `VAR_DECL' node or a constant of some sort.
4594 RELOC indicates whether forming the initial value of DECL requires
4595 link-time relocations. */
4596
4597 static section * x86_64_elf_select_section (tree, int, unsigned HOST_WIDE_INT)
4598 ATTRIBUTE_UNUSED;
4599
4600 static section *
4601 x86_64_elf_select_section (tree decl, int reloc,
4602 unsigned HOST_WIDE_INT align)
4603 {
4604 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4605 && ix86_in_large_data_p (decl))
4606 {
4607 const char *sname = NULL;
4608 unsigned int flags = SECTION_WRITE;
4609 switch (categorize_decl_for_section (decl, reloc))
4610 {
4611 case SECCAT_DATA:
4612 sname = ".ldata";
4613 break;
4614 case SECCAT_DATA_REL:
4615 sname = ".ldata.rel";
4616 break;
4617 case SECCAT_DATA_REL_LOCAL:
4618 sname = ".ldata.rel.local";
4619 break;
4620 case SECCAT_DATA_REL_RO:
4621 sname = ".ldata.rel.ro";
4622 break;
4623 case SECCAT_DATA_REL_RO_LOCAL:
4624 sname = ".ldata.rel.ro.local";
4625 break;
4626 case SECCAT_BSS:
4627 sname = ".lbss";
4628 flags |= SECTION_BSS;
4629 break;
4630 case SECCAT_RODATA:
4631 case SECCAT_RODATA_MERGE_STR:
4632 case SECCAT_RODATA_MERGE_STR_INIT:
4633 case SECCAT_RODATA_MERGE_CONST:
4634 sname = ".lrodata";
4635 flags = 0;
4636 break;
4637 case SECCAT_SRODATA:
4638 case SECCAT_SDATA:
4639 case SECCAT_SBSS:
4640 gcc_unreachable ();
4641 case SECCAT_TEXT:
4642 case SECCAT_TDATA:
4643 case SECCAT_TBSS:
4644 /* We don't split these for medium model. Place them into
4645 default sections and hope for best. */
4646 break;
4647 }
4648 if (sname)
4649 {
4650 /* We might get called with string constants, but get_named_section
4651 doesn't like them as they are not DECLs. Also, we need to set
4652 flags in that case. */
4653 if (!DECL_P (decl))
4654 return get_section (sname, flags, NULL);
4655 return get_named_section (decl, sname, reloc);
4656 }
4657 }
4658 return default_elf_select_section (decl, reloc, align);
4659 }
4660
4661 /* Build up a unique section name, expressed as a
4662 STRING_CST node, and assign it to DECL_SECTION_NAME (decl).
4663 RELOC indicates whether the initial value of EXP requires
4664 link-time relocations. */
4665
4666 static void ATTRIBUTE_UNUSED
4667 x86_64_elf_unique_section (tree decl, int reloc)
4668 {
4669 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4670 && ix86_in_large_data_p (decl))
4671 {
4672 const char *prefix = NULL;
4673 /* We only need to use .gnu.linkonce if we don't have COMDAT groups. */
4674 bool one_only = DECL_ONE_ONLY (decl) && !HAVE_COMDAT_GROUP;
4675
4676 switch (categorize_decl_for_section (decl, reloc))
4677 {
4678 case SECCAT_DATA:
4679 case SECCAT_DATA_REL:
4680 case SECCAT_DATA_REL_LOCAL:
4681 case SECCAT_DATA_REL_RO:
4682 case SECCAT_DATA_REL_RO_LOCAL:
4683 prefix = one_only ? ".ld" : ".ldata";
4684 break;
4685 case SECCAT_BSS:
4686 prefix = one_only ? ".lb" : ".lbss";
4687 break;
4688 case SECCAT_RODATA:
4689 case SECCAT_RODATA_MERGE_STR:
4690 case SECCAT_RODATA_MERGE_STR_INIT:
4691 case SECCAT_RODATA_MERGE_CONST:
4692 prefix = one_only ? ".lr" : ".lrodata";
4693 break;
4694 case SECCAT_SRODATA:
4695 case SECCAT_SDATA:
4696 case SECCAT_SBSS:
4697 gcc_unreachable ();
4698 case SECCAT_TEXT:
4699 case SECCAT_TDATA:
4700 case SECCAT_TBSS:
4701 /* We don't split these for medium model. Place them into
4702 default sections and hope for best. */
4703 break;
4704 }
4705 if (prefix)
4706 {
4707 const char *name, *linkonce;
4708 char *string;
4709
4710 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
4711 name = targetm.strip_name_encoding (name);
4712
4713 /* If we're using one_only, then there needs to be a .gnu.linkonce
4714 prefix to the section name. */
4715 linkonce = one_only ? ".gnu.linkonce" : "";
4716
4717 string = ACONCAT ((linkonce, prefix, ".", name, NULL));
4718
4719 DECL_SECTION_NAME (decl) = build_string (strlen (string), string);
4720 return;
4721 }
4722 }
4723 default_unique_section (decl, reloc);
4724 }
4725
4726 #ifdef COMMON_ASM_OP
4727 /* This says how to output assembler code to declare an
4728 uninitialized external linkage data object.
4729
4730 For medium model x86-64 we need to use .largecomm opcode for
4731 large objects. */
4732 void
4733 x86_elf_aligned_common (FILE *file,
4734 const char *name, unsigned HOST_WIDE_INT size,
4735 int align)
4736 {
4737 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4738 && size > (unsigned int)ix86_section_threshold)
4739 fputs (".largecomm\t", file);
4740 else
4741 fputs (COMMON_ASM_OP, file);
4742 assemble_name (file, name);
4743 fprintf (file, "," HOST_WIDE_INT_PRINT_UNSIGNED ",%u\n",
4744 size, align / BITS_PER_UNIT);
4745 }
4746 #endif
4747
4748 /* Utility function for targets to use in implementing
4749 ASM_OUTPUT_ALIGNED_BSS. */
4750
4751 void
4752 x86_output_aligned_bss (FILE *file, tree decl ATTRIBUTE_UNUSED,
4753 const char *name, unsigned HOST_WIDE_INT size,
4754 int align)
4755 {
4756 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4757 && size > (unsigned int)ix86_section_threshold)
4758 switch_to_section (get_named_section (decl, ".lbss", 0));
4759 else
4760 switch_to_section (bss_section);
4761 ASM_OUTPUT_ALIGN (file, floor_log2 (align / BITS_PER_UNIT));
4762 #ifdef ASM_DECLARE_OBJECT_NAME
4763 last_assemble_variable_decl = decl;
4764 ASM_DECLARE_OBJECT_NAME (file, name, decl);
4765 #else
4766 /* Standard thing is just output label for the object. */
4767 ASM_OUTPUT_LABEL (file, name);
4768 #endif /* ASM_DECLARE_OBJECT_NAME */
4769 ASM_OUTPUT_SKIP (file, size ? size : 1);
4770 }
4771 \f
4772 /* Decide whether we must probe the stack before any space allocation
4773 on this target. It's essentially TARGET_STACK_PROBE except when
4774 -fstack-check causes the stack to be already probed differently. */
4775
4776 bool
4777 ix86_target_stack_probe (void)
4778 {
4779 /* Do not probe the stack twice if static stack checking is enabled. */
4780 if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
4781 return false;
4782
4783 return TARGET_STACK_PROBE;
4784 }
4785 \f
4786 /* Decide whether we can make a sibling call to a function. DECL is the
4787 declaration of the function being targeted by the call and EXP is the
4788 CALL_EXPR representing the call. */
4789
4790 static bool
4791 ix86_function_ok_for_sibcall (tree decl, tree exp)
4792 {
4793 tree type, decl_or_type;
4794 rtx a, b;
4795
4796 /* If we are generating position-independent code, we cannot sibcall
4797 optimize any indirect call, or a direct call to a global function,
4798 as the PLT requires %ebx be live. (Darwin does not have a PLT.) */
4799 if (!TARGET_MACHO
4800 && !TARGET_64BIT
4801 && flag_pic
4802 && (!decl || !targetm.binds_local_p (decl)))
4803 return false;
4804
4805 /* If we need to align the outgoing stack, then sibcalling would
4806 unalign the stack, which may break the called function. */
4807 if (ix86_minimum_incoming_stack_boundary (true)
4808 < PREFERRED_STACK_BOUNDARY)
4809 return false;
4810
4811 if (decl)
4812 {
4813 decl_or_type = decl;
4814 type = TREE_TYPE (decl);
4815 }
4816 else
4817 {
4818 /* We're looking at the CALL_EXPR, we need the type of the function. */
4819 type = CALL_EXPR_FN (exp); /* pointer expression */
4820 type = TREE_TYPE (type); /* pointer type */
4821 type = TREE_TYPE (type); /* function type */
4822 decl_or_type = type;
4823 }
4824
4825 /* Check that the return value locations are the same. Like
4826 if we are returning floats on the 80387 register stack, we cannot
4827 make a sibcall from a function that doesn't return a float to a
4828 function that does or, conversely, from a function that does return
4829 a float to a function that doesn't; the necessary stack adjustment
4830 would not be executed. This is also the place we notice
4831 differences in the return value ABI. Note that it is ok for one
4832 of the functions to have void return type as long as the return
4833 value of the other is passed in a register. */
4834 a = ix86_function_value (TREE_TYPE (exp), decl_or_type, false);
4835 b = ix86_function_value (TREE_TYPE (DECL_RESULT (cfun->decl)),
4836 cfun->decl, false);
4837 if (STACK_REG_P (a) || STACK_REG_P (b))
4838 {
4839 if (!rtx_equal_p (a, b))
4840 return false;
4841 }
4842 else if (VOID_TYPE_P (TREE_TYPE (DECL_RESULT (cfun->decl))))
4843 {
4844 /* Disable sibcall if we need to generate vzeroupper after
4845 callee returns. */
4846 if (TARGET_VZEROUPPER
4847 && cfun->machine->callee_return_avx256_p
4848 && !cfun->machine->caller_return_avx256_p)
4849 return false;
4850 }
4851 else if (!rtx_equal_p (a, b))
4852 return false;
4853
4854 if (TARGET_64BIT)
4855 {
4856 /* The SYSV ABI has more call-clobbered registers;
4857 disallow sibcalls from MS to SYSV. */
4858 if (cfun->machine->call_abi == MS_ABI
4859 && ix86_function_type_abi (type) == SYSV_ABI)
4860 return false;
4861 }
4862 else
4863 {
4864 /* If this call is indirect, we'll need to be able to use a
4865 call-clobbered register for the address of the target function.
4866 Make sure that all such registers are not used for passing
4867 parameters. Note that DLLIMPORT functions are indirect. */
4868 if (!decl
4869 || (TARGET_DLLIMPORT_DECL_ATTRIBUTES && DECL_DLLIMPORT_P (decl)))
4870 {
4871 if (ix86_function_regparm (type, NULL) >= 3)
4872 {
4873 /* ??? Need to count the actual number of registers to be used,
4874 not the possible number of registers. Fix later. */
4875 return false;
4876 }
4877 }
4878 }
4879
4880 /* Otherwise okay. That also includes certain types of indirect calls. */
4881 return true;
4882 }
4883
4884 /* Handle "cdecl", "stdcall", "fastcall", "regparm", "thiscall",
4885 and "sseregparm" calling convention attributes;
4886 arguments as in struct attribute_spec.handler. */
4887
4888 static tree
4889 ix86_handle_cconv_attribute (tree *node, tree name,
4890 tree args,
4891 int flags ATTRIBUTE_UNUSED,
4892 bool *no_add_attrs)
4893 {
4894 if (TREE_CODE (*node) != FUNCTION_TYPE
4895 && TREE_CODE (*node) != METHOD_TYPE
4896 && TREE_CODE (*node) != FIELD_DECL
4897 && TREE_CODE (*node) != TYPE_DECL)
4898 {
4899 warning (OPT_Wattributes, "%qE attribute only applies to functions",
4900 name);
4901 *no_add_attrs = true;
4902 return NULL_TREE;
4903 }
4904
4905 /* Can combine regparm with all attributes but fastcall, and thiscall. */
4906 if (is_attribute_p ("regparm", name))
4907 {
4908 tree cst;
4909
4910 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
4911 {
4912 error ("fastcall and regparm attributes are not compatible");
4913 }
4914
4915 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
4916 {
4917 error ("regparam and thiscall attributes are not compatible");
4918 }
4919
4920 cst = TREE_VALUE (args);
4921 if (TREE_CODE (cst) != INTEGER_CST)
4922 {
4923 warning (OPT_Wattributes,
4924 "%qE attribute requires an integer constant argument",
4925 name);
4926 *no_add_attrs = true;
4927 }
4928 else if (compare_tree_int (cst, REGPARM_MAX) > 0)
4929 {
4930 warning (OPT_Wattributes, "argument to %qE attribute larger than %d",
4931 name, REGPARM_MAX);
4932 *no_add_attrs = true;
4933 }
4934
4935 return NULL_TREE;
4936 }
4937
4938 if (TARGET_64BIT)
4939 {
4940 /* Do not warn when emulating the MS ABI. */
4941 if ((TREE_CODE (*node) != FUNCTION_TYPE
4942 && TREE_CODE (*node) != METHOD_TYPE)
4943 || ix86_function_type_abi (*node) != MS_ABI)
4944 warning (OPT_Wattributes, "%qE attribute ignored",
4945 name);
4946 *no_add_attrs = true;
4947 return NULL_TREE;
4948 }
4949
4950 /* Can combine fastcall with stdcall (redundant) and sseregparm. */
4951 if (is_attribute_p ("fastcall", name))
4952 {
4953 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
4954 {
4955 error ("fastcall and cdecl attributes are not compatible");
4956 }
4957 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
4958 {
4959 error ("fastcall and stdcall attributes are not compatible");
4960 }
4961 if (lookup_attribute ("regparm", TYPE_ATTRIBUTES (*node)))
4962 {
4963 error ("fastcall and regparm attributes are not compatible");
4964 }
4965 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
4966 {
4967 error ("fastcall and thiscall attributes are not compatible");
4968 }
4969 }
4970
4971 /* Can combine stdcall with fastcall (redundant), regparm and
4972 sseregparm. */
4973 else if (is_attribute_p ("stdcall", name))
4974 {
4975 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
4976 {
4977 error ("stdcall and cdecl attributes are not compatible");
4978 }
4979 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
4980 {
4981 error ("stdcall and fastcall attributes are not compatible");
4982 }
4983 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
4984 {
4985 error ("stdcall and thiscall attributes are not compatible");
4986 }
4987 }
4988
4989 /* Can combine cdecl with regparm and sseregparm. */
4990 else if (is_attribute_p ("cdecl", name))
4991 {
4992 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
4993 {
4994 error ("stdcall and cdecl attributes are not compatible");
4995 }
4996 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
4997 {
4998 error ("fastcall and cdecl attributes are not compatible");
4999 }
5000 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
5001 {
5002 error ("cdecl and thiscall attributes are not compatible");
5003 }
5004 }
5005 else if (is_attribute_p ("thiscall", name))
5006 {
5007 if (TREE_CODE (*node) != METHOD_TYPE && pedantic)
5008 warning (OPT_Wattributes, "%qE attribute is used for none class-method",
5009 name);
5010 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
5011 {
5012 error ("stdcall and thiscall attributes are not compatible");
5013 }
5014 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
5015 {
5016 error ("fastcall and thiscall attributes are not compatible");
5017 }
5018 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
5019 {
5020 error ("cdecl and thiscall attributes are not compatible");
5021 }
5022 }
5023
5024 /* Can combine sseregparm with all attributes. */
5025
5026 return NULL_TREE;
5027 }
5028
5029 /* The transactional memory builtins are implicitly regparm or fastcall
5030 depending on the ABI. Override the generic do-nothing attribute that
5031 these builtins were declared with, and replace it with one of the two
5032 attributes that we expect elsewhere. */
5033
5034 static tree
5035 ix86_handle_tm_regparm_attribute (tree *node, tree name ATTRIBUTE_UNUSED,
5036 tree args ATTRIBUTE_UNUSED,
5037 int flags ATTRIBUTE_UNUSED,
5038 bool *no_add_attrs)
5039 {
5040 tree alt;
5041
5042 /* In no case do we want to add the placeholder attribute. */
5043 *no_add_attrs = true;
5044
5045 /* The 64-bit ABI is unchanged for transactional memory. */
5046 if (TARGET_64BIT)
5047 return NULL_TREE;
5048
5049 /* ??? Is there a better way to validate 32-bit windows? We have
5050 cfun->machine->call_abi, but that seems to be set only for 64-bit. */
5051 if (CHECK_STACK_LIMIT > 0)
5052 alt = tree_cons (get_identifier ("fastcall"), NULL, NULL);
5053 else
5054 {
5055 alt = tree_cons (NULL, build_int_cst (NULL, 2), NULL);
5056 alt = tree_cons (get_identifier ("regparm"), alt, NULL);
5057 }
5058 decl_attributes (node, alt, flags);
5059
5060 return NULL_TREE;
5061 }
5062
5063 /* This function determines from TYPE the calling-convention. */
5064
5065 unsigned int
5066 ix86_get_callcvt (const_tree type)
5067 {
5068 unsigned int ret = 0;
5069 bool is_stdarg;
5070 tree attrs;
5071
5072 if (TARGET_64BIT)
5073 return IX86_CALLCVT_CDECL;
5074
5075 attrs = TYPE_ATTRIBUTES (type);
5076 if (attrs != NULL_TREE)
5077 {
5078 if (lookup_attribute ("cdecl", attrs))
5079 ret |= IX86_CALLCVT_CDECL;
5080 else if (lookup_attribute ("stdcall", attrs))
5081 ret |= IX86_CALLCVT_STDCALL;
5082 else if (lookup_attribute ("fastcall", attrs))
5083 ret |= IX86_CALLCVT_FASTCALL;
5084 else if (lookup_attribute ("thiscall", attrs))
5085 ret |= IX86_CALLCVT_THISCALL;
5086
5087 /* Regparam isn't allowed for thiscall and fastcall. */
5088 if ((ret & (IX86_CALLCVT_THISCALL | IX86_CALLCVT_FASTCALL)) == 0)
5089 {
5090 if (lookup_attribute ("regparm", attrs))
5091 ret |= IX86_CALLCVT_REGPARM;
5092 if (lookup_attribute ("sseregparm", attrs))
5093 ret |= IX86_CALLCVT_SSEREGPARM;
5094 }
5095
5096 if (IX86_BASE_CALLCVT(ret) != 0)
5097 return ret;
5098 }
5099
5100 is_stdarg = stdarg_p (type);
5101 if (TARGET_RTD && !is_stdarg)
5102 return IX86_CALLCVT_STDCALL | ret;
5103
5104 if (ret != 0
5105 || is_stdarg
5106 || TREE_CODE (type) != METHOD_TYPE
5107 || ix86_function_type_abi (type) != MS_ABI)
5108 return IX86_CALLCVT_CDECL | ret;
5109
5110 return IX86_CALLCVT_THISCALL;
5111 }
5112
5113 /* Return 0 if the attributes for two types are incompatible, 1 if they
5114 are compatible, and 2 if they are nearly compatible (which causes a
5115 warning to be generated). */
5116
5117 static int
5118 ix86_comp_type_attributes (const_tree type1, const_tree type2)
5119 {
5120 unsigned int ccvt1, ccvt2;
5121
5122 if (TREE_CODE (type1) != FUNCTION_TYPE
5123 && TREE_CODE (type1) != METHOD_TYPE)
5124 return 1;
5125
5126 ccvt1 = ix86_get_callcvt (type1);
5127 ccvt2 = ix86_get_callcvt (type2);
5128 if (ccvt1 != ccvt2)
5129 return 0;
5130 if (ix86_function_regparm (type1, NULL)
5131 != ix86_function_regparm (type2, NULL))
5132 return 0;
5133
5134 return 1;
5135 }
5136 \f
5137 /* Return the regparm value for a function with the indicated TYPE and DECL.
5138 DECL may be NULL when calling function indirectly
5139 or considering a libcall. */
5140
5141 static int
5142 ix86_function_regparm (const_tree type, const_tree decl)
5143 {
5144 tree attr;
5145 int regparm;
5146 unsigned int ccvt;
5147
5148 if (TARGET_64BIT)
5149 return (ix86_function_type_abi (type) == SYSV_ABI
5150 ? X86_64_REGPARM_MAX : X86_64_MS_REGPARM_MAX);
5151 ccvt = ix86_get_callcvt (type);
5152 regparm = ix86_regparm;
5153
5154 if ((ccvt & IX86_CALLCVT_REGPARM) != 0)
5155 {
5156 attr = lookup_attribute ("regparm", TYPE_ATTRIBUTES (type));
5157 if (attr)
5158 {
5159 regparm = TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr)));
5160 return regparm;
5161 }
5162 }
5163 else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
5164 return 2;
5165 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
5166 return 1;
5167
5168 /* Use register calling convention for local functions when possible. */
5169 if (decl
5170 && TREE_CODE (decl) == FUNCTION_DECL
5171 && optimize
5172 && !(profile_flag && !flag_fentry))
5173 {
5174 /* FIXME: remove this CONST_CAST when cgraph.[ch] is constified. */
5175 struct cgraph_local_info *i = cgraph_local_info (CONST_CAST_TREE (decl));
5176 if (i && i->local && i->can_change_signature)
5177 {
5178 int local_regparm, globals = 0, regno;
5179
5180 /* Make sure no regparm register is taken by a
5181 fixed register variable. */
5182 for (local_regparm = 0; local_regparm < REGPARM_MAX; local_regparm++)
5183 if (fixed_regs[local_regparm])
5184 break;
5185
5186 /* We don't want to use regparm(3) for nested functions as
5187 these use a static chain pointer in the third argument. */
5188 if (local_regparm == 3 && DECL_STATIC_CHAIN (decl))
5189 local_regparm = 2;
5190
5191 /* In 32-bit mode save a register for the split stack. */
5192 if (!TARGET_64BIT && local_regparm == 3 && flag_split_stack)
5193 local_regparm = 2;
5194
5195 /* Each fixed register usage increases register pressure,
5196 so less registers should be used for argument passing.
5197 This functionality can be overriden by an explicit
5198 regparm value. */
5199 for (regno = 0; regno <= DI_REG; regno++)
5200 if (fixed_regs[regno])
5201 globals++;
5202
5203 local_regparm
5204 = globals < local_regparm ? local_regparm - globals : 0;
5205
5206 if (local_regparm > regparm)
5207 regparm = local_regparm;
5208 }
5209 }
5210
5211 return regparm;
5212 }
5213
5214 /* Return 1 or 2, if we can pass up to SSE_REGPARM_MAX SFmode (1) and
5215 DFmode (2) arguments in SSE registers for a function with the
5216 indicated TYPE and DECL. DECL may be NULL when calling function
5217 indirectly or considering a libcall. Otherwise return 0. */
5218
5219 static int
5220 ix86_function_sseregparm (const_tree type, const_tree decl, bool warn)
5221 {
5222 gcc_assert (!TARGET_64BIT);
5223
5224 /* Use SSE registers to pass SFmode and DFmode arguments if requested
5225 by the sseregparm attribute. */
5226 if (TARGET_SSEREGPARM
5227 || (type && lookup_attribute ("sseregparm", TYPE_ATTRIBUTES (type))))
5228 {
5229 if (!TARGET_SSE)
5230 {
5231 if (warn)
5232 {
5233 if (decl)
5234 error ("calling %qD with attribute sseregparm without "
5235 "SSE/SSE2 enabled", decl);
5236 else
5237 error ("calling %qT with attribute sseregparm without "
5238 "SSE/SSE2 enabled", type);
5239 }
5240 return 0;
5241 }
5242
5243 return 2;
5244 }
5245
5246 /* For local functions, pass up to SSE_REGPARM_MAX SFmode
5247 (and DFmode for SSE2) arguments in SSE registers. */
5248 if (decl && TARGET_SSE_MATH && optimize
5249 && !(profile_flag && !flag_fentry))
5250 {
5251 /* FIXME: remove this CONST_CAST when cgraph.[ch] is constified. */
5252 struct cgraph_local_info *i = cgraph_local_info (CONST_CAST_TREE(decl));
5253 if (i && i->local && i->can_change_signature)
5254 return TARGET_SSE2 ? 2 : 1;
5255 }
5256
5257 return 0;
5258 }
5259
5260 /* Return true if EAX is live at the start of the function. Used by
5261 ix86_expand_prologue to determine if we need special help before
5262 calling allocate_stack_worker. */
5263
5264 static bool
5265 ix86_eax_live_at_start_p (void)
5266 {
5267 /* Cheat. Don't bother working forward from ix86_function_regparm
5268 to the function type to whether an actual argument is located in
5269 eax. Instead just look at cfg info, which is still close enough
5270 to correct at this point. This gives false positives for broken
5271 functions that might use uninitialized data that happens to be
5272 allocated in eax, but who cares? */
5273 return REGNO_REG_SET_P (df_get_live_out (ENTRY_BLOCK_PTR), 0);
5274 }
5275
5276 static bool
5277 ix86_keep_aggregate_return_pointer (tree fntype)
5278 {
5279 tree attr;
5280
5281 if (!TARGET_64BIT)
5282 {
5283 attr = lookup_attribute ("callee_pop_aggregate_return",
5284 TYPE_ATTRIBUTES (fntype));
5285 if (attr)
5286 return (TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr))) == 0);
5287
5288 /* For 32-bit MS-ABI the default is to keep aggregate
5289 return pointer. */
5290 if (ix86_function_type_abi (fntype) == MS_ABI)
5291 return true;
5292 }
5293 return KEEP_AGGREGATE_RETURN_POINTER != 0;
5294 }
5295
5296 /* Value is the number of bytes of arguments automatically
5297 popped when returning from a subroutine call.
5298 FUNDECL is the declaration node of the function (as a tree),
5299 FUNTYPE is the data type of the function (as a tree),
5300 or for a library call it is an identifier node for the subroutine name.
5301 SIZE is the number of bytes of arguments passed on the stack.
5302
5303 On the 80386, the RTD insn may be used to pop them if the number
5304 of args is fixed, but if the number is variable then the caller
5305 must pop them all. RTD can't be used for library calls now
5306 because the library is compiled with the Unix compiler.
5307 Use of RTD is a selectable option, since it is incompatible with
5308 standard Unix calling sequences. If the option is not selected,
5309 the caller must always pop the args.
5310
5311 The attribute stdcall is equivalent to RTD on a per module basis. */
5312
5313 static int
5314 ix86_return_pops_args (tree fundecl, tree funtype, int size)
5315 {
5316 unsigned int ccvt;
5317
5318 /* None of the 64-bit ABIs pop arguments. */
5319 if (TARGET_64BIT)
5320 return 0;
5321
5322 ccvt = ix86_get_callcvt (funtype);
5323
5324 if ((ccvt & (IX86_CALLCVT_STDCALL | IX86_CALLCVT_FASTCALL
5325 | IX86_CALLCVT_THISCALL)) != 0
5326 && ! stdarg_p (funtype))
5327 return size;
5328
5329 /* Lose any fake structure return argument if it is passed on the stack. */
5330 if (aggregate_value_p (TREE_TYPE (funtype), fundecl)
5331 && !ix86_keep_aggregate_return_pointer (funtype))
5332 {
5333 int nregs = ix86_function_regparm (funtype, fundecl);
5334 if (nregs == 0)
5335 return GET_MODE_SIZE (Pmode);
5336 }
5337
5338 return 0;
5339 }
5340 \f
5341 /* Argument support functions. */
5342
5343 /* Return true when register may be used to pass function parameters. */
5344 bool
5345 ix86_function_arg_regno_p (int regno)
5346 {
5347 int i;
5348 const int *parm_regs;
5349
5350 if (!TARGET_64BIT)
5351 {
5352 if (TARGET_MACHO)
5353 return (regno < REGPARM_MAX
5354 || (TARGET_SSE && SSE_REGNO_P (regno) && !fixed_regs[regno]));
5355 else
5356 return (regno < REGPARM_MAX
5357 || (TARGET_MMX && MMX_REGNO_P (regno)
5358 && (regno < FIRST_MMX_REG + MMX_REGPARM_MAX))
5359 || (TARGET_SSE && SSE_REGNO_P (regno)
5360 && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX)));
5361 }
5362
5363 if (TARGET_MACHO)
5364 {
5365 if (SSE_REGNO_P (regno) && TARGET_SSE)
5366 return true;
5367 }
5368 else
5369 {
5370 if (TARGET_SSE && SSE_REGNO_P (regno)
5371 && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX))
5372 return true;
5373 }
5374
5375 /* TODO: The function should depend on current function ABI but
5376 builtins.c would need updating then. Therefore we use the
5377 default ABI. */
5378
5379 /* RAX is used as hidden argument to va_arg functions. */
5380 if (ix86_abi == SYSV_ABI && regno == AX_REG)
5381 return true;
5382
5383 if (ix86_abi == MS_ABI)
5384 parm_regs = x86_64_ms_abi_int_parameter_registers;
5385 else
5386 parm_regs = x86_64_int_parameter_registers;
5387 for (i = 0; i < (ix86_abi == MS_ABI
5388 ? X86_64_MS_REGPARM_MAX : X86_64_REGPARM_MAX); i++)
5389 if (regno == parm_regs[i])
5390 return true;
5391 return false;
5392 }
5393
5394 /* Return if we do not know how to pass TYPE solely in registers. */
5395
5396 static bool
5397 ix86_must_pass_in_stack (enum machine_mode mode, const_tree type)
5398 {
5399 if (must_pass_in_stack_var_size_or_pad (mode, type))
5400 return true;
5401
5402 /* For 32-bit, we want TImode aggregates to go on the stack. But watch out!
5403 The layout_type routine is crafty and tries to trick us into passing
5404 currently unsupported vector types on the stack by using TImode. */
5405 return (!TARGET_64BIT && mode == TImode
5406 && type && TREE_CODE (type) != VECTOR_TYPE);
5407 }
5408
5409 /* It returns the size, in bytes, of the area reserved for arguments passed
5410 in registers for the function represented by fndecl dependent to the used
5411 abi format. */
5412 int
5413 ix86_reg_parm_stack_space (const_tree fndecl)
5414 {
5415 enum calling_abi call_abi = SYSV_ABI;
5416 if (fndecl != NULL_TREE && TREE_CODE (fndecl) == FUNCTION_DECL)
5417 call_abi = ix86_function_abi (fndecl);
5418 else
5419 call_abi = ix86_function_type_abi (fndecl);
5420 if (TARGET_64BIT && call_abi == MS_ABI)
5421 return 32;
5422 return 0;
5423 }
5424
5425 /* Returns value SYSV_ABI, MS_ABI dependent on fntype, specifying the
5426 call abi used. */
5427 enum calling_abi
5428 ix86_function_type_abi (const_tree fntype)
5429 {
5430 if (fntype != NULL_TREE && TYPE_ATTRIBUTES (fntype) != NULL_TREE)
5431 {
5432 enum calling_abi abi = ix86_abi;
5433 if (abi == SYSV_ABI)
5434 {
5435 if (lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (fntype)))
5436 abi = MS_ABI;
5437 }
5438 else if (lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (fntype)))
5439 abi = SYSV_ABI;
5440 return abi;
5441 }
5442 return ix86_abi;
5443 }
5444
5445 static bool
5446 ix86_function_ms_hook_prologue (const_tree fn)
5447 {
5448 if (fn && lookup_attribute ("ms_hook_prologue", DECL_ATTRIBUTES (fn)))
5449 {
5450 if (decl_function_context (fn) != NULL_TREE)
5451 error_at (DECL_SOURCE_LOCATION (fn),
5452 "ms_hook_prologue is not compatible with nested function");
5453 else
5454 return true;
5455 }
5456 return false;
5457 }
5458
5459 static enum calling_abi
5460 ix86_function_abi (const_tree fndecl)
5461 {
5462 if (! fndecl)
5463 return ix86_abi;
5464 return ix86_function_type_abi (TREE_TYPE (fndecl));
5465 }
5466
5467 /* Returns value SYSV_ABI, MS_ABI dependent on cfun, specifying the
5468 call abi used. */
5469 enum calling_abi
5470 ix86_cfun_abi (void)
5471 {
5472 if (! cfun)
5473 return ix86_abi;
5474 return cfun->machine->call_abi;
5475 }
5476
5477 /* Write the extra assembler code needed to declare a function properly. */
5478
5479 void
5480 ix86_asm_output_function_label (FILE *asm_out_file, const char *fname,
5481 tree decl)
5482 {
5483 bool is_ms_hook = ix86_function_ms_hook_prologue (decl);
5484
5485 if (is_ms_hook)
5486 {
5487 int i, filler_count = (TARGET_64BIT ? 32 : 16);
5488 unsigned int filler_cc = 0xcccccccc;
5489
5490 for (i = 0; i < filler_count; i += 4)
5491 fprintf (asm_out_file, ASM_LONG " %#x\n", filler_cc);
5492 }
5493
5494 #ifdef SUBTARGET_ASM_UNWIND_INIT
5495 SUBTARGET_ASM_UNWIND_INIT (asm_out_file);
5496 #endif
5497
5498 ASM_OUTPUT_LABEL (asm_out_file, fname);
5499
5500 /* Output magic byte marker, if hot-patch attribute is set. */
5501 if (is_ms_hook)
5502 {
5503 if (TARGET_64BIT)
5504 {
5505 /* leaq [%rsp + 0], %rsp */
5506 asm_fprintf (asm_out_file, ASM_BYTE
5507 "0x48, 0x8d, 0xa4, 0x24, 0x00, 0x00, 0x00, 0x00\n");
5508 }
5509 else
5510 {
5511 /* movl.s %edi, %edi
5512 push %ebp
5513 movl.s %esp, %ebp */
5514 asm_fprintf (asm_out_file, ASM_BYTE
5515 "0x8b, 0xff, 0x55, 0x8b, 0xec\n");
5516 }
5517 }
5518 }
5519
5520 /* regclass.c */
5521 extern void init_regs (void);
5522
5523 /* Implementation of call abi switching target hook. Specific to FNDECL
5524 the specific call register sets are set. See also
5525 ix86_conditional_register_usage for more details. */
5526 void
5527 ix86_call_abi_override (const_tree fndecl)
5528 {
5529 if (fndecl == NULL_TREE)
5530 cfun->machine->call_abi = ix86_abi;
5531 else
5532 cfun->machine->call_abi = ix86_function_type_abi (TREE_TYPE (fndecl));
5533 }
5534
5535 /* 64-bit MS and SYSV ABI have different set of call used registers. Avoid
5536 expensive re-initialization of init_regs each time we switch function context
5537 since this is needed only during RTL expansion. */
5538 static void
5539 ix86_maybe_switch_abi (void)
5540 {
5541 if (TARGET_64BIT &&
5542 call_used_regs[SI_REG] == (cfun->machine->call_abi == MS_ABI))
5543 reinit_regs ();
5544 }
5545
5546 /* Initialize a variable CUM of type CUMULATIVE_ARGS
5547 for a call to a function whose data type is FNTYPE.
5548 For a library call, FNTYPE is 0. */
5549
5550 void
5551 init_cumulative_args (CUMULATIVE_ARGS *cum, /* Argument info to initialize */
5552 tree fntype, /* tree ptr for function decl */
5553 rtx libname, /* SYMBOL_REF of library name or 0 */
5554 tree fndecl,
5555 int caller)
5556 {
5557 struct cgraph_local_info *i;
5558 tree fnret_type;
5559
5560 memset (cum, 0, sizeof (*cum));
5561
5562 /* Initialize for the current callee. */
5563 if (caller)
5564 {
5565 cfun->machine->callee_pass_avx256_p = false;
5566 cfun->machine->callee_return_avx256_p = false;
5567 }
5568
5569 if (fndecl)
5570 {
5571 i = cgraph_local_info (fndecl);
5572 cum->call_abi = ix86_function_abi (fndecl);
5573 fnret_type = TREE_TYPE (TREE_TYPE (fndecl));
5574 }
5575 else
5576 {
5577 i = NULL;
5578 cum->call_abi = ix86_function_type_abi (fntype);
5579 if (fntype)
5580 fnret_type = TREE_TYPE (fntype);
5581 else
5582 fnret_type = NULL;
5583 }
5584
5585 if (TARGET_VZEROUPPER && fnret_type)
5586 {
5587 rtx fnret_value = ix86_function_value (fnret_type, fntype,
5588 false);
5589 if (function_pass_avx256_p (fnret_value))
5590 {
5591 /* The return value of this function uses 256bit AVX modes. */
5592 if (caller)
5593 cfun->machine->callee_return_avx256_p = true;
5594 else
5595 cfun->machine->caller_return_avx256_p = true;
5596 }
5597 }
5598
5599 cum->caller = caller;
5600
5601 /* Set up the number of registers to use for passing arguments. */
5602
5603 if (TARGET_64BIT && cum->call_abi == MS_ABI && !ACCUMULATE_OUTGOING_ARGS)
5604 sorry ("ms_abi attribute requires -maccumulate-outgoing-args "
5605 "or subtarget optimization implying it");
5606 cum->nregs = ix86_regparm;
5607 if (TARGET_64BIT)
5608 {
5609 cum->nregs = (cum->call_abi == SYSV_ABI
5610 ? X86_64_REGPARM_MAX
5611 : X86_64_MS_REGPARM_MAX);
5612 }
5613 if (TARGET_SSE)
5614 {
5615 cum->sse_nregs = SSE_REGPARM_MAX;
5616 if (TARGET_64BIT)
5617 {
5618 cum->sse_nregs = (cum->call_abi == SYSV_ABI
5619 ? X86_64_SSE_REGPARM_MAX
5620 : X86_64_MS_SSE_REGPARM_MAX);
5621 }
5622 }
5623 if (TARGET_MMX)
5624 cum->mmx_nregs = MMX_REGPARM_MAX;
5625 cum->warn_avx = true;
5626 cum->warn_sse = true;
5627 cum->warn_mmx = true;
5628
5629 /* Because type might mismatch in between caller and callee, we need to
5630 use actual type of function for local calls.
5631 FIXME: cgraph_analyze can be told to actually record if function uses
5632 va_start so for local functions maybe_vaarg can be made aggressive
5633 helping K&R code.
5634 FIXME: once typesytem is fixed, we won't need this code anymore. */
5635 if (i && i->local && i->can_change_signature)
5636 fntype = TREE_TYPE (fndecl);
5637 cum->maybe_vaarg = (fntype
5638 ? (!prototype_p (fntype) || stdarg_p (fntype))
5639 : !libname);
5640
5641 if (!TARGET_64BIT)
5642 {
5643 /* If there are variable arguments, then we won't pass anything
5644 in registers in 32-bit mode. */
5645 if (stdarg_p (fntype))
5646 {
5647 cum->nregs = 0;
5648 cum->sse_nregs = 0;
5649 cum->mmx_nregs = 0;
5650 cum->warn_avx = 0;
5651 cum->warn_sse = 0;
5652 cum->warn_mmx = 0;
5653 return;
5654 }
5655
5656 /* Use ecx and edx registers if function has fastcall attribute,
5657 else look for regparm information. */
5658 if (fntype)
5659 {
5660 unsigned int ccvt = ix86_get_callcvt (fntype);
5661 if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
5662 {
5663 cum->nregs = 1;
5664 cum->fastcall = 1; /* Same first register as in fastcall. */
5665 }
5666 else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
5667 {
5668 cum->nregs = 2;
5669 cum->fastcall = 1;
5670 }
5671 else
5672 cum->nregs = ix86_function_regparm (fntype, fndecl);
5673 }
5674
5675 /* Set up the number of SSE registers used for passing SFmode
5676 and DFmode arguments. Warn for mismatching ABI. */
5677 cum->float_in_sse = ix86_function_sseregparm (fntype, fndecl, true);
5678 }
5679 }
5680
5681 /* Return the "natural" mode for TYPE. In most cases, this is just TYPE_MODE.
5682 But in the case of vector types, it is some vector mode.
5683
5684 When we have only some of our vector isa extensions enabled, then there
5685 are some modes for which vector_mode_supported_p is false. For these
5686 modes, the generic vector support in gcc will choose some non-vector mode
5687 in order to implement the type. By computing the natural mode, we'll
5688 select the proper ABI location for the operand and not depend on whatever
5689 the middle-end decides to do with these vector types.
5690
5691 The midde-end can't deal with the vector types > 16 bytes. In this
5692 case, we return the original mode and warn ABI change if CUM isn't
5693 NULL. */
5694
5695 static enum machine_mode
5696 type_natural_mode (const_tree type, const CUMULATIVE_ARGS *cum)
5697 {
5698 enum machine_mode mode = TYPE_MODE (type);
5699
5700 if (TREE_CODE (type) == VECTOR_TYPE && !VECTOR_MODE_P (mode))
5701 {
5702 HOST_WIDE_INT size = int_size_in_bytes (type);
5703 if ((size == 8 || size == 16 || size == 32)
5704 /* ??? Generic code allows us to create width 1 vectors. Ignore. */
5705 && TYPE_VECTOR_SUBPARTS (type) > 1)
5706 {
5707 enum machine_mode innermode = TYPE_MODE (TREE_TYPE (type));
5708
5709 if (TREE_CODE (TREE_TYPE (type)) == REAL_TYPE)
5710 mode = MIN_MODE_VECTOR_FLOAT;
5711 else
5712 mode = MIN_MODE_VECTOR_INT;
5713
5714 /* Get the mode which has this inner mode and number of units. */
5715 for (; mode != VOIDmode; mode = GET_MODE_WIDER_MODE (mode))
5716 if (GET_MODE_NUNITS (mode) == TYPE_VECTOR_SUBPARTS (type)
5717 && GET_MODE_INNER (mode) == innermode)
5718 {
5719 if (size == 32 && !TARGET_AVX)
5720 {
5721 static bool warnedavx;
5722
5723 if (cum
5724 && !warnedavx
5725 && cum->warn_avx)
5726 {
5727 warnedavx = true;
5728 warning (0, "AVX vector argument without AVX "
5729 "enabled changes the ABI");
5730 }
5731 return TYPE_MODE (type);
5732 }
5733 else
5734 return mode;
5735 }
5736
5737 gcc_unreachable ();
5738 }
5739 }
5740
5741 return mode;
5742 }
5743
5744 /* We want to pass a value in REGNO whose "natural" mode is MODE. However,
5745 this may not agree with the mode that the type system has chosen for the
5746 register, which is ORIG_MODE. If ORIG_MODE is not BLKmode, then we can
5747 go ahead and use it. Otherwise we have to build a PARALLEL instead. */
5748
5749 static rtx
5750 gen_reg_or_parallel (enum machine_mode mode, enum machine_mode orig_mode,
5751 unsigned int regno)
5752 {
5753 rtx tmp;
5754
5755 if (orig_mode != BLKmode)
5756 tmp = gen_rtx_REG (orig_mode, regno);
5757 else
5758 {
5759 tmp = gen_rtx_REG (mode, regno);
5760 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, const0_rtx);
5761 tmp = gen_rtx_PARALLEL (orig_mode, gen_rtvec (1, tmp));
5762 }
5763
5764 return tmp;
5765 }
5766
5767 /* x86-64 register passing implementation. See x86-64 ABI for details. Goal
5768 of this code is to classify each 8bytes of incoming argument by the register
5769 class and assign registers accordingly. */
5770
5771 /* Return the union class of CLASS1 and CLASS2.
5772 See the x86-64 PS ABI for details. */
5773
5774 static enum x86_64_reg_class
5775 merge_classes (enum x86_64_reg_class class1, enum x86_64_reg_class class2)
5776 {
5777 /* Rule #1: If both classes are equal, this is the resulting class. */
5778 if (class1 == class2)
5779 return class1;
5780
5781 /* Rule #2: If one of the classes is NO_CLASS, the resulting class is
5782 the other class. */
5783 if (class1 == X86_64_NO_CLASS)
5784 return class2;
5785 if (class2 == X86_64_NO_CLASS)
5786 return class1;
5787
5788 /* Rule #3: If one of the classes is MEMORY, the result is MEMORY. */
5789 if (class1 == X86_64_MEMORY_CLASS || class2 == X86_64_MEMORY_CLASS)
5790 return X86_64_MEMORY_CLASS;
5791
5792 /* Rule #4: If one of the classes is INTEGER, the result is INTEGER. */
5793 if ((class1 == X86_64_INTEGERSI_CLASS && class2 == X86_64_SSESF_CLASS)
5794 || (class2 == X86_64_INTEGERSI_CLASS && class1 == X86_64_SSESF_CLASS))
5795 return X86_64_INTEGERSI_CLASS;
5796 if (class1 == X86_64_INTEGER_CLASS || class1 == X86_64_INTEGERSI_CLASS
5797 || class2 == X86_64_INTEGER_CLASS || class2 == X86_64_INTEGERSI_CLASS)
5798 return X86_64_INTEGER_CLASS;
5799
5800 /* Rule #5: If one of the classes is X87, X87UP, or COMPLEX_X87 class,
5801 MEMORY is used. */
5802 if (class1 == X86_64_X87_CLASS
5803 || class1 == X86_64_X87UP_CLASS
5804 || class1 == X86_64_COMPLEX_X87_CLASS
5805 || class2 == X86_64_X87_CLASS
5806 || class2 == X86_64_X87UP_CLASS
5807 || class2 == X86_64_COMPLEX_X87_CLASS)
5808 return X86_64_MEMORY_CLASS;
5809
5810 /* Rule #6: Otherwise class SSE is used. */
5811 return X86_64_SSE_CLASS;
5812 }
5813
5814 /* Classify the argument of type TYPE and mode MODE.
5815 CLASSES will be filled by the register class used to pass each word
5816 of the operand. The number of words is returned. In case the parameter
5817 should be passed in memory, 0 is returned. As a special case for zero
5818 sized containers, classes[0] will be NO_CLASS and 1 is returned.
5819
5820 BIT_OFFSET is used internally for handling records and specifies offset
5821 of the offset in bits modulo 256 to avoid overflow cases.
5822
5823 See the x86-64 PS ABI for details.
5824 */
5825
5826 static int
5827 classify_argument (enum machine_mode mode, const_tree type,
5828 enum x86_64_reg_class classes[MAX_CLASSES], int bit_offset)
5829 {
5830 HOST_WIDE_INT bytes =
5831 (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
5832 int words
5833 = (bytes + (bit_offset % 64) / 8 + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
5834
5835 /* Variable sized entities are always passed/returned in memory. */
5836 if (bytes < 0)
5837 return 0;
5838
5839 if (mode != VOIDmode
5840 && targetm.calls.must_pass_in_stack (mode, type))
5841 return 0;
5842
5843 if (type && AGGREGATE_TYPE_P (type))
5844 {
5845 int i;
5846 tree field;
5847 enum x86_64_reg_class subclasses[MAX_CLASSES];
5848
5849 /* On x86-64 we pass structures larger than 32 bytes on the stack. */
5850 if (bytes > 32)
5851 return 0;
5852
5853 for (i = 0; i < words; i++)
5854 classes[i] = X86_64_NO_CLASS;
5855
5856 /* Zero sized arrays or structures are NO_CLASS. We return 0 to
5857 signalize memory class, so handle it as special case. */
5858 if (!words)
5859 {
5860 classes[0] = X86_64_NO_CLASS;
5861 return 1;
5862 }
5863
5864 /* Classify each field of record and merge classes. */
5865 switch (TREE_CODE (type))
5866 {
5867 case RECORD_TYPE:
5868 /* And now merge the fields of structure. */
5869 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
5870 {
5871 if (TREE_CODE (field) == FIELD_DECL)
5872 {
5873 int num;
5874
5875 if (TREE_TYPE (field) == error_mark_node)
5876 continue;
5877
5878 /* Bitfields are always classified as integer. Handle them
5879 early, since later code would consider them to be
5880 misaligned integers. */
5881 if (DECL_BIT_FIELD (field))
5882 {
5883 for (i = (int_bit_position (field)
5884 + (bit_offset % 64)) / 8 / 8;
5885 i < ((int_bit_position (field) + (bit_offset % 64))
5886 + tree_low_cst (DECL_SIZE (field), 0)
5887 + 63) / 8 / 8; i++)
5888 classes[i] =
5889 merge_classes (X86_64_INTEGER_CLASS,
5890 classes[i]);
5891 }
5892 else
5893 {
5894 int pos;
5895
5896 type = TREE_TYPE (field);
5897
5898 /* Flexible array member is ignored. */
5899 if (TYPE_MODE (type) == BLKmode
5900 && TREE_CODE (type) == ARRAY_TYPE
5901 && TYPE_SIZE (type) == NULL_TREE
5902 && TYPE_DOMAIN (type) != NULL_TREE
5903 && (TYPE_MAX_VALUE (TYPE_DOMAIN (type))
5904 == NULL_TREE))
5905 {
5906 static bool warned;
5907
5908 if (!warned && warn_psabi)
5909 {
5910 warned = true;
5911 inform (input_location,
5912 "the ABI of passing struct with"
5913 " a flexible array member has"
5914 " changed in GCC 4.4");
5915 }
5916 continue;
5917 }
5918 num = classify_argument (TYPE_MODE (type), type,
5919 subclasses,
5920 (int_bit_position (field)
5921 + bit_offset) % 256);
5922 if (!num)
5923 return 0;
5924 pos = (int_bit_position (field)
5925 + (bit_offset % 64)) / 8 / 8;
5926 for (i = 0; i < num && (i + pos) < words; i++)
5927 classes[i + pos] =
5928 merge_classes (subclasses[i], classes[i + pos]);
5929 }
5930 }
5931 }
5932 break;
5933
5934 case ARRAY_TYPE:
5935 /* Arrays are handled as small records. */
5936 {
5937 int num;
5938 num = classify_argument (TYPE_MODE (TREE_TYPE (type)),
5939 TREE_TYPE (type), subclasses, bit_offset);
5940 if (!num)
5941 return 0;
5942
5943 /* The partial classes are now full classes. */
5944 if (subclasses[0] == X86_64_SSESF_CLASS && bytes != 4)
5945 subclasses[0] = X86_64_SSE_CLASS;
5946 if (subclasses[0] == X86_64_INTEGERSI_CLASS
5947 && !((bit_offset % 64) == 0 && bytes == 4))
5948 subclasses[0] = X86_64_INTEGER_CLASS;
5949
5950 for (i = 0; i < words; i++)
5951 classes[i] = subclasses[i % num];
5952
5953 break;
5954 }
5955 case UNION_TYPE:
5956 case QUAL_UNION_TYPE:
5957 /* Unions are similar to RECORD_TYPE but offset is always 0.
5958 */
5959 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
5960 {
5961 if (TREE_CODE (field) == FIELD_DECL)
5962 {
5963 int num;
5964
5965 if (TREE_TYPE (field) == error_mark_node)
5966 continue;
5967
5968 num = classify_argument (TYPE_MODE (TREE_TYPE (field)),
5969 TREE_TYPE (field), subclasses,
5970 bit_offset);
5971 if (!num)
5972 return 0;
5973 for (i = 0; i < num; i++)
5974 classes[i] = merge_classes (subclasses[i], classes[i]);
5975 }
5976 }
5977 break;
5978
5979 default:
5980 gcc_unreachable ();
5981 }
5982
5983 if (words > 2)
5984 {
5985 /* When size > 16 bytes, if the first one isn't
5986 X86_64_SSE_CLASS or any other ones aren't
5987 X86_64_SSEUP_CLASS, everything should be passed in
5988 memory. */
5989 if (classes[0] != X86_64_SSE_CLASS)
5990 return 0;
5991
5992 for (i = 1; i < words; i++)
5993 if (classes[i] != X86_64_SSEUP_CLASS)
5994 return 0;
5995 }
5996
5997 /* Final merger cleanup. */
5998 for (i = 0; i < words; i++)
5999 {
6000 /* If one class is MEMORY, everything should be passed in
6001 memory. */
6002 if (classes[i] == X86_64_MEMORY_CLASS)
6003 return 0;
6004
6005 /* The X86_64_SSEUP_CLASS should be always preceded by
6006 X86_64_SSE_CLASS or X86_64_SSEUP_CLASS. */
6007 if (classes[i] == X86_64_SSEUP_CLASS
6008 && classes[i - 1] != X86_64_SSE_CLASS
6009 && classes[i - 1] != X86_64_SSEUP_CLASS)
6010 {
6011 /* The first one should never be X86_64_SSEUP_CLASS. */
6012 gcc_assert (i != 0);
6013 classes[i] = X86_64_SSE_CLASS;
6014 }
6015
6016 /* If X86_64_X87UP_CLASS isn't preceded by X86_64_X87_CLASS,
6017 everything should be passed in memory. */
6018 if (classes[i] == X86_64_X87UP_CLASS
6019 && (classes[i - 1] != X86_64_X87_CLASS))
6020 {
6021 static bool warned;
6022
6023 /* The first one should never be X86_64_X87UP_CLASS. */
6024 gcc_assert (i != 0);
6025 if (!warned && warn_psabi)
6026 {
6027 warned = true;
6028 inform (input_location,
6029 "the ABI of passing union with long double"
6030 " has changed in GCC 4.4");
6031 }
6032 return 0;
6033 }
6034 }
6035 return words;
6036 }
6037
6038 /* Compute alignment needed. We align all types to natural boundaries with
6039 exception of XFmode that is aligned to 64bits. */
6040 if (mode != VOIDmode && mode != BLKmode)
6041 {
6042 int mode_alignment = GET_MODE_BITSIZE (mode);
6043
6044 if (mode == XFmode)
6045 mode_alignment = 128;
6046 else if (mode == XCmode)
6047 mode_alignment = 256;
6048 if (COMPLEX_MODE_P (mode))
6049 mode_alignment /= 2;
6050 /* Misaligned fields are always returned in memory. */
6051 if (bit_offset % mode_alignment)
6052 return 0;
6053 }
6054
6055 /* for V1xx modes, just use the base mode */
6056 if (VECTOR_MODE_P (mode) && mode != V1DImode && mode != V1TImode
6057 && GET_MODE_SIZE (GET_MODE_INNER (mode)) == bytes)
6058 mode = GET_MODE_INNER (mode);
6059
6060 /* Classification of atomic types. */
6061 switch (mode)
6062 {
6063 case SDmode:
6064 case DDmode:
6065 classes[0] = X86_64_SSE_CLASS;
6066 return 1;
6067 case TDmode:
6068 classes[0] = X86_64_SSE_CLASS;
6069 classes[1] = X86_64_SSEUP_CLASS;
6070 return 2;
6071 case DImode:
6072 case SImode:
6073 case HImode:
6074 case QImode:
6075 case CSImode:
6076 case CHImode:
6077 case CQImode:
6078 {
6079 int size = (bit_offset % 64)+ (int) GET_MODE_BITSIZE (mode);
6080
6081 if (size <= 32)
6082 {
6083 classes[0] = X86_64_INTEGERSI_CLASS;
6084 return 1;
6085 }
6086 else if (size <= 64)
6087 {
6088 classes[0] = X86_64_INTEGER_CLASS;
6089 return 1;
6090 }
6091 else if (size <= 64+32)
6092 {
6093 classes[0] = X86_64_INTEGER_CLASS;
6094 classes[1] = X86_64_INTEGERSI_CLASS;
6095 return 2;
6096 }
6097 else if (size <= 64+64)
6098 {
6099 classes[0] = classes[1] = X86_64_INTEGER_CLASS;
6100 return 2;
6101 }
6102 else
6103 gcc_unreachable ();
6104 }
6105 case CDImode:
6106 case TImode:
6107 classes[0] = classes[1] = X86_64_INTEGER_CLASS;
6108 return 2;
6109 case COImode:
6110 case OImode:
6111 /* OImode shouldn't be used directly. */
6112 gcc_unreachable ();
6113 case CTImode:
6114 return 0;
6115 case SFmode:
6116 if (!(bit_offset % 64))
6117 classes[0] = X86_64_SSESF_CLASS;
6118 else
6119 classes[0] = X86_64_SSE_CLASS;
6120 return 1;
6121 case DFmode:
6122 classes[0] = X86_64_SSEDF_CLASS;
6123 return 1;
6124 case XFmode:
6125 classes[0] = X86_64_X87_CLASS;
6126 classes[1] = X86_64_X87UP_CLASS;
6127 return 2;
6128 case TFmode:
6129 classes[0] = X86_64_SSE_CLASS;
6130 classes[1] = X86_64_SSEUP_CLASS;
6131 return 2;
6132 case SCmode:
6133 classes[0] = X86_64_SSE_CLASS;
6134 if (!(bit_offset % 64))
6135 return 1;
6136 else
6137 {
6138 static bool warned;
6139
6140 if (!warned && warn_psabi)
6141 {
6142 warned = true;
6143 inform (input_location,
6144 "the ABI of passing structure with complex float"
6145 " member has changed in GCC 4.4");
6146 }
6147 classes[1] = X86_64_SSESF_CLASS;
6148 return 2;
6149 }
6150 case DCmode:
6151 classes[0] = X86_64_SSEDF_CLASS;
6152 classes[1] = X86_64_SSEDF_CLASS;
6153 return 2;
6154 case XCmode:
6155 classes[0] = X86_64_COMPLEX_X87_CLASS;
6156 return 1;
6157 case TCmode:
6158 /* This modes is larger than 16 bytes. */
6159 return 0;
6160 case V8SFmode:
6161 case V8SImode:
6162 case V32QImode:
6163 case V16HImode:
6164 case V4DFmode:
6165 case V4DImode:
6166 classes[0] = X86_64_SSE_CLASS;
6167 classes[1] = X86_64_SSEUP_CLASS;
6168 classes[2] = X86_64_SSEUP_CLASS;
6169 classes[3] = X86_64_SSEUP_CLASS;
6170 return 4;
6171 case V4SFmode:
6172 case V4SImode:
6173 case V16QImode:
6174 case V8HImode:
6175 case V2DFmode:
6176 case V2DImode:
6177 classes[0] = X86_64_SSE_CLASS;
6178 classes[1] = X86_64_SSEUP_CLASS;
6179 return 2;
6180 case V1TImode:
6181 case V1DImode:
6182 case V2SFmode:
6183 case V2SImode:
6184 case V4HImode:
6185 case V8QImode:
6186 classes[0] = X86_64_SSE_CLASS;
6187 return 1;
6188 case BLKmode:
6189 case VOIDmode:
6190 return 0;
6191 default:
6192 gcc_assert (VECTOR_MODE_P (mode));
6193
6194 if (bytes > 16)
6195 return 0;
6196
6197 gcc_assert (GET_MODE_CLASS (GET_MODE_INNER (mode)) == MODE_INT);
6198
6199 if (bit_offset + GET_MODE_BITSIZE (mode) <= 32)
6200 classes[0] = X86_64_INTEGERSI_CLASS;
6201 else
6202 classes[0] = X86_64_INTEGER_CLASS;
6203 classes[1] = X86_64_INTEGER_CLASS;
6204 return 1 + (bytes > 8);
6205 }
6206 }
6207
6208 /* Examine the argument and return set number of register required in each
6209 class. Return 0 iff parameter should be passed in memory. */
6210 static int
6211 examine_argument (enum machine_mode mode, const_tree type, int in_return,
6212 int *int_nregs, int *sse_nregs)
6213 {
6214 enum x86_64_reg_class regclass[MAX_CLASSES];
6215 int n = classify_argument (mode, type, regclass, 0);
6216
6217 *int_nregs = 0;
6218 *sse_nregs = 0;
6219 if (!n)
6220 return 0;
6221 for (n--; n >= 0; n--)
6222 switch (regclass[n])
6223 {
6224 case X86_64_INTEGER_CLASS:
6225 case X86_64_INTEGERSI_CLASS:
6226 (*int_nregs)++;
6227 break;
6228 case X86_64_SSE_CLASS:
6229 case X86_64_SSESF_CLASS:
6230 case X86_64_SSEDF_CLASS:
6231 (*sse_nregs)++;
6232 break;
6233 case X86_64_NO_CLASS:
6234 case X86_64_SSEUP_CLASS:
6235 break;
6236 case X86_64_X87_CLASS:
6237 case X86_64_X87UP_CLASS:
6238 if (!in_return)
6239 return 0;
6240 break;
6241 case X86_64_COMPLEX_X87_CLASS:
6242 return in_return ? 2 : 0;
6243 case X86_64_MEMORY_CLASS:
6244 gcc_unreachable ();
6245 }
6246 return 1;
6247 }
6248
6249 /* Construct container for the argument used by GCC interface. See
6250 FUNCTION_ARG for the detailed description. */
6251
6252 static rtx
6253 construct_container (enum machine_mode mode, enum machine_mode orig_mode,
6254 const_tree type, int in_return, int nintregs, int nsseregs,
6255 const int *intreg, int sse_regno)
6256 {
6257 /* The following variables hold the static issued_error state. */
6258 static bool issued_sse_arg_error;
6259 static bool issued_sse_ret_error;
6260 static bool issued_x87_ret_error;
6261
6262 enum machine_mode tmpmode;
6263 int bytes =
6264 (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
6265 enum x86_64_reg_class regclass[MAX_CLASSES];
6266 int n;
6267 int i;
6268 int nexps = 0;
6269 int needed_sseregs, needed_intregs;
6270 rtx exp[MAX_CLASSES];
6271 rtx ret;
6272
6273 n = classify_argument (mode, type, regclass, 0);
6274 if (!n)
6275 return NULL;
6276 if (!examine_argument (mode, type, in_return, &needed_intregs,
6277 &needed_sseregs))
6278 return NULL;
6279 if (needed_intregs > nintregs || needed_sseregs > nsseregs)
6280 return NULL;
6281
6282 /* We allowed the user to turn off SSE for kernel mode. Don't crash if
6283 some less clueful developer tries to use floating-point anyway. */
6284 if (needed_sseregs && !TARGET_SSE)
6285 {
6286 if (in_return)
6287 {
6288 if (!issued_sse_ret_error)
6289 {
6290 error ("SSE register return with SSE disabled");
6291 issued_sse_ret_error = true;
6292 }
6293 }
6294 else if (!issued_sse_arg_error)
6295 {
6296 error ("SSE register argument with SSE disabled");
6297 issued_sse_arg_error = true;
6298 }
6299 return NULL;
6300 }
6301
6302 /* Likewise, error if the ABI requires us to return values in the
6303 x87 registers and the user specified -mno-80387. */
6304 if (!TARGET_80387 && in_return)
6305 for (i = 0; i < n; i++)
6306 if (regclass[i] == X86_64_X87_CLASS
6307 || regclass[i] == X86_64_X87UP_CLASS
6308 || regclass[i] == X86_64_COMPLEX_X87_CLASS)
6309 {
6310 if (!issued_x87_ret_error)
6311 {
6312 error ("x87 register return with x87 disabled");
6313 issued_x87_ret_error = true;
6314 }
6315 return NULL;
6316 }
6317
6318 /* First construct simple cases. Avoid SCmode, since we want to use
6319 single register to pass this type. */
6320 if (n == 1 && mode != SCmode)
6321 switch (regclass[0])
6322 {
6323 case X86_64_INTEGER_CLASS:
6324 case X86_64_INTEGERSI_CLASS:
6325 return gen_rtx_REG (mode, intreg[0]);
6326 case X86_64_SSE_CLASS:
6327 case X86_64_SSESF_CLASS:
6328 case X86_64_SSEDF_CLASS:
6329 if (mode != BLKmode)
6330 return gen_reg_or_parallel (mode, orig_mode,
6331 SSE_REGNO (sse_regno));
6332 break;
6333 case X86_64_X87_CLASS:
6334 case X86_64_COMPLEX_X87_CLASS:
6335 return gen_rtx_REG (mode, FIRST_STACK_REG);
6336 case X86_64_NO_CLASS:
6337 /* Zero sized array, struct or class. */
6338 return NULL;
6339 default:
6340 gcc_unreachable ();
6341 }
6342 if (n == 2
6343 && regclass[0] == X86_64_SSE_CLASS
6344 && regclass[1] == X86_64_SSEUP_CLASS
6345 && mode != BLKmode)
6346 return gen_reg_or_parallel (mode, orig_mode,
6347 SSE_REGNO (sse_regno));
6348 if (n == 4
6349 && regclass[0] == X86_64_SSE_CLASS
6350 && regclass[1] == X86_64_SSEUP_CLASS
6351 && regclass[2] == X86_64_SSEUP_CLASS
6352 && regclass[3] == X86_64_SSEUP_CLASS
6353 && mode != BLKmode)
6354 return gen_reg_or_parallel (mode, orig_mode,
6355 SSE_REGNO (sse_regno));
6356 if (n == 2
6357 && regclass[0] == X86_64_X87_CLASS
6358 && regclass[1] == X86_64_X87UP_CLASS)
6359 return gen_rtx_REG (XFmode, FIRST_STACK_REG);
6360
6361 if (n == 2
6362 && regclass[0] == X86_64_INTEGER_CLASS
6363 && regclass[1] == X86_64_INTEGER_CLASS
6364 && (mode == CDImode || mode == TImode || mode == TFmode)
6365 && intreg[0] + 1 == intreg[1])
6366 return gen_rtx_REG (mode, intreg[0]);
6367
6368 /* Otherwise figure out the entries of the PARALLEL. */
6369 for (i = 0; i < n; i++)
6370 {
6371 int pos;
6372
6373 switch (regclass[i])
6374 {
6375 case X86_64_NO_CLASS:
6376 break;
6377 case X86_64_INTEGER_CLASS:
6378 case X86_64_INTEGERSI_CLASS:
6379 /* Merge TImodes on aligned occasions here too. */
6380 if (i * 8 + 8 > bytes)
6381 tmpmode
6382 = mode_for_size ((bytes - i * 8) * BITS_PER_UNIT, MODE_INT, 0);
6383 else if (regclass[i] == X86_64_INTEGERSI_CLASS)
6384 tmpmode = SImode;
6385 else
6386 tmpmode = DImode;
6387 /* We've requested 24 bytes we
6388 don't have mode for. Use DImode. */
6389 if (tmpmode == BLKmode)
6390 tmpmode = DImode;
6391 exp [nexps++]
6392 = gen_rtx_EXPR_LIST (VOIDmode,
6393 gen_rtx_REG (tmpmode, *intreg),
6394 GEN_INT (i*8));
6395 intreg++;
6396 break;
6397 case X86_64_SSESF_CLASS:
6398 exp [nexps++]
6399 = gen_rtx_EXPR_LIST (VOIDmode,
6400 gen_rtx_REG (SFmode,
6401 SSE_REGNO (sse_regno)),
6402 GEN_INT (i*8));
6403 sse_regno++;
6404 break;
6405 case X86_64_SSEDF_CLASS:
6406 exp [nexps++]
6407 = gen_rtx_EXPR_LIST (VOIDmode,
6408 gen_rtx_REG (DFmode,
6409 SSE_REGNO (sse_regno)),
6410 GEN_INT (i*8));
6411 sse_regno++;
6412 break;
6413 case X86_64_SSE_CLASS:
6414 pos = i;
6415 switch (n)
6416 {
6417 case 1:
6418 tmpmode = DImode;
6419 break;
6420 case 2:
6421 if (i == 0 && regclass[1] == X86_64_SSEUP_CLASS)
6422 {
6423 tmpmode = TImode;
6424 i++;
6425 }
6426 else
6427 tmpmode = DImode;
6428 break;
6429 case 4:
6430 gcc_assert (i == 0
6431 && regclass[1] == X86_64_SSEUP_CLASS
6432 && regclass[2] == X86_64_SSEUP_CLASS
6433 && regclass[3] == X86_64_SSEUP_CLASS);
6434 tmpmode = OImode;
6435 i += 3;
6436 break;
6437 default:
6438 gcc_unreachable ();
6439 }
6440 exp [nexps++]
6441 = gen_rtx_EXPR_LIST (VOIDmode,
6442 gen_rtx_REG (tmpmode,
6443 SSE_REGNO (sse_regno)),
6444 GEN_INT (pos*8));
6445 sse_regno++;
6446 break;
6447 default:
6448 gcc_unreachable ();
6449 }
6450 }
6451
6452 /* Empty aligned struct, union or class. */
6453 if (nexps == 0)
6454 return NULL;
6455
6456 ret = gen_rtx_PARALLEL (mode, rtvec_alloc (nexps));
6457 for (i = 0; i < nexps; i++)
6458 XVECEXP (ret, 0, i) = exp [i];
6459 return ret;
6460 }
6461
6462 /* Update the data in CUM to advance over an argument of mode MODE
6463 and data type TYPE. (TYPE is null for libcalls where that information
6464 may not be available.) */
6465
6466 static void
6467 function_arg_advance_32 (CUMULATIVE_ARGS *cum, enum machine_mode mode,
6468 const_tree type, HOST_WIDE_INT bytes,
6469 HOST_WIDE_INT words)
6470 {
6471 switch (mode)
6472 {
6473 default:
6474 break;
6475
6476 case BLKmode:
6477 if (bytes < 0)
6478 break;
6479 /* FALLTHRU */
6480
6481 case DImode:
6482 case SImode:
6483 case HImode:
6484 case QImode:
6485 cum->words += words;
6486 cum->nregs -= words;
6487 cum->regno += words;
6488
6489 if (cum->nregs <= 0)
6490 {
6491 cum->nregs = 0;
6492 cum->regno = 0;
6493 }
6494 break;
6495
6496 case OImode:
6497 /* OImode shouldn't be used directly. */
6498 gcc_unreachable ();
6499
6500 case DFmode:
6501 if (cum->float_in_sse < 2)
6502 break;
6503 case SFmode:
6504 if (cum->float_in_sse < 1)
6505 break;
6506 /* FALLTHRU */
6507
6508 case V8SFmode:
6509 case V8SImode:
6510 case V32QImode:
6511 case V16HImode:
6512 case V4DFmode:
6513 case V4DImode:
6514 case TImode:
6515 case V16QImode:
6516 case V8HImode:
6517 case V4SImode:
6518 case V2DImode:
6519 case V4SFmode:
6520 case V2DFmode:
6521 if (!type || !AGGREGATE_TYPE_P (type))
6522 {
6523 cum->sse_words += words;
6524 cum->sse_nregs -= 1;
6525 cum->sse_regno += 1;
6526 if (cum->sse_nregs <= 0)
6527 {
6528 cum->sse_nregs = 0;
6529 cum->sse_regno = 0;
6530 }
6531 }
6532 break;
6533
6534 case V8QImode:
6535 case V4HImode:
6536 case V2SImode:
6537 case V2SFmode:
6538 case V1TImode:
6539 case V1DImode:
6540 if (!type || !AGGREGATE_TYPE_P (type))
6541 {
6542 cum->mmx_words += words;
6543 cum->mmx_nregs -= 1;
6544 cum->mmx_regno += 1;
6545 if (cum->mmx_nregs <= 0)
6546 {
6547 cum->mmx_nregs = 0;
6548 cum->mmx_regno = 0;
6549 }
6550 }
6551 break;
6552 }
6553 }
6554
6555 static void
6556 function_arg_advance_64 (CUMULATIVE_ARGS *cum, enum machine_mode mode,
6557 const_tree type, HOST_WIDE_INT words, bool named)
6558 {
6559 int int_nregs, sse_nregs;
6560
6561 /* Unnamed 256bit vector mode parameters are passed on stack. */
6562 if (!named && VALID_AVX256_REG_MODE (mode))
6563 return;
6564
6565 if (examine_argument (mode, type, 0, &int_nregs, &sse_nregs)
6566 && sse_nregs <= cum->sse_nregs && int_nregs <= cum->nregs)
6567 {
6568 cum->nregs -= int_nregs;
6569 cum->sse_nregs -= sse_nregs;
6570 cum->regno += int_nregs;
6571 cum->sse_regno += sse_nregs;
6572 }
6573 else
6574 {
6575 int align = ix86_function_arg_boundary (mode, type) / BITS_PER_WORD;
6576 cum->words = (cum->words + align - 1) & ~(align - 1);
6577 cum->words += words;
6578 }
6579 }
6580
6581 static void
6582 function_arg_advance_ms_64 (CUMULATIVE_ARGS *cum, HOST_WIDE_INT bytes,
6583 HOST_WIDE_INT words)
6584 {
6585 /* Otherwise, this should be passed indirect. */
6586 gcc_assert (bytes == 1 || bytes == 2 || bytes == 4 || bytes == 8);
6587
6588 cum->words += words;
6589 if (cum->nregs > 0)
6590 {
6591 cum->nregs -= 1;
6592 cum->regno += 1;
6593 }
6594 }
6595
6596 /* Update the data in CUM to advance over an argument of mode MODE and
6597 data type TYPE. (TYPE is null for libcalls where that information
6598 may not be available.) */
6599
6600 static void
6601 ix86_function_arg_advance (cumulative_args_t cum_v, enum machine_mode mode,
6602 const_tree type, bool named)
6603 {
6604 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
6605 HOST_WIDE_INT bytes, words;
6606
6607 if (mode == BLKmode)
6608 bytes = int_size_in_bytes (type);
6609 else
6610 bytes = GET_MODE_SIZE (mode);
6611 words = (bytes + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
6612
6613 if (type)
6614 mode = type_natural_mode (type, NULL);
6615
6616 if (TARGET_64BIT && (cum ? cum->call_abi : ix86_abi) == MS_ABI)
6617 function_arg_advance_ms_64 (cum, bytes, words);
6618 else if (TARGET_64BIT)
6619 function_arg_advance_64 (cum, mode, type, words, named);
6620 else
6621 function_arg_advance_32 (cum, mode, type, bytes, words);
6622 }
6623
6624 /* Define where to put the arguments to a function.
6625 Value is zero to push the argument on the stack,
6626 or a hard register in which to store the argument.
6627
6628 MODE is the argument's machine mode.
6629 TYPE is the data type of the argument (as a tree).
6630 This is null for libcalls where that information may
6631 not be available.
6632 CUM is a variable of type CUMULATIVE_ARGS which gives info about
6633 the preceding args and about the function being called.
6634 NAMED is nonzero if this argument is a named parameter
6635 (otherwise it is an extra parameter matching an ellipsis). */
6636
6637 static rtx
6638 function_arg_32 (const CUMULATIVE_ARGS *cum, enum machine_mode mode,
6639 enum machine_mode orig_mode, const_tree type,
6640 HOST_WIDE_INT bytes, HOST_WIDE_INT words)
6641 {
6642 static bool warnedsse, warnedmmx;
6643
6644 /* Avoid the AL settings for the Unix64 ABI. */
6645 if (mode == VOIDmode)
6646 return constm1_rtx;
6647
6648 switch (mode)
6649 {
6650 default:
6651 break;
6652
6653 case BLKmode:
6654 if (bytes < 0)
6655 break;
6656 /* FALLTHRU */
6657 case DImode:
6658 case SImode:
6659 case HImode:
6660 case QImode:
6661 if (words <= cum->nregs)
6662 {
6663 int regno = cum->regno;
6664
6665 /* Fastcall allocates the first two DWORD (SImode) or
6666 smaller arguments to ECX and EDX if it isn't an
6667 aggregate type . */
6668 if (cum->fastcall)
6669 {
6670 if (mode == BLKmode
6671 || mode == DImode
6672 || (type && AGGREGATE_TYPE_P (type)))
6673 break;
6674
6675 /* ECX not EAX is the first allocated register. */
6676 if (regno == AX_REG)
6677 regno = CX_REG;
6678 }
6679 return gen_rtx_REG (mode, regno);
6680 }
6681 break;
6682
6683 case DFmode:
6684 if (cum->float_in_sse < 2)
6685 break;
6686 case SFmode:
6687 if (cum->float_in_sse < 1)
6688 break;
6689 /* FALLTHRU */
6690 case TImode:
6691 /* In 32bit, we pass TImode in xmm registers. */
6692 case V16QImode:
6693 case V8HImode:
6694 case V4SImode:
6695 case V2DImode:
6696 case V4SFmode:
6697 case V2DFmode:
6698 if (!type || !AGGREGATE_TYPE_P (type))
6699 {
6700 if (!TARGET_SSE && !warnedsse && cum->warn_sse)
6701 {
6702 warnedsse = true;
6703 warning (0, "SSE vector argument without SSE enabled "
6704 "changes the ABI");
6705 }
6706 if (cum->sse_nregs)
6707 return gen_reg_or_parallel (mode, orig_mode,
6708 cum->sse_regno + FIRST_SSE_REG);
6709 }
6710 break;
6711
6712 case OImode:
6713 /* OImode shouldn't be used directly. */
6714 gcc_unreachable ();
6715
6716 case V8SFmode:
6717 case V8SImode:
6718 case V32QImode:
6719 case V16HImode:
6720 case V4DFmode:
6721 case V4DImode:
6722 if (!type || !AGGREGATE_TYPE_P (type))
6723 {
6724 if (cum->sse_nregs)
6725 return gen_reg_or_parallel (mode, orig_mode,
6726 cum->sse_regno + FIRST_SSE_REG);
6727 }
6728 break;
6729
6730 case V8QImode:
6731 case V4HImode:
6732 case V2SImode:
6733 case V2SFmode:
6734 case V1TImode:
6735 case V1DImode:
6736 if (!type || !AGGREGATE_TYPE_P (type))
6737 {
6738 if (!TARGET_MMX && !warnedmmx && cum->warn_mmx)
6739 {
6740 warnedmmx = true;
6741 warning (0, "MMX vector argument without MMX enabled "
6742 "changes the ABI");
6743 }
6744 if (cum->mmx_nregs)
6745 return gen_reg_or_parallel (mode, orig_mode,
6746 cum->mmx_regno + FIRST_MMX_REG);
6747 }
6748 break;
6749 }
6750
6751 return NULL_RTX;
6752 }
6753
6754 static rtx
6755 function_arg_64 (const CUMULATIVE_ARGS *cum, enum machine_mode mode,
6756 enum machine_mode orig_mode, const_tree type, bool named)
6757 {
6758 /* Handle a hidden AL argument containing number of registers
6759 for varargs x86-64 functions. */
6760 if (mode == VOIDmode)
6761 return GEN_INT (cum->maybe_vaarg
6762 ? (cum->sse_nregs < 0
6763 ? X86_64_SSE_REGPARM_MAX
6764 : cum->sse_regno)
6765 : -1);
6766
6767 switch (mode)
6768 {
6769 default:
6770 break;
6771
6772 case V8SFmode:
6773 case V8SImode:
6774 case V32QImode:
6775 case V16HImode:
6776 case V4DFmode:
6777 case V4DImode:
6778 /* Unnamed 256bit vector mode parameters are passed on stack. */
6779 if (!named)
6780 return NULL;
6781 break;
6782 }
6783
6784 return construct_container (mode, orig_mode, type, 0, cum->nregs,
6785 cum->sse_nregs,
6786 &x86_64_int_parameter_registers [cum->regno],
6787 cum->sse_regno);
6788 }
6789
6790 static rtx
6791 function_arg_ms_64 (const CUMULATIVE_ARGS *cum, enum machine_mode mode,
6792 enum machine_mode orig_mode, bool named,
6793 HOST_WIDE_INT bytes)
6794 {
6795 unsigned int regno;
6796
6797 /* We need to add clobber for MS_ABI->SYSV ABI calls in expand_call.
6798 We use value of -2 to specify that current function call is MSABI. */
6799 if (mode == VOIDmode)
6800 return GEN_INT (-2);
6801
6802 /* If we've run out of registers, it goes on the stack. */
6803 if (cum->nregs == 0)
6804 return NULL_RTX;
6805
6806 regno = x86_64_ms_abi_int_parameter_registers[cum->regno];
6807
6808 /* Only floating point modes are passed in anything but integer regs. */
6809 if (TARGET_SSE && (mode == SFmode || mode == DFmode))
6810 {
6811 if (named)
6812 regno = cum->regno + FIRST_SSE_REG;
6813 else
6814 {
6815 rtx t1, t2;
6816
6817 /* Unnamed floating parameters are passed in both the
6818 SSE and integer registers. */
6819 t1 = gen_rtx_REG (mode, cum->regno + FIRST_SSE_REG);
6820 t2 = gen_rtx_REG (mode, regno);
6821 t1 = gen_rtx_EXPR_LIST (VOIDmode, t1, const0_rtx);
6822 t2 = gen_rtx_EXPR_LIST (VOIDmode, t2, const0_rtx);
6823 return gen_rtx_PARALLEL (mode, gen_rtvec (2, t1, t2));
6824 }
6825 }
6826 /* Handle aggregated types passed in register. */
6827 if (orig_mode == BLKmode)
6828 {
6829 if (bytes > 0 && bytes <= 8)
6830 mode = (bytes > 4 ? DImode : SImode);
6831 if (mode == BLKmode)
6832 mode = DImode;
6833 }
6834
6835 return gen_reg_or_parallel (mode, orig_mode, regno);
6836 }
6837
6838 /* Return where to put the arguments to a function.
6839 Return zero to push the argument on the stack, or a hard register in which to store the argument.
6840
6841 MODE is the argument's machine mode. TYPE is the data type of the
6842 argument. It is null for libcalls where that information may not be
6843 available. CUM gives information about the preceding args and about
6844 the function being called. NAMED is nonzero if this argument is a
6845 named parameter (otherwise it is an extra parameter matching an
6846 ellipsis). */
6847
6848 static rtx
6849 ix86_function_arg (cumulative_args_t cum_v, enum machine_mode omode,
6850 const_tree type, bool named)
6851 {
6852 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
6853 enum machine_mode mode = omode;
6854 HOST_WIDE_INT bytes, words;
6855 rtx arg;
6856
6857 if (mode == BLKmode)
6858 bytes = int_size_in_bytes (type);
6859 else
6860 bytes = GET_MODE_SIZE (mode);
6861 words = (bytes + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
6862
6863 /* To simplify the code below, represent vector types with a vector mode
6864 even if MMX/SSE are not active. */
6865 if (type && TREE_CODE (type) == VECTOR_TYPE)
6866 mode = type_natural_mode (type, cum);
6867
6868 if (TARGET_64BIT && (cum ? cum->call_abi : ix86_abi) == MS_ABI)
6869 arg = function_arg_ms_64 (cum, mode, omode, named, bytes);
6870 else if (TARGET_64BIT)
6871 arg = function_arg_64 (cum, mode, omode, type, named);
6872 else
6873 arg = function_arg_32 (cum, mode, omode, type, bytes, words);
6874
6875 if (TARGET_VZEROUPPER && function_pass_avx256_p (arg))
6876 {
6877 /* This argument uses 256bit AVX modes. */
6878 if (cum->caller)
6879 cfun->machine->callee_pass_avx256_p = true;
6880 else
6881 cfun->machine->caller_pass_avx256_p = true;
6882 }
6883
6884 return arg;
6885 }
6886
6887 /* A C expression that indicates when an argument must be passed by
6888 reference. If nonzero for an argument, a copy of that argument is
6889 made in memory and a pointer to the argument is passed instead of
6890 the argument itself. The pointer is passed in whatever way is
6891 appropriate for passing a pointer to that type. */
6892
6893 static bool
6894 ix86_pass_by_reference (cumulative_args_t cum_v ATTRIBUTE_UNUSED,
6895 enum machine_mode mode ATTRIBUTE_UNUSED,
6896 const_tree type, bool named ATTRIBUTE_UNUSED)
6897 {
6898 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
6899
6900 /* See Windows x64 Software Convention. */
6901 if (TARGET_64BIT && (cum ? cum->call_abi : ix86_abi) == MS_ABI)
6902 {
6903 int msize = (int) GET_MODE_SIZE (mode);
6904 if (type)
6905 {
6906 /* Arrays are passed by reference. */
6907 if (TREE_CODE (type) == ARRAY_TYPE)
6908 return true;
6909
6910 if (AGGREGATE_TYPE_P (type))
6911 {
6912 /* Structs/unions of sizes other than 8, 16, 32, or 64 bits
6913 are passed by reference. */
6914 msize = int_size_in_bytes (type);
6915 }
6916 }
6917
6918 /* __m128 is passed by reference. */
6919 switch (msize) {
6920 case 1: case 2: case 4: case 8:
6921 break;
6922 default:
6923 return true;
6924 }
6925 }
6926 else if (TARGET_64BIT && type && int_size_in_bytes (type) == -1)
6927 return 1;
6928
6929 return 0;
6930 }
6931
6932 /* Return true when TYPE should be 128bit aligned for 32bit argument
6933 passing ABI. XXX: This function is obsolete and is only used for
6934 checking psABI compatibility with previous versions of GCC. */
6935
6936 static bool
6937 ix86_compat_aligned_value_p (const_tree type)
6938 {
6939 enum machine_mode mode = TYPE_MODE (type);
6940 if (((TARGET_SSE && SSE_REG_MODE_P (mode))
6941 || mode == TDmode
6942 || mode == TFmode
6943 || mode == TCmode)
6944 && (!TYPE_USER_ALIGN (type) || TYPE_ALIGN (type) > 128))
6945 return true;
6946 if (TYPE_ALIGN (type) < 128)
6947 return false;
6948
6949 if (AGGREGATE_TYPE_P (type))
6950 {
6951 /* Walk the aggregates recursively. */
6952 switch (TREE_CODE (type))
6953 {
6954 case RECORD_TYPE:
6955 case UNION_TYPE:
6956 case QUAL_UNION_TYPE:
6957 {
6958 tree field;
6959
6960 /* Walk all the structure fields. */
6961 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
6962 {
6963 if (TREE_CODE (field) == FIELD_DECL
6964 && ix86_compat_aligned_value_p (TREE_TYPE (field)))
6965 return true;
6966 }
6967 break;
6968 }
6969
6970 case ARRAY_TYPE:
6971 /* Just for use if some languages passes arrays by value. */
6972 if (ix86_compat_aligned_value_p (TREE_TYPE (type)))
6973 return true;
6974 break;
6975
6976 default:
6977 gcc_unreachable ();
6978 }
6979 }
6980 return false;
6981 }
6982
6983 /* Return the alignment boundary for MODE and TYPE with alignment ALIGN.
6984 XXX: This function is obsolete and is only used for checking psABI
6985 compatibility with previous versions of GCC. */
6986
6987 static unsigned int
6988 ix86_compat_function_arg_boundary (enum machine_mode mode,
6989 const_tree type, unsigned int align)
6990 {
6991 /* In 32bit, only _Decimal128 and __float128 are aligned to their
6992 natural boundaries. */
6993 if (!TARGET_64BIT && mode != TDmode && mode != TFmode)
6994 {
6995 /* i386 ABI defines all arguments to be 4 byte aligned. We have to
6996 make an exception for SSE modes since these require 128bit
6997 alignment.
6998
6999 The handling here differs from field_alignment. ICC aligns MMX
7000 arguments to 4 byte boundaries, while structure fields are aligned
7001 to 8 byte boundaries. */
7002 if (!type)
7003 {
7004 if (!(TARGET_SSE && SSE_REG_MODE_P (mode)))
7005 align = PARM_BOUNDARY;
7006 }
7007 else
7008 {
7009 if (!ix86_compat_aligned_value_p (type))
7010 align = PARM_BOUNDARY;
7011 }
7012 }
7013 if (align > BIGGEST_ALIGNMENT)
7014 align = BIGGEST_ALIGNMENT;
7015 return align;
7016 }
7017
7018 /* Return true when TYPE should be 128bit aligned for 32bit argument
7019 passing ABI. */
7020
7021 static bool
7022 ix86_contains_aligned_value_p (const_tree type)
7023 {
7024 enum machine_mode mode = TYPE_MODE (type);
7025
7026 if (mode == XFmode || mode == XCmode)
7027 return false;
7028
7029 if (TYPE_ALIGN (type) < 128)
7030 return false;
7031
7032 if (AGGREGATE_TYPE_P (type))
7033 {
7034 /* Walk the aggregates recursively. */
7035 switch (TREE_CODE (type))
7036 {
7037 case RECORD_TYPE:
7038 case UNION_TYPE:
7039 case QUAL_UNION_TYPE:
7040 {
7041 tree field;
7042
7043 /* Walk all the structure fields. */
7044 for (field = TYPE_FIELDS (type);
7045 field;
7046 field = DECL_CHAIN (field))
7047 {
7048 if (TREE_CODE (field) == FIELD_DECL
7049 && ix86_contains_aligned_value_p (TREE_TYPE (field)))
7050 return true;
7051 }
7052 break;
7053 }
7054
7055 case ARRAY_TYPE:
7056 /* Just for use if some languages passes arrays by value. */
7057 if (ix86_contains_aligned_value_p (TREE_TYPE (type)))
7058 return true;
7059 break;
7060
7061 default:
7062 gcc_unreachable ();
7063 }
7064 }
7065 else
7066 return TYPE_ALIGN (type) >= 128;
7067
7068 return false;
7069 }
7070
7071 /* Gives the alignment boundary, in bits, of an argument with the
7072 specified mode and type. */
7073
7074 static unsigned int
7075 ix86_function_arg_boundary (enum machine_mode mode, const_tree type)
7076 {
7077 unsigned int align;
7078 if (type)
7079 {
7080 /* Since the main variant type is used for call, we convert it to
7081 the main variant type. */
7082 type = TYPE_MAIN_VARIANT (type);
7083 align = TYPE_ALIGN (type);
7084 }
7085 else
7086 align = GET_MODE_ALIGNMENT (mode);
7087 if (align < PARM_BOUNDARY)
7088 align = PARM_BOUNDARY;
7089 else
7090 {
7091 static bool warned;
7092 unsigned int saved_align = align;
7093
7094 if (!TARGET_64BIT)
7095 {
7096 /* i386 ABI defines XFmode arguments to be 4 byte aligned. */
7097 if (!type)
7098 {
7099 if (mode == XFmode || mode == XCmode)
7100 align = PARM_BOUNDARY;
7101 }
7102 else if (!ix86_contains_aligned_value_p (type))
7103 align = PARM_BOUNDARY;
7104
7105 if (align < 128)
7106 align = PARM_BOUNDARY;
7107 }
7108
7109 if (warn_psabi
7110 && !warned
7111 && align != ix86_compat_function_arg_boundary (mode, type,
7112 saved_align))
7113 {
7114 warned = true;
7115 inform (input_location,
7116 "The ABI for passing parameters with %d-byte"
7117 " alignment has changed in GCC 4.6",
7118 align / BITS_PER_UNIT);
7119 }
7120 }
7121
7122 return align;
7123 }
7124
7125 /* Return true if N is a possible register number of function value. */
7126
7127 static bool
7128 ix86_function_value_regno_p (const unsigned int regno)
7129 {
7130 switch (regno)
7131 {
7132 case AX_REG:
7133 return true;
7134
7135 case FIRST_FLOAT_REG:
7136 /* TODO: The function should depend on current function ABI but
7137 builtins.c would need updating then. Therefore we use the
7138 default ABI. */
7139 if (TARGET_64BIT && ix86_abi == MS_ABI)
7140 return false;
7141 return TARGET_FLOAT_RETURNS_IN_80387;
7142
7143 case FIRST_SSE_REG:
7144 return TARGET_SSE;
7145
7146 case FIRST_MMX_REG:
7147 if (TARGET_MACHO || TARGET_64BIT)
7148 return false;
7149 return TARGET_MMX;
7150 }
7151
7152 return false;
7153 }
7154
7155 /* Define how to find the value returned by a function.
7156 VALTYPE is the data type of the value (as a tree).
7157 If the precise function being called is known, FUNC is its FUNCTION_DECL;
7158 otherwise, FUNC is 0. */
7159
7160 static rtx
7161 function_value_32 (enum machine_mode orig_mode, enum machine_mode mode,
7162 const_tree fntype, const_tree fn)
7163 {
7164 unsigned int regno;
7165
7166 /* 8-byte vector modes in %mm0. See ix86_return_in_memory for where
7167 we normally prevent this case when mmx is not available. However
7168 some ABIs may require the result to be returned like DImode. */
7169 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 8)
7170 regno = FIRST_MMX_REG;
7171
7172 /* 16-byte vector modes in %xmm0. See ix86_return_in_memory for where
7173 we prevent this case when sse is not available. However some ABIs
7174 may require the result to be returned like integer TImode. */
7175 else if (mode == TImode
7176 || (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 16))
7177 regno = FIRST_SSE_REG;
7178
7179 /* 32-byte vector modes in %ymm0. */
7180 else if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 32)
7181 regno = FIRST_SSE_REG;
7182
7183 /* Floating point return values in %st(0) (unless -mno-fp-ret-in-387). */
7184 else if (X87_FLOAT_MODE_P (mode) && TARGET_FLOAT_RETURNS_IN_80387)
7185 regno = FIRST_FLOAT_REG;
7186 else
7187 /* Most things go in %eax. */
7188 regno = AX_REG;
7189
7190 /* Override FP return register with %xmm0 for local functions when
7191 SSE math is enabled or for functions with sseregparm attribute. */
7192 if ((fn || fntype) && (mode == SFmode || mode == DFmode))
7193 {
7194 int sse_level = ix86_function_sseregparm (fntype, fn, false);
7195 if ((sse_level >= 1 && mode == SFmode)
7196 || (sse_level == 2 && mode == DFmode))
7197 regno = FIRST_SSE_REG;
7198 }
7199
7200 /* OImode shouldn't be used directly. */
7201 gcc_assert (mode != OImode);
7202
7203 return gen_rtx_REG (orig_mode, regno);
7204 }
7205
7206 static rtx
7207 function_value_64 (enum machine_mode orig_mode, enum machine_mode mode,
7208 const_tree valtype)
7209 {
7210 rtx ret;
7211
7212 /* Handle libcalls, which don't provide a type node. */
7213 if (valtype == NULL)
7214 {
7215 unsigned int regno;
7216
7217 switch (mode)
7218 {
7219 case SFmode:
7220 case SCmode:
7221 case DFmode:
7222 case DCmode:
7223 case TFmode:
7224 case SDmode:
7225 case DDmode:
7226 case TDmode:
7227 regno = FIRST_SSE_REG;
7228 break;
7229 case XFmode:
7230 case XCmode:
7231 regno = FIRST_FLOAT_REG;
7232 break;
7233 case TCmode:
7234 return NULL;
7235 default:
7236 regno = AX_REG;
7237 }
7238
7239 return gen_rtx_REG (mode, regno);
7240 }
7241 else if (POINTER_TYPE_P (valtype))
7242 {
7243 /* Pointers are always returned in word_mode. */
7244 mode = word_mode;
7245 }
7246
7247 ret = construct_container (mode, orig_mode, valtype, 1,
7248 X86_64_REGPARM_MAX, X86_64_SSE_REGPARM_MAX,
7249 x86_64_int_return_registers, 0);
7250
7251 /* For zero sized structures, construct_container returns NULL, but we
7252 need to keep rest of compiler happy by returning meaningful value. */
7253 if (!ret)
7254 ret = gen_rtx_REG (orig_mode, AX_REG);
7255
7256 return ret;
7257 }
7258
7259 static rtx
7260 function_value_ms_64 (enum machine_mode orig_mode, enum machine_mode mode)
7261 {
7262 unsigned int regno = AX_REG;
7263
7264 if (TARGET_SSE)
7265 {
7266 switch (GET_MODE_SIZE (mode))
7267 {
7268 case 16:
7269 if((SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
7270 && !COMPLEX_MODE_P (mode))
7271 regno = FIRST_SSE_REG;
7272 break;
7273 case 8:
7274 case 4:
7275 if (mode == SFmode || mode == DFmode)
7276 regno = FIRST_SSE_REG;
7277 break;
7278 default:
7279 break;
7280 }
7281 }
7282 return gen_rtx_REG (orig_mode, regno);
7283 }
7284
7285 static rtx
7286 ix86_function_value_1 (const_tree valtype, const_tree fntype_or_decl,
7287 enum machine_mode orig_mode, enum machine_mode mode)
7288 {
7289 const_tree fn, fntype;
7290
7291 fn = NULL_TREE;
7292 if (fntype_or_decl && DECL_P (fntype_or_decl))
7293 fn = fntype_or_decl;
7294 fntype = fn ? TREE_TYPE (fn) : fntype_or_decl;
7295
7296 if (TARGET_64BIT && ix86_function_type_abi (fntype) == MS_ABI)
7297 return function_value_ms_64 (orig_mode, mode);
7298 else if (TARGET_64BIT)
7299 return function_value_64 (orig_mode, mode, valtype);
7300 else
7301 return function_value_32 (orig_mode, mode, fntype, fn);
7302 }
7303
7304 static rtx
7305 ix86_function_value (const_tree valtype, const_tree fntype_or_decl,
7306 bool outgoing ATTRIBUTE_UNUSED)
7307 {
7308 enum machine_mode mode, orig_mode;
7309
7310 orig_mode = TYPE_MODE (valtype);
7311 mode = type_natural_mode (valtype, NULL);
7312 return ix86_function_value_1 (valtype, fntype_or_decl, orig_mode, mode);
7313 }
7314
7315 /* Pointer function arguments and return values are promoted to
7316 word_mode. */
7317
7318 static enum machine_mode
7319 ix86_promote_function_mode (const_tree type, enum machine_mode mode,
7320 int *punsignedp, const_tree fntype,
7321 int for_return)
7322 {
7323 if (type != NULL_TREE && POINTER_TYPE_P (type))
7324 {
7325 *punsignedp = POINTERS_EXTEND_UNSIGNED;
7326 return word_mode;
7327 }
7328 return default_promote_function_mode (type, mode, punsignedp, fntype,
7329 for_return);
7330 }
7331
7332 rtx
7333 ix86_libcall_value (enum machine_mode mode)
7334 {
7335 return ix86_function_value_1 (NULL, NULL, mode, mode);
7336 }
7337
7338 /* Return true iff type is returned in memory. */
7339
7340 static bool ATTRIBUTE_UNUSED
7341 return_in_memory_32 (const_tree type, enum machine_mode mode)
7342 {
7343 HOST_WIDE_INT size;
7344
7345 if (mode == BLKmode)
7346 return true;
7347
7348 size = int_size_in_bytes (type);
7349
7350 if (MS_AGGREGATE_RETURN && AGGREGATE_TYPE_P (type) && size <= 8)
7351 return false;
7352
7353 if (VECTOR_MODE_P (mode) || mode == TImode)
7354 {
7355 /* User-created vectors small enough to fit in EAX. */
7356 if (size < 8)
7357 return false;
7358
7359 /* MMX/3dNow values are returned in MM0,
7360 except when it doesn't exits or the ABI prescribes otherwise. */
7361 if (size == 8)
7362 return !TARGET_MMX || TARGET_VECT8_RETURNS;
7363
7364 /* SSE values are returned in XMM0, except when it doesn't exist. */
7365 if (size == 16)
7366 return !TARGET_SSE;
7367
7368 /* AVX values are returned in YMM0, except when it doesn't exist. */
7369 if (size == 32)
7370 return !TARGET_AVX;
7371 }
7372
7373 if (mode == XFmode)
7374 return false;
7375
7376 if (size > 12)
7377 return true;
7378
7379 /* OImode shouldn't be used directly. */
7380 gcc_assert (mode != OImode);
7381
7382 return false;
7383 }
7384
7385 static bool ATTRIBUTE_UNUSED
7386 return_in_memory_64 (const_tree type, enum machine_mode mode)
7387 {
7388 int needed_intregs, needed_sseregs;
7389 return !examine_argument (mode, type, 1, &needed_intregs, &needed_sseregs);
7390 }
7391
7392 static bool ATTRIBUTE_UNUSED
7393 return_in_memory_ms_64 (const_tree type, enum machine_mode mode)
7394 {
7395 HOST_WIDE_INT size = int_size_in_bytes (type);
7396
7397 /* __m128 is returned in xmm0. */
7398 if ((SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
7399 && !COMPLEX_MODE_P (mode) && (GET_MODE_SIZE (mode) == 16 || size == 16))
7400 return false;
7401
7402 /* Otherwise, the size must be exactly in [1248]. */
7403 return size != 1 && size != 2 && size != 4 && size != 8;
7404 }
7405
7406 static bool
7407 ix86_return_in_memory (const_tree type, const_tree fntype ATTRIBUTE_UNUSED)
7408 {
7409 #ifdef SUBTARGET_RETURN_IN_MEMORY
7410 return SUBTARGET_RETURN_IN_MEMORY (type, fntype);
7411 #else
7412 const enum machine_mode mode = type_natural_mode (type, NULL);
7413
7414 if (TARGET_64BIT)
7415 {
7416 if (ix86_function_type_abi (fntype) == MS_ABI)
7417 return return_in_memory_ms_64 (type, mode);
7418 else
7419 return return_in_memory_64 (type, mode);
7420 }
7421 else
7422 return return_in_memory_32 (type, mode);
7423 #endif
7424 }
7425
7426 /* When returning SSE vector types, we have a choice of either
7427 (1) being abi incompatible with a -march switch, or
7428 (2) generating an error.
7429 Given no good solution, I think the safest thing is one warning.
7430 The user won't be able to use -Werror, but....
7431
7432 Choose the STRUCT_VALUE_RTX hook because that's (at present) only
7433 called in response to actually generating a caller or callee that
7434 uses such a type. As opposed to TARGET_RETURN_IN_MEMORY, which is called
7435 via aggregate_value_p for general type probing from tree-ssa. */
7436
7437 static rtx
7438 ix86_struct_value_rtx (tree type, int incoming ATTRIBUTE_UNUSED)
7439 {
7440 static bool warnedsse, warnedmmx;
7441
7442 if (!TARGET_64BIT && type)
7443 {
7444 /* Look at the return type of the function, not the function type. */
7445 enum machine_mode mode = TYPE_MODE (TREE_TYPE (type));
7446
7447 if (!TARGET_SSE && !warnedsse)
7448 {
7449 if (mode == TImode
7450 || (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 16))
7451 {
7452 warnedsse = true;
7453 warning (0, "SSE vector return without SSE enabled "
7454 "changes the ABI");
7455 }
7456 }
7457
7458 if (!TARGET_MMX && !warnedmmx)
7459 {
7460 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 8)
7461 {
7462 warnedmmx = true;
7463 warning (0, "MMX vector return without MMX enabled "
7464 "changes the ABI");
7465 }
7466 }
7467 }
7468
7469 return NULL;
7470 }
7471
7472 \f
7473 /* Create the va_list data type. */
7474
7475 /* Returns the calling convention specific va_list date type.
7476 The argument ABI can be DEFAULT_ABI, MS_ABI, or SYSV_ABI. */
7477
7478 static tree
7479 ix86_build_builtin_va_list_abi (enum calling_abi abi)
7480 {
7481 tree f_gpr, f_fpr, f_ovf, f_sav, record, type_decl;
7482
7483 /* For i386 we use plain pointer to argument area. */
7484 if (!TARGET_64BIT || abi == MS_ABI)
7485 return build_pointer_type (char_type_node);
7486
7487 record = lang_hooks.types.make_type (RECORD_TYPE);
7488 type_decl = build_decl (BUILTINS_LOCATION,
7489 TYPE_DECL, get_identifier ("__va_list_tag"), record);
7490
7491 f_gpr = build_decl (BUILTINS_LOCATION,
7492 FIELD_DECL, get_identifier ("gp_offset"),
7493 unsigned_type_node);
7494 f_fpr = build_decl (BUILTINS_LOCATION,
7495 FIELD_DECL, get_identifier ("fp_offset"),
7496 unsigned_type_node);
7497 f_ovf = build_decl (BUILTINS_LOCATION,
7498 FIELD_DECL, get_identifier ("overflow_arg_area"),
7499 ptr_type_node);
7500 f_sav = build_decl (BUILTINS_LOCATION,
7501 FIELD_DECL, get_identifier ("reg_save_area"),
7502 ptr_type_node);
7503
7504 va_list_gpr_counter_field = f_gpr;
7505 va_list_fpr_counter_field = f_fpr;
7506
7507 DECL_FIELD_CONTEXT (f_gpr) = record;
7508 DECL_FIELD_CONTEXT (f_fpr) = record;
7509 DECL_FIELD_CONTEXT (f_ovf) = record;
7510 DECL_FIELD_CONTEXT (f_sav) = record;
7511
7512 TYPE_STUB_DECL (record) = type_decl;
7513 TYPE_NAME (record) = type_decl;
7514 TYPE_FIELDS (record) = f_gpr;
7515 DECL_CHAIN (f_gpr) = f_fpr;
7516 DECL_CHAIN (f_fpr) = f_ovf;
7517 DECL_CHAIN (f_ovf) = f_sav;
7518
7519 layout_type (record);
7520
7521 /* The correct type is an array type of one element. */
7522 return build_array_type (record, build_index_type (size_zero_node));
7523 }
7524
7525 /* Setup the builtin va_list data type and for 64-bit the additional
7526 calling convention specific va_list data types. */
7527
7528 static tree
7529 ix86_build_builtin_va_list (void)
7530 {
7531 tree ret = ix86_build_builtin_va_list_abi (ix86_abi);
7532
7533 /* Initialize abi specific va_list builtin types. */
7534 if (TARGET_64BIT)
7535 {
7536 tree t;
7537 if (ix86_abi == MS_ABI)
7538 {
7539 t = ix86_build_builtin_va_list_abi (SYSV_ABI);
7540 if (TREE_CODE (t) != RECORD_TYPE)
7541 t = build_variant_type_copy (t);
7542 sysv_va_list_type_node = t;
7543 }
7544 else
7545 {
7546 t = ret;
7547 if (TREE_CODE (t) != RECORD_TYPE)
7548 t = build_variant_type_copy (t);
7549 sysv_va_list_type_node = t;
7550 }
7551 if (ix86_abi != MS_ABI)
7552 {
7553 t = ix86_build_builtin_va_list_abi (MS_ABI);
7554 if (TREE_CODE (t) != RECORD_TYPE)
7555 t = build_variant_type_copy (t);
7556 ms_va_list_type_node = t;
7557 }
7558 else
7559 {
7560 t = ret;
7561 if (TREE_CODE (t) != RECORD_TYPE)
7562 t = build_variant_type_copy (t);
7563 ms_va_list_type_node = t;
7564 }
7565 }
7566
7567 return ret;
7568 }
7569
7570 /* Worker function for TARGET_SETUP_INCOMING_VARARGS. */
7571
7572 static void
7573 setup_incoming_varargs_64 (CUMULATIVE_ARGS *cum)
7574 {
7575 rtx save_area, mem;
7576 alias_set_type set;
7577 int i, max;
7578
7579 /* GPR size of varargs save area. */
7580 if (cfun->va_list_gpr_size)
7581 ix86_varargs_gpr_size = X86_64_REGPARM_MAX * UNITS_PER_WORD;
7582 else
7583 ix86_varargs_gpr_size = 0;
7584
7585 /* FPR size of varargs save area. We don't need it if we don't pass
7586 anything in SSE registers. */
7587 if (TARGET_SSE && cfun->va_list_fpr_size)
7588 ix86_varargs_fpr_size = X86_64_SSE_REGPARM_MAX * 16;
7589 else
7590 ix86_varargs_fpr_size = 0;
7591
7592 if (! ix86_varargs_gpr_size && ! ix86_varargs_fpr_size)
7593 return;
7594
7595 save_area = frame_pointer_rtx;
7596 set = get_varargs_alias_set ();
7597
7598 max = cum->regno + cfun->va_list_gpr_size / UNITS_PER_WORD;
7599 if (max > X86_64_REGPARM_MAX)
7600 max = X86_64_REGPARM_MAX;
7601
7602 for (i = cum->regno; i < max; i++)
7603 {
7604 mem = gen_rtx_MEM (word_mode,
7605 plus_constant (save_area, i * UNITS_PER_WORD));
7606 MEM_NOTRAP_P (mem) = 1;
7607 set_mem_alias_set (mem, set);
7608 emit_move_insn (mem,
7609 gen_rtx_REG (word_mode,
7610 x86_64_int_parameter_registers[i]));
7611 }
7612
7613 if (ix86_varargs_fpr_size)
7614 {
7615 enum machine_mode smode;
7616 rtx label, test;
7617
7618 /* Now emit code to save SSE registers. The AX parameter contains number
7619 of SSE parameter registers used to call this function, though all we
7620 actually check here is the zero/non-zero status. */
7621
7622 label = gen_label_rtx ();
7623 test = gen_rtx_EQ (VOIDmode, gen_rtx_REG (QImode, AX_REG), const0_rtx);
7624 emit_jump_insn (gen_cbranchqi4 (test, XEXP (test, 0), XEXP (test, 1),
7625 label));
7626
7627 /* ??? If !TARGET_SSE_TYPELESS_STORES, would we perform better if
7628 we used movdqa (i.e. TImode) instead? Perhaps even better would
7629 be if we could determine the real mode of the data, via a hook
7630 into pass_stdarg. Ignore all that for now. */
7631 smode = V4SFmode;
7632 if (crtl->stack_alignment_needed < GET_MODE_ALIGNMENT (smode))
7633 crtl->stack_alignment_needed = GET_MODE_ALIGNMENT (smode);
7634
7635 max = cum->sse_regno + cfun->va_list_fpr_size / 16;
7636 if (max > X86_64_SSE_REGPARM_MAX)
7637 max = X86_64_SSE_REGPARM_MAX;
7638
7639 for (i = cum->sse_regno; i < max; ++i)
7640 {
7641 mem = plus_constant (save_area, i * 16 + ix86_varargs_gpr_size);
7642 mem = gen_rtx_MEM (smode, mem);
7643 MEM_NOTRAP_P (mem) = 1;
7644 set_mem_alias_set (mem, set);
7645 set_mem_align (mem, GET_MODE_ALIGNMENT (smode));
7646
7647 emit_move_insn (mem, gen_rtx_REG (smode, SSE_REGNO (i)));
7648 }
7649
7650 emit_label (label);
7651 }
7652 }
7653
7654 static void
7655 setup_incoming_varargs_ms_64 (CUMULATIVE_ARGS *cum)
7656 {
7657 alias_set_type set = get_varargs_alias_set ();
7658 int i;
7659
7660 /* Reset to zero, as there might be a sysv vaarg used
7661 before. */
7662 ix86_varargs_gpr_size = 0;
7663 ix86_varargs_fpr_size = 0;
7664
7665 for (i = cum->regno; i < X86_64_MS_REGPARM_MAX; i++)
7666 {
7667 rtx reg, mem;
7668
7669 mem = gen_rtx_MEM (Pmode,
7670 plus_constant (virtual_incoming_args_rtx,
7671 i * UNITS_PER_WORD));
7672 MEM_NOTRAP_P (mem) = 1;
7673 set_mem_alias_set (mem, set);
7674
7675 reg = gen_rtx_REG (Pmode, x86_64_ms_abi_int_parameter_registers[i]);
7676 emit_move_insn (mem, reg);
7677 }
7678 }
7679
7680 static void
7681 ix86_setup_incoming_varargs (cumulative_args_t cum_v, enum machine_mode mode,
7682 tree type, int *pretend_size ATTRIBUTE_UNUSED,
7683 int no_rtl)
7684 {
7685 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
7686 CUMULATIVE_ARGS next_cum;
7687 tree fntype;
7688
7689 /* This argument doesn't appear to be used anymore. Which is good,
7690 because the old code here didn't suppress rtl generation. */
7691 gcc_assert (!no_rtl);
7692
7693 if (!TARGET_64BIT)
7694 return;
7695
7696 fntype = TREE_TYPE (current_function_decl);
7697
7698 /* For varargs, we do not want to skip the dummy va_dcl argument.
7699 For stdargs, we do want to skip the last named argument. */
7700 next_cum = *cum;
7701 if (stdarg_p (fntype))
7702 ix86_function_arg_advance (pack_cumulative_args (&next_cum), mode, type,
7703 true);
7704
7705 if (cum->call_abi == MS_ABI)
7706 setup_incoming_varargs_ms_64 (&next_cum);
7707 else
7708 setup_incoming_varargs_64 (&next_cum);
7709 }
7710
7711 /* Checks if TYPE is of kind va_list char *. */
7712
7713 static bool
7714 is_va_list_char_pointer (tree type)
7715 {
7716 tree canonic;
7717
7718 /* For 32-bit it is always true. */
7719 if (!TARGET_64BIT)
7720 return true;
7721 canonic = ix86_canonical_va_list_type (type);
7722 return (canonic == ms_va_list_type_node
7723 || (ix86_abi == MS_ABI && canonic == va_list_type_node));
7724 }
7725
7726 /* Implement va_start. */
7727
7728 static void
7729 ix86_va_start (tree valist, rtx nextarg)
7730 {
7731 HOST_WIDE_INT words, n_gpr, n_fpr;
7732 tree f_gpr, f_fpr, f_ovf, f_sav;
7733 tree gpr, fpr, ovf, sav, t;
7734 tree type;
7735 rtx ovf_rtx;
7736
7737 if (flag_split_stack
7738 && cfun->machine->split_stack_varargs_pointer == NULL_RTX)
7739 {
7740 unsigned int scratch_regno;
7741
7742 /* When we are splitting the stack, we can't refer to the stack
7743 arguments using internal_arg_pointer, because they may be on
7744 the old stack. The split stack prologue will arrange to
7745 leave a pointer to the old stack arguments in a scratch
7746 register, which we here copy to a pseudo-register. The split
7747 stack prologue can't set the pseudo-register directly because
7748 it (the prologue) runs before any registers have been saved. */
7749
7750 scratch_regno = split_stack_prologue_scratch_regno ();
7751 if (scratch_regno != INVALID_REGNUM)
7752 {
7753 rtx reg, seq;
7754
7755 reg = gen_reg_rtx (Pmode);
7756 cfun->machine->split_stack_varargs_pointer = reg;
7757
7758 start_sequence ();
7759 emit_move_insn (reg, gen_rtx_REG (Pmode, scratch_regno));
7760 seq = get_insns ();
7761 end_sequence ();
7762
7763 push_topmost_sequence ();
7764 emit_insn_after (seq, entry_of_function ());
7765 pop_topmost_sequence ();
7766 }
7767 }
7768
7769 /* Only 64bit target needs something special. */
7770 if (!TARGET_64BIT || is_va_list_char_pointer (TREE_TYPE (valist)))
7771 {
7772 if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
7773 std_expand_builtin_va_start (valist, nextarg);
7774 else
7775 {
7776 rtx va_r, next;
7777
7778 va_r = expand_expr (valist, NULL_RTX, VOIDmode, EXPAND_WRITE);
7779 next = expand_binop (ptr_mode, add_optab,
7780 cfun->machine->split_stack_varargs_pointer,
7781 crtl->args.arg_offset_rtx,
7782 NULL_RTX, 0, OPTAB_LIB_WIDEN);
7783 convert_move (va_r, next, 0);
7784 }
7785 return;
7786 }
7787
7788 f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
7789 f_fpr = DECL_CHAIN (f_gpr);
7790 f_ovf = DECL_CHAIN (f_fpr);
7791 f_sav = DECL_CHAIN (f_ovf);
7792
7793 valist = build_simple_mem_ref (valist);
7794 TREE_TYPE (valist) = TREE_TYPE (sysv_va_list_type_node);
7795 /* The following should be folded into the MEM_REF offset. */
7796 gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr), unshare_expr (valist),
7797 f_gpr, NULL_TREE);
7798 fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), unshare_expr (valist),
7799 f_fpr, NULL_TREE);
7800 ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), unshare_expr (valist),
7801 f_ovf, NULL_TREE);
7802 sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), unshare_expr (valist),
7803 f_sav, NULL_TREE);
7804
7805 /* Count number of gp and fp argument registers used. */
7806 words = crtl->args.info.words;
7807 n_gpr = crtl->args.info.regno;
7808 n_fpr = crtl->args.info.sse_regno;
7809
7810 if (cfun->va_list_gpr_size)
7811 {
7812 type = TREE_TYPE (gpr);
7813 t = build2 (MODIFY_EXPR, type,
7814 gpr, build_int_cst (type, n_gpr * 8));
7815 TREE_SIDE_EFFECTS (t) = 1;
7816 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7817 }
7818
7819 if (TARGET_SSE && cfun->va_list_fpr_size)
7820 {
7821 type = TREE_TYPE (fpr);
7822 t = build2 (MODIFY_EXPR, type, fpr,
7823 build_int_cst (type, n_fpr * 16 + 8*X86_64_REGPARM_MAX));
7824 TREE_SIDE_EFFECTS (t) = 1;
7825 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7826 }
7827
7828 /* Find the overflow area. */
7829 type = TREE_TYPE (ovf);
7830 if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
7831 ovf_rtx = crtl->args.internal_arg_pointer;
7832 else
7833 ovf_rtx = cfun->machine->split_stack_varargs_pointer;
7834 t = make_tree (type, ovf_rtx);
7835 if (words != 0)
7836 t = fold_build_pointer_plus_hwi (t, words * UNITS_PER_WORD);
7837 t = build2 (MODIFY_EXPR, type, ovf, t);
7838 TREE_SIDE_EFFECTS (t) = 1;
7839 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7840
7841 if (ix86_varargs_gpr_size || ix86_varargs_fpr_size)
7842 {
7843 /* Find the register save area.
7844 Prologue of the function save it right above stack frame. */
7845 type = TREE_TYPE (sav);
7846 t = make_tree (type, frame_pointer_rtx);
7847 if (!ix86_varargs_gpr_size)
7848 t = fold_build_pointer_plus_hwi (t, -8 * X86_64_REGPARM_MAX);
7849 t = build2 (MODIFY_EXPR, type, sav, t);
7850 TREE_SIDE_EFFECTS (t) = 1;
7851 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7852 }
7853 }
7854
7855 /* Implement va_arg. */
7856
7857 static tree
7858 ix86_gimplify_va_arg (tree valist, tree type, gimple_seq *pre_p,
7859 gimple_seq *post_p)
7860 {
7861 static const int intreg[6] = { 0, 1, 2, 3, 4, 5 };
7862 tree f_gpr, f_fpr, f_ovf, f_sav;
7863 tree gpr, fpr, ovf, sav, t;
7864 int size, rsize;
7865 tree lab_false, lab_over = NULL_TREE;
7866 tree addr, t2;
7867 rtx container;
7868 int indirect_p = 0;
7869 tree ptrtype;
7870 enum machine_mode nat_mode;
7871 unsigned int arg_boundary;
7872
7873 /* Only 64bit target needs something special. */
7874 if (!TARGET_64BIT || is_va_list_char_pointer (TREE_TYPE (valist)))
7875 return std_gimplify_va_arg_expr (valist, type, pre_p, post_p);
7876
7877 f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
7878 f_fpr = DECL_CHAIN (f_gpr);
7879 f_ovf = DECL_CHAIN (f_fpr);
7880 f_sav = DECL_CHAIN (f_ovf);
7881
7882 gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr),
7883 build_va_arg_indirect_ref (valist), f_gpr, NULL_TREE);
7884 valist = build_va_arg_indirect_ref (valist);
7885 fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), valist, f_fpr, NULL_TREE);
7886 ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), valist, f_ovf, NULL_TREE);
7887 sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), valist, f_sav, NULL_TREE);
7888
7889 indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
7890 if (indirect_p)
7891 type = build_pointer_type (type);
7892 size = int_size_in_bytes (type);
7893 rsize = (size + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
7894
7895 nat_mode = type_natural_mode (type, NULL);
7896 switch (nat_mode)
7897 {
7898 case V8SFmode:
7899 case V8SImode:
7900 case V32QImode:
7901 case V16HImode:
7902 case V4DFmode:
7903 case V4DImode:
7904 /* Unnamed 256bit vector mode parameters are passed on stack. */
7905 if (!TARGET_64BIT_MS_ABI)
7906 {
7907 container = NULL;
7908 break;
7909 }
7910
7911 default:
7912 container = construct_container (nat_mode, TYPE_MODE (type),
7913 type, 0, X86_64_REGPARM_MAX,
7914 X86_64_SSE_REGPARM_MAX, intreg,
7915 0);
7916 break;
7917 }
7918
7919 /* Pull the value out of the saved registers. */
7920
7921 addr = create_tmp_var (ptr_type_node, "addr");
7922
7923 if (container)
7924 {
7925 int needed_intregs, needed_sseregs;
7926 bool need_temp;
7927 tree int_addr, sse_addr;
7928
7929 lab_false = create_artificial_label (UNKNOWN_LOCATION);
7930 lab_over = create_artificial_label (UNKNOWN_LOCATION);
7931
7932 examine_argument (nat_mode, type, 0, &needed_intregs, &needed_sseregs);
7933
7934 need_temp = (!REG_P (container)
7935 && ((needed_intregs && TYPE_ALIGN (type) > 64)
7936 || TYPE_ALIGN (type) > 128));
7937
7938 /* In case we are passing structure, verify that it is consecutive block
7939 on the register save area. If not we need to do moves. */
7940 if (!need_temp && !REG_P (container))
7941 {
7942 /* Verify that all registers are strictly consecutive */
7943 if (SSE_REGNO_P (REGNO (XEXP (XVECEXP (container, 0, 0), 0))))
7944 {
7945 int i;
7946
7947 for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
7948 {
7949 rtx slot = XVECEXP (container, 0, i);
7950 if (REGNO (XEXP (slot, 0)) != FIRST_SSE_REG + (unsigned int) i
7951 || INTVAL (XEXP (slot, 1)) != i * 16)
7952 need_temp = 1;
7953 }
7954 }
7955 else
7956 {
7957 int i;
7958
7959 for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
7960 {
7961 rtx slot = XVECEXP (container, 0, i);
7962 if (REGNO (XEXP (slot, 0)) != (unsigned int) i
7963 || INTVAL (XEXP (slot, 1)) != i * 8)
7964 need_temp = 1;
7965 }
7966 }
7967 }
7968 if (!need_temp)
7969 {
7970 int_addr = addr;
7971 sse_addr = addr;
7972 }
7973 else
7974 {
7975 int_addr = create_tmp_var (ptr_type_node, "int_addr");
7976 sse_addr = create_tmp_var (ptr_type_node, "sse_addr");
7977 }
7978
7979 /* First ensure that we fit completely in registers. */
7980 if (needed_intregs)
7981 {
7982 t = build_int_cst (TREE_TYPE (gpr),
7983 (X86_64_REGPARM_MAX - needed_intregs + 1) * 8);
7984 t = build2 (GE_EXPR, boolean_type_node, gpr, t);
7985 t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
7986 t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
7987 gimplify_and_add (t, pre_p);
7988 }
7989 if (needed_sseregs)
7990 {
7991 t = build_int_cst (TREE_TYPE (fpr),
7992 (X86_64_SSE_REGPARM_MAX - needed_sseregs + 1) * 16
7993 + X86_64_REGPARM_MAX * 8);
7994 t = build2 (GE_EXPR, boolean_type_node, fpr, t);
7995 t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
7996 t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
7997 gimplify_and_add (t, pre_p);
7998 }
7999
8000 /* Compute index to start of area used for integer regs. */
8001 if (needed_intregs)
8002 {
8003 /* int_addr = gpr + sav; */
8004 t = fold_build_pointer_plus (sav, gpr);
8005 gimplify_assign (int_addr, t, pre_p);
8006 }
8007 if (needed_sseregs)
8008 {
8009 /* sse_addr = fpr + sav; */
8010 t = fold_build_pointer_plus (sav, fpr);
8011 gimplify_assign (sse_addr, t, pre_p);
8012 }
8013 if (need_temp)
8014 {
8015 int i, prev_size = 0;
8016 tree temp = create_tmp_var (type, "va_arg_tmp");
8017
8018 /* addr = &temp; */
8019 t = build1 (ADDR_EXPR, build_pointer_type (type), temp);
8020 gimplify_assign (addr, t, pre_p);
8021
8022 for (i = 0; i < XVECLEN (container, 0); i++)
8023 {
8024 rtx slot = XVECEXP (container, 0, i);
8025 rtx reg = XEXP (slot, 0);
8026 enum machine_mode mode = GET_MODE (reg);
8027 tree piece_type;
8028 tree addr_type;
8029 tree daddr_type;
8030 tree src_addr, src;
8031 int src_offset;
8032 tree dest_addr, dest;
8033 int cur_size = GET_MODE_SIZE (mode);
8034
8035 gcc_assert (prev_size <= INTVAL (XEXP (slot, 1)));
8036 prev_size = INTVAL (XEXP (slot, 1));
8037 if (prev_size + cur_size > size)
8038 {
8039 cur_size = size - prev_size;
8040 mode = mode_for_size (cur_size * BITS_PER_UNIT, MODE_INT, 1);
8041 if (mode == BLKmode)
8042 mode = QImode;
8043 }
8044 piece_type = lang_hooks.types.type_for_mode (mode, 1);
8045 if (mode == GET_MODE (reg))
8046 addr_type = build_pointer_type (piece_type);
8047 else
8048 addr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
8049 true);
8050 daddr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
8051 true);
8052
8053 if (SSE_REGNO_P (REGNO (reg)))
8054 {
8055 src_addr = sse_addr;
8056 src_offset = (REGNO (reg) - FIRST_SSE_REG) * 16;
8057 }
8058 else
8059 {
8060 src_addr = int_addr;
8061 src_offset = REGNO (reg) * 8;
8062 }
8063 src_addr = fold_convert (addr_type, src_addr);
8064 src_addr = fold_build_pointer_plus_hwi (src_addr, src_offset);
8065
8066 dest_addr = fold_convert (daddr_type, addr);
8067 dest_addr = fold_build_pointer_plus_hwi (dest_addr, prev_size);
8068 if (cur_size == GET_MODE_SIZE (mode))
8069 {
8070 src = build_va_arg_indirect_ref (src_addr);
8071 dest = build_va_arg_indirect_ref (dest_addr);
8072
8073 gimplify_assign (dest, src, pre_p);
8074 }
8075 else
8076 {
8077 tree copy
8078 = build_call_expr (builtin_decl_implicit (BUILT_IN_MEMCPY),
8079 3, dest_addr, src_addr,
8080 size_int (cur_size));
8081 gimplify_and_add (copy, pre_p);
8082 }
8083 prev_size += cur_size;
8084 }
8085 }
8086
8087 if (needed_intregs)
8088 {
8089 t = build2 (PLUS_EXPR, TREE_TYPE (gpr), gpr,
8090 build_int_cst (TREE_TYPE (gpr), needed_intregs * 8));
8091 gimplify_assign (gpr, t, pre_p);
8092 }
8093
8094 if (needed_sseregs)
8095 {
8096 t = build2 (PLUS_EXPR, TREE_TYPE (fpr), fpr,
8097 build_int_cst (TREE_TYPE (fpr), needed_sseregs * 16));
8098 gimplify_assign (fpr, t, pre_p);
8099 }
8100
8101 gimple_seq_add_stmt (pre_p, gimple_build_goto (lab_over));
8102
8103 gimple_seq_add_stmt (pre_p, gimple_build_label (lab_false));
8104 }
8105
8106 /* ... otherwise out of the overflow area. */
8107
8108 /* When we align parameter on stack for caller, if the parameter
8109 alignment is beyond MAX_SUPPORTED_STACK_ALIGNMENT, it will be
8110 aligned at MAX_SUPPORTED_STACK_ALIGNMENT. We will match callee
8111 here with caller. */
8112 arg_boundary = ix86_function_arg_boundary (VOIDmode, type);
8113 if ((unsigned int) arg_boundary > MAX_SUPPORTED_STACK_ALIGNMENT)
8114 arg_boundary = MAX_SUPPORTED_STACK_ALIGNMENT;
8115
8116 /* Care for on-stack alignment if needed. */
8117 if (arg_boundary <= 64 || size == 0)
8118 t = ovf;
8119 else
8120 {
8121 HOST_WIDE_INT align = arg_boundary / 8;
8122 t = fold_build_pointer_plus_hwi (ovf, align - 1);
8123 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
8124 build_int_cst (TREE_TYPE (t), -align));
8125 }
8126
8127 gimplify_expr (&t, pre_p, NULL, is_gimple_val, fb_rvalue);
8128 gimplify_assign (addr, t, pre_p);
8129
8130 t = fold_build_pointer_plus_hwi (t, rsize * UNITS_PER_WORD);
8131 gimplify_assign (unshare_expr (ovf), t, pre_p);
8132
8133 if (container)
8134 gimple_seq_add_stmt (pre_p, gimple_build_label (lab_over));
8135
8136 ptrtype = build_pointer_type_for_mode (type, ptr_mode, true);
8137 addr = fold_convert (ptrtype, addr);
8138
8139 if (indirect_p)
8140 addr = build_va_arg_indirect_ref (addr);
8141 return build_va_arg_indirect_ref (addr);
8142 }
8143 \f
8144 /* Return true if OPNUM's MEM should be matched
8145 in movabs* patterns. */
8146
8147 bool
8148 ix86_check_movabs (rtx insn, int opnum)
8149 {
8150 rtx set, mem;
8151
8152 set = PATTERN (insn);
8153 if (GET_CODE (set) == PARALLEL)
8154 set = XVECEXP (set, 0, 0);
8155 gcc_assert (GET_CODE (set) == SET);
8156 mem = XEXP (set, opnum);
8157 while (GET_CODE (mem) == SUBREG)
8158 mem = SUBREG_REG (mem);
8159 gcc_assert (MEM_P (mem));
8160 return volatile_ok || !MEM_VOLATILE_P (mem);
8161 }
8162 \f
8163 /* Initialize the table of extra 80387 mathematical constants. */
8164
8165 static void
8166 init_ext_80387_constants (void)
8167 {
8168 static const char * cst[5] =
8169 {
8170 "0.3010299956639811952256464283594894482", /* 0: fldlg2 */
8171 "0.6931471805599453094286904741849753009", /* 1: fldln2 */
8172 "1.4426950408889634073876517827983434472", /* 2: fldl2e */
8173 "3.3219280948873623478083405569094566090", /* 3: fldl2t */
8174 "3.1415926535897932385128089594061862044", /* 4: fldpi */
8175 };
8176 int i;
8177
8178 for (i = 0; i < 5; i++)
8179 {
8180 real_from_string (&ext_80387_constants_table[i], cst[i]);
8181 /* Ensure each constant is rounded to XFmode precision. */
8182 real_convert (&ext_80387_constants_table[i],
8183 XFmode, &ext_80387_constants_table[i]);
8184 }
8185
8186 ext_80387_constants_init = 1;
8187 }
8188
8189 /* Return non-zero if the constant is something that
8190 can be loaded with a special instruction. */
8191
8192 int
8193 standard_80387_constant_p (rtx x)
8194 {
8195 enum machine_mode mode = GET_MODE (x);
8196
8197 REAL_VALUE_TYPE r;
8198
8199 if (!(X87_FLOAT_MODE_P (mode) && (GET_CODE (x) == CONST_DOUBLE)))
8200 return -1;
8201
8202 if (x == CONST0_RTX (mode))
8203 return 1;
8204 if (x == CONST1_RTX (mode))
8205 return 2;
8206
8207 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
8208
8209 /* For XFmode constants, try to find a special 80387 instruction when
8210 optimizing for size or on those CPUs that benefit from them. */
8211 if (mode == XFmode
8212 && (optimize_function_for_size_p (cfun) || TARGET_EXT_80387_CONSTANTS))
8213 {
8214 int i;
8215
8216 if (! ext_80387_constants_init)
8217 init_ext_80387_constants ();
8218
8219 for (i = 0; i < 5; i++)
8220 if (real_identical (&r, &ext_80387_constants_table[i]))
8221 return i + 3;
8222 }
8223
8224 /* Load of the constant -0.0 or -1.0 will be split as
8225 fldz;fchs or fld1;fchs sequence. */
8226 if (real_isnegzero (&r))
8227 return 8;
8228 if (real_identical (&r, &dconstm1))
8229 return 9;
8230
8231 return 0;
8232 }
8233
8234 /* Return the opcode of the special instruction to be used to load
8235 the constant X. */
8236
8237 const char *
8238 standard_80387_constant_opcode (rtx x)
8239 {
8240 switch (standard_80387_constant_p (x))
8241 {
8242 case 1:
8243 return "fldz";
8244 case 2:
8245 return "fld1";
8246 case 3:
8247 return "fldlg2";
8248 case 4:
8249 return "fldln2";
8250 case 5:
8251 return "fldl2e";
8252 case 6:
8253 return "fldl2t";
8254 case 7:
8255 return "fldpi";
8256 case 8:
8257 case 9:
8258 return "#";
8259 default:
8260 gcc_unreachable ();
8261 }
8262 }
8263
8264 /* Return the CONST_DOUBLE representing the 80387 constant that is
8265 loaded by the specified special instruction. The argument IDX
8266 matches the return value from standard_80387_constant_p. */
8267
8268 rtx
8269 standard_80387_constant_rtx (int idx)
8270 {
8271 int i;
8272
8273 if (! ext_80387_constants_init)
8274 init_ext_80387_constants ();
8275
8276 switch (idx)
8277 {
8278 case 3:
8279 case 4:
8280 case 5:
8281 case 6:
8282 case 7:
8283 i = idx - 3;
8284 break;
8285
8286 default:
8287 gcc_unreachable ();
8288 }
8289
8290 return CONST_DOUBLE_FROM_REAL_VALUE (ext_80387_constants_table[i],
8291 XFmode);
8292 }
8293
8294 /* Return 1 if X is all 0s and 2 if x is all 1s
8295 in supported SSE/AVX vector mode. */
8296
8297 int
8298 standard_sse_constant_p (rtx x)
8299 {
8300 enum machine_mode mode = GET_MODE (x);
8301
8302 if (x == const0_rtx || x == CONST0_RTX (GET_MODE (x)))
8303 return 1;
8304 if (vector_all_ones_operand (x, mode))
8305 switch (mode)
8306 {
8307 case V16QImode:
8308 case V8HImode:
8309 case V4SImode:
8310 case V2DImode:
8311 if (TARGET_SSE2)
8312 return 2;
8313 case V32QImode:
8314 case V16HImode:
8315 case V8SImode:
8316 case V4DImode:
8317 if (TARGET_AVX2)
8318 return 2;
8319 default:
8320 break;
8321 }
8322
8323 return 0;
8324 }
8325
8326 /* Return the opcode of the special instruction to be used to load
8327 the constant X. */
8328
8329 const char *
8330 standard_sse_constant_opcode (rtx insn, rtx x)
8331 {
8332 switch (standard_sse_constant_p (x))
8333 {
8334 case 1:
8335 switch (get_attr_mode (insn))
8336 {
8337 case MODE_TI:
8338 if (!TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
8339 return "%vpxor\t%0, %d0";
8340 case MODE_V2DF:
8341 if (!TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
8342 return "%vxorpd\t%0, %d0";
8343 case MODE_V4SF:
8344 return "%vxorps\t%0, %d0";
8345
8346 case MODE_OI:
8347 if (!TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
8348 return "vpxor\t%x0, %x0, %x0";
8349 case MODE_V4DF:
8350 if (!TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
8351 return "vxorpd\t%x0, %x0, %x0";
8352 case MODE_V8SF:
8353 return "vxorps\t%x0, %x0, %x0";
8354
8355 default:
8356 break;
8357 }
8358
8359 case 2:
8360 if (TARGET_AVX)
8361 return "vpcmpeqd\t%0, %0, %0";
8362 else
8363 return "pcmpeqd\t%0, %0";
8364
8365 default:
8366 break;
8367 }
8368 gcc_unreachable ();
8369 }
8370
8371 /* Returns true if OP contains a symbol reference */
8372
8373 bool
8374 symbolic_reference_mentioned_p (rtx op)
8375 {
8376 const char *fmt;
8377 int i;
8378
8379 if (GET_CODE (op) == SYMBOL_REF || GET_CODE (op) == LABEL_REF)
8380 return true;
8381
8382 fmt = GET_RTX_FORMAT (GET_CODE (op));
8383 for (i = GET_RTX_LENGTH (GET_CODE (op)) - 1; i >= 0; i--)
8384 {
8385 if (fmt[i] == 'E')
8386 {
8387 int j;
8388
8389 for (j = XVECLEN (op, i) - 1; j >= 0; j--)
8390 if (symbolic_reference_mentioned_p (XVECEXP (op, i, j)))
8391 return true;
8392 }
8393
8394 else if (fmt[i] == 'e' && symbolic_reference_mentioned_p (XEXP (op, i)))
8395 return true;
8396 }
8397
8398 return false;
8399 }
8400
8401 /* Return true if it is appropriate to emit `ret' instructions in the
8402 body of a function. Do this only if the epilogue is simple, needing a
8403 couple of insns. Prior to reloading, we can't tell how many registers
8404 must be saved, so return false then. Return false if there is no frame
8405 marker to de-allocate. */
8406
8407 bool
8408 ix86_can_use_return_insn_p (void)
8409 {
8410 struct ix86_frame frame;
8411
8412 if (! reload_completed || frame_pointer_needed)
8413 return 0;
8414
8415 /* Don't allow more than 32k pop, since that's all we can do
8416 with one instruction. */
8417 if (crtl->args.pops_args && crtl->args.size >= 32768)
8418 return 0;
8419
8420 ix86_compute_frame_layout (&frame);
8421 return (frame.stack_pointer_offset == UNITS_PER_WORD
8422 && (frame.nregs + frame.nsseregs) == 0);
8423 }
8424 \f
8425 /* Value should be nonzero if functions must have frame pointers.
8426 Zero means the frame pointer need not be set up (and parms may
8427 be accessed via the stack pointer) in functions that seem suitable. */
8428
8429 static bool
8430 ix86_frame_pointer_required (void)
8431 {
8432 /* If we accessed previous frames, then the generated code expects
8433 to be able to access the saved ebp value in our frame. */
8434 if (cfun->machine->accesses_prev_frame)
8435 return true;
8436
8437 /* Several x86 os'es need a frame pointer for other reasons,
8438 usually pertaining to setjmp. */
8439 if (SUBTARGET_FRAME_POINTER_REQUIRED)
8440 return true;
8441
8442 /* For older 32-bit runtimes setjmp requires valid frame-pointer. */
8443 if (TARGET_32BIT_MS_ABI && cfun->calls_setjmp)
8444 return true;
8445
8446 /* In ix86_option_override_internal, TARGET_OMIT_LEAF_FRAME_POINTER
8447 turns off the frame pointer by default. Turn it back on now if
8448 we've not got a leaf function. */
8449 if (TARGET_OMIT_LEAF_FRAME_POINTER
8450 && (!current_function_is_leaf
8451 || ix86_current_function_calls_tls_descriptor))
8452 return true;
8453
8454 if (crtl->profile && !flag_fentry)
8455 return true;
8456
8457 return false;
8458 }
8459
8460 /* Record that the current function accesses previous call frames. */
8461
8462 void
8463 ix86_setup_frame_addresses (void)
8464 {
8465 cfun->machine->accesses_prev_frame = 1;
8466 }
8467 \f
8468 #ifndef USE_HIDDEN_LINKONCE
8469 # if defined(HAVE_GAS_HIDDEN) && (SUPPORTS_ONE_ONLY - 0)
8470 # define USE_HIDDEN_LINKONCE 1
8471 # else
8472 # define USE_HIDDEN_LINKONCE 0
8473 # endif
8474 #endif
8475
8476 static int pic_labels_used;
8477
8478 /* Fills in the label name that should be used for a pc thunk for
8479 the given register. */
8480
8481 static void
8482 get_pc_thunk_name (char name[32], unsigned int regno)
8483 {
8484 gcc_assert (!TARGET_64BIT);
8485
8486 if (USE_HIDDEN_LINKONCE)
8487 sprintf (name, "__x86.get_pc_thunk.%s", reg_names[regno]);
8488 else
8489 ASM_GENERATE_INTERNAL_LABEL (name, "LPR", regno);
8490 }
8491
8492
8493 /* This function generates code for -fpic that loads %ebx with
8494 the return address of the caller and then returns. */
8495
8496 static void
8497 ix86_code_end (void)
8498 {
8499 rtx xops[2];
8500 int regno;
8501
8502 for (regno = AX_REG; regno <= SP_REG; regno++)
8503 {
8504 char name[32];
8505 tree decl;
8506
8507 if (!(pic_labels_used & (1 << regno)))
8508 continue;
8509
8510 get_pc_thunk_name (name, regno);
8511
8512 decl = build_decl (BUILTINS_LOCATION, FUNCTION_DECL,
8513 get_identifier (name),
8514 build_function_type_list (void_type_node, NULL_TREE));
8515 DECL_RESULT (decl) = build_decl (BUILTINS_LOCATION, RESULT_DECL,
8516 NULL_TREE, void_type_node);
8517 TREE_PUBLIC (decl) = 1;
8518 TREE_STATIC (decl) = 1;
8519
8520 #if TARGET_MACHO
8521 if (TARGET_MACHO)
8522 {
8523 switch_to_section (darwin_sections[text_coal_section]);
8524 fputs ("\t.weak_definition\t", asm_out_file);
8525 assemble_name (asm_out_file, name);
8526 fputs ("\n\t.private_extern\t", asm_out_file);
8527 assemble_name (asm_out_file, name);
8528 putc ('\n', asm_out_file);
8529 ASM_OUTPUT_LABEL (asm_out_file, name);
8530 DECL_WEAK (decl) = 1;
8531 }
8532 else
8533 #endif
8534 if (USE_HIDDEN_LINKONCE)
8535 {
8536 DECL_COMDAT_GROUP (decl) = DECL_ASSEMBLER_NAME (decl);
8537
8538 targetm.asm_out.unique_section (decl, 0);
8539 switch_to_section (get_named_section (decl, NULL, 0));
8540
8541 targetm.asm_out.globalize_label (asm_out_file, name);
8542 fputs ("\t.hidden\t", asm_out_file);
8543 assemble_name (asm_out_file, name);
8544 putc ('\n', asm_out_file);
8545 ASM_DECLARE_FUNCTION_NAME (asm_out_file, name, decl);
8546 }
8547 else
8548 {
8549 switch_to_section (text_section);
8550 ASM_OUTPUT_LABEL (asm_out_file, name);
8551 }
8552
8553 DECL_INITIAL (decl) = make_node (BLOCK);
8554 current_function_decl = decl;
8555 init_function_start (decl);
8556 first_function_block_is_cold = false;
8557 /* Make sure unwind info is emitted for the thunk if needed. */
8558 final_start_function (emit_barrier (), asm_out_file, 1);
8559
8560 /* Pad stack IP move with 4 instructions (two NOPs count
8561 as one instruction). */
8562 if (TARGET_PAD_SHORT_FUNCTION)
8563 {
8564 int i = 8;
8565
8566 while (i--)
8567 fputs ("\tnop\n", asm_out_file);
8568 }
8569
8570 xops[0] = gen_rtx_REG (Pmode, regno);
8571 xops[1] = gen_rtx_MEM (Pmode, stack_pointer_rtx);
8572 output_asm_insn ("mov%z0\t{%1, %0|%0, %1}", xops);
8573 fputs ("\tret\n", asm_out_file);
8574 final_end_function ();
8575 init_insn_lengths ();
8576 free_after_compilation (cfun);
8577 set_cfun (NULL);
8578 current_function_decl = NULL;
8579 }
8580
8581 if (flag_split_stack)
8582 file_end_indicate_split_stack ();
8583 }
8584
8585 /* Emit code for the SET_GOT patterns. */
8586
8587 const char *
8588 output_set_got (rtx dest, rtx label ATTRIBUTE_UNUSED)
8589 {
8590 rtx xops[3];
8591
8592 xops[0] = dest;
8593
8594 if (TARGET_VXWORKS_RTP && flag_pic)
8595 {
8596 /* Load (*VXWORKS_GOTT_BASE) into the PIC register. */
8597 xops[2] = gen_rtx_MEM (Pmode,
8598 gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_BASE));
8599 output_asm_insn ("mov{l}\t{%2, %0|%0, %2}", xops);
8600
8601 /* Load (*VXWORKS_GOTT_BASE)[VXWORKS_GOTT_INDEX] into the PIC register.
8602 Use %P and a local symbol in order to print VXWORKS_GOTT_INDEX as
8603 an unadorned address. */
8604 xops[2] = gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_INDEX);
8605 SYMBOL_REF_FLAGS (xops[2]) |= SYMBOL_FLAG_LOCAL;
8606 output_asm_insn ("mov{l}\t{%P2(%0), %0|%0, DWORD PTR %P2[%0]}", xops);
8607 return "";
8608 }
8609
8610 xops[1] = gen_rtx_SYMBOL_REF (Pmode, GOT_SYMBOL_NAME);
8611
8612 if (!flag_pic)
8613 {
8614 xops[2] = gen_rtx_LABEL_REF (Pmode, label ? label : gen_label_rtx ());
8615
8616 output_asm_insn ("mov%z0\t{%2, %0|%0, %2}", xops);
8617
8618 #if TARGET_MACHO
8619 /* Output the Mach-O "canonical" label name ("Lxx$pb") here too. This
8620 is what will be referenced by the Mach-O PIC subsystem. */
8621 if (!label)
8622 ASM_OUTPUT_LABEL (asm_out_file, MACHOPIC_FUNCTION_BASE_NAME);
8623 #endif
8624
8625 targetm.asm_out.internal_label (asm_out_file, "L",
8626 CODE_LABEL_NUMBER (XEXP (xops[2], 0)));
8627 }
8628 else
8629 {
8630 char name[32];
8631 get_pc_thunk_name (name, REGNO (dest));
8632 pic_labels_used |= 1 << REGNO (dest);
8633
8634 xops[2] = gen_rtx_SYMBOL_REF (Pmode, ggc_strdup (name));
8635 xops[2] = gen_rtx_MEM (QImode, xops[2]);
8636 output_asm_insn ("call\t%X2", xops);
8637 /* Output the Mach-O "canonical" label name ("Lxx$pb") here too. This
8638 is what will be referenced by the Mach-O PIC subsystem. */
8639 #if TARGET_MACHO
8640 if (!label)
8641 ASM_OUTPUT_LABEL (asm_out_file, MACHOPIC_FUNCTION_BASE_NAME);
8642 else
8643 targetm.asm_out.internal_label (asm_out_file, "L",
8644 CODE_LABEL_NUMBER (label));
8645 #endif
8646 }
8647
8648 if (!TARGET_MACHO)
8649 output_asm_insn ("add%z0\t{%1, %0|%0, %1}", xops);
8650
8651 return "";
8652 }
8653
8654 /* Generate an "push" pattern for input ARG. */
8655
8656 static rtx
8657 gen_push (rtx arg)
8658 {
8659 struct machine_function *m = cfun->machine;
8660
8661 if (m->fs.cfa_reg == stack_pointer_rtx)
8662 m->fs.cfa_offset += UNITS_PER_WORD;
8663 m->fs.sp_offset += UNITS_PER_WORD;
8664
8665 if (REG_P (arg) && GET_MODE (arg) != word_mode)
8666 arg = gen_rtx_REG (word_mode, REGNO (arg));
8667
8668 return gen_rtx_SET (VOIDmode,
8669 gen_rtx_MEM (word_mode,
8670 gen_rtx_PRE_DEC (Pmode,
8671 stack_pointer_rtx)),
8672 arg);
8673 }
8674
8675 /* Generate an "pop" pattern for input ARG. */
8676
8677 static rtx
8678 gen_pop (rtx arg)
8679 {
8680 if (REG_P (arg) && GET_MODE (arg) != word_mode)
8681 arg = gen_rtx_REG (word_mode, REGNO (arg));
8682
8683 return gen_rtx_SET (VOIDmode,
8684 arg,
8685 gen_rtx_MEM (word_mode,
8686 gen_rtx_POST_INC (Pmode,
8687 stack_pointer_rtx)));
8688 }
8689
8690 /* Return >= 0 if there is an unused call-clobbered register available
8691 for the entire function. */
8692
8693 static unsigned int
8694 ix86_select_alt_pic_regnum (void)
8695 {
8696 if (current_function_is_leaf
8697 && !crtl->profile
8698 && !ix86_current_function_calls_tls_descriptor)
8699 {
8700 int i, drap;
8701 /* Can't use the same register for both PIC and DRAP. */
8702 if (crtl->drap_reg)
8703 drap = REGNO (crtl->drap_reg);
8704 else
8705 drap = -1;
8706 for (i = 2; i >= 0; --i)
8707 if (i != drap && !df_regs_ever_live_p (i))
8708 return i;
8709 }
8710
8711 return INVALID_REGNUM;
8712 }
8713
8714 /* Return TRUE if we need to save REGNO. */
8715
8716 static bool
8717 ix86_save_reg (unsigned int regno, bool maybe_eh_return)
8718 {
8719 if (pic_offset_table_rtx
8720 && regno == REAL_PIC_OFFSET_TABLE_REGNUM
8721 && (df_regs_ever_live_p (REAL_PIC_OFFSET_TABLE_REGNUM)
8722 || crtl->profile
8723 || crtl->calls_eh_return
8724 || crtl->uses_const_pool))
8725 return ix86_select_alt_pic_regnum () == INVALID_REGNUM;
8726
8727 if (crtl->calls_eh_return && maybe_eh_return)
8728 {
8729 unsigned i;
8730 for (i = 0; ; i++)
8731 {
8732 unsigned test = EH_RETURN_DATA_REGNO (i);
8733 if (test == INVALID_REGNUM)
8734 break;
8735 if (test == regno)
8736 return true;
8737 }
8738 }
8739
8740 if (crtl->drap_reg && regno == REGNO (crtl->drap_reg))
8741 return true;
8742
8743 return (df_regs_ever_live_p (regno)
8744 && !call_used_regs[regno]
8745 && !fixed_regs[regno]
8746 && (regno != HARD_FRAME_POINTER_REGNUM || !frame_pointer_needed));
8747 }
8748
8749 /* Return number of saved general prupose registers. */
8750
8751 static int
8752 ix86_nsaved_regs (void)
8753 {
8754 int nregs = 0;
8755 int regno;
8756
8757 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
8758 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
8759 nregs ++;
8760 return nregs;
8761 }
8762
8763 /* Return number of saved SSE registrers. */
8764
8765 static int
8766 ix86_nsaved_sseregs (void)
8767 {
8768 int nregs = 0;
8769 int regno;
8770
8771 if (!TARGET_64BIT_MS_ABI)
8772 return 0;
8773 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
8774 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
8775 nregs ++;
8776 return nregs;
8777 }
8778
8779 /* Given FROM and TO register numbers, say whether this elimination is
8780 allowed. If stack alignment is needed, we can only replace argument
8781 pointer with hard frame pointer, or replace frame pointer with stack
8782 pointer. Otherwise, frame pointer elimination is automatically
8783 handled and all other eliminations are valid. */
8784
8785 static bool
8786 ix86_can_eliminate (const int from, const int to)
8787 {
8788 if (stack_realign_fp)
8789 return ((from == ARG_POINTER_REGNUM
8790 && to == HARD_FRAME_POINTER_REGNUM)
8791 || (from == FRAME_POINTER_REGNUM
8792 && to == STACK_POINTER_REGNUM));
8793 else
8794 return to == STACK_POINTER_REGNUM ? !frame_pointer_needed : true;
8795 }
8796
8797 /* Return the offset between two registers, one to be eliminated, and the other
8798 its replacement, at the start of a routine. */
8799
8800 HOST_WIDE_INT
8801 ix86_initial_elimination_offset (int from, int to)
8802 {
8803 struct ix86_frame frame;
8804 ix86_compute_frame_layout (&frame);
8805
8806 if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
8807 return frame.hard_frame_pointer_offset;
8808 else if (from == FRAME_POINTER_REGNUM
8809 && to == HARD_FRAME_POINTER_REGNUM)
8810 return frame.hard_frame_pointer_offset - frame.frame_pointer_offset;
8811 else
8812 {
8813 gcc_assert (to == STACK_POINTER_REGNUM);
8814
8815 if (from == ARG_POINTER_REGNUM)
8816 return frame.stack_pointer_offset;
8817
8818 gcc_assert (from == FRAME_POINTER_REGNUM);
8819 return frame.stack_pointer_offset - frame.frame_pointer_offset;
8820 }
8821 }
8822
8823 /* In a dynamically-aligned function, we can't know the offset from
8824 stack pointer to frame pointer, so we must ensure that setjmp
8825 eliminates fp against the hard fp (%ebp) rather than trying to
8826 index from %esp up to the top of the frame across a gap that is
8827 of unknown (at compile-time) size. */
8828 static rtx
8829 ix86_builtin_setjmp_frame_value (void)
8830 {
8831 return stack_realign_fp ? hard_frame_pointer_rtx : virtual_stack_vars_rtx;
8832 }
8833
8834 /* When using -fsplit-stack, the allocation routines set a field in
8835 the TCB to the bottom of the stack plus this much space, measured
8836 in bytes. */
8837
8838 #define SPLIT_STACK_AVAILABLE 256
8839
8840 /* Fill structure ix86_frame about frame of currently computed function. */
8841
8842 static void
8843 ix86_compute_frame_layout (struct ix86_frame *frame)
8844 {
8845 unsigned int stack_alignment_needed;
8846 HOST_WIDE_INT offset;
8847 unsigned int preferred_alignment;
8848 HOST_WIDE_INT size = get_frame_size ();
8849 HOST_WIDE_INT to_allocate;
8850
8851 frame->nregs = ix86_nsaved_regs ();
8852 frame->nsseregs = ix86_nsaved_sseregs ();
8853
8854 stack_alignment_needed = crtl->stack_alignment_needed / BITS_PER_UNIT;
8855 preferred_alignment = crtl->preferred_stack_boundary / BITS_PER_UNIT;
8856
8857 /* 64-bit MS ABI seem to require stack alignment to be always 16 except for
8858 function prologues and leaf. */
8859 if ((TARGET_64BIT_MS_ABI && preferred_alignment < 16)
8860 && (!current_function_is_leaf || cfun->calls_alloca != 0
8861 || ix86_current_function_calls_tls_descriptor))
8862 {
8863 preferred_alignment = 16;
8864 stack_alignment_needed = 16;
8865 crtl->preferred_stack_boundary = 128;
8866 crtl->stack_alignment_needed = 128;
8867 }
8868
8869 gcc_assert (!size || stack_alignment_needed);
8870 gcc_assert (preferred_alignment >= STACK_BOUNDARY / BITS_PER_UNIT);
8871 gcc_assert (preferred_alignment <= stack_alignment_needed);
8872
8873 /* For SEH we have to limit the amount of code movement into the prologue.
8874 At present we do this via a BLOCKAGE, at which point there's very little
8875 scheduling that can be done, which means that there's very little point
8876 in doing anything except PUSHs. */
8877 if (TARGET_SEH)
8878 cfun->machine->use_fast_prologue_epilogue = false;
8879
8880 /* During reload iteration the amount of registers saved can change.
8881 Recompute the value as needed. Do not recompute when amount of registers
8882 didn't change as reload does multiple calls to the function and does not
8883 expect the decision to change within single iteration. */
8884 else if (!optimize_function_for_size_p (cfun)
8885 && cfun->machine->use_fast_prologue_epilogue_nregs != frame->nregs)
8886 {
8887 int count = frame->nregs;
8888 struct cgraph_node *node = cgraph_get_node (current_function_decl);
8889
8890 cfun->machine->use_fast_prologue_epilogue_nregs = count;
8891
8892 /* The fast prologue uses move instead of push to save registers. This
8893 is significantly longer, but also executes faster as modern hardware
8894 can execute the moves in parallel, but can't do that for push/pop.
8895
8896 Be careful about choosing what prologue to emit: When function takes
8897 many instructions to execute we may use slow version as well as in
8898 case function is known to be outside hot spot (this is known with
8899 feedback only). Weight the size of function by number of registers
8900 to save as it is cheap to use one or two push instructions but very
8901 slow to use many of them. */
8902 if (count)
8903 count = (count - 1) * FAST_PROLOGUE_INSN_COUNT;
8904 if (node->frequency < NODE_FREQUENCY_NORMAL
8905 || (flag_branch_probabilities
8906 && node->frequency < NODE_FREQUENCY_HOT))
8907 cfun->machine->use_fast_prologue_epilogue = false;
8908 else
8909 cfun->machine->use_fast_prologue_epilogue
8910 = !expensive_function_p (count);
8911 }
8912
8913 frame->save_regs_using_mov
8914 = (TARGET_PROLOGUE_USING_MOVE && cfun->machine->use_fast_prologue_epilogue
8915 /* If static stack checking is enabled and done with probes,
8916 the registers need to be saved before allocating the frame. */
8917 && flag_stack_check != STATIC_BUILTIN_STACK_CHECK);
8918
8919 /* Skip return address. */
8920 offset = UNITS_PER_WORD;
8921
8922 /* Skip pushed static chain. */
8923 if (ix86_static_chain_on_stack)
8924 offset += UNITS_PER_WORD;
8925
8926 /* Skip saved base pointer. */
8927 if (frame_pointer_needed)
8928 offset += UNITS_PER_WORD;
8929 frame->hfp_save_offset = offset;
8930
8931 /* The traditional frame pointer location is at the top of the frame. */
8932 frame->hard_frame_pointer_offset = offset;
8933
8934 /* Register save area */
8935 offset += frame->nregs * UNITS_PER_WORD;
8936 frame->reg_save_offset = offset;
8937
8938 /* Align and set SSE register save area. */
8939 if (frame->nsseregs)
8940 {
8941 /* The only ABI that has saved SSE registers (Win64) also has a
8942 16-byte aligned default stack, and thus we don't need to be
8943 within the re-aligned local stack frame to save them. */
8944 gcc_assert (INCOMING_STACK_BOUNDARY >= 128);
8945 offset = (offset + 16 - 1) & -16;
8946 offset += frame->nsseregs * 16;
8947 }
8948 frame->sse_reg_save_offset = offset;
8949
8950 /* The re-aligned stack starts here. Values before this point are not
8951 directly comparable with values below this point. In order to make
8952 sure that no value happens to be the same before and after, force
8953 the alignment computation below to add a non-zero value. */
8954 if (stack_realign_fp)
8955 offset = (offset + stack_alignment_needed) & -stack_alignment_needed;
8956
8957 /* Va-arg area */
8958 frame->va_arg_size = ix86_varargs_gpr_size + ix86_varargs_fpr_size;
8959 offset += frame->va_arg_size;
8960
8961 /* Align start of frame for local function. */
8962 if (stack_realign_fp
8963 || offset != frame->sse_reg_save_offset
8964 || size != 0
8965 || !current_function_is_leaf
8966 || cfun->calls_alloca
8967 || ix86_current_function_calls_tls_descriptor)
8968 offset = (offset + stack_alignment_needed - 1) & -stack_alignment_needed;
8969
8970 /* Frame pointer points here. */
8971 frame->frame_pointer_offset = offset;
8972
8973 offset += size;
8974
8975 /* Add outgoing arguments area. Can be skipped if we eliminated
8976 all the function calls as dead code.
8977 Skipping is however impossible when function calls alloca. Alloca
8978 expander assumes that last crtl->outgoing_args_size
8979 of stack frame are unused. */
8980 if (ACCUMULATE_OUTGOING_ARGS
8981 && (!current_function_is_leaf || cfun->calls_alloca
8982 || ix86_current_function_calls_tls_descriptor))
8983 {
8984 offset += crtl->outgoing_args_size;
8985 frame->outgoing_arguments_size = crtl->outgoing_args_size;
8986 }
8987 else
8988 frame->outgoing_arguments_size = 0;
8989
8990 /* Align stack boundary. Only needed if we're calling another function
8991 or using alloca. */
8992 if (!current_function_is_leaf || cfun->calls_alloca
8993 || ix86_current_function_calls_tls_descriptor)
8994 offset = (offset + preferred_alignment - 1) & -preferred_alignment;
8995
8996 /* We've reached end of stack frame. */
8997 frame->stack_pointer_offset = offset;
8998
8999 /* Size prologue needs to allocate. */
9000 to_allocate = offset - frame->sse_reg_save_offset;
9001
9002 if ((!to_allocate && frame->nregs <= 1)
9003 || (TARGET_64BIT && to_allocate >= (HOST_WIDE_INT) 0x80000000))
9004 frame->save_regs_using_mov = false;
9005
9006 if (ix86_using_red_zone ()
9007 && current_function_sp_is_unchanging
9008 && current_function_is_leaf
9009 && !ix86_current_function_calls_tls_descriptor)
9010 {
9011 frame->red_zone_size = to_allocate;
9012 if (frame->save_regs_using_mov)
9013 frame->red_zone_size += frame->nregs * UNITS_PER_WORD;
9014 if (frame->red_zone_size > RED_ZONE_SIZE - RED_ZONE_RESERVE)
9015 frame->red_zone_size = RED_ZONE_SIZE - RED_ZONE_RESERVE;
9016 }
9017 else
9018 frame->red_zone_size = 0;
9019 frame->stack_pointer_offset -= frame->red_zone_size;
9020
9021 /* The SEH frame pointer location is near the bottom of the frame.
9022 This is enforced by the fact that the difference between the
9023 stack pointer and the frame pointer is limited to 240 bytes in
9024 the unwind data structure. */
9025 if (TARGET_SEH)
9026 {
9027 HOST_WIDE_INT diff;
9028
9029 /* If we can leave the frame pointer where it is, do so. */
9030 diff = frame->stack_pointer_offset - frame->hard_frame_pointer_offset;
9031 if (diff > 240 || (diff & 15) != 0)
9032 {
9033 /* Ideally we'd determine what portion of the local stack frame
9034 (within the constraint of the lowest 240) is most heavily used.
9035 But without that complication, simply bias the frame pointer
9036 by 128 bytes so as to maximize the amount of the local stack
9037 frame that is addressable with 8-bit offsets. */
9038 frame->hard_frame_pointer_offset = frame->stack_pointer_offset - 128;
9039 }
9040 }
9041 }
9042
9043 /* This is semi-inlined memory_address_length, but simplified
9044 since we know that we're always dealing with reg+offset, and
9045 to avoid having to create and discard all that rtl. */
9046
9047 static inline int
9048 choose_baseaddr_len (unsigned int regno, HOST_WIDE_INT offset)
9049 {
9050 int len = 4;
9051
9052 if (offset == 0)
9053 {
9054 /* EBP and R13 cannot be encoded without an offset. */
9055 len = (regno == BP_REG || regno == R13_REG);
9056 }
9057 else if (IN_RANGE (offset, -128, 127))
9058 len = 1;
9059
9060 /* ESP and R12 must be encoded with a SIB byte. */
9061 if (regno == SP_REG || regno == R12_REG)
9062 len++;
9063
9064 return len;
9065 }
9066
9067 /* Return an RTX that points to CFA_OFFSET within the stack frame.
9068 The valid base registers are taken from CFUN->MACHINE->FS. */
9069
9070 static rtx
9071 choose_baseaddr (HOST_WIDE_INT cfa_offset)
9072 {
9073 const struct machine_function *m = cfun->machine;
9074 rtx base_reg = NULL;
9075 HOST_WIDE_INT base_offset = 0;
9076
9077 if (m->use_fast_prologue_epilogue)
9078 {
9079 /* Choose the base register most likely to allow the most scheduling
9080 opportunities. Generally FP is valid througout the function,
9081 while DRAP must be reloaded within the epilogue. But choose either
9082 over the SP due to increased encoding size. */
9083
9084 if (m->fs.fp_valid)
9085 {
9086 base_reg = hard_frame_pointer_rtx;
9087 base_offset = m->fs.fp_offset - cfa_offset;
9088 }
9089 else if (m->fs.drap_valid)
9090 {
9091 base_reg = crtl->drap_reg;
9092 base_offset = 0 - cfa_offset;
9093 }
9094 else if (m->fs.sp_valid)
9095 {
9096 base_reg = stack_pointer_rtx;
9097 base_offset = m->fs.sp_offset - cfa_offset;
9098 }
9099 }
9100 else
9101 {
9102 HOST_WIDE_INT toffset;
9103 int len = 16, tlen;
9104
9105 /* Choose the base register with the smallest address encoding.
9106 With a tie, choose FP > DRAP > SP. */
9107 if (m->fs.sp_valid)
9108 {
9109 base_reg = stack_pointer_rtx;
9110 base_offset = m->fs.sp_offset - cfa_offset;
9111 len = choose_baseaddr_len (STACK_POINTER_REGNUM, base_offset);
9112 }
9113 if (m->fs.drap_valid)
9114 {
9115 toffset = 0 - cfa_offset;
9116 tlen = choose_baseaddr_len (REGNO (crtl->drap_reg), toffset);
9117 if (tlen <= len)
9118 {
9119 base_reg = crtl->drap_reg;
9120 base_offset = toffset;
9121 len = tlen;
9122 }
9123 }
9124 if (m->fs.fp_valid)
9125 {
9126 toffset = m->fs.fp_offset - cfa_offset;
9127 tlen = choose_baseaddr_len (HARD_FRAME_POINTER_REGNUM, toffset);
9128 if (tlen <= len)
9129 {
9130 base_reg = hard_frame_pointer_rtx;
9131 base_offset = toffset;
9132 len = tlen;
9133 }
9134 }
9135 }
9136 gcc_assert (base_reg != NULL);
9137
9138 return plus_constant (base_reg, base_offset);
9139 }
9140
9141 /* Emit code to save registers in the prologue. */
9142
9143 static void
9144 ix86_emit_save_regs (void)
9145 {
9146 unsigned int regno;
9147 rtx insn;
9148
9149 for (regno = FIRST_PSEUDO_REGISTER - 1; regno-- > 0; )
9150 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9151 {
9152 insn = emit_insn (gen_push (gen_rtx_REG (word_mode, regno)));
9153 RTX_FRAME_RELATED_P (insn) = 1;
9154 }
9155 }
9156
9157 /* Emit a single register save at CFA - CFA_OFFSET. */
9158
9159 static void
9160 ix86_emit_save_reg_using_mov (enum machine_mode mode, unsigned int regno,
9161 HOST_WIDE_INT cfa_offset)
9162 {
9163 struct machine_function *m = cfun->machine;
9164 rtx reg = gen_rtx_REG (mode, regno);
9165 rtx mem, addr, base, insn;
9166
9167 addr = choose_baseaddr (cfa_offset);
9168 mem = gen_frame_mem (mode, addr);
9169
9170 /* For SSE saves, we need to indicate the 128-bit alignment. */
9171 set_mem_align (mem, GET_MODE_ALIGNMENT (mode));
9172
9173 insn = emit_move_insn (mem, reg);
9174 RTX_FRAME_RELATED_P (insn) = 1;
9175
9176 base = addr;
9177 if (GET_CODE (base) == PLUS)
9178 base = XEXP (base, 0);
9179 gcc_checking_assert (REG_P (base));
9180
9181 /* When saving registers into a re-aligned local stack frame, avoid
9182 any tricky guessing by dwarf2out. */
9183 if (m->fs.realigned)
9184 {
9185 gcc_checking_assert (stack_realign_drap);
9186
9187 if (regno == REGNO (crtl->drap_reg))
9188 {
9189 /* A bit of a hack. We force the DRAP register to be saved in
9190 the re-aligned stack frame, which provides us with a copy
9191 of the CFA that will last past the prologue. Install it. */
9192 gcc_checking_assert (cfun->machine->fs.fp_valid);
9193 addr = plus_constant (hard_frame_pointer_rtx,
9194 cfun->machine->fs.fp_offset - cfa_offset);
9195 mem = gen_rtx_MEM (mode, addr);
9196 add_reg_note (insn, REG_CFA_DEF_CFA, mem);
9197 }
9198 else
9199 {
9200 /* The frame pointer is a stable reference within the
9201 aligned frame. Use it. */
9202 gcc_checking_assert (cfun->machine->fs.fp_valid);
9203 addr = plus_constant (hard_frame_pointer_rtx,
9204 cfun->machine->fs.fp_offset - cfa_offset);
9205 mem = gen_rtx_MEM (mode, addr);
9206 add_reg_note (insn, REG_CFA_EXPRESSION,
9207 gen_rtx_SET (VOIDmode, mem, reg));
9208 }
9209 }
9210
9211 /* The memory may not be relative to the current CFA register,
9212 which means that we may need to generate a new pattern for
9213 use by the unwind info. */
9214 else if (base != m->fs.cfa_reg)
9215 {
9216 addr = plus_constant (m->fs.cfa_reg, m->fs.cfa_offset - cfa_offset);
9217 mem = gen_rtx_MEM (mode, addr);
9218 add_reg_note (insn, REG_CFA_OFFSET, gen_rtx_SET (VOIDmode, mem, reg));
9219 }
9220 }
9221
9222 /* Emit code to save registers using MOV insns.
9223 First register is stored at CFA - CFA_OFFSET. */
9224 static void
9225 ix86_emit_save_regs_using_mov (HOST_WIDE_INT cfa_offset)
9226 {
9227 unsigned int regno;
9228
9229 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
9230 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9231 {
9232 ix86_emit_save_reg_using_mov (word_mode, regno, cfa_offset);
9233 cfa_offset -= UNITS_PER_WORD;
9234 }
9235 }
9236
9237 /* Emit code to save SSE registers using MOV insns.
9238 First register is stored at CFA - CFA_OFFSET. */
9239 static void
9240 ix86_emit_save_sse_regs_using_mov (HOST_WIDE_INT cfa_offset)
9241 {
9242 unsigned int regno;
9243
9244 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
9245 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9246 {
9247 ix86_emit_save_reg_using_mov (V4SFmode, regno, cfa_offset);
9248 cfa_offset -= 16;
9249 }
9250 }
9251
9252 static GTY(()) rtx queued_cfa_restores;
9253
9254 /* Add a REG_CFA_RESTORE REG note to INSN or queue them until next stack
9255 manipulation insn. The value is on the stack at CFA - CFA_OFFSET.
9256 Don't add the note if the previously saved value will be left untouched
9257 within stack red-zone till return, as unwinders can find the same value
9258 in the register and on the stack. */
9259
9260 static void
9261 ix86_add_cfa_restore_note (rtx insn, rtx reg, HOST_WIDE_INT cfa_offset)
9262 {
9263 if (!crtl->shrink_wrapped
9264 && cfa_offset <= cfun->machine->fs.red_zone_offset)
9265 return;
9266
9267 if (insn)
9268 {
9269 add_reg_note (insn, REG_CFA_RESTORE, reg);
9270 RTX_FRAME_RELATED_P (insn) = 1;
9271 }
9272 else
9273 queued_cfa_restores
9274 = alloc_reg_note (REG_CFA_RESTORE, reg, queued_cfa_restores);
9275 }
9276
9277 /* Add queued REG_CFA_RESTORE notes if any to INSN. */
9278
9279 static void
9280 ix86_add_queued_cfa_restore_notes (rtx insn)
9281 {
9282 rtx last;
9283 if (!queued_cfa_restores)
9284 return;
9285 for (last = queued_cfa_restores; XEXP (last, 1); last = XEXP (last, 1))
9286 ;
9287 XEXP (last, 1) = REG_NOTES (insn);
9288 REG_NOTES (insn) = queued_cfa_restores;
9289 queued_cfa_restores = NULL_RTX;
9290 RTX_FRAME_RELATED_P (insn) = 1;
9291 }
9292
9293 /* Expand prologue or epilogue stack adjustment.
9294 The pattern exist to put a dependency on all ebp-based memory accesses.
9295 STYLE should be negative if instructions should be marked as frame related,
9296 zero if %r11 register is live and cannot be freely used and positive
9297 otherwise. */
9298
9299 static void
9300 pro_epilogue_adjust_stack (rtx dest, rtx src, rtx offset,
9301 int style, bool set_cfa)
9302 {
9303 struct machine_function *m = cfun->machine;
9304 rtx insn;
9305 bool add_frame_related_expr = false;
9306
9307 if (Pmode == SImode)
9308 insn = gen_pro_epilogue_adjust_stack_si_add (dest, src, offset);
9309 else if (x86_64_immediate_operand (offset, DImode))
9310 insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, offset);
9311 else
9312 {
9313 rtx tmp;
9314 /* r11 is used by indirect sibcall return as well, set before the
9315 epilogue and used after the epilogue. */
9316 if (style)
9317 tmp = gen_rtx_REG (DImode, R11_REG);
9318 else
9319 {
9320 gcc_assert (src != hard_frame_pointer_rtx
9321 && dest != hard_frame_pointer_rtx);
9322 tmp = hard_frame_pointer_rtx;
9323 }
9324 insn = emit_insn (gen_rtx_SET (DImode, tmp, offset));
9325 if (style < 0)
9326 add_frame_related_expr = true;
9327
9328 insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, tmp);
9329 }
9330
9331 insn = emit_insn (insn);
9332 if (style >= 0)
9333 ix86_add_queued_cfa_restore_notes (insn);
9334
9335 if (set_cfa)
9336 {
9337 rtx r;
9338
9339 gcc_assert (m->fs.cfa_reg == src);
9340 m->fs.cfa_offset += INTVAL (offset);
9341 m->fs.cfa_reg = dest;
9342
9343 r = gen_rtx_PLUS (Pmode, src, offset);
9344 r = gen_rtx_SET (VOIDmode, dest, r);
9345 add_reg_note (insn, REG_CFA_ADJUST_CFA, r);
9346 RTX_FRAME_RELATED_P (insn) = 1;
9347 }
9348 else if (style < 0)
9349 {
9350 RTX_FRAME_RELATED_P (insn) = 1;
9351 if (add_frame_related_expr)
9352 {
9353 rtx r = gen_rtx_PLUS (Pmode, src, offset);
9354 r = gen_rtx_SET (VOIDmode, dest, r);
9355 add_reg_note (insn, REG_FRAME_RELATED_EXPR, r);
9356 }
9357 }
9358
9359 if (dest == stack_pointer_rtx)
9360 {
9361 HOST_WIDE_INT ooffset = m->fs.sp_offset;
9362 bool valid = m->fs.sp_valid;
9363
9364 if (src == hard_frame_pointer_rtx)
9365 {
9366 valid = m->fs.fp_valid;
9367 ooffset = m->fs.fp_offset;
9368 }
9369 else if (src == crtl->drap_reg)
9370 {
9371 valid = m->fs.drap_valid;
9372 ooffset = 0;
9373 }
9374 else
9375 {
9376 /* Else there are two possibilities: SP itself, which we set
9377 up as the default above. Or EH_RETURN_STACKADJ_RTX, which is
9378 taken care of this by hand along the eh_return path. */
9379 gcc_checking_assert (src == stack_pointer_rtx
9380 || offset == const0_rtx);
9381 }
9382
9383 m->fs.sp_offset = ooffset - INTVAL (offset);
9384 m->fs.sp_valid = valid;
9385 }
9386 }
9387
9388 /* Find an available register to be used as dynamic realign argument
9389 pointer regsiter. Such a register will be written in prologue and
9390 used in begin of body, so it must not be
9391 1. parameter passing register.
9392 2. GOT pointer.
9393 We reuse static-chain register if it is available. Otherwise, we
9394 use DI for i386 and R13 for x86-64. We chose R13 since it has
9395 shorter encoding.
9396
9397 Return: the regno of chosen register. */
9398
9399 static unsigned int
9400 find_drap_reg (void)
9401 {
9402 tree decl = cfun->decl;
9403
9404 if (TARGET_64BIT)
9405 {
9406 /* Use R13 for nested function or function need static chain.
9407 Since function with tail call may use any caller-saved
9408 registers in epilogue, DRAP must not use caller-saved
9409 register in such case. */
9410 if (DECL_STATIC_CHAIN (decl) || crtl->tail_call_emit)
9411 return R13_REG;
9412
9413 return R10_REG;
9414 }
9415 else
9416 {
9417 /* Use DI for nested function or function need static chain.
9418 Since function with tail call may use any caller-saved
9419 registers in epilogue, DRAP must not use caller-saved
9420 register in such case. */
9421 if (DECL_STATIC_CHAIN (decl) || crtl->tail_call_emit)
9422 return DI_REG;
9423
9424 /* Reuse static chain register if it isn't used for parameter
9425 passing. */
9426 if (ix86_function_regparm (TREE_TYPE (decl), decl) <= 2)
9427 {
9428 unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (decl));
9429 if ((ccvt & (IX86_CALLCVT_FASTCALL | IX86_CALLCVT_THISCALL)) == 0)
9430 return CX_REG;
9431 }
9432 return DI_REG;
9433 }
9434 }
9435
9436 /* Return minimum incoming stack alignment. */
9437
9438 static unsigned int
9439 ix86_minimum_incoming_stack_boundary (bool sibcall)
9440 {
9441 unsigned int incoming_stack_boundary;
9442
9443 /* Prefer the one specified at command line. */
9444 if (ix86_user_incoming_stack_boundary)
9445 incoming_stack_boundary = ix86_user_incoming_stack_boundary;
9446 /* In 32bit, use MIN_STACK_BOUNDARY for incoming stack boundary
9447 if -mstackrealign is used, it isn't used for sibcall check and
9448 estimated stack alignment is 128bit. */
9449 else if (!sibcall
9450 && !TARGET_64BIT
9451 && ix86_force_align_arg_pointer
9452 && crtl->stack_alignment_estimated == 128)
9453 incoming_stack_boundary = MIN_STACK_BOUNDARY;
9454 else
9455 incoming_stack_boundary = ix86_default_incoming_stack_boundary;
9456
9457 /* Incoming stack alignment can be changed on individual functions
9458 via force_align_arg_pointer attribute. We use the smallest
9459 incoming stack boundary. */
9460 if (incoming_stack_boundary > MIN_STACK_BOUNDARY
9461 && lookup_attribute (ix86_force_align_arg_pointer_string,
9462 TYPE_ATTRIBUTES (TREE_TYPE (current_function_decl))))
9463 incoming_stack_boundary = MIN_STACK_BOUNDARY;
9464
9465 /* The incoming stack frame has to be aligned at least at
9466 parm_stack_boundary. */
9467 if (incoming_stack_boundary < crtl->parm_stack_boundary)
9468 incoming_stack_boundary = crtl->parm_stack_boundary;
9469
9470 /* Stack at entrance of main is aligned by runtime. We use the
9471 smallest incoming stack boundary. */
9472 if (incoming_stack_boundary > MAIN_STACK_BOUNDARY
9473 && DECL_NAME (current_function_decl)
9474 && MAIN_NAME_P (DECL_NAME (current_function_decl))
9475 && DECL_FILE_SCOPE_P (current_function_decl))
9476 incoming_stack_boundary = MAIN_STACK_BOUNDARY;
9477
9478 return incoming_stack_boundary;
9479 }
9480
9481 /* Update incoming stack boundary and estimated stack alignment. */
9482
9483 static void
9484 ix86_update_stack_boundary (void)
9485 {
9486 ix86_incoming_stack_boundary
9487 = ix86_minimum_incoming_stack_boundary (false);
9488
9489 /* x86_64 vararg needs 16byte stack alignment for register save
9490 area. */
9491 if (TARGET_64BIT
9492 && cfun->stdarg
9493 && crtl->stack_alignment_estimated < 128)
9494 crtl->stack_alignment_estimated = 128;
9495 }
9496
9497 /* Handle the TARGET_GET_DRAP_RTX hook. Return NULL if no DRAP is
9498 needed or an rtx for DRAP otherwise. */
9499
9500 static rtx
9501 ix86_get_drap_rtx (void)
9502 {
9503 if (ix86_force_drap || !ACCUMULATE_OUTGOING_ARGS)
9504 crtl->need_drap = true;
9505
9506 if (stack_realign_drap)
9507 {
9508 /* Assign DRAP to vDRAP and returns vDRAP */
9509 unsigned int regno = find_drap_reg ();
9510 rtx drap_vreg;
9511 rtx arg_ptr;
9512 rtx seq, insn;
9513
9514 arg_ptr = gen_rtx_REG (Pmode, regno);
9515 crtl->drap_reg = arg_ptr;
9516
9517 start_sequence ();
9518 drap_vreg = copy_to_reg (arg_ptr);
9519 seq = get_insns ();
9520 end_sequence ();
9521
9522 insn = emit_insn_before (seq, NEXT_INSN (entry_of_function ()));
9523 if (!optimize)
9524 {
9525 add_reg_note (insn, REG_CFA_SET_VDRAP, drap_vreg);
9526 RTX_FRAME_RELATED_P (insn) = 1;
9527 }
9528 return drap_vreg;
9529 }
9530 else
9531 return NULL;
9532 }
9533
9534 /* Handle the TARGET_INTERNAL_ARG_POINTER hook. */
9535
9536 static rtx
9537 ix86_internal_arg_pointer (void)
9538 {
9539 return virtual_incoming_args_rtx;
9540 }
9541
9542 struct scratch_reg {
9543 rtx reg;
9544 bool saved;
9545 };
9546
9547 /* Return a short-lived scratch register for use on function entry.
9548 In 32-bit mode, it is valid only after the registers are saved
9549 in the prologue. This register must be released by means of
9550 release_scratch_register_on_entry once it is dead. */
9551
9552 static void
9553 get_scratch_register_on_entry (struct scratch_reg *sr)
9554 {
9555 int regno;
9556
9557 sr->saved = false;
9558
9559 if (TARGET_64BIT)
9560 {
9561 /* We always use R11 in 64-bit mode. */
9562 regno = R11_REG;
9563 }
9564 else
9565 {
9566 tree decl = current_function_decl, fntype = TREE_TYPE (decl);
9567 bool fastcall_p
9568 = lookup_attribute ("fastcall", TYPE_ATTRIBUTES (fntype)) != NULL_TREE;
9569 bool static_chain_p = DECL_STATIC_CHAIN (decl);
9570 int regparm = ix86_function_regparm (fntype, decl);
9571 int drap_regno
9572 = crtl->drap_reg ? REGNO (crtl->drap_reg) : INVALID_REGNUM;
9573
9574 /* 'fastcall' sets regparm to 2, uses ecx/edx for arguments and eax
9575 for the static chain register. */
9576 if ((regparm < 1 || (fastcall_p && !static_chain_p))
9577 && drap_regno != AX_REG)
9578 regno = AX_REG;
9579 else if (regparm < 2 && drap_regno != DX_REG)
9580 regno = DX_REG;
9581 /* ecx is the static chain register. */
9582 else if (regparm < 3 && !fastcall_p && !static_chain_p
9583 && drap_regno != CX_REG)
9584 regno = CX_REG;
9585 else if (ix86_save_reg (BX_REG, true))
9586 regno = BX_REG;
9587 /* esi is the static chain register. */
9588 else if (!(regparm == 3 && static_chain_p)
9589 && ix86_save_reg (SI_REG, true))
9590 regno = SI_REG;
9591 else if (ix86_save_reg (DI_REG, true))
9592 regno = DI_REG;
9593 else
9594 {
9595 regno = (drap_regno == AX_REG ? DX_REG : AX_REG);
9596 sr->saved = true;
9597 }
9598 }
9599
9600 sr->reg = gen_rtx_REG (Pmode, regno);
9601 if (sr->saved)
9602 {
9603 rtx insn = emit_insn (gen_push (sr->reg));
9604 RTX_FRAME_RELATED_P (insn) = 1;
9605 }
9606 }
9607
9608 /* Release a scratch register obtained from the preceding function. */
9609
9610 static void
9611 release_scratch_register_on_entry (struct scratch_reg *sr)
9612 {
9613 if (sr->saved)
9614 {
9615 rtx x, insn = emit_insn (gen_pop (sr->reg));
9616
9617 /* The RTX_FRAME_RELATED_P mechanism doesn't know about pop. */
9618 RTX_FRAME_RELATED_P (insn) = 1;
9619 x = gen_rtx_PLUS (Pmode, stack_pointer_rtx, GEN_INT (UNITS_PER_WORD));
9620 x = gen_rtx_SET (VOIDmode, stack_pointer_rtx, x);
9621 add_reg_note (insn, REG_FRAME_RELATED_EXPR, x);
9622 }
9623 }
9624
9625 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
9626
9627 /* Emit code to adjust the stack pointer by SIZE bytes while probing it. */
9628
9629 static void
9630 ix86_adjust_stack_and_probe (const HOST_WIDE_INT size)
9631 {
9632 /* We skip the probe for the first interval + a small dope of 4 words and
9633 probe that many bytes past the specified size to maintain a protection
9634 area at the botton of the stack. */
9635 const int dope = 4 * UNITS_PER_WORD;
9636 rtx size_rtx = GEN_INT (size), last;
9637
9638 /* See if we have a constant small number of probes to generate. If so,
9639 that's the easy case. The run-time loop is made up of 11 insns in the
9640 generic case while the compile-time loop is made up of 3+2*(n-1) insns
9641 for n # of intervals. */
9642 if (size <= 5 * PROBE_INTERVAL)
9643 {
9644 HOST_WIDE_INT i, adjust;
9645 bool first_probe = true;
9646
9647 /* Adjust SP and probe at PROBE_INTERVAL + N * PROBE_INTERVAL for
9648 values of N from 1 until it exceeds SIZE. If only one probe is
9649 needed, this will not generate any code. Then adjust and probe
9650 to PROBE_INTERVAL + SIZE. */
9651 for (i = PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
9652 {
9653 if (first_probe)
9654 {
9655 adjust = 2 * PROBE_INTERVAL + dope;
9656 first_probe = false;
9657 }
9658 else
9659 adjust = PROBE_INTERVAL;
9660
9661 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9662 plus_constant (stack_pointer_rtx, -adjust)));
9663 emit_stack_probe (stack_pointer_rtx);
9664 }
9665
9666 if (first_probe)
9667 adjust = size + PROBE_INTERVAL + dope;
9668 else
9669 adjust = size + PROBE_INTERVAL - i;
9670
9671 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9672 plus_constant (stack_pointer_rtx, -adjust)));
9673 emit_stack_probe (stack_pointer_rtx);
9674
9675 /* Adjust back to account for the additional first interval. */
9676 last = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9677 plus_constant (stack_pointer_rtx,
9678 PROBE_INTERVAL + dope)));
9679 }
9680
9681 /* Otherwise, do the same as above, but in a loop. Note that we must be
9682 extra careful with variables wrapping around because we might be at
9683 the very top (or the very bottom) of the address space and we have
9684 to be able to handle this case properly; in particular, we use an
9685 equality test for the loop condition. */
9686 else
9687 {
9688 HOST_WIDE_INT rounded_size;
9689 struct scratch_reg sr;
9690
9691 get_scratch_register_on_entry (&sr);
9692
9693
9694 /* Step 1: round SIZE to the previous multiple of the interval. */
9695
9696 rounded_size = size & -PROBE_INTERVAL;
9697
9698
9699 /* Step 2: compute initial and final value of the loop counter. */
9700
9701 /* SP = SP_0 + PROBE_INTERVAL. */
9702 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9703 plus_constant (stack_pointer_rtx,
9704 - (PROBE_INTERVAL + dope))));
9705
9706 /* LAST_ADDR = SP_0 + PROBE_INTERVAL + ROUNDED_SIZE. */
9707 emit_move_insn (sr.reg, GEN_INT (-rounded_size));
9708 emit_insn (gen_rtx_SET (VOIDmode, sr.reg,
9709 gen_rtx_PLUS (Pmode, sr.reg,
9710 stack_pointer_rtx)));
9711
9712
9713 /* Step 3: the loop
9714
9715 while (SP != LAST_ADDR)
9716 {
9717 SP = SP + PROBE_INTERVAL
9718 probe at SP
9719 }
9720
9721 adjusts SP and probes to PROBE_INTERVAL + N * PROBE_INTERVAL for
9722 values of N from 1 until it is equal to ROUNDED_SIZE. */
9723
9724 emit_insn (ix86_gen_adjust_stack_and_probe (sr.reg, sr.reg, size_rtx));
9725
9726
9727 /* Step 4: adjust SP and probe at PROBE_INTERVAL + SIZE if we cannot
9728 assert at compile-time that SIZE is equal to ROUNDED_SIZE. */
9729
9730 if (size != rounded_size)
9731 {
9732 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9733 plus_constant (stack_pointer_rtx,
9734 rounded_size - size)));
9735 emit_stack_probe (stack_pointer_rtx);
9736 }
9737
9738 /* Adjust back to account for the additional first interval. */
9739 last = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9740 plus_constant (stack_pointer_rtx,
9741 PROBE_INTERVAL + dope)));
9742
9743 release_scratch_register_on_entry (&sr);
9744 }
9745
9746 gcc_assert (cfun->machine->fs.cfa_reg != stack_pointer_rtx);
9747
9748 /* Even if the stack pointer isn't the CFA register, we need to correctly
9749 describe the adjustments made to it, in particular differentiate the
9750 frame-related ones from the frame-unrelated ones. */
9751 if (size > 0)
9752 {
9753 rtx expr = gen_rtx_SEQUENCE (VOIDmode, rtvec_alloc (2));
9754 XVECEXP (expr, 0, 0)
9755 = gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9756 plus_constant (stack_pointer_rtx, -size));
9757 XVECEXP (expr, 0, 1)
9758 = gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9759 plus_constant (stack_pointer_rtx,
9760 PROBE_INTERVAL + dope + size));
9761 add_reg_note (last, REG_FRAME_RELATED_EXPR, expr);
9762 RTX_FRAME_RELATED_P (last) = 1;
9763
9764 cfun->machine->fs.sp_offset += size;
9765 }
9766
9767 /* Make sure nothing is scheduled before we are done. */
9768 emit_insn (gen_blockage ());
9769 }
9770
9771 /* Adjust the stack pointer up to REG while probing it. */
9772
9773 const char *
9774 output_adjust_stack_and_probe (rtx reg)
9775 {
9776 static int labelno = 0;
9777 char loop_lab[32], end_lab[32];
9778 rtx xops[2];
9779
9780 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno);
9781 ASM_GENERATE_INTERNAL_LABEL (end_lab, "LPSRE", labelno++);
9782
9783 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
9784
9785 /* Jump to END_LAB if SP == LAST_ADDR. */
9786 xops[0] = stack_pointer_rtx;
9787 xops[1] = reg;
9788 output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
9789 fputs ("\tje\t", asm_out_file);
9790 assemble_name_raw (asm_out_file, end_lab);
9791 fputc ('\n', asm_out_file);
9792
9793 /* SP = SP + PROBE_INTERVAL. */
9794 xops[1] = GEN_INT (PROBE_INTERVAL);
9795 output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
9796
9797 /* Probe at SP. */
9798 xops[1] = const0_rtx;
9799 output_asm_insn ("or%z0\t{%1, (%0)|DWORD PTR [%0], %1}", xops);
9800
9801 fprintf (asm_out_file, "\tjmp\t");
9802 assemble_name_raw (asm_out_file, loop_lab);
9803 fputc ('\n', asm_out_file);
9804
9805 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, end_lab);
9806
9807 return "";
9808 }
9809
9810 /* Emit code to probe a range of stack addresses from FIRST to FIRST+SIZE,
9811 inclusive. These are offsets from the current stack pointer. */
9812
9813 static void
9814 ix86_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size)
9815 {
9816 /* See if we have a constant small number of probes to generate. If so,
9817 that's the easy case. The run-time loop is made up of 7 insns in the
9818 generic case while the compile-time loop is made up of n insns for n #
9819 of intervals. */
9820 if (size <= 7 * PROBE_INTERVAL)
9821 {
9822 HOST_WIDE_INT i;
9823
9824 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 1 until
9825 it exceeds SIZE. If only one probe is needed, this will not
9826 generate any code. Then probe at FIRST + SIZE. */
9827 for (i = PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
9828 emit_stack_probe (plus_constant (stack_pointer_rtx, -(first + i)));
9829
9830 emit_stack_probe (plus_constant (stack_pointer_rtx, -(first + size)));
9831 }
9832
9833 /* Otherwise, do the same as above, but in a loop. Note that we must be
9834 extra careful with variables wrapping around because we might be at
9835 the very top (or the very bottom) of the address space and we have
9836 to be able to handle this case properly; in particular, we use an
9837 equality test for the loop condition. */
9838 else
9839 {
9840 HOST_WIDE_INT rounded_size, last;
9841 struct scratch_reg sr;
9842
9843 get_scratch_register_on_entry (&sr);
9844
9845
9846 /* Step 1: round SIZE to the previous multiple of the interval. */
9847
9848 rounded_size = size & -PROBE_INTERVAL;
9849
9850
9851 /* Step 2: compute initial and final value of the loop counter. */
9852
9853 /* TEST_OFFSET = FIRST. */
9854 emit_move_insn (sr.reg, GEN_INT (-first));
9855
9856 /* LAST_OFFSET = FIRST + ROUNDED_SIZE. */
9857 last = first + rounded_size;
9858
9859
9860 /* Step 3: the loop
9861
9862 while (TEST_ADDR != LAST_ADDR)
9863 {
9864 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
9865 probe at TEST_ADDR
9866 }
9867
9868 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
9869 until it is equal to ROUNDED_SIZE. */
9870
9871 emit_insn (ix86_gen_probe_stack_range (sr.reg, sr.reg, GEN_INT (-last)));
9872
9873
9874 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
9875 that SIZE is equal to ROUNDED_SIZE. */
9876
9877 if (size != rounded_size)
9878 emit_stack_probe (plus_constant (gen_rtx_PLUS (Pmode,
9879 stack_pointer_rtx,
9880 sr.reg),
9881 rounded_size - size));
9882
9883 release_scratch_register_on_entry (&sr);
9884 }
9885
9886 /* Make sure nothing is scheduled before we are done. */
9887 emit_insn (gen_blockage ());
9888 }
9889
9890 /* Probe a range of stack addresses from REG to END, inclusive. These are
9891 offsets from the current stack pointer. */
9892
9893 const char *
9894 output_probe_stack_range (rtx reg, rtx end)
9895 {
9896 static int labelno = 0;
9897 char loop_lab[32], end_lab[32];
9898 rtx xops[3];
9899
9900 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno);
9901 ASM_GENERATE_INTERNAL_LABEL (end_lab, "LPSRE", labelno++);
9902
9903 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
9904
9905 /* Jump to END_LAB if TEST_ADDR == LAST_ADDR. */
9906 xops[0] = reg;
9907 xops[1] = end;
9908 output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
9909 fputs ("\tje\t", asm_out_file);
9910 assemble_name_raw (asm_out_file, end_lab);
9911 fputc ('\n', asm_out_file);
9912
9913 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
9914 xops[1] = GEN_INT (PROBE_INTERVAL);
9915 output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
9916
9917 /* Probe at TEST_ADDR. */
9918 xops[0] = stack_pointer_rtx;
9919 xops[1] = reg;
9920 xops[2] = const0_rtx;
9921 output_asm_insn ("or%z0\t{%2, (%0,%1)|DWORD PTR [%0+%1], %2}", xops);
9922
9923 fprintf (asm_out_file, "\tjmp\t");
9924 assemble_name_raw (asm_out_file, loop_lab);
9925 fputc ('\n', asm_out_file);
9926
9927 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, end_lab);
9928
9929 return "";
9930 }
9931
9932 /* Finalize stack_realign_needed flag, which will guide prologue/epilogue
9933 to be generated in correct form. */
9934 static void
9935 ix86_finalize_stack_realign_flags (void)
9936 {
9937 /* Check if stack realign is really needed after reload, and
9938 stores result in cfun */
9939 unsigned int incoming_stack_boundary
9940 = (crtl->parm_stack_boundary > ix86_incoming_stack_boundary
9941 ? crtl->parm_stack_boundary : ix86_incoming_stack_boundary);
9942 unsigned int stack_realign = (incoming_stack_boundary
9943 < (current_function_is_leaf
9944 ? crtl->max_used_stack_slot_alignment
9945 : crtl->stack_alignment_needed));
9946
9947 if (crtl->stack_realign_finalized)
9948 {
9949 /* After stack_realign_needed is finalized, we can't no longer
9950 change it. */
9951 gcc_assert (crtl->stack_realign_needed == stack_realign);
9952 return;
9953 }
9954
9955 /* If the only reason for frame_pointer_needed is that we conservatively
9956 assumed stack realignment might be needed, but in the end nothing that
9957 needed the stack alignment had been spilled, clear frame_pointer_needed
9958 and say we don't need stack realignment. */
9959 if (stack_realign
9960 && !crtl->need_drap
9961 && frame_pointer_needed
9962 && current_function_is_leaf
9963 && flag_omit_frame_pointer
9964 && current_function_sp_is_unchanging
9965 && !ix86_current_function_calls_tls_descriptor
9966 && !crtl->accesses_prior_frames
9967 && !cfun->calls_alloca
9968 && !crtl->calls_eh_return
9969 && !(flag_stack_check && STACK_CHECK_MOVING_SP)
9970 && !ix86_frame_pointer_required ()
9971 && get_frame_size () == 0
9972 && ix86_nsaved_sseregs () == 0
9973 && ix86_varargs_gpr_size + ix86_varargs_fpr_size == 0)
9974 {
9975 HARD_REG_SET set_up_by_prologue, prologue_used;
9976 basic_block bb;
9977
9978 CLEAR_HARD_REG_SET (prologue_used);
9979 CLEAR_HARD_REG_SET (set_up_by_prologue);
9980 add_to_hard_reg_set (&set_up_by_prologue, Pmode, STACK_POINTER_REGNUM);
9981 add_to_hard_reg_set (&set_up_by_prologue, Pmode, ARG_POINTER_REGNUM);
9982 add_to_hard_reg_set (&set_up_by_prologue, Pmode,
9983 HARD_FRAME_POINTER_REGNUM);
9984 FOR_EACH_BB (bb)
9985 {
9986 rtx insn;
9987 FOR_BB_INSNS (bb, insn)
9988 if (NONDEBUG_INSN_P (insn)
9989 && requires_stack_frame_p (insn, prologue_used,
9990 set_up_by_prologue))
9991 {
9992 crtl->stack_realign_needed = stack_realign;
9993 crtl->stack_realign_finalized = true;
9994 return;
9995 }
9996 }
9997
9998 frame_pointer_needed = false;
9999 stack_realign = false;
10000 crtl->max_used_stack_slot_alignment = incoming_stack_boundary;
10001 crtl->stack_alignment_needed = incoming_stack_boundary;
10002 crtl->stack_alignment_estimated = incoming_stack_boundary;
10003 if (crtl->preferred_stack_boundary > incoming_stack_boundary)
10004 crtl->preferred_stack_boundary = incoming_stack_boundary;
10005 df_finish_pass (true);
10006 df_scan_alloc (NULL);
10007 df_scan_blocks ();
10008 df_compute_regs_ever_live (true);
10009 df_analyze ();
10010 }
10011
10012 crtl->stack_realign_needed = stack_realign;
10013 crtl->stack_realign_finalized = true;
10014 }
10015
10016 /* Expand the prologue into a bunch of separate insns. */
10017
10018 void
10019 ix86_expand_prologue (void)
10020 {
10021 struct machine_function *m = cfun->machine;
10022 rtx insn, t;
10023 bool pic_reg_used;
10024 struct ix86_frame frame;
10025 HOST_WIDE_INT allocate;
10026 bool int_registers_saved;
10027
10028 ix86_finalize_stack_realign_flags ();
10029
10030 /* DRAP should not coexist with stack_realign_fp */
10031 gcc_assert (!(crtl->drap_reg && stack_realign_fp));
10032
10033 memset (&m->fs, 0, sizeof (m->fs));
10034
10035 /* Initialize CFA state for before the prologue. */
10036 m->fs.cfa_reg = stack_pointer_rtx;
10037 m->fs.cfa_offset = INCOMING_FRAME_SP_OFFSET;
10038
10039 /* Track SP offset to the CFA. We continue tracking this after we've
10040 swapped the CFA register away from SP. In the case of re-alignment
10041 this is fudged; we're interested to offsets within the local frame. */
10042 m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
10043 m->fs.sp_valid = true;
10044
10045 ix86_compute_frame_layout (&frame);
10046
10047 if (!TARGET_64BIT && ix86_function_ms_hook_prologue (current_function_decl))
10048 {
10049 /* We should have already generated an error for any use of
10050 ms_hook on a nested function. */
10051 gcc_checking_assert (!ix86_static_chain_on_stack);
10052
10053 /* Check if profiling is active and we shall use profiling before
10054 prologue variant. If so sorry. */
10055 if (crtl->profile && flag_fentry != 0)
10056 sorry ("ms_hook_prologue attribute isn%'t compatible "
10057 "with -mfentry for 32-bit");
10058
10059 /* In ix86_asm_output_function_label we emitted:
10060 8b ff movl.s %edi,%edi
10061 55 push %ebp
10062 8b ec movl.s %esp,%ebp
10063
10064 This matches the hookable function prologue in Win32 API
10065 functions in Microsoft Windows XP Service Pack 2 and newer.
10066 Wine uses this to enable Windows apps to hook the Win32 API
10067 functions provided by Wine.
10068
10069 What that means is that we've already set up the frame pointer. */
10070
10071 if (frame_pointer_needed
10072 && !(crtl->drap_reg && crtl->stack_realign_needed))
10073 {
10074 rtx push, mov;
10075
10076 /* We've decided to use the frame pointer already set up.
10077 Describe this to the unwinder by pretending that both
10078 push and mov insns happen right here.
10079
10080 Putting the unwind info here at the end of the ms_hook
10081 is done so that we can make absolutely certain we get
10082 the required byte sequence at the start of the function,
10083 rather than relying on an assembler that can produce
10084 the exact encoding required.
10085
10086 However it does mean (in the unpatched case) that we have
10087 a 1 insn window where the asynchronous unwind info is
10088 incorrect. However, if we placed the unwind info at
10089 its correct location we would have incorrect unwind info
10090 in the patched case. Which is probably all moot since
10091 I don't expect Wine generates dwarf2 unwind info for the
10092 system libraries that use this feature. */
10093
10094 insn = emit_insn (gen_blockage ());
10095
10096 push = gen_push (hard_frame_pointer_rtx);
10097 mov = gen_rtx_SET (VOIDmode, hard_frame_pointer_rtx,
10098 stack_pointer_rtx);
10099 RTX_FRAME_RELATED_P (push) = 1;
10100 RTX_FRAME_RELATED_P (mov) = 1;
10101
10102 RTX_FRAME_RELATED_P (insn) = 1;
10103 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
10104 gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, push, mov)));
10105
10106 /* Note that gen_push incremented m->fs.cfa_offset, even
10107 though we didn't emit the push insn here. */
10108 m->fs.cfa_reg = hard_frame_pointer_rtx;
10109 m->fs.fp_offset = m->fs.cfa_offset;
10110 m->fs.fp_valid = true;
10111 }
10112 else
10113 {
10114 /* The frame pointer is not needed so pop %ebp again.
10115 This leaves us with a pristine state. */
10116 emit_insn (gen_pop (hard_frame_pointer_rtx));
10117 }
10118 }
10119
10120 /* The first insn of a function that accepts its static chain on the
10121 stack is to push the register that would be filled in by a direct
10122 call. This insn will be skipped by the trampoline. */
10123 else if (ix86_static_chain_on_stack)
10124 {
10125 insn = emit_insn (gen_push (ix86_static_chain (cfun->decl, false)));
10126 emit_insn (gen_blockage ());
10127
10128 /* We don't want to interpret this push insn as a register save,
10129 only as a stack adjustment. The real copy of the register as
10130 a save will be done later, if needed. */
10131 t = plus_constant (stack_pointer_rtx, -UNITS_PER_WORD);
10132 t = gen_rtx_SET (VOIDmode, stack_pointer_rtx, t);
10133 add_reg_note (insn, REG_CFA_ADJUST_CFA, t);
10134 RTX_FRAME_RELATED_P (insn) = 1;
10135 }
10136
10137 /* Emit prologue code to adjust stack alignment and setup DRAP, in case
10138 of DRAP is needed and stack realignment is really needed after reload */
10139 if (stack_realign_drap)
10140 {
10141 int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
10142
10143 /* Only need to push parameter pointer reg if it is caller saved. */
10144 if (!call_used_regs[REGNO (crtl->drap_reg)])
10145 {
10146 /* Push arg pointer reg */
10147 insn = emit_insn (gen_push (crtl->drap_reg));
10148 RTX_FRAME_RELATED_P (insn) = 1;
10149 }
10150
10151 /* Grab the argument pointer. */
10152 t = plus_constant (stack_pointer_rtx, m->fs.sp_offset);
10153 insn = emit_insn (gen_rtx_SET (VOIDmode, crtl->drap_reg, t));
10154 RTX_FRAME_RELATED_P (insn) = 1;
10155 m->fs.cfa_reg = crtl->drap_reg;
10156 m->fs.cfa_offset = 0;
10157
10158 /* Align the stack. */
10159 insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
10160 stack_pointer_rtx,
10161 GEN_INT (-align_bytes)));
10162 RTX_FRAME_RELATED_P (insn) = 1;
10163
10164 /* Replicate the return address on the stack so that return
10165 address can be reached via (argp - 1) slot. This is needed
10166 to implement macro RETURN_ADDR_RTX and intrinsic function
10167 expand_builtin_return_addr etc. */
10168 t = plus_constant (crtl->drap_reg, -UNITS_PER_WORD);
10169 t = gen_frame_mem (word_mode, t);
10170 insn = emit_insn (gen_push (t));
10171 RTX_FRAME_RELATED_P (insn) = 1;
10172
10173 /* For the purposes of frame and register save area addressing,
10174 we've started over with a new frame. */
10175 m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
10176 m->fs.realigned = true;
10177 }
10178
10179 if (frame_pointer_needed && !m->fs.fp_valid)
10180 {
10181 /* Note: AT&T enter does NOT have reversed args. Enter is probably
10182 slower on all targets. Also sdb doesn't like it. */
10183 insn = emit_insn (gen_push (hard_frame_pointer_rtx));
10184 RTX_FRAME_RELATED_P (insn) = 1;
10185
10186 if (m->fs.sp_offset == frame.hard_frame_pointer_offset)
10187 {
10188 insn = emit_move_insn (hard_frame_pointer_rtx, stack_pointer_rtx);
10189 RTX_FRAME_RELATED_P (insn) = 1;
10190
10191 if (m->fs.cfa_reg == stack_pointer_rtx)
10192 m->fs.cfa_reg = hard_frame_pointer_rtx;
10193 m->fs.fp_offset = m->fs.sp_offset;
10194 m->fs.fp_valid = true;
10195 }
10196 }
10197
10198 int_registers_saved = (frame.nregs == 0);
10199
10200 if (!int_registers_saved)
10201 {
10202 /* If saving registers via PUSH, do so now. */
10203 if (!frame.save_regs_using_mov)
10204 {
10205 ix86_emit_save_regs ();
10206 int_registers_saved = true;
10207 gcc_assert (m->fs.sp_offset == frame.reg_save_offset);
10208 }
10209
10210 /* When using red zone we may start register saving before allocating
10211 the stack frame saving one cycle of the prologue. However, avoid
10212 doing this if we have to probe the stack; at least on x86_64 the
10213 stack probe can turn into a call that clobbers a red zone location. */
10214 else if (ix86_using_red_zone ()
10215 && (! TARGET_STACK_PROBE
10216 || frame.stack_pointer_offset < CHECK_STACK_LIMIT))
10217 {
10218 ix86_emit_save_regs_using_mov (frame.reg_save_offset);
10219 int_registers_saved = true;
10220 }
10221 }
10222
10223 if (stack_realign_fp)
10224 {
10225 int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
10226 gcc_assert (align_bytes > MIN_STACK_BOUNDARY / BITS_PER_UNIT);
10227
10228 /* The computation of the size of the re-aligned stack frame means
10229 that we must allocate the size of the register save area before
10230 performing the actual alignment. Otherwise we cannot guarantee
10231 that there's enough storage above the realignment point. */
10232 if (m->fs.sp_offset != frame.sse_reg_save_offset)
10233 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10234 GEN_INT (m->fs.sp_offset
10235 - frame.sse_reg_save_offset),
10236 -1, false);
10237
10238 /* Align the stack. */
10239 insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
10240 stack_pointer_rtx,
10241 GEN_INT (-align_bytes)));
10242
10243 /* For the purposes of register save area addressing, the stack
10244 pointer is no longer valid. As for the value of sp_offset,
10245 see ix86_compute_frame_layout, which we need to match in order
10246 to pass verification of stack_pointer_offset at the end. */
10247 m->fs.sp_offset = (m->fs.sp_offset + align_bytes) & -align_bytes;
10248 m->fs.sp_valid = false;
10249 }
10250
10251 allocate = frame.stack_pointer_offset - m->fs.sp_offset;
10252
10253 if (flag_stack_usage_info)
10254 {
10255 /* We start to count from ARG_POINTER. */
10256 HOST_WIDE_INT stack_size = frame.stack_pointer_offset;
10257
10258 /* If it was realigned, take into account the fake frame. */
10259 if (stack_realign_drap)
10260 {
10261 if (ix86_static_chain_on_stack)
10262 stack_size += UNITS_PER_WORD;
10263
10264 if (!call_used_regs[REGNO (crtl->drap_reg)])
10265 stack_size += UNITS_PER_WORD;
10266
10267 /* This over-estimates by 1 minimal-stack-alignment-unit but
10268 mitigates that by counting in the new return address slot. */
10269 current_function_dynamic_stack_size
10270 += crtl->stack_alignment_needed / BITS_PER_UNIT;
10271 }
10272
10273 current_function_static_stack_size = stack_size;
10274 }
10275
10276 /* The stack has already been decremented by the instruction calling us
10277 so probe if the size is non-negative to preserve the protection area. */
10278 if (allocate >= 0 && flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
10279 {
10280 /* We expect the registers to be saved when probes are used. */
10281 gcc_assert (int_registers_saved);
10282
10283 if (STACK_CHECK_MOVING_SP)
10284 {
10285 ix86_adjust_stack_and_probe (allocate);
10286 allocate = 0;
10287 }
10288 else
10289 {
10290 HOST_WIDE_INT size = allocate;
10291
10292 if (TARGET_64BIT && size >= (HOST_WIDE_INT) 0x80000000)
10293 size = 0x80000000 - STACK_CHECK_PROTECT - 1;
10294
10295 if (TARGET_STACK_PROBE)
10296 ix86_emit_probe_stack_range (0, size + STACK_CHECK_PROTECT);
10297 else
10298 ix86_emit_probe_stack_range (STACK_CHECK_PROTECT, size);
10299 }
10300 }
10301
10302 if (allocate == 0)
10303 ;
10304 else if (!ix86_target_stack_probe ()
10305 || frame.stack_pointer_offset < CHECK_STACK_LIMIT)
10306 {
10307 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10308 GEN_INT (-allocate), -1,
10309 m->fs.cfa_reg == stack_pointer_rtx);
10310 }
10311 else
10312 {
10313 rtx eax = gen_rtx_REG (Pmode, AX_REG);
10314 rtx r10 = NULL;
10315 rtx (*adjust_stack_insn)(rtx, rtx, rtx);
10316
10317 bool eax_live = false;
10318 bool r10_live = false;
10319
10320 if (TARGET_64BIT)
10321 r10_live = (DECL_STATIC_CHAIN (current_function_decl) != 0);
10322 if (!TARGET_64BIT_MS_ABI)
10323 eax_live = ix86_eax_live_at_start_p ();
10324
10325 if (eax_live)
10326 {
10327 emit_insn (gen_push (eax));
10328 allocate -= UNITS_PER_WORD;
10329 }
10330 if (r10_live)
10331 {
10332 r10 = gen_rtx_REG (Pmode, R10_REG);
10333 emit_insn (gen_push (r10));
10334 allocate -= UNITS_PER_WORD;
10335 }
10336
10337 emit_move_insn (eax, GEN_INT (allocate));
10338 emit_insn (ix86_gen_allocate_stack_worker (eax, eax));
10339
10340 /* Use the fact that AX still contains ALLOCATE. */
10341 adjust_stack_insn = (Pmode == DImode
10342 ? gen_pro_epilogue_adjust_stack_di_sub
10343 : gen_pro_epilogue_adjust_stack_si_sub);
10344
10345 insn = emit_insn (adjust_stack_insn (stack_pointer_rtx,
10346 stack_pointer_rtx, eax));
10347
10348 /* Note that SEH directives need to continue tracking the stack
10349 pointer even after the frame pointer has been set up. */
10350 if (m->fs.cfa_reg == stack_pointer_rtx || TARGET_SEH)
10351 {
10352 if (m->fs.cfa_reg == stack_pointer_rtx)
10353 m->fs.cfa_offset += allocate;
10354
10355 RTX_FRAME_RELATED_P (insn) = 1;
10356 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
10357 gen_rtx_SET (VOIDmode, stack_pointer_rtx,
10358 plus_constant (stack_pointer_rtx,
10359 -allocate)));
10360 }
10361 m->fs.sp_offset += allocate;
10362
10363 if (r10_live && eax_live)
10364 {
10365 t = choose_baseaddr (m->fs.sp_offset - allocate);
10366 emit_move_insn (gen_rtx_REG (word_mode, R10_REG),
10367 gen_frame_mem (word_mode, t));
10368 t = choose_baseaddr (m->fs.sp_offset - allocate - UNITS_PER_WORD);
10369 emit_move_insn (gen_rtx_REG (word_mode, AX_REG),
10370 gen_frame_mem (word_mode, t));
10371 }
10372 else if (eax_live || r10_live)
10373 {
10374 t = choose_baseaddr (m->fs.sp_offset - allocate);
10375 emit_move_insn (gen_rtx_REG (word_mode,
10376 (eax_live ? AX_REG : R10_REG)),
10377 gen_frame_mem (word_mode, t));
10378 }
10379 }
10380 gcc_assert (m->fs.sp_offset == frame.stack_pointer_offset);
10381
10382 /* If we havn't already set up the frame pointer, do so now. */
10383 if (frame_pointer_needed && !m->fs.fp_valid)
10384 {
10385 insn = ix86_gen_add3 (hard_frame_pointer_rtx, stack_pointer_rtx,
10386 GEN_INT (frame.stack_pointer_offset
10387 - frame.hard_frame_pointer_offset));
10388 insn = emit_insn (insn);
10389 RTX_FRAME_RELATED_P (insn) = 1;
10390 add_reg_note (insn, REG_CFA_ADJUST_CFA, NULL);
10391
10392 if (m->fs.cfa_reg == stack_pointer_rtx)
10393 m->fs.cfa_reg = hard_frame_pointer_rtx;
10394 m->fs.fp_offset = frame.hard_frame_pointer_offset;
10395 m->fs.fp_valid = true;
10396 }
10397
10398 if (!int_registers_saved)
10399 ix86_emit_save_regs_using_mov (frame.reg_save_offset);
10400 if (frame.nsseregs)
10401 ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
10402
10403 pic_reg_used = false;
10404 if (pic_offset_table_rtx
10405 && (df_regs_ever_live_p (REAL_PIC_OFFSET_TABLE_REGNUM)
10406 || crtl->profile))
10407 {
10408 unsigned int alt_pic_reg_used = ix86_select_alt_pic_regnum ();
10409
10410 if (alt_pic_reg_used != INVALID_REGNUM)
10411 SET_REGNO (pic_offset_table_rtx, alt_pic_reg_used);
10412
10413 pic_reg_used = true;
10414 }
10415
10416 if (pic_reg_used)
10417 {
10418 if (TARGET_64BIT)
10419 {
10420 if (ix86_cmodel == CM_LARGE_PIC)
10421 {
10422 rtx tmp_reg = gen_rtx_REG (DImode, R11_REG);
10423 rtx label = gen_label_rtx ();
10424 emit_label (label);
10425 LABEL_PRESERVE_P (label) = 1;
10426 gcc_assert (REGNO (pic_offset_table_rtx) != REGNO (tmp_reg));
10427 insn = emit_insn (gen_set_rip_rex64 (pic_offset_table_rtx, label));
10428 insn = emit_insn (gen_set_got_offset_rex64 (tmp_reg, label));
10429 insn = emit_insn (gen_adddi3 (pic_offset_table_rtx,
10430 pic_offset_table_rtx, tmp_reg));
10431 }
10432 else
10433 insn = emit_insn (gen_set_got_rex64 (pic_offset_table_rtx));
10434 }
10435 else
10436 {
10437 insn = emit_insn (gen_set_got (pic_offset_table_rtx));
10438 RTX_FRAME_RELATED_P (insn) = 1;
10439 add_reg_note (insn, REG_CFA_FLUSH_QUEUE, NULL_RTX);
10440 }
10441 }
10442
10443 /* In the pic_reg_used case, make sure that the got load isn't deleted
10444 when mcount needs it. Blockage to avoid call movement across mcount
10445 call is emitted in generic code after the NOTE_INSN_PROLOGUE_END
10446 note. */
10447 if (crtl->profile && !flag_fentry && pic_reg_used)
10448 emit_insn (gen_prologue_use (pic_offset_table_rtx));
10449
10450 if (crtl->drap_reg && !crtl->stack_realign_needed)
10451 {
10452 /* vDRAP is setup but after reload it turns out stack realign
10453 isn't necessary, here we will emit prologue to setup DRAP
10454 without stack realign adjustment */
10455 t = choose_baseaddr (0);
10456 emit_insn (gen_rtx_SET (VOIDmode, crtl->drap_reg, t));
10457 }
10458
10459 /* Prevent instructions from being scheduled into register save push
10460 sequence when access to the redzone area is done through frame pointer.
10461 The offset between the frame pointer and the stack pointer is calculated
10462 relative to the value of the stack pointer at the end of the function
10463 prologue, and moving instructions that access redzone area via frame
10464 pointer inside push sequence violates this assumption. */
10465 if (frame_pointer_needed && frame.red_zone_size)
10466 emit_insn (gen_memory_blockage ());
10467
10468 /* Emit cld instruction if stringops are used in the function. */
10469 if (TARGET_CLD && ix86_current_function_needs_cld)
10470 emit_insn (gen_cld ());
10471
10472 /* SEH requires that the prologue end within 256 bytes of the start of
10473 the function. Prevent instruction schedules that would extend that.
10474 Further, prevent alloca modifications to the stack pointer from being
10475 combined with prologue modifications. */
10476 if (TARGET_SEH)
10477 emit_insn (gen_prologue_use (stack_pointer_rtx));
10478 }
10479
10480 /* Emit code to restore REG using a POP insn. */
10481
10482 static void
10483 ix86_emit_restore_reg_using_pop (rtx reg)
10484 {
10485 struct machine_function *m = cfun->machine;
10486 rtx insn = emit_insn (gen_pop (reg));
10487
10488 ix86_add_cfa_restore_note (insn, reg, m->fs.sp_offset);
10489 m->fs.sp_offset -= UNITS_PER_WORD;
10490
10491 if (m->fs.cfa_reg == crtl->drap_reg
10492 && REGNO (reg) == REGNO (crtl->drap_reg))
10493 {
10494 /* Previously we'd represented the CFA as an expression
10495 like *(%ebp - 8). We've just popped that value from
10496 the stack, which means we need to reset the CFA to
10497 the drap register. This will remain until we restore
10498 the stack pointer. */
10499 add_reg_note (insn, REG_CFA_DEF_CFA, reg);
10500 RTX_FRAME_RELATED_P (insn) = 1;
10501
10502 /* This means that the DRAP register is valid for addressing too. */
10503 m->fs.drap_valid = true;
10504 return;
10505 }
10506
10507 if (m->fs.cfa_reg == stack_pointer_rtx)
10508 {
10509 rtx x = plus_constant (stack_pointer_rtx, UNITS_PER_WORD);
10510 x = gen_rtx_SET (VOIDmode, stack_pointer_rtx, x);
10511 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
10512 RTX_FRAME_RELATED_P (insn) = 1;
10513
10514 m->fs.cfa_offset -= UNITS_PER_WORD;
10515 }
10516
10517 /* When the frame pointer is the CFA, and we pop it, we are
10518 swapping back to the stack pointer as the CFA. This happens
10519 for stack frames that don't allocate other data, so we assume
10520 the stack pointer is now pointing at the return address, i.e.
10521 the function entry state, which makes the offset be 1 word. */
10522 if (reg == hard_frame_pointer_rtx)
10523 {
10524 m->fs.fp_valid = false;
10525 if (m->fs.cfa_reg == hard_frame_pointer_rtx)
10526 {
10527 m->fs.cfa_reg = stack_pointer_rtx;
10528 m->fs.cfa_offset -= UNITS_PER_WORD;
10529
10530 add_reg_note (insn, REG_CFA_DEF_CFA,
10531 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
10532 GEN_INT (m->fs.cfa_offset)));
10533 RTX_FRAME_RELATED_P (insn) = 1;
10534 }
10535 }
10536 }
10537
10538 /* Emit code to restore saved registers using POP insns. */
10539
10540 static void
10541 ix86_emit_restore_regs_using_pop (void)
10542 {
10543 unsigned int regno;
10544
10545 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
10546 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, false))
10547 ix86_emit_restore_reg_using_pop (gen_rtx_REG (word_mode, regno));
10548 }
10549
10550 /* Emit code and notes for the LEAVE instruction. */
10551
10552 static void
10553 ix86_emit_leave (void)
10554 {
10555 struct machine_function *m = cfun->machine;
10556 rtx insn = emit_insn (ix86_gen_leave ());
10557
10558 ix86_add_queued_cfa_restore_notes (insn);
10559
10560 gcc_assert (m->fs.fp_valid);
10561 m->fs.sp_valid = true;
10562 m->fs.sp_offset = m->fs.fp_offset - UNITS_PER_WORD;
10563 m->fs.fp_valid = false;
10564
10565 if (m->fs.cfa_reg == hard_frame_pointer_rtx)
10566 {
10567 m->fs.cfa_reg = stack_pointer_rtx;
10568 m->fs.cfa_offset = m->fs.sp_offset;
10569
10570 add_reg_note (insn, REG_CFA_DEF_CFA,
10571 plus_constant (stack_pointer_rtx, m->fs.sp_offset));
10572 RTX_FRAME_RELATED_P (insn) = 1;
10573 }
10574 ix86_add_cfa_restore_note (insn, hard_frame_pointer_rtx,
10575 m->fs.fp_offset);
10576 }
10577
10578 /* Emit code to restore saved registers using MOV insns.
10579 First register is restored from CFA - CFA_OFFSET. */
10580 static void
10581 ix86_emit_restore_regs_using_mov (HOST_WIDE_INT cfa_offset,
10582 bool maybe_eh_return)
10583 {
10584 struct machine_function *m = cfun->machine;
10585 unsigned int regno;
10586
10587 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
10588 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return))
10589 {
10590 rtx reg = gen_rtx_REG (word_mode, regno);
10591 rtx insn, mem;
10592
10593 mem = choose_baseaddr (cfa_offset);
10594 mem = gen_frame_mem (word_mode, mem);
10595 insn = emit_move_insn (reg, mem);
10596
10597 if (m->fs.cfa_reg == crtl->drap_reg && regno == REGNO (crtl->drap_reg))
10598 {
10599 /* Previously we'd represented the CFA as an expression
10600 like *(%ebp - 8). We've just popped that value from
10601 the stack, which means we need to reset the CFA to
10602 the drap register. This will remain until we restore
10603 the stack pointer. */
10604 add_reg_note (insn, REG_CFA_DEF_CFA, reg);
10605 RTX_FRAME_RELATED_P (insn) = 1;
10606
10607 /* This means that the DRAP register is valid for addressing. */
10608 m->fs.drap_valid = true;
10609 }
10610 else
10611 ix86_add_cfa_restore_note (NULL_RTX, reg, cfa_offset);
10612
10613 cfa_offset -= UNITS_PER_WORD;
10614 }
10615 }
10616
10617 /* Emit code to restore saved registers using MOV insns.
10618 First register is restored from CFA - CFA_OFFSET. */
10619 static void
10620 ix86_emit_restore_sse_regs_using_mov (HOST_WIDE_INT cfa_offset,
10621 bool maybe_eh_return)
10622 {
10623 unsigned int regno;
10624
10625 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
10626 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return))
10627 {
10628 rtx reg = gen_rtx_REG (V4SFmode, regno);
10629 rtx mem;
10630
10631 mem = choose_baseaddr (cfa_offset);
10632 mem = gen_rtx_MEM (V4SFmode, mem);
10633 set_mem_align (mem, 128);
10634 emit_move_insn (reg, mem);
10635
10636 ix86_add_cfa_restore_note (NULL_RTX, reg, cfa_offset);
10637
10638 cfa_offset -= 16;
10639 }
10640 }
10641
10642 /* Emit vzeroupper if needed. */
10643
10644 void
10645 ix86_maybe_emit_epilogue_vzeroupper (void)
10646 {
10647 if (TARGET_VZEROUPPER
10648 && !TREE_THIS_VOLATILE (cfun->decl)
10649 && !cfun->machine->caller_return_avx256_p)
10650 emit_insn (gen_avx_vzeroupper (GEN_INT (call_no_avx256)));
10651 }
10652
10653 /* Restore function stack, frame, and registers. */
10654
10655 void
10656 ix86_expand_epilogue (int style)
10657 {
10658 struct machine_function *m = cfun->machine;
10659 struct machine_frame_state frame_state_save = m->fs;
10660 struct ix86_frame frame;
10661 bool restore_regs_via_mov;
10662 bool using_drap;
10663
10664 ix86_finalize_stack_realign_flags ();
10665 ix86_compute_frame_layout (&frame);
10666
10667 m->fs.sp_valid = (!frame_pointer_needed
10668 || (current_function_sp_is_unchanging
10669 && !stack_realign_fp));
10670 gcc_assert (!m->fs.sp_valid
10671 || m->fs.sp_offset == frame.stack_pointer_offset);
10672
10673 /* The FP must be valid if the frame pointer is present. */
10674 gcc_assert (frame_pointer_needed == m->fs.fp_valid);
10675 gcc_assert (!m->fs.fp_valid
10676 || m->fs.fp_offset == frame.hard_frame_pointer_offset);
10677
10678 /* We must have *some* valid pointer to the stack frame. */
10679 gcc_assert (m->fs.sp_valid || m->fs.fp_valid);
10680
10681 /* The DRAP is never valid at this point. */
10682 gcc_assert (!m->fs.drap_valid);
10683
10684 /* See the comment about red zone and frame
10685 pointer usage in ix86_expand_prologue. */
10686 if (frame_pointer_needed && frame.red_zone_size)
10687 emit_insn (gen_memory_blockage ());
10688
10689 using_drap = crtl->drap_reg && crtl->stack_realign_needed;
10690 gcc_assert (!using_drap || m->fs.cfa_reg == crtl->drap_reg);
10691
10692 /* Determine the CFA offset of the end of the red-zone. */
10693 m->fs.red_zone_offset = 0;
10694 if (ix86_using_red_zone () && crtl->args.pops_args < 65536)
10695 {
10696 /* The red-zone begins below the return address. */
10697 m->fs.red_zone_offset = RED_ZONE_SIZE + UNITS_PER_WORD;
10698
10699 /* When the register save area is in the aligned portion of
10700 the stack, determine the maximum runtime displacement that
10701 matches up with the aligned frame. */
10702 if (stack_realign_drap)
10703 m->fs.red_zone_offset -= (crtl->stack_alignment_needed / BITS_PER_UNIT
10704 + UNITS_PER_WORD);
10705 }
10706
10707 /* Special care must be taken for the normal return case of a function
10708 using eh_return: the eax and edx registers are marked as saved, but
10709 not restored along this path. Adjust the save location to match. */
10710 if (crtl->calls_eh_return && style != 2)
10711 frame.reg_save_offset -= 2 * UNITS_PER_WORD;
10712
10713 /* EH_RETURN requires the use of moves to function properly. */
10714 if (crtl->calls_eh_return)
10715 restore_regs_via_mov = true;
10716 /* SEH requires the use of pops to identify the epilogue. */
10717 else if (TARGET_SEH)
10718 restore_regs_via_mov = false;
10719 /* If we're only restoring one register and sp is not valid then
10720 using a move instruction to restore the register since it's
10721 less work than reloading sp and popping the register. */
10722 else if (!m->fs.sp_valid && frame.nregs <= 1)
10723 restore_regs_via_mov = true;
10724 else if (TARGET_EPILOGUE_USING_MOVE
10725 && cfun->machine->use_fast_prologue_epilogue
10726 && (frame.nregs > 1
10727 || m->fs.sp_offset != frame.reg_save_offset))
10728 restore_regs_via_mov = true;
10729 else if (frame_pointer_needed
10730 && !frame.nregs
10731 && m->fs.sp_offset != frame.reg_save_offset)
10732 restore_regs_via_mov = true;
10733 else if (frame_pointer_needed
10734 && TARGET_USE_LEAVE
10735 && cfun->machine->use_fast_prologue_epilogue
10736 && frame.nregs == 1)
10737 restore_regs_via_mov = true;
10738 else
10739 restore_regs_via_mov = false;
10740
10741 if (restore_regs_via_mov || frame.nsseregs)
10742 {
10743 /* Ensure that the entire register save area is addressable via
10744 the stack pointer, if we will restore via sp. */
10745 if (TARGET_64BIT
10746 && m->fs.sp_offset > 0x7fffffff
10747 && !(m->fs.fp_valid || m->fs.drap_valid)
10748 && (frame.nsseregs + frame.nregs) != 0)
10749 {
10750 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10751 GEN_INT (m->fs.sp_offset
10752 - frame.sse_reg_save_offset),
10753 style,
10754 m->fs.cfa_reg == stack_pointer_rtx);
10755 }
10756 }
10757
10758 /* If there are any SSE registers to restore, then we have to do it
10759 via moves, since there's obviously no pop for SSE regs. */
10760 if (frame.nsseregs)
10761 ix86_emit_restore_sse_regs_using_mov (frame.sse_reg_save_offset,
10762 style == 2);
10763
10764 if (restore_regs_via_mov)
10765 {
10766 rtx t;
10767
10768 if (frame.nregs)
10769 ix86_emit_restore_regs_using_mov (frame.reg_save_offset, style == 2);
10770
10771 /* eh_return epilogues need %ecx added to the stack pointer. */
10772 if (style == 2)
10773 {
10774 rtx insn, sa = EH_RETURN_STACKADJ_RTX;
10775
10776 /* Stack align doesn't work with eh_return. */
10777 gcc_assert (!stack_realign_drap);
10778 /* Neither does regparm nested functions. */
10779 gcc_assert (!ix86_static_chain_on_stack);
10780
10781 if (frame_pointer_needed)
10782 {
10783 t = gen_rtx_PLUS (Pmode, hard_frame_pointer_rtx, sa);
10784 t = plus_constant (t, m->fs.fp_offset - UNITS_PER_WORD);
10785 emit_insn (gen_rtx_SET (VOIDmode, sa, t));
10786
10787 t = gen_frame_mem (Pmode, hard_frame_pointer_rtx);
10788 insn = emit_move_insn (hard_frame_pointer_rtx, t);
10789
10790 /* Note that we use SA as a temporary CFA, as the return
10791 address is at the proper place relative to it. We
10792 pretend this happens at the FP restore insn because
10793 prior to this insn the FP would be stored at the wrong
10794 offset relative to SA, and after this insn we have no
10795 other reasonable register to use for the CFA. We don't
10796 bother resetting the CFA to the SP for the duration of
10797 the return insn. */
10798 add_reg_note (insn, REG_CFA_DEF_CFA,
10799 plus_constant (sa, UNITS_PER_WORD));
10800 ix86_add_queued_cfa_restore_notes (insn);
10801 add_reg_note (insn, REG_CFA_RESTORE, hard_frame_pointer_rtx);
10802 RTX_FRAME_RELATED_P (insn) = 1;
10803
10804 m->fs.cfa_reg = sa;
10805 m->fs.cfa_offset = UNITS_PER_WORD;
10806 m->fs.fp_valid = false;
10807
10808 pro_epilogue_adjust_stack (stack_pointer_rtx, sa,
10809 const0_rtx, style, false);
10810 }
10811 else
10812 {
10813 t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, sa);
10814 t = plus_constant (t, m->fs.sp_offset - UNITS_PER_WORD);
10815 insn = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx, t));
10816 ix86_add_queued_cfa_restore_notes (insn);
10817
10818 gcc_assert (m->fs.cfa_reg == stack_pointer_rtx);
10819 if (m->fs.cfa_offset != UNITS_PER_WORD)
10820 {
10821 m->fs.cfa_offset = UNITS_PER_WORD;
10822 add_reg_note (insn, REG_CFA_DEF_CFA,
10823 plus_constant (stack_pointer_rtx,
10824 UNITS_PER_WORD));
10825 RTX_FRAME_RELATED_P (insn) = 1;
10826 }
10827 }
10828 m->fs.sp_offset = UNITS_PER_WORD;
10829 m->fs.sp_valid = true;
10830 }
10831 }
10832 else
10833 {
10834 /* SEH requires that the function end with (1) a stack adjustment
10835 if necessary, (2) a sequence of pops, and (3) a return or
10836 jump instruction. Prevent insns from the function body from
10837 being scheduled into this sequence. */
10838 if (TARGET_SEH)
10839 {
10840 /* Prevent a catch region from being adjacent to the standard
10841 epilogue sequence. Unfortuantely crtl->uses_eh_lsda nor
10842 several other flags that would be interesting to test are
10843 not yet set up. */
10844 if (flag_non_call_exceptions)
10845 emit_insn (gen_nops (const1_rtx));
10846 else
10847 emit_insn (gen_blockage ());
10848 }
10849
10850 /* First step is to deallocate the stack frame so that we can
10851 pop the registers. */
10852 if (!m->fs.sp_valid)
10853 {
10854 pro_epilogue_adjust_stack (stack_pointer_rtx, hard_frame_pointer_rtx,
10855 GEN_INT (m->fs.fp_offset
10856 - frame.reg_save_offset),
10857 style, false);
10858 }
10859 else if (m->fs.sp_offset != frame.reg_save_offset)
10860 {
10861 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10862 GEN_INT (m->fs.sp_offset
10863 - frame.reg_save_offset),
10864 style,
10865 m->fs.cfa_reg == stack_pointer_rtx);
10866 }
10867
10868 ix86_emit_restore_regs_using_pop ();
10869 }
10870
10871 /* If we used a stack pointer and haven't already got rid of it,
10872 then do so now. */
10873 if (m->fs.fp_valid)
10874 {
10875 /* If the stack pointer is valid and pointing at the frame
10876 pointer store address, then we only need a pop. */
10877 if (m->fs.sp_valid && m->fs.sp_offset == frame.hfp_save_offset)
10878 ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
10879 /* Leave results in shorter dependency chains on CPUs that are
10880 able to grok it fast. */
10881 else if (TARGET_USE_LEAVE
10882 || optimize_function_for_size_p (cfun)
10883 || !cfun->machine->use_fast_prologue_epilogue)
10884 ix86_emit_leave ();
10885 else
10886 {
10887 pro_epilogue_adjust_stack (stack_pointer_rtx,
10888 hard_frame_pointer_rtx,
10889 const0_rtx, style, !using_drap);
10890 ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
10891 }
10892 }
10893
10894 if (using_drap)
10895 {
10896 int param_ptr_offset = UNITS_PER_WORD;
10897 rtx insn;
10898
10899 gcc_assert (stack_realign_drap);
10900
10901 if (ix86_static_chain_on_stack)
10902 param_ptr_offset += UNITS_PER_WORD;
10903 if (!call_used_regs[REGNO (crtl->drap_reg)])
10904 param_ptr_offset += UNITS_PER_WORD;
10905
10906 insn = emit_insn (gen_rtx_SET
10907 (VOIDmode, stack_pointer_rtx,
10908 gen_rtx_PLUS (Pmode,
10909 crtl->drap_reg,
10910 GEN_INT (-param_ptr_offset))));
10911 m->fs.cfa_reg = stack_pointer_rtx;
10912 m->fs.cfa_offset = param_ptr_offset;
10913 m->fs.sp_offset = param_ptr_offset;
10914 m->fs.realigned = false;
10915
10916 add_reg_note (insn, REG_CFA_DEF_CFA,
10917 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
10918 GEN_INT (param_ptr_offset)));
10919 RTX_FRAME_RELATED_P (insn) = 1;
10920
10921 if (!call_used_regs[REGNO (crtl->drap_reg)])
10922 ix86_emit_restore_reg_using_pop (crtl->drap_reg);
10923 }
10924
10925 /* At this point the stack pointer must be valid, and we must have
10926 restored all of the registers. We may not have deallocated the
10927 entire stack frame. We've delayed this until now because it may
10928 be possible to merge the local stack deallocation with the
10929 deallocation forced by ix86_static_chain_on_stack. */
10930 gcc_assert (m->fs.sp_valid);
10931 gcc_assert (!m->fs.fp_valid);
10932 gcc_assert (!m->fs.realigned);
10933 if (m->fs.sp_offset != UNITS_PER_WORD)
10934 {
10935 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10936 GEN_INT (m->fs.sp_offset - UNITS_PER_WORD),
10937 style, true);
10938 }
10939 else
10940 ix86_add_queued_cfa_restore_notes (get_last_insn ());
10941
10942 /* Sibcall epilogues don't want a return instruction. */
10943 if (style == 0)
10944 {
10945 m->fs = frame_state_save;
10946 return;
10947 }
10948
10949 /* Emit vzeroupper if needed. */
10950 ix86_maybe_emit_epilogue_vzeroupper ();
10951
10952 if (crtl->args.pops_args && crtl->args.size)
10953 {
10954 rtx popc = GEN_INT (crtl->args.pops_args);
10955
10956 /* i386 can only pop 64K bytes. If asked to pop more, pop return
10957 address, do explicit add, and jump indirectly to the caller. */
10958
10959 if (crtl->args.pops_args >= 65536)
10960 {
10961 rtx ecx = gen_rtx_REG (SImode, CX_REG);
10962 rtx insn;
10963
10964 /* There is no "pascal" calling convention in any 64bit ABI. */
10965 gcc_assert (!TARGET_64BIT);
10966
10967 insn = emit_insn (gen_pop (ecx));
10968 m->fs.cfa_offset -= UNITS_PER_WORD;
10969 m->fs.sp_offset -= UNITS_PER_WORD;
10970
10971 add_reg_note (insn, REG_CFA_ADJUST_CFA,
10972 copy_rtx (XVECEXP (PATTERN (insn), 0, 1)));
10973 add_reg_note (insn, REG_CFA_REGISTER,
10974 gen_rtx_SET (VOIDmode, ecx, pc_rtx));
10975 RTX_FRAME_RELATED_P (insn) = 1;
10976
10977 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10978 popc, -1, true);
10979 emit_jump_insn (gen_simple_return_indirect_internal (ecx));
10980 }
10981 else
10982 emit_jump_insn (gen_simple_return_pop_internal (popc));
10983 }
10984 else
10985 emit_jump_insn (gen_simple_return_internal ());
10986
10987 /* Restore the state back to the state from the prologue,
10988 so that it's correct for the next epilogue. */
10989 m->fs = frame_state_save;
10990 }
10991
10992 /* Reset from the function's potential modifications. */
10993
10994 static void
10995 ix86_output_function_epilogue (FILE *file ATTRIBUTE_UNUSED,
10996 HOST_WIDE_INT size ATTRIBUTE_UNUSED)
10997 {
10998 if (pic_offset_table_rtx)
10999 SET_REGNO (pic_offset_table_rtx, REAL_PIC_OFFSET_TABLE_REGNUM);
11000 #if TARGET_MACHO
11001 /* Mach-O doesn't support labels at the end of objects, so if
11002 it looks like we might want one, insert a NOP. */
11003 {
11004 rtx insn = get_last_insn ();
11005 rtx deleted_debug_label = NULL_RTX;
11006 while (insn
11007 && NOTE_P (insn)
11008 && NOTE_KIND (insn) != NOTE_INSN_DELETED_LABEL)
11009 {
11010 /* Don't insert a nop for NOTE_INSN_DELETED_DEBUG_LABEL
11011 notes only, instead set their CODE_LABEL_NUMBER to -1,
11012 otherwise there would be code generation differences
11013 in between -g and -g0. */
11014 if (NOTE_P (insn) && NOTE_KIND (insn) == NOTE_INSN_DELETED_DEBUG_LABEL)
11015 deleted_debug_label = insn;
11016 insn = PREV_INSN (insn);
11017 }
11018 if (insn
11019 && (LABEL_P (insn)
11020 || (NOTE_P (insn)
11021 && NOTE_KIND (insn) == NOTE_INSN_DELETED_LABEL)))
11022 fputs ("\tnop\n", file);
11023 else if (deleted_debug_label)
11024 for (insn = deleted_debug_label; insn; insn = NEXT_INSN (insn))
11025 if (NOTE_KIND (insn) == NOTE_INSN_DELETED_DEBUG_LABEL)
11026 CODE_LABEL_NUMBER (insn) = -1;
11027 }
11028 #endif
11029
11030 }
11031
11032 /* Return a scratch register to use in the split stack prologue. The
11033 split stack prologue is used for -fsplit-stack. It is the first
11034 instructions in the function, even before the regular prologue.
11035 The scratch register can be any caller-saved register which is not
11036 used for parameters or for the static chain. */
11037
11038 static unsigned int
11039 split_stack_prologue_scratch_regno (void)
11040 {
11041 if (TARGET_64BIT)
11042 return R11_REG;
11043 else
11044 {
11045 bool is_fastcall;
11046 int regparm;
11047
11048 is_fastcall = (lookup_attribute ("fastcall",
11049 TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
11050 != NULL);
11051 regparm = ix86_function_regparm (TREE_TYPE (cfun->decl), cfun->decl);
11052
11053 if (is_fastcall)
11054 {
11055 if (DECL_STATIC_CHAIN (cfun->decl))
11056 {
11057 sorry ("-fsplit-stack does not support fastcall with "
11058 "nested function");
11059 return INVALID_REGNUM;
11060 }
11061 return AX_REG;
11062 }
11063 else if (regparm < 3)
11064 {
11065 if (!DECL_STATIC_CHAIN (cfun->decl))
11066 return CX_REG;
11067 else
11068 {
11069 if (regparm >= 2)
11070 {
11071 sorry ("-fsplit-stack does not support 2 register "
11072 " parameters for a nested function");
11073 return INVALID_REGNUM;
11074 }
11075 return DX_REG;
11076 }
11077 }
11078 else
11079 {
11080 /* FIXME: We could make this work by pushing a register
11081 around the addition and comparison. */
11082 sorry ("-fsplit-stack does not support 3 register parameters");
11083 return INVALID_REGNUM;
11084 }
11085 }
11086 }
11087
11088 /* A SYMBOL_REF for the function which allocates new stackspace for
11089 -fsplit-stack. */
11090
11091 static GTY(()) rtx split_stack_fn;
11092
11093 /* A SYMBOL_REF for the more stack function when using the large
11094 model. */
11095
11096 static GTY(()) rtx split_stack_fn_large;
11097
11098 /* Handle -fsplit-stack. These are the first instructions in the
11099 function, even before the regular prologue. */
11100
11101 void
11102 ix86_expand_split_stack_prologue (void)
11103 {
11104 struct ix86_frame frame;
11105 HOST_WIDE_INT allocate;
11106 unsigned HOST_WIDE_INT args_size;
11107 rtx label, limit, current, jump_insn, allocate_rtx, call_insn, call_fusage;
11108 rtx scratch_reg = NULL_RTX;
11109 rtx varargs_label = NULL_RTX;
11110 rtx fn;
11111
11112 gcc_assert (flag_split_stack && reload_completed);
11113
11114 ix86_finalize_stack_realign_flags ();
11115 ix86_compute_frame_layout (&frame);
11116 allocate = frame.stack_pointer_offset - INCOMING_FRAME_SP_OFFSET;
11117
11118 /* This is the label we will branch to if we have enough stack
11119 space. We expect the basic block reordering pass to reverse this
11120 branch if optimizing, so that we branch in the unlikely case. */
11121 label = gen_label_rtx ();
11122
11123 /* We need to compare the stack pointer minus the frame size with
11124 the stack boundary in the TCB. The stack boundary always gives
11125 us SPLIT_STACK_AVAILABLE bytes, so if we need less than that we
11126 can compare directly. Otherwise we need to do an addition. */
11127
11128 limit = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
11129 UNSPEC_STACK_CHECK);
11130 limit = gen_rtx_CONST (Pmode, limit);
11131 limit = gen_rtx_MEM (Pmode, limit);
11132 if (allocate < SPLIT_STACK_AVAILABLE)
11133 current = stack_pointer_rtx;
11134 else
11135 {
11136 unsigned int scratch_regno;
11137 rtx offset;
11138
11139 /* We need a scratch register to hold the stack pointer minus
11140 the required frame size. Since this is the very start of the
11141 function, the scratch register can be any caller-saved
11142 register which is not used for parameters. */
11143 offset = GEN_INT (- allocate);
11144 scratch_regno = split_stack_prologue_scratch_regno ();
11145 if (scratch_regno == INVALID_REGNUM)
11146 return;
11147 scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
11148 if (!TARGET_64BIT || x86_64_immediate_operand (offset, Pmode))
11149 {
11150 /* We don't use ix86_gen_add3 in this case because it will
11151 want to split to lea, but when not optimizing the insn
11152 will not be split after this point. */
11153 emit_insn (gen_rtx_SET (VOIDmode, scratch_reg,
11154 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
11155 offset)));
11156 }
11157 else
11158 {
11159 emit_move_insn (scratch_reg, offset);
11160 emit_insn (gen_adddi3 (scratch_reg, scratch_reg,
11161 stack_pointer_rtx));
11162 }
11163 current = scratch_reg;
11164 }
11165
11166 ix86_expand_branch (GEU, current, limit, label);
11167 jump_insn = get_last_insn ();
11168 JUMP_LABEL (jump_insn) = label;
11169
11170 /* Mark the jump as very likely to be taken. */
11171 add_reg_note (jump_insn, REG_BR_PROB,
11172 GEN_INT (REG_BR_PROB_BASE - REG_BR_PROB_BASE / 100));
11173
11174 if (split_stack_fn == NULL_RTX)
11175 split_stack_fn = gen_rtx_SYMBOL_REF (Pmode, "__morestack");
11176 fn = split_stack_fn;
11177
11178 /* Get more stack space. We pass in the desired stack space and the
11179 size of the arguments to copy to the new stack. In 32-bit mode
11180 we push the parameters; __morestack will return on a new stack
11181 anyhow. In 64-bit mode we pass the parameters in r10 and
11182 r11. */
11183 allocate_rtx = GEN_INT (allocate);
11184 args_size = crtl->args.size >= 0 ? crtl->args.size : 0;
11185 call_fusage = NULL_RTX;
11186 if (TARGET_64BIT)
11187 {
11188 rtx reg10, reg11;
11189
11190 reg10 = gen_rtx_REG (Pmode, R10_REG);
11191 reg11 = gen_rtx_REG (Pmode, R11_REG);
11192
11193 /* If this function uses a static chain, it will be in %r10.
11194 Preserve it across the call to __morestack. */
11195 if (DECL_STATIC_CHAIN (cfun->decl))
11196 {
11197 rtx rax;
11198
11199 rax = gen_rtx_REG (word_mode, AX_REG);
11200 emit_move_insn (rax, gen_rtx_REG (word_mode, R10_REG));
11201 use_reg (&call_fusage, rax);
11202 }
11203
11204 if (ix86_cmodel == CM_LARGE || ix86_cmodel == CM_LARGE_PIC)
11205 {
11206 HOST_WIDE_INT argval;
11207
11208 /* When using the large model we need to load the address
11209 into a register, and we've run out of registers. So we
11210 switch to a different calling convention, and we call a
11211 different function: __morestack_large. We pass the
11212 argument size in the upper 32 bits of r10 and pass the
11213 frame size in the lower 32 bits. */
11214 gcc_assert ((allocate & (HOST_WIDE_INT) 0xffffffff) == allocate);
11215 gcc_assert ((args_size & 0xffffffff) == args_size);
11216
11217 if (split_stack_fn_large == NULL_RTX)
11218 split_stack_fn_large =
11219 gen_rtx_SYMBOL_REF (Pmode, "__morestack_large_model");
11220
11221 if (ix86_cmodel == CM_LARGE_PIC)
11222 {
11223 rtx label, x;
11224
11225 label = gen_label_rtx ();
11226 emit_label (label);
11227 LABEL_PRESERVE_P (label) = 1;
11228 emit_insn (gen_set_rip_rex64 (reg10, label));
11229 emit_insn (gen_set_got_offset_rex64 (reg11, label));
11230 emit_insn (gen_adddi3 (reg10, reg10, reg11));
11231 x = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, split_stack_fn_large),
11232 UNSPEC_GOT);
11233 x = gen_rtx_CONST (Pmode, x);
11234 emit_move_insn (reg11, x);
11235 x = gen_rtx_PLUS (Pmode, reg10, reg11);
11236 x = gen_const_mem (Pmode, x);
11237 emit_move_insn (reg11, x);
11238 }
11239 else
11240 emit_move_insn (reg11, split_stack_fn_large);
11241
11242 fn = reg11;
11243
11244 argval = ((args_size << 16) << 16) + allocate;
11245 emit_move_insn (reg10, GEN_INT (argval));
11246 }
11247 else
11248 {
11249 emit_move_insn (reg10, allocate_rtx);
11250 emit_move_insn (reg11, GEN_INT (args_size));
11251 use_reg (&call_fusage, reg11);
11252 }
11253
11254 use_reg (&call_fusage, reg10);
11255 }
11256 else
11257 {
11258 emit_insn (gen_push (GEN_INT (args_size)));
11259 emit_insn (gen_push (allocate_rtx));
11260 }
11261 call_insn = ix86_expand_call (NULL_RTX, gen_rtx_MEM (QImode, fn),
11262 GEN_INT (UNITS_PER_WORD), constm1_rtx,
11263 NULL_RTX, false);
11264 add_function_usage_to (call_insn, call_fusage);
11265
11266 /* In order to make call/return prediction work right, we now need
11267 to execute a return instruction. See
11268 libgcc/config/i386/morestack.S for the details on how this works.
11269
11270 For flow purposes gcc must not see this as a return
11271 instruction--we need control flow to continue at the subsequent
11272 label. Therefore, we use an unspec. */
11273 gcc_assert (crtl->args.pops_args < 65536);
11274 emit_insn (gen_split_stack_return (GEN_INT (crtl->args.pops_args)));
11275
11276 /* If we are in 64-bit mode and this function uses a static chain,
11277 we saved %r10 in %rax before calling _morestack. */
11278 if (TARGET_64BIT && DECL_STATIC_CHAIN (cfun->decl))
11279 emit_move_insn (gen_rtx_REG (word_mode, R10_REG),
11280 gen_rtx_REG (word_mode, AX_REG));
11281
11282 /* If this function calls va_start, we need to store a pointer to
11283 the arguments on the old stack, because they may not have been
11284 all copied to the new stack. At this point the old stack can be
11285 found at the frame pointer value used by __morestack, because
11286 __morestack has set that up before calling back to us. Here we
11287 store that pointer in a scratch register, and in
11288 ix86_expand_prologue we store the scratch register in a stack
11289 slot. */
11290 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
11291 {
11292 unsigned int scratch_regno;
11293 rtx frame_reg;
11294 int words;
11295
11296 scratch_regno = split_stack_prologue_scratch_regno ();
11297 scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
11298 frame_reg = gen_rtx_REG (Pmode, BP_REG);
11299
11300 /* 64-bit:
11301 fp -> old fp value
11302 return address within this function
11303 return address of caller of this function
11304 stack arguments
11305 So we add three words to get to the stack arguments.
11306
11307 32-bit:
11308 fp -> old fp value
11309 return address within this function
11310 first argument to __morestack
11311 second argument to __morestack
11312 return address of caller of this function
11313 stack arguments
11314 So we add five words to get to the stack arguments.
11315 */
11316 words = TARGET_64BIT ? 3 : 5;
11317 emit_insn (gen_rtx_SET (VOIDmode, scratch_reg,
11318 gen_rtx_PLUS (Pmode, frame_reg,
11319 GEN_INT (words * UNITS_PER_WORD))));
11320
11321 varargs_label = gen_label_rtx ();
11322 emit_jump_insn (gen_jump (varargs_label));
11323 JUMP_LABEL (get_last_insn ()) = varargs_label;
11324
11325 emit_barrier ();
11326 }
11327
11328 emit_label (label);
11329 LABEL_NUSES (label) = 1;
11330
11331 /* If this function calls va_start, we now have to set the scratch
11332 register for the case where we do not call __morestack. In this
11333 case we need to set it based on the stack pointer. */
11334 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
11335 {
11336 emit_insn (gen_rtx_SET (VOIDmode, scratch_reg,
11337 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
11338 GEN_INT (UNITS_PER_WORD))));
11339
11340 emit_label (varargs_label);
11341 LABEL_NUSES (varargs_label) = 1;
11342 }
11343 }
11344
11345 /* We may have to tell the dataflow pass that the split stack prologue
11346 is initializing a scratch register. */
11347
11348 static void
11349 ix86_live_on_entry (bitmap regs)
11350 {
11351 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
11352 {
11353 gcc_assert (flag_split_stack);
11354 bitmap_set_bit (regs, split_stack_prologue_scratch_regno ());
11355 }
11356 }
11357 \f
11358 /* Determine if op is suitable SUBREG RTX for address. */
11359
11360 static bool
11361 ix86_address_subreg_operand (rtx op)
11362 {
11363 enum machine_mode mode;
11364
11365 if (!REG_P (op))
11366 return false;
11367
11368 mode = GET_MODE (op);
11369
11370 if (GET_MODE_CLASS (mode) != MODE_INT)
11371 return false;
11372
11373 /* Don't allow SUBREGs that span more than a word. It can lead to spill
11374 failures when the register is one word out of a two word structure. */
11375 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
11376 return false;
11377
11378 /* Allow only SUBREGs of non-eliminable hard registers. */
11379 return register_no_elim_operand (op, mode);
11380 }
11381
11382 /* Extract the parts of an RTL expression that is a valid memory address
11383 for an instruction. Return 0 if the structure of the address is
11384 grossly off. Return -1 if the address contains ASHIFT, so it is not
11385 strictly valid, but still used for computing length of lea instruction. */
11386
11387 int
11388 ix86_decompose_address (rtx addr, struct ix86_address *out)
11389 {
11390 rtx base = NULL_RTX, index = NULL_RTX, disp = NULL_RTX;
11391 rtx base_reg, index_reg;
11392 HOST_WIDE_INT scale = 1;
11393 rtx scale_rtx = NULL_RTX;
11394 rtx tmp;
11395 int retval = 1;
11396 enum ix86_address_seg seg = SEG_DEFAULT;
11397
11398 /* Allow zero-extended SImode addresses,
11399 they will be emitted with addr32 prefix. */
11400 if (TARGET_64BIT && GET_MODE (addr) == DImode)
11401 {
11402 if (GET_CODE (addr) == ZERO_EXTEND
11403 && GET_MODE (XEXP (addr, 0)) == SImode)
11404 addr = XEXP (addr, 0);
11405 else if (GET_CODE (addr) == AND
11406 && const_32bit_mask (XEXP (addr, 1), DImode))
11407 {
11408 addr = XEXP (addr, 0);
11409
11410 /* Strip subreg. */
11411 if (GET_CODE (addr) == SUBREG
11412 && GET_MODE (SUBREG_REG (addr)) == SImode)
11413 addr = SUBREG_REG (addr);
11414 }
11415 }
11416
11417 if (REG_P (addr))
11418 base = addr;
11419 else if (GET_CODE (addr) == SUBREG)
11420 {
11421 if (ix86_address_subreg_operand (SUBREG_REG (addr)))
11422 base = addr;
11423 else
11424 return 0;
11425 }
11426 else if (GET_CODE (addr) == PLUS)
11427 {
11428 rtx addends[4], op;
11429 int n = 0, i;
11430
11431 op = addr;
11432 do
11433 {
11434 if (n >= 4)
11435 return 0;
11436 addends[n++] = XEXP (op, 1);
11437 op = XEXP (op, 0);
11438 }
11439 while (GET_CODE (op) == PLUS);
11440 if (n >= 4)
11441 return 0;
11442 addends[n] = op;
11443
11444 for (i = n; i >= 0; --i)
11445 {
11446 op = addends[i];
11447 switch (GET_CODE (op))
11448 {
11449 case MULT:
11450 if (index)
11451 return 0;
11452 index = XEXP (op, 0);
11453 scale_rtx = XEXP (op, 1);
11454 break;
11455
11456 case ASHIFT:
11457 if (index)
11458 return 0;
11459 index = XEXP (op, 0);
11460 tmp = XEXP (op, 1);
11461 if (!CONST_INT_P (tmp))
11462 return 0;
11463 scale = INTVAL (tmp);
11464 if ((unsigned HOST_WIDE_INT) scale > 3)
11465 return 0;
11466 scale = 1 << scale;
11467 break;
11468
11469 case UNSPEC:
11470 if (XINT (op, 1) == UNSPEC_TP
11471 && TARGET_TLS_DIRECT_SEG_REFS
11472 && seg == SEG_DEFAULT)
11473 seg = TARGET_64BIT ? SEG_FS : SEG_GS;
11474 else
11475 return 0;
11476 break;
11477
11478 case SUBREG:
11479 if (!ix86_address_subreg_operand (SUBREG_REG (op)))
11480 return 0;
11481 /* FALLTHRU */
11482
11483 case REG:
11484 if (!base)
11485 base = op;
11486 else if (!index)
11487 index = op;
11488 else
11489 return 0;
11490 break;
11491
11492 case CONST:
11493 case CONST_INT:
11494 case SYMBOL_REF:
11495 case LABEL_REF:
11496 if (disp)
11497 return 0;
11498 disp = op;
11499 break;
11500
11501 default:
11502 return 0;
11503 }
11504 }
11505 }
11506 else if (GET_CODE (addr) == MULT)
11507 {
11508 index = XEXP (addr, 0); /* index*scale */
11509 scale_rtx = XEXP (addr, 1);
11510 }
11511 else if (GET_CODE (addr) == ASHIFT)
11512 {
11513 /* We're called for lea too, which implements ashift on occasion. */
11514 index = XEXP (addr, 0);
11515 tmp = XEXP (addr, 1);
11516 if (!CONST_INT_P (tmp))
11517 return 0;
11518 scale = INTVAL (tmp);
11519 if ((unsigned HOST_WIDE_INT) scale > 3)
11520 return 0;
11521 scale = 1 << scale;
11522 retval = -1;
11523 }
11524 else
11525 disp = addr; /* displacement */
11526
11527 /* Since address override works only on the (reg32) part in fs:(reg32),
11528 we can't use it as memory operand. */
11529 if (Pmode != word_mode && seg == SEG_FS && (base || index))
11530 return 0;
11531
11532 if (index)
11533 {
11534 if (REG_P (index))
11535 ;
11536 else if (GET_CODE (index) == SUBREG
11537 && ix86_address_subreg_operand (SUBREG_REG (index)))
11538 ;
11539 else
11540 return 0;
11541 }
11542
11543 /* Extract the integral value of scale. */
11544 if (scale_rtx)
11545 {
11546 if (!CONST_INT_P (scale_rtx))
11547 return 0;
11548 scale = INTVAL (scale_rtx);
11549 }
11550
11551 base_reg = base && GET_CODE (base) == SUBREG ? SUBREG_REG (base) : base;
11552 index_reg = index && GET_CODE (index) == SUBREG ? SUBREG_REG (index) : index;
11553
11554 /* Avoid useless 0 displacement. */
11555 if (disp == const0_rtx && (base || index))
11556 disp = NULL_RTX;
11557
11558 /* Allow arg pointer and stack pointer as index if there is not scaling. */
11559 if (base_reg && index_reg && scale == 1
11560 && (index_reg == arg_pointer_rtx
11561 || index_reg == frame_pointer_rtx
11562 || (REG_P (index_reg) && REGNO (index_reg) == STACK_POINTER_REGNUM)))
11563 {
11564 rtx tmp;
11565 tmp = base, base = index, index = tmp;
11566 tmp = base_reg, base_reg = index_reg, index_reg = tmp;
11567 }
11568
11569 /* Special case: %ebp cannot be encoded as a base without a displacement.
11570 Similarly %r13. */
11571 if (!disp
11572 && base_reg
11573 && (base_reg == hard_frame_pointer_rtx
11574 || base_reg == frame_pointer_rtx
11575 || base_reg == arg_pointer_rtx
11576 || (REG_P (base_reg)
11577 && (REGNO (base_reg) == HARD_FRAME_POINTER_REGNUM
11578 || REGNO (base_reg) == R13_REG))))
11579 disp = const0_rtx;
11580
11581 /* Special case: on K6, [%esi] makes the instruction vector decoded.
11582 Avoid this by transforming to [%esi+0].
11583 Reload calls address legitimization without cfun defined, so we need
11584 to test cfun for being non-NULL. */
11585 if (TARGET_K6 && cfun && optimize_function_for_speed_p (cfun)
11586 && base_reg && !index_reg && !disp
11587 && REG_P (base_reg) && REGNO (base_reg) == SI_REG)
11588 disp = const0_rtx;
11589
11590 /* Special case: encode reg+reg instead of reg*2. */
11591 if (!base && index && scale == 2)
11592 base = index, base_reg = index_reg, scale = 1;
11593
11594 /* Special case: scaling cannot be encoded without base or displacement. */
11595 if (!base && !disp && index && scale != 1)
11596 disp = const0_rtx;
11597
11598 out->base = base;
11599 out->index = index;
11600 out->disp = disp;
11601 out->scale = scale;
11602 out->seg = seg;
11603
11604 return retval;
11605 }
11606 \f
11607 /* Return cost of the memory address x.
11608 For i386, it is better to use a complex address than let gcc copy
11609 the address into a reg and make a new pseudo. But not if the address
11610 requires to two regs - that would mean more pseudos with longer
11611 lifetimes. */
11612 static int
11613 ix86_address_cost (rtx x, bool speed ATTRIBUTE_UNUSED)
11614 {
11615 struct ix86_address parts;
11616 int cost = 1;
11617 int ok = ix86_decompose_address (x, &parts);
11618
11619 gcc_assert (ok);
11620
11621 if (parts.base && GET_CODE (parts.base) == SUBREG)
11622 parts.base = SUBREG_REG (parts.base);
11623 if (parts.index && GET_CODE (parts.index) == SUBREG)
11624 parts.index = SUBREG_REG (parts.index);
11625
11626 /* Attempt to minimize number of registers in the address. */
11627 if ((parts.base
11628 && (!REG_P (parts.base) || REGNO (parts.base) >= FIRST_PSEUDO_REGISTER))
11629 || (parts.index
11630 && (!REG_P (parts.index)
11631 || REGNO (parts.index) >= FIRST_PSEUDO_REGISTER)))
11632 cost++;
11633
11634 if (parts.base
11635 && (!REG_P (parts.base) || REGNO (parts.base) >= FIRST_PSEUDO_REGISTER)
11636 && parts.index
11637 && (!REG_P (parts.index) || REGNO (parts.index) >= FIRST_PSEUDO_REGISTER)
11638 && parts.base != parts.index)
11639 cost++;
11640
11641 /* AMD-K6 don't like addresses with ModR/M set to 00_xxx_100b,
11642 since it's predecode logic can't detect the length of instructions
11643 and it degenerates to vector decoded. Increase cost of such
11644 addresses here. The penalty is minimally 2 cycles. It may be worthwhile
11645 to split such addresses or even refuse such addresses at all.
11646
11647 Following addressing modes are affected:
11648 [base+scale*index]
11649 [scale*index+disp]
11650 [base+index]
11651
11652 The first and last case may be avoidable by explicitly coding the zero in
11653 memory address, but I don't have AMD-K6 machine handy to check this
11654 theory. */
11655
11656 if (TARGET_K6
11657 && ((!parts.disp && parts.base && parts.index && parts.scale != 1)
11658 || (parts.disp && !parts.base && parts.index && parts.scale != 1)
11659 || (!parts.disp && parts.base && parts.index && parts.scale == 1)))
11660 cost += 10;
11661
11662 return cost;
11663 }
11664 \f
11665 /* Allow {LABEL | SYMBOL}_REF - SYMBOL_REF-FOR-PICBASE for Mach-O as
11666 this is used for to form addresses to local data when -fPIC is in
11667 use. */
11668
11669 static bool
11670 darwin_local_data_pic (rtx disp)
11671 {
11672 return (GET_CODE (disp) == UNSPEC
11673 && XINT (disp, 1) == UNSPEC_MACHOPIC_OFFSET);
11674 }
11675
11676 /* Determine if a given RTX is a valid constant. We already know this
11677 satisfies CONSTANT_P. */
11678
11679 static bool
11680 ix86_legitimate_constant_p (enum machine_mode mode ATTRIBUTE_UNUSED, rtx x)
11681 {
11682 switch (GET_CODE (x))
11683 {
11684 case CONST:
11685 x = XEXP (x, 0);
11686
11687 if (GET_CODE (x) == PLUS)
11688 {
11689 if (!CONST_INT_P (XEXP (x, 1)))
11690 return false;
11691 x = XEXP (x, 0);
11692 }
11693
11694 if (TARGET_MACHO && darwin_local_data_pic (x))
11695 return true;
11696
11697 /* Only some unspecs are valid as "constants". */
11698 if (GET_CODE (x) == UNSPEC)
11699 switch (XINT (x, 1))
11700 {
11701 case UNSPEC_GOT:
11702 case UNSPEC_GOTOFF:
11703 case UNSPEC_PLTOFF:
11704 return TARGET_64BIT;
11705 case UNSPEC_TPOFF:
11706 case UNSPEC_NTPOFF:
11707 x = XVECEXP (x, 0, 0);
11708 return (GET_CODE (x) == SYMBOL_REF
11709 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
11710 case UNSPEC_DTPOFF:
11711 x = XVECEXP (x, 0, 0);
11712 return (GET_CODE (x) == SYMBOL_REF
11713 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_DYNAMIC);
11714 default:
11715 return false;
11716 }
11717
11718 /* We must have drilled down to a symbol. */
11719 if (GET_CODE (x) == LABEL_REF)
11720 return true;
11721 if (GET_CODE (x) != SYMBOL_REF)
11722 return false;
11723 /* FALLTHRU */
11724
11725 case SYMBOL_REF:
11726 /* TLS symbols are never valid. */
11727 if (SYMBOL_REF_TLS_MODEL (x))
11728 return false;
11729
11730 /* DLLIMPORT symbols are never valid. */
11731 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
11732 && SYMBOL_REF_DLLIMPORT_P (x))
11733 return false;
11734
11735 #if TARGET_MACHO
11736 /* mdynamic-no-pic */
11737 if (MACHO_DYNAMIC_NO_PIC_P)
11738 return machopic_symbol_defined_p (x);
11739 #endif
11740 break;
11741
11742 case CONST_DOUBLE:
11743 if (GET_MODE (x) == TImode
11744 && x != CONST0_RTX (TImode)
11745 && !TARGET_64BIT)
11746 return false;
11747 break;
11748
11749 case CONST_VECTOR:
11750 if (!standard_sse_constant_p (x))
11751 return false;
11752
11753 default:
11754 break;
11755 }
11756
11757 /* Otherwise we handle everything else in the move patterns. */
11758 return true;
11759 }
11760
11761 /* Determine if it's legal to put X into the constant pool. This
11762 is not possible for the address of thread-local symbols, which
11763 is checked above. */
11764
11765 static bool
11766 ix86_cannot_force_const_mem (enum machine_mode mode, rtx x)
11767 {
11768 /* We can always put integral constants and vectors in memory. */
11769 switch (GET_CODE (x))
11770 {
11771 case CONST_INT:
11772 case CONST_DOUBLE:
11773 case CONST_VECTOR:
11774 return false;
11775
11776 default:
11777 break;
11778 }
11779 return !ix86_legitimate_constant_p (mode, x);
11780 }
11781
11782
11783 /* Nonzero if the constant value X is a legitimate general operand
11784 when generating PIC code. It is given that flag_pic is on and
11785 that X satisfies CONSTANT_P or is a CONST_DOUBLE. */
11786
11787 bool
11788 legitimate_pic_operand_p (rtx x)
11789 {
11790 rtx inner;
11791
11792 switch (GET_CODE (x))
11793 {
11794 case CONST:
11795 inner = XEXP (x, 0);
11796 if (GET_CODE (inner) == PLUS
11797 && CONST_INT_P (XEXP (inner, 1)))
11798 inner = XEXP (inner, 0);
11799
11800 /* Only some unspecs are valid as "constants". */
11801 if (GET_CODE (inner) == UNSPEC)
11802 switch (XINT (inner, 1))
11803 {
11804 case UNSPEC_GOT:
11805 case UNSPEC_GOTOFF:
11806 case UNSPEC_PLTOFF:
11807 return TARGET_64BIT;
11808 case UNSPEC_TPOFF:
11809 x = XVECEXP (inner, 0, 0);
11810 return (GET_CODE (x) == SYMBOL_REF
11811 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
11812 case UNSPEC_MACHOPIC_OFFSET:
11813 return legitimate_pic_address_disp_p (x);
11814 default:
11815 return false;
11816 }
11817 /* FALLTHRU */
11818
11819 case SYMBOL_REF:
11820 case LABEL_REF:
11821 return legitimate_pic_address_disp_p (x);
11822
11823 default:
11824 return true;
11825 }
11826 }
11827
11828 /* Determine if a given CONST RTX is a valid memory displacement
11829 in PIC mode. */
11830
11831 bool
11832 legitimate_pic_address_disp_p (rtx disp)
11833 {
11834 bool saw_plus;
11835
11836 /* In 64bit mode we can allow direct addresses of symbols and labels
11837 when they are not dynamic symbols. */
11838 if (TARGET_64BIT)
11839 {
11840 rtx op0 = disp, op1;
11841
11842 switch (GET_CODE (disp))
11843 {
11844 case LABEL_REF:
11845 return true;
11846
11847 case CONST:
11848 if (GET_CODE (XEXP (disp, 0)) != PLUS)
11849 break;
11850 op0 = XEXP (XEXP (disp, 0), 0);
11851 op1 = XEXP (XEXP (disp, 0), 1);
11852 if (!CONST_INT_P (op1)
11853 || INTVAL (op1) >= 16*1024*1024
11854 || INTVAL (op1) < -16*1024*1024)
11855 break;
11856 if (GET_CODE (op0) == LABEL_REF)
11857 return true;
11858 if (GET_CODE (op0) == CONST
11859 && GET_CODE (XEXP (op0, 0)) == UNSPEC
11860 && XINT (XEXP (op0, 0), 1) == UNSPEC_PCREL)
11861 return true;
11862 if (GET_CODE (op0) == UNSPEC
11863 && XINT (op0, 1) == UNSPEC_PCREL)
11864 return true;
11865 if (GET_CODE (op0) != SYMBOL_REF)
11866 break;
11867 /* FALLTHRU */
11868
11869 case SYMBOL_REF:
11870 /* TLS references should always be enclosed in UNSPEC. */
11871 if (SYMBOL_REF_TLS_MODEL (op0))
11872 return false;
11873 if (!SYMBOL_REF_FAR_ADDR_P (op0) && SYMBOL_REF_LOCAL_P (op0)
11874 && ix86_cmodel != CM_LARGE_PIC)
11875 return true;
11876 break;
11877
11878 default:
11879 break;
11880 }
11881 }
11882 if (GET_CODE (disp) != CONST)
11883 return false;
11884 disp = XEXP (disp, 0);
11885
11886 if (TARGET_64BIT)
11887 {
11888 /* We are unsafe to allow PLUS expressions. This limit allowed distance
11889 of GOT tables. We should not need these anyway. */
11890 if (GET_CODE (disp) != UNSPEC
11891 || (XINT (disp, 1) != UNSPEC_GOTPCREL
11892 && XINT (disp, 1) != UNSPEC_GOTOFF
11893 && XINT (disp, 1) != UNSPEC_PCREL
11894 && XINT (disp, 1) != UNSPEC_PLTOFF))
11895 return false;
11896
11897 if (GET_CODE (XVECEXP (disp, 0, 0)) != SYMBOL_REF
11898 && GET_CODE (XVECEXP (disp, 0, 0)) != LABEL_REF)
11899 return false;
11900 return true;
11901 }
11902
11903 saw_plus = false;
11904 if (GET_CODE (disp) == PLUS)
11905 {
11906 if (!CONST_INT_P (XEXP (disp, 1)))
11907 return false;
11908 disp = XEXP (disp, 0);
11909 saw_plus = true;
11910 }
11911
11912 if (TARGET_MACHO && darwin_local_data_pic (disp))
11913 return true;
11914
11915 if (GET_CODE (disp) != UNSPEC)
11916 return false;
11917
11918 switch (XINT (disp, 1))
11919 {
11920 case UNSPEC_GOT:
11921 if (saw_plus)
11922 return false;
11923 /* We need to check for both symbols and labels because VxWorks loads
11924 text labels with @GOT rather than @GOTOFF. See gotoff_operand for
11925 details. */
11926 return (GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
11927 || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF);
11928 case UNSPEC_GOTOFF:
11929 /* Refuse GOTOFF in 64bit mode since it is always 64bit when used.
11930 While ABI specify also 32bit relocation but we don't produce it in
11931 small PIC model at all. */
11932 if ((GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
11933 || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF)
11934 && !TARGET_64BIT)
11935 return gotoff_operand (XVECEXP (disp, 0, 0), Pmode);
11936 return false;
11937 case UNSPEC_GOTTPOFF:
11938 case UNSPEC_GOTNTPOFF:
11939 case UNSPEC_INDNTPOFF:
11940 if (saw_plus)
11941 return false;
11942 disp = XVECEXP (disp, 0, 0);
11943 return (GET_CODE (disp) == SYMBOL_REF
11944 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_INITIAL_EXEC);
11945 case UNSPEC_NTPOFF:
11946 disp = XVECEXP (disp, 0, 0);
11947 return (GET_CODE (disp) == SYMBOL_REF
11948 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_EXEC);
11949 case UNSPEC_DTPOFF:
11950 disp = XVECEXP (disp, 0, 0);
11951 return (GET_CODE (disp) == SYMBOL_REF
11952 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_DYNAMIC);
11953 }
11954
11955 return false;
11956 }
11957
11958 /* Recognizes RTL expressions that are valid memory addresses for an
11959 instruction. The MODE argument is the machine mode for the MEM
11960 expression that wants to use this address.
11961
11962 It only recognizes address in canonical form. LEGITIMIZE_ADDRESS should
11963 convert common non-canonical forms to canonical form so that they will
11964 be recognized. */
11965
11966 static bool
11967 ix86_legitimate_address_p (enum machine_mode mode ATTRIBUTE_UNUSED,
11968 rtx addr, bool strict)
11969 {
11970 struct ix86_address parts;
11971 rtx base, index, disp;
11972 HOST_WIDE_INT scale;
11973
11974 /* Since constant address in x32 is signed extended to 64bit,
11975 we have to prevent addresses from 0x80000000 to 0xffffffff. */
11976 if (TARGET_X32
11977 && CONST_INT_P (addr)
11978 && INTVAL (addr) < 0)
11979 return false;
11980
11981 if (ix86_decompose_address (addr, &parts) <= 0)
11982 /* Decomposition failed. */
11983 return false;
11984
11985 base = parts.base;
11986 index = parts.index;
11987 disp = parts.disp;
11988 scale = parts.scale;
11989
11990 /* Validate base register. */
11991 if (base)
11992 {
11993 rtx reg;
11994
11995 if (REG_P (base))
11996 reg = base;
11997 else if (GET_CODE (base) == SUBREG && REG_P (SUBREG_REG (base)))
11998 reg = SUBREG_REG (base);
11999 else
12000 /* Base is not a register. */
12001 return false;
12002
12003 if (GET_MODE (base) != SImode && GET_MODE (base) != DImode)
12004 return false;
12005
12006 if ((strict && ! REG_OK_FOR_BASE_STRICT_P (reg))
12007 || (! strict && ! REG_OK_FOR_BASE_NONSTRICT_P (reg)))
12008 /* Base is not valid. */
12009 return false;
12010 }
12011
12012 /* Validate index register. */
12013 if (index)
12014 {
12015 rtx reg;
12016
12017 if (REG_P (index))
12018 reg = index;
12019 else if (GET_CODE (index) == SUBREG && REG_P (SUBREG_REG (index)))
12020 reg = SUBREG_REG (index);
12021 else
12022 /* Index is not a register. */
12023 return false;
12024
12025 if (GET_MODE (index) != SImode && GET_MODE (index) != DImode)
12026 return false;
12027
12028 if ((strict && ! REG_OK_FOR_INDEX_STRICT_P (reg))
12029 || (! strict && ! REG_OK_FOR_INDEX_NONSTRICT_P (reg)))
12030 /* Index is not valid. */
12031 return false;
12032 }
12033
12034 /* Index and base should have the same mode. */
12035 if (base && index
12036 && GET_MODE (base) != GET_MODE (index))
12037 return false;
12038
12039 /* Validate scale factor. */
12040 if (scale != 1)
12041 {
12042 if (!index)
12043 /* Scale without index. */
12044 return false;
12045
12046 if (scale != 2 && scale != 4 && scale != 8)
12047 /* Scale is not a valid multiplier. */
12048 return false;
12049 }
12050
12051 /* Validate displacement. */
12052 if (disp)
12053 {
12054 if (GET_CODE (disp) == CONST
12055 && GET_CODE (XEXP (disp, 0)) == UNSPEC
12056 && XINT (XEXP (disp, 0), 1) != UNSPEC_MACHOPIC_OFFSET)
12057 switch (XINT (XEXP (disp, 0), 1))
12058 {
12059 /* Refuse GOTOFF and GOT in 64bit mode since it is always 64bit when
12060 used. While ABI specify also 32bit relocations, we don't produce
12061 them at all and use IP relative instead. */
12062 case UNSPEC_GOT:
12063 case UNSPEC_GOTOFF:
12064 gcc_assert (flag_pic);
12065 if (!TARGET_64BIT)
12066 goto is_legitimate_pic;
12067
12068 /* 64bit address unspec. */
12069 return false;
12070
12071 case UNSPEC_GOTPCREL:
12072 case UNSPEC_PCREL:
12073 gcc_assert (flag_pic);
12074 goto is_legitimate_pic;
12075
12076 case UNSPEC_GOTTPOFF:
12077 case UNSPEC_GOTNTPOFF:
12078 case UNSPEC_INDNTPOFF:
12079 case UNSPEC_NTPOFF:
12080 case UNSPEC_DTPOFF:
12081 break;
12082
12083 case UNSPEC_STACK_CHECK:
12084 gcc_assert (flag_split_stack);
12085 break;
12086
12087 default:
12088 /* Invalid address unspec. */
12089 return false;
12090 }
12091
12092 else if (SYMBOLIC_CONST (disp)
12093 && (flag_pic
12094 || (TARGET_MACHO
12095 #if TARGET_MACHO
12096 && MACHOPIC_INDIRECT
12097 && !machopic_operand_p (disp)
12098 #endif
12099 )))
12100 {
12101
12102 is_legitimate_pic:
12103 if (TARGET_64BIT && (index || base))
12104 {
12105 /* foo@dtpoff(%rX) is ok. */
12106 if (GET_CODE (disp) != CONST
12107 || GET_CODE (XEXP (disp, 0)) != PLUS
12108 || GET_CODE (XEXP (XEXP (disp, 0), 0)) != UNSPEC
12109 || !CONST_INT_P (XEXP (XEXP (disp, 0), 1))
12110 || (XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_DTPOFF
12111 && XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_NTPOFF))
12112 /* Non-constant pic memory reference. */
12113 return false;
12114 }
12115 else if ((!TARGET_MACHO || flag_pic)
12116 && ! legitimate_pic_address_disp_p (disp))
12117 /* Displacement is an invalid pic construct. */
12118 return false;
12119 #if TARGET_MACHO
12120 else if (MACHO_DYNAMIC_NO_PIC_P
12121 && !ix86_legitimate_constant_p (Pmode, disp))
12122 /* displacment must be referenced via non_lazy_pointer */
12123 return false;
12124 #endif
12125
12126 /* This code used to verify that a symbolic pic displacement
12127 includes the pic_offset_table_rtx register.
12128
12129 While this is good idea, unfortunately these constructs may
12130 be created by "adds using lea" optimization for incorrect
12131 code like:
12132
12133 int a;
12134 int foo(int i)
12135 {
12136 return *(&a+i);
12137 }
12138
12139 This code is nonsensical, but results in addressing
12140 GOT table with pic_offset_table_rtx base. We can't
12141 just refuse it easily, since it gets matched by
12142 "addsi3" pattern, that later gets split to lea in the
12143 case output register differs from input. While this
12144 can be handled by separate addsi pattern for this case
12145 that never results in lea, this seems to be easier and
12146 correct fix for crash to disable this test. */
12147 }
12148 else if (GET_CODE (disp) != LABEL_REF
12149 && !CONST_INT_P (disp)
12150 && (GET_CODE (disp) != CONST
12151 || !ix86_legitimate_constant_p (Pmode, disp))
12152 && (GET_CODE (disp) != SYMBOL_REF
12153 || !ix86_legitimate_constant_p (Pmode, disp)))
12154 /* Displacement is not constant. */
12155 return false;
12156 else if (TARGET_64BIT
12157 && !x86_64_immediate_operand (disp, VOIDmode))
12158 /* Displacement is out of range. */
12159 return false;
12160 }
12161
12162 /* Everything looks valid. */
12163 return true;
12164 }
12165
12166 /* Determine if a given RTX is a valid constant address. */
12167
12168 bool
12169 constant_address_p (rtx x)
12170 {
12171 return CONSTANT_P (x) && ix86_legitimate_address_p (Pmode, x, 1);
12172 }
12173 \f
12174 /* Return a unique alias set for the GOT. */
12175
12176 static alias_set_type
12177 ix86_GOT_alias_set (void)
12178 {
12179 static alias_set_type set = -1;
12180 if (set == -1)
12181 set = new_alias_set ();
12182 return set;
12183 }
12184
12185 /* Return a legitimate reference for ORIG (an address) using the
12186 register REG. If REG is 0, a new pseudo is generated.
12187
12188 There are two types of references that must be handled:
12189
12190 1. Global data references must load the address from the GOT, via
12191 the PIC reg. An insn is emitted to do this load, and the reg is
12192 returned.
12193
12194 2. Static data references, constant pool addresses, and code labels
12195 compute the address as an offset from the GOT, whose base is in
12196 the PIC reg. Static data objects have SYMBOL_FLAG_LOCAL set to
12197 differentiate them from global data objects. The returned
12198 address is the PIC reg + an unspec constant.
12199
12200 TARGET_LEGITIMATE_ADDRESS_P rejects symbolic references unless the PIC
12201 reg also appears in the address. */
12202
12203 static rtx
12204 legitimize_pic_address (rtx orig, rtx reg)
12205 {
12206 rtx addr = orig;
12207 rtx new_rtx = orig;
12208 rtx base;
12209
12210 #if TARGET_MACHO
12211 if (TARGET_MACHO && !TARGET_64BIT)
12212 {
12213 if (reg == 0)
12214 reg = gen_reg_rtx (Pmode);
12215 /* Use the generic Mach-O PIC machinery. */
12216 return machopic_legitimize_pic_address (orig, GET_MODE (orig), reg);
12217 }
12218 #endif
12219
12220 if (TARGET_64BIT && legitimate_pic_address_disp_p (addr))
12221 new_rtx = addr;
12222 else if (TARGET_64BIT
12223 && ix86_cmodel != CM_SMALL_PIC
12224 && gotoff_operand (addr, Pmode))
12225 {
12226 rtx tmpreg;
12227 /* This symbol may be referenced via a displacement from the PIC
12228 base address (@GOTOFF). */
12229
12230 if (reload_in_progress)
12231 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12232 if (GET_CODE (addr) == CONST)
12233 addr = XEXP (addr, 0);
12234 if (GET_CODE (addr) == PLUS)
12235 {
12236 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, XEXP (addr, 0)),
12237 UNSPEC_GOTOFF);
12238 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, XEXP (addr, 1));
12239 }
12240 else
12241 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTOFF);
12242 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12243 if (!reg)
12244 tmpreg = gen_reg_rtx (Pmode);
12245 else
12246 tmpreg = reg;
12247 emit_move_insn (tmpreg, new_rtx);
12248
12249 if (reg != 0)
12250 {
12251 new_rtx = expand_simple_binop (Pmode, PLUS, reg, pic_offset_table_rtx,
12252 tmpreg, 1, OPTAB_DIRECT);
12253 new_rtx = reg;
12254 }
12255 else new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, tmpreg);
12256 }
12257 else if (!TARGET_64BIT && gotoff_operand (addr, Pmode))
12258 {
12259 /* This symbol may be referenced via a displacement from the PIC
12260 base address (@GOTOFF). */
12261
12262 if (reload_in_progress)
12263 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12264 if (GET_CODE (addr) == CONST)
12265 addr = XEXP (addr, 0);
12266 if (GET_CODE (addr) == PLUS)
12267 {
12268 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, XEXP (addr, 0)),
12269 UNSPEC_GOTOFF);
12270 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, XEXP (addr, 1));
12271 }
12272 else
12273 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTOFF);
12274 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12275 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
12276
12277 if (reg != 0)
12278 {
12279 emit_move_insn (reg, new_rtx);
12280 new_rtx = reg;
12281 }
12282 }
12283 else if ((GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (addr) == 0)
12284 /* We can't use @GOTOFF for text labels on VxWorks;
12285 see gotoff_operand. */
12286 || (TARGET_VXWORKS_RTP && GET_CODE (addr) == LABEL_REF))
12287 {
12288 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
12289 {
12290 if (GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_DLLIMPORT_P (addr))
12291 return legitimize_dllimport_symbol (addr, true);
12292 if (GET_CODE (addr) == CONST && GET_CODE (XEXP (addr, 0)) == PLUS
12293 && GET_CODE (XEXP (XEXP (addr, 0), 0)) == SYMBOL_REF
12294 && SYMBOL_REF_DLLIMPORT_P (XEXP (XEXP (addr, 0), 0)))
12295 {
12296 rtx t = legitimize_dllimport_symbol (XEXP (XEXP (addr, 0), 0), true);
12297 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (addr, 0), 1));
12298 }
12299 }
12300
12301 /* For x64 PE-COFF there is no GOT table. So we use address
12302 directly. */
12303 if (TARGET_64BIT && DEFAULT_ABI == MS_ABI)
12304 {
12305 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_PCREL);
12306 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12307
12308 if (reg == 0)
12309 reg = gen_reg_rtx (Pmode);
12310 emit_move_insn (reg, new_rtx);
12311 new_rtx = reg;
12312 }
12313 else if (TARGET_64BIT && ix86_cmodel != CM_LARGE_PIC)
12314 {
12315 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTPCREL);
12316 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12317 new_rtx = gen_const_mem (Pmode, new_rtx);
12318 set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
12319
12320 if (reg == 0)
12321 reg = gen_reg_rtx (Pmode);
12322 /* Use directly gen_movsi, otherwise the address is loaded
12323 into register for CSE. We don't want to CSE this addresses,
12324 instead we CSE addresses from the GOT table, so skip this. */
12325 emit_insn (gen_movsi (reg, new_rtx));
12326 new_rtx = reg;
12327 }
12328 else
12329 {
12330 /* This symbol must be referenced via a load from the
12331 Global Offset Table (@GOT). */
12332
12333 if (reload_in_progress)
12334 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12335 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOT);
12336 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12337 if (TARGET_64BIT)
12338 new_rtx = force_reg (Pmode, new_rtx);
12339 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
12340 new_rtx = gen_const_mem (Pmode, new_rtx);
12341 set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
12342
12343 if (reg == 0)
12344 reg = gen_reg_rtx (Pmode);
12345 emit_move_insn (reg, new_rtx);
12346 new_rtx = reg;
12347 }
12348 }
12349 else
12350 {
12351 if (CONST_INT_P (addr)
12352 && !x86_64_immediate_operand (addr, VOIDmode))
12353 {
12354 if (reg)
12355 {
12356 emit_move_insn (reg, addr);
12357 new_rtx = reg;
12358 }
12359 else
12360 new_rtx = force_reg (Pmode, addr);
12361 }
12362 else if (GET_CODE (addr) == CONST)
12363 {
12364 addr = XEXP (addr, 0);
12365
12366 /* We must match stuff we generate before. Assume the only
12367 unspecs that can get here are ours. Not that we could do
12368 anything with them anyway.... */
12369 if (GET_CODE (addr) == UNSPEC
12370 || (GET_CODE (addr) == PLUS
12371 && GET_CODE (XEXP (addr, 0)) == UNSPEC))
12372 return orig;
12373 gcc_assert (GET_CODE (addr) == PLUS);
12374 }
12375 if (GET_CODE (addr) == PLUS)
12376 {
12377 rtx op0 = XEXP (addr, 0), op1 = XEXP (addr, 1);
12378
12379 /* Check first to see if this is a constant offset from a @GOTOFF
12380 symbol reference. */
12381 if (gotoff_operand (op0, Pmode)
12382 && CONST_INT_P (op1))
12383 {
12384 if (!TARGET_64BIT)
12385 {
12386 if (reload_in_progress)
12387 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12388 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op0),
12389 UNSPEC_GOTOFF);
12390 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, op1);
12391 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12392 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
12393
12394 if (reg != 0)
12395 {
12396 emit_move_insn (reg, new_rtx);
12397 new_rtx = reg;
12398 }
12399 }
12400 else
12401 {
12402 if (INTVAL (op1) < -16*1024*1024
12403 || INTVAL (op1) >= 16*1024*1024)
12404 {
12405 if (!x86_64_immediate_operand (op1, Pmode))
12406 op1 = force_reg (Pmode, op1);
12407 new_rtx = gen_rtx_PLUS (Pmode, force_reg (Pmode, op0), op1);
12408 }
12409 }
12410 }
12411 else
12412 {
12413 base = legitimize_pic_address (XEXP (addr, 0), reg);
12414 new_rtx = legitimize_pic_address (XEXP (addr, 1),
12415 base == reg ? NULL_RTX : reg);
12416
12417 if (CONST_INT_P (new_rtx))
12418 new_rtx = plus_constant (base, INTVAL (new_rtx));
12419 else
12420 {
12421 if (GET_CODE (new_rtx) == PLUS && CONSTANT_P (XEXP (new_rtx, 1)))
12422 {
12423 base = gen_rtx_PLUS (Pmode, base, XEXP (new_rtx, 0));
12424 new_rtx = XEXP (new_rtx, 1);
12425 }
12426 new_rtx = gen_rtx_PLUS (Pmode, base, new_rtx);
12427 }
12428 }
12429 }
12430 }
12431 return new_rtx;
12432 }
12433 \f
12434 /* Load the thread pointer. If TO_REG is true, force it into a register. */
12435
12436 static rtx
12437 get_thread_pointer (bool to_reg)
12438 {
12439 rtx tp = gen_rtx_UNSPEC (ptr_mode, gen_rtvec (1, const0_rtx), UNSPEC_TP);
12440
12441 if (GET_MODE (tp) != Pmode)
12442 tp = convert_to_mode (Pmode, tp, 1);
12443
12444 if (to_reg)
12445 tp = copy_addr_to_reg (tp);
12446
12447 return tp;
12448 }
12449
12450 /* Construct the SYMBOL_REF for the tls_get_addr function. */
12451
12452 static GTY(()) rtx ix86_tls_symbol;
12453
12454 static rtx
12455 ix86_tls_get_addr (void)
12456 {
12457 if (!ix86_tls_symbol)
12458 {
12459 const char *sym
12460 = ((TARGET_ANY_GNU_TLS && !TARGET_64BIT)
12461 ? "___tls_get_addr" : "__tls_get_addr");
12462
12463 ix86_tls_symbol = gen_rtx_SYMBOL_REF (Pmode, sym);
12464 }
12465
12466 return ix86_tls_symbol;
12467 }
12468
12469 /* Construct the SYMBOL_REF for the _TLS_MODULE_BASE_ symbol. */
12470
12471 static GTY(()) rtx ix86_tls_module_base_symbol;
12472
12473 rtx
12474 ix86_tls_module_base (void)
12475 {
12476 if (!ix86_tls_module_base_symbol)
12477 {
12478 ix86_tls_module_base_symbol
12479 = gen_rtx_SYMBOL_REF (Pmode, "_TLS_MODULE_BASE_");
12480
12481 SYMBOL_REF_FLAGS (ix86_tls_module_base_symbol)
12482 |= TLS_MODEL_GLOBAL_DYNAMIC << SYMBOL_FLAG_TLS_SHIFT;
12483 }
12484
12485 return ix86_tls_module_base_symbol;
12486 }
12487
12488 /* A subroutine of ix86_legitimize_address and ix86_expand_move. FOR_MOV is
12489 false if we expect this to be used for a memory address and true if
12490 we expect to load the address into a register. */
12491
12492 static rtx
12493 legitimize_tls_address (rtx x, enum tls_model model, bool for_mov)
12494 {
12495 rtx dest, base, off;
12496 rtx pic = NULL_RTX, tp = NULL_RTX;
12497 int type;
12498
12499 switch (model)
12500 {
12501 case TLS_MODEL_GLOBAL_DYNAMIC:
12502 dest = gen_reg_rtx (Pmode);
12503
12504 if (!TARGET_64BIT)
12505 {
12506 if (flag_pic)
12507 pic = pic_offset_table_rtx;
12508 else
12509 {
12510 pic = gen_reg_rtx (Pmode);
12511 emit_insn (gen_set_got (pic));
12512 }
12513 }
12514
12515 if (TARGET_GNU2_TLS)
12516 {
12517 if (TARGET_64BIT)
12518 emit_insn (gen_tls_dynamic_gnu2_64 (dest, x));
12519 else
12520 emit_insn (gen_tls_dynamic_gnu2_32 (dest, x, pic));
12521
12522 tp = get_thread_pointer (true);
12523 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, tp, dest));
12524
12525 set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
12526 }
12527 else
12528 {
12529 rtx caddr = ix86_tls_get_addr ();
12530
12531 if (TARGET_64BIT)
12532 {
12533 rtx rax = gen_rtx_REG (Pmode, AX_REG), insns;
12534
12535 start_sequence ();
12536 emit_call_insn (gen_tls_global_dynamic_64 (rax, x, caddr));
12537 insns = get_insns ();
12538 end_sequence ();
12539
12540 RTL_CONST_CALL_P (insns) = 1;
12541 emit_libcall_block (insns, dest, rax, x);
12542 }
12543 else
12544 emit_insn (gen_tls_global_dynamic_32 (dest, x, pic, caddr));
12545 }
12546 break;
12547
12548 case TLS_MODEL_LOCAL_DYNAMIC:
12549 base = gen_reg_rtx (Pmode);
12550
12551 if (!TARGET_64BIT)
12552 {
12553 if (flag_pic)
12554 pic = pic_offset_table_rtx;
12555 else
12556 {
12557 pic = gen_reg_rtx (Pmode);
12558 emit_insn (gen_set_got (pic));
12559 }
12560 }
12561
12562 if (TARGET_GNU2_TLS)
12563 {
12564 rtx tmp = ix86_tls_module_base ();
12565
12566 if (TARGET_64BIT)
12567 emit_insn (gen_tls_dynamic_gnu2_64 (base, tmp));
12568 else
12569 emit_insn (gen_tls_dynamic_gnu2_32 (base, tmp, pic));
12570
12571 tp = get_thread_pointer (true);
12572 set_unique_reg_note (get_last_insn (), REG_EQUAL,
12573 gen_rtx_MINUS (Pmode, tmp, tp));
12574 }
12575 else
12576 {
12577 rtx caddr = ix86_tls_get_addr ();
12578
12579 if (TARGET_64BIT)
12580 {
12581 rtx rax = gen_rtx_REG (Pmode, AX_REG), insns, eqv;
12582
12583 start_sequence ();
12584 emit_call_insn (gen_tls_local_dynamic_base_64 (rax, caddr));
12585 insns = get_insns ();
12586 end_sequence ();
12587
12588 /* Attach a unique REG_EQUAL, to allow the RTL optimizers to
12589 share the LD_BASE result with other LD model accesses. */
12590 eqv = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
12591 UNSPEC_TLS_LD_BASE);
12592
12593 RTL_CONST_CALL_P (insns) = 1;
12594 emit_libcall_block (insns, base, rax, eqv);
12595 }
12596 else
12597 emit_insn (gen_tls_local_dynamic_base_32 (base, pic, caddr));
12598 }
12599
12600 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x), UNSPEC_DTPOFF);
12601 off = gen_rtx_CONST (Pmode, off);
12602
12603 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, base, off));
12604
12605 if (TARGET_GNU2_TLS)
12606 {
12607 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, dest, tp));
12608
12609 set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
12610 }
12611 break;
12612
12613 case TLS_MODEL_INITIAL_EXEC:
12614 if (TARGET_64BIT)
12615 {
12616 if (TARGET_SUN_TLS)
12617 {
12618 /* The Sun linker took the AMD64 TLS spec literally
12619 and can only handle %rax as destination of the
12620 initial executable code sequence. */
12621
12622 dest = gen_reg_rtx (Pmode);
12623 emit_insn (gen_tls_initial_exec_64_sun (dest, x));
12624 return dest;
12625 }
12626 else if (Pmode == SImode)
12627 {
12628 /* Always generate
12629 movl %fs:0, %reg32
12630 addl xgottpoff(%rip), %reg32
12631 to support linker IE->LE optimization and avoid
12632 fs:(%reg32) as memory operand. */
12633 dest = gen_reg_rtx (Pmode);
12634 emit_insn (gen_tls_initial_exec_x32 (dest, x));
12635 return dest;
12636 }
12637
12638 pic = NULL;
12639 type = UNSPEC_GOTNTPOFF;
12640 }
12641 else if (flag_pic)
12642 {
12643 if (reload_in_progress)
12644 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12645 pic = pic_offset_table_rtx;
12646 type = TARGET_ANY_GNU_TLS ? UNSPEC_GOTNTPOFF : UNSPEC_GOTTPOFF;
12647 }
12648 else if (!TARGET_ANY_GNU_TLS)
12649 {
12650 pic = gen_reg_rtx (Pmode);
12651 emit_insn (gen_set_got (pic));
12652 type = UNSPEC_GOTTPOFF;
12653 }
12654 else
12655 {
12656 pic = NULL;
12657 type = UNSPEC_INDNTPOFF;
12658 }
12659
12660 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x), type);
12661 off = gen_rtx_CONST (Pmode, off);
12662 if (pic)
12663 off = gen_rtx_PLUS (Pmode, pic, off);
12664 off = gen_const_mem (Pmode, off);
12665 set_mem_alias_set (off, ix86_GOT_alias_set ());
12666
12667 if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
12668 {
12669 base = get_thread_pointer (for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
12670 off = force_reg (Pmode, off);
12671 return gen_rtx_PLUS (Pmode, base, off);
12672 }
12673 else
12674 {
12675 base = get_thread_pointer (true);
12676 dest = gen_reg_rtx (Pmode);
12677 emit_insn (gen_subsi3 (dest, base, off));
12678 }
12679 break;
12680
12681 case TLS_MODEL_LOCAL_EXEC:
12682 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x),
12683 (TARGET_64BIT || TARGET_ANY_GNU_TLS)
12684 ? UNSPEC_NTPOFF : UNSPEC_TPOFF);
12685 off = gen_rtx_CONST (Pmode, off);
12686
12687 if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
12688 {
12689 base = get_thread_pointer (for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
12690 return gen_rtx_PLUS (Pmode, base, off);
12691 }
12692 else
12693 {
12694 base = get_thread_pointer (true);
12695 dest = gen_reg_rtx (Pmode);
12696 emit_insn (gen_subsi3 (dest, base, off));
12697 }
12698 break;
12699
12700 default:
12701 gcc_unreachable ();
12702 }
12703
12704 return dest;
12705 }
12706
12707 /* Create or return the unique __imp_DECL dllimport symbol corresponding
12708 to symbol DECL. */
12709
12710 static GTY((if_marked ("tree_map_marked_p"), param_is (struct tree_map)))
12711 htab_t dllimport_map;
12712
12713 static tree
12714 get_dllimport_decl (tree decl)
12715 {
12716 struct tree_map *h, in;
12717 void **loc;
12718 const char *name;
12719 const char *prefix;
12720 size_t namelen, prefixlen;
12721 char *imp_name;
12722 tree to;
12723 rtx rtl;
12724
12725 if (!dllimport_map)
12726 dllimport_map = htab_create_ggc (512, tree_map_hash, tree_map_eq, 0);
12727
12728 in.hash = htab_hash_pointer (decl);
12729 in.base.from = decl;
12730 loc = htab_find_slot_with_hash (dllimport_map, &in, in.hash, INSERT);
12731 h = (struct tree_map *) *loc;
12732 if (h)
12733 return h->to;
12734
12735 *loc = h = ggc_alloc_tree_map ();
12736 h->hash = in.hash;
12737 h->base.from = decl;
12738 h->to = to = build_decl (DECL_SOURCE_LOCATION (decl),
12739 VAR_DECL, NULL, ptr_type_node);
12740 DECL_ARTIFICIAL (to) = 1;
12741 DECL_IGNORED_P (to) = 1;
12742 DECL_EXTERNAL (to) = 1;
12743 TREE_READONLY (to) = 1;
12744
12745 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
12746 name = targetm.strip_name_encoding (name);
12747 prefix = name[0] == FASTCALL_PREFIX || user_label_prefix[0] == 0
12748 ? "*__imp_" : "*__imp__";
12749 namelen = strlen (name);
12750 prefixlen = strlen (prefix);
12751 imp_name = (char *) alloca (namelen + prefixlen + 1);
12752 memcpy (imp_name, prefix, prefixlen);
12753 memcpy (imp_name + prefixlen, name, namelen + 1);
12754
12755 name = ggc_alloc_string (imp_name, namelen + prefixlen);
12756 rtl = gen_rtx_SYMBOL_REF (Pmode, name);
12757 SET_SYMBOL_REF_DECL (rtl, to);
12758 SYMBOL_REF_FLAGS (rtl) = SYMBOL_FLAG_LOCAL;
12759
12760 rtl = gen_const_mem (Pmode, rtl);
12761 set_mem_alias_set (rtl, ix86_GOT_alias_set ());
12762
12763 SET_DECL_RTL (to, rtl);
12764 SET_DECL_ASSEMBLER_NAME (to, get_identifier (name));
12765
12766 return to;
12767 }
12768
12769 /* Expand SYMBOL into its corresponding dllimport symbol. WANT_REG is
12770 true if we require the result be a register. */
12771
12772 static rtx
12773 legitimize_dllimport_symbol (rtx symbol, bool want_reg)
12774 {
12775 tree imp_decl;
12776 rtx x;
12777
12778 gcc_assert (SYMBOL_REF_DECL (symbol));
12779 imp_decl = get_dllimport_decl (SYMBOL_REF_DECL (symbol));
12780
12781 x = DECL_RTL (imp_decl);
12782 if (want_reg)
12783 x = force_reg (Pmode, x);
12784 return x;
12785 }
12786
12787 /* Try machine-dependent ways of modifying an illegitimate address
12788 to be legitimate. If we find one, return the new, valid address.
12789 This macro is used in only one place: `memory_address' in explow.c.
12790
12791 OLDX is the address as it was before break_out_memory_refs was called.
12792 In some cases it is useful to look at this to decide what needs to be done.
12793
12794 It is always safe for this macro to do nothing. It exists to recognize
12795 opportunities to optimize the output.
12796
12797 For the 80386, we handle X+REG by loading X into a register R and
12798 using R+REG. R will go in a general reg and indexing will be used.
12799 However, if REG is a broken-out memory address or multiplication,
12800 nothing needs to be done because REG can certainly go in a general reg.
12801
12802 When -fpic is used, special handling is needed for symbolic references.
12803 See comments by legitimize_pic_address in i386.c for details. */
12804
12805 static rtx
12806 ix86_legitimize_address (rtx x, rtx oldx ATTRIBUTE_UNUSED,
12807 enum machine_mode mode)
12808 {
12809 int changed = 0;
12810 unsigned log;
12811
12812 log = GET_CODE (x) == SYMBOL_REF ? SYMBOL_REF_TLS_MODEL (x) : 0;
12813 if (log)
12814 return legitimize_tls_address (x, (enum tls_model) log, false);
12815 if (GET_CODE (x) == CONST
12816 && GET_CODE (XEXP (x, 0)) == PLUS
12817 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF
12818 && (log = SYMBOL_REF_TLS_MODEL (XEXP (XEXP (x, 0), 0))))
12819 {
12820 rtx t = legitimize_tls_address (XEXP (XEXP (x, 0), 0),
12821 (enum tls_model) log, false);
12822 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (x, 0), 1));
12823 }
12824
12825 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
12826 {
12827 if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_DLLIMPORT_P (x))
12828 return legitimize_dllimport_symbol (x, true);
12829 if (GET_CODE (x) == CONST
12830 && GET_CODE (XEXP (x, 0)) == PLUS
12831 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF
12832 && SYMBOL_REF_DLLIMPORT_P (XEXP (XEXP (x, 0), 0)))
12833 {
12834 rtx t = legitimize_dllimport_symbol (XEXP (XEXP (x, 0), 0), true);
12835 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (x, 0), 1));
12836 }
12837 }
12838
12839 if (flag_pic && SYMBOLIC_CONST (x))
12840 return legitimize_pic_address (x, 0);
12841
12842 #if TARGET_MACHO
12843 if (MACHO_DYNAMIC_NO_PIC_P && SYMBOLIC_CONST (x))
12844 return machopic_indirect_data_reference (x, 0);
12845 #endif
12846
12847 /* Canonicalize shifts by 0, 1, 2, 3 into multiply */
12848 if (GET_CODE (x) == ASHIFT
12849 && CONST_INT_P (XEXP (x, 1))
12850 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (x, 1)) < 4)
12851 {
12852 changed = 1;
12853 log = INTVAL (XEXP (x, 1));
12854 x = gen_rtx_MULT (Pmode, force_reg (Pmode, XEXP (x, 0)),
12855 GEN_INT (1 << log));
12856 }
12857
12858 if (GET_CODE (x) == PLUS)
12859 {
12860 /* Canonicalize shifts by 0, 1, 2, 3 into multiply. */
12861
12862 if (GET_CODE (XEXP (x, 0)) == ASHIFT
12863 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
12864 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 0), 1)) < 4)
12865 {
12866 changed = 1;
12867 log = INTVAL (XEXP (XEXP (x, 0), 1));
12868 XEXP (x, 0) = gen_rtx_MULT (Pmode,
12869 force_reg (Pmode, XEXP (XEXP (x, 0), 0)),
12870 GEN_INT (1 << log));
12871 }
12872
12873 if (GET_CODE (XEXP (x, 1)) == ASHIFT
12874 && CONST_INT_P (XEXP (XEXP (x, 1), 1))
12875 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 1), 1)) < 4)
12876 {
12877 changed = 1;
12878 log = INTVAL (XEXP (XEXP (x, 1), 1));
12879 XEXP (x, 1) = gen_rtx_MULT (Pmode,
12880 force_reg (Pmode, XEXP (XEXP (x, 1), 0)),
12881 GEN_INT (1 << log));
12882 }
12883
12884 /* Put multiply first if it isn't already. */
12885 if (GET_CODE (XEXP (x, 1)) == MULT)
12886 {
12887 rtx tmp = XEXP (x, 0);
12888 XEXP (x, 0) = XEXP (x, 1);
12889 XEXP (x, 1) = tmp;
12890 changed = 1;
12891 }
12892
12893 /* Canonicalize (plus (mult (reg) (const)) (plus (reg) (const)))
12894 into (plus (plus (mult (reg) (const)) (reg)) (const)). This can be
12895 created by virtual register instantiation, register elimination, and
12896 similar optimizations. */
12897 if (GET_CODE (XEXP (x, 0)) == MULT && GET_CODE (XEXP (x, 1)) == PLUS)
12898 {
12899 changed = 1;
12900 x = gen_rtx_PLUS (Pmode,
12901 gen_rtx_PLUS (Pmode, XEXP (x, 0),
12902 XEXP (XEXP (x, 1), 0)),
12903 XEXP (XEXP (x, 1), 1));
12904 }
12905
12906 /* Canonicalize
12907 (plus (plus (mult (reg) (const)) (plus (reg) (const))) const)
12908 into (plus (plus (mult (reg) (const)) (reg)) (const)). */
12909 else if (GET_CODE (x) == PLUS && GET_CODE (XEXP (x, 0)) == PLUS
12910 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
12911 && GET_CODE (XEXP (XEXP (x, 0), 1)) == PLUS
12912 && CONSTANT_P (XEXP (x, 1)))
12913 {
12914 rtx constant;
12915 rtx other = NULL_RTX;
12916
12917 if (CONST_INT_P (XEXP (x, 1)))
12918 {
12919 constant = XEXP (x, 1);
12920 other = XEXP (XEXP (XEXP (x, 0), 1), 1);
12921 }
12922 else if (CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 1), 1)))
12923 {
12924 constant = XEXP (XEXP (XEXP (x, 0), 1), 1);
12925 other = XEXP (x, 1);
12926 }
12927 else
12928 constant = 0;
12929
12930 if (constant)
12931 {
12932 changed = 1;
12933 x = gen_rtx_PLUS (Pmode,
12934 gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 0),
12935 XEXP (XEXP (XEXP (x, 0), 1), 0)),
12936 plus_constant (other, INTVAL (constant)));
12937 }
12938 }
12939
12940 if (changed && ix86_legitimate_address_p (mode, x, false))
12941 return x;
12942
12943 if (GET_CODE (XEXP (x, 0)) == MULT)
12944 {
12945 changed = 1;
12946 XEXP (x, 0) = force_operand (XEXP (x, 0), 0);
12947 }
12948
12949 if (GET_CODE (XEXP (x, 1)) == MULT)
12950 {
12951 changed = 1;
12952 XEXP (x, 1) = force_operand (XEXP (x, 1), 0);
12953 }
12954
12955 if (changed
12956 && REG_P (XEXP (x, 1))
12957 && REG_P (XEXP (x, 0)))
12958 return x;
12959
12960 if (flag_pic && SYMBOLIC_CONST (XEXP (x, 1)))
12961 {
12962 changed = 1;
12963 x = legitimize_pic_address (x, 0);
12964 }
12965
12966 if (changed && ix86_legitimate_address_p (mode, x, false))
12967 return x;
12968
12969 if (REG_P (XEXP (x, 0)))
12970 {
12971 rtx temp = gen_reg_rtx (Pmode);
12972 rtx val = force_operand (XEXP (x, 1), temp);
12973 if (val != temp)
12974 {
12975 if (GET_MODE (val) != Pmode)
12976 val = convert_to_mode (Pmode, val, 1);
12977 emit_move_insn (temp, val);
12978 }
12979
12980 XEXP (x, 1) = temp;
12981 return x;
12982 }
12983
12984 else if (REG_P (XEXP (x, 1)))
12985 {
12986 rtx temp = gen_reg_rtx (Pmode);
12987 rtx val = force_operand (XEXP (x, 0), temp);
12988 if (val != temp)
12989 {
12990 if (GET_MODE (val) != Pmode)
12991 val = convert_to_mode (Pmode, val, 1);
12992 emit_move_insn (temp, val);
12993 }
12994
12995 XEXP (x, 0) = temp;
12996 return x;
12997 }
12998 }
12999
13000 return x;
13001 }
13002 \f
13003 /* Print an integer constant expression in assembler syntax. Addition
13004 and subtraction are the only arithmetic that may appear in these
13005 expressions. FILE is the stdio stream to write to, X is the rtx, and
13006 CODE is the operand print code from the output string. */
13007
13008 static void
13009 output_pic_addr_const (FILE *file, rtx x, int code)
13010 {
13011 char buf[256];
13012
13013 switch (GET_CODE (x))
13014 {
13015 case PC:
13016 gcc_assert (flag_pic);
13017 putc ('.', file);
13018 break;
13019
13020 case SYMBOL_REF:
13021 if (TARGET_64BIT || ! TARGET_MACHO_BRANCH_ISLANDS)
13022 output_addr_const (file, x);
13023 else
13024 {
13025 const char *name = XSTR (x, 0);
13026
13027 /* Mark the decl as referenced so that cgraph will
13028 output the function. */
13029 if (SYMBOL_REF_DECL (x))
13030 mark_decl_referenced (SYMBOL_REF_DECL (x));
13031
13032 #if TARGET_MACHO
13033 if (MACHOPIC_INDIRECT
13034 && machopic_classify_symbol (x) == MACHOPIC_UNDEFINED_FUNCTION)
13035 name = machopic_indirection_name (x, /*stub_p=*/true);
13036 #endif
13037 assemble_name (file, name);
13038 }
13039 if (!TARGET_MACHO && !(TARGET_64BIT && DEFAULT_ABI == MS_ABI)
13040 && code == 'P' && ! SYMBOL_REF_LOCAL_P (x))
13041 fputs ("@PLT", file);
13042 break;
13043
13044 case LABEL_REF:
13045 x = XEXP (x, 0);
13046 /* FALLTHRU */
13047 case CODE_LABEL:
13048 ASM_GENERATE_INTERNAL_LABEL (buf, "L", CODE_LABEL_NUMBER (x));
13049 assemble_name (asm_out_file, buf);
13050 break;
13051
13052 case CONST_INT:
13053 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
13054 break;
13055
13056 case CONST:
13057 /* This used to output parentheses around the expression,
13058 but that does not work on the 386 (either ATT or BSD assembler). */
13059 output_pic_addr_const (file, XEXP (x, 0), code);
13060 break;
13061
13062 case CONST_DOUBLE:
13063 if (GET_MODE (x) == VOIDmode)
13064 {
13065 /* We can use %d if the number is <32 bits and positive. */
13066 if (CONST_DOUBLE_HIGH (x) || CONST_DOUBLE_LOW (x) < 0)
13067 fprintf (file, "0x%lx%08lx",
13068 (unsigned long) CONST_DOUBLE_HIGH (x),
13069 (unsigned long) CONST_DOUBLE_LOW (x));
13070 else
13071 fprintf (file, HOST_WIDE_INT_PRINT_DEC, CONST_DOUBLE_LOW (x));
13072 }
13073 else
13074 /* We can't handle floating point constants;
13075 TARGET_PRINT_OPERAND must handle them. */
13076 output_operand_lossage ("floating constant misused");
13077 break;
13078
13079 case PLUS:
13080 /* Some assemblers need integer constants to appear first. */
13081 if (CONST_INT_P (XEXP (x, 0)))
13082 {
13083 output_pic_addr_const (file, XEXP (x, 0), code);
13084 putc ('+', file);
13085 output_pic_addr_const (file, XEXP (x, 1), code);
13086 }
13087 else
13088 {
13089 gcc_assert (CONST_INT_P (XEXP (x, 1)));
13090 output_pic_addr_const (file, XEXP (x, 1), code);
13091 putc ('+', file);
13092 output_pic_addr_const (file, XEXP (x, 0), code);
13093 }
13094 break;
13095
13096 case MINUS:
13097 if (!TARGET_MACHO)
13098 putc (ASSEMBLER_DIALECT == ASM_INTEL ? '(' : '[', file);
13099 output_pic_addr_const (file, XEXP (x, 0), code);
13100 putc ('-', file);
13101 output_pic_addr_const (file, XEXP (x, 1), code);
13102 if (!TARGET_MACHO)
13103 putc (ASSEMBLER_DIALECT == ASM_INTEL ? ')' : ']', file);
13104 break;
13105
13106 case UNSPEC:
13107 if (XINT (x, 1) == UNSPEC_STACK_CHECK)
13108 {
13109 bool f = i386_asm_output_addr_const_extra (file, x);
13110 gcc_assert (f);
13111 break;
13112 }
13113
13114 gcc_assert (XVECLEN (x, 0) == 1);
13115 output_pic_addr_const (file, XVECEXP (x, 0, 0), code);
13116 switch (XINT (x, 1))
13117 {
13118 case UNSPEC_GOT:
13119 fputs ("@GOT", file);
13120 break;
13121 case UNSPEC_GOTOFF:
13122 fputs ("@GOTOFF", file);
13123 break;
13124 case UNSPEC_PLTOFF:
13125 fputs ("@PLTOFF", file);
13126 break;
13127 case UNSPEC_PCREL:
13128 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
13129 "(%rip)" : "[rip]", file);
13130 break;
13131 case UNSPEC_GOTPCREL:
13132 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
13133 "@GOTPCREL(%rip)" : "@GOTPCREL[rip]", file);
13134 break;
13135 case UNSPEC_GOTTPOFF:
13136 /* FIXME: This might be @TPOFF in Sun ld too. */
13137 fputs ("@gottpoff", file);
13138 break;
13139 case UNSPEC_TPOFF:
13140 fputs ("@tpoff", file);
13141 break;
13142 case UNSPEC_NTPOFF:
13143 if (TARGET_64BIT)
13144 fputs ("@tpoff", file);
13145 else
13146 fputs ("@ntpoff", file);
13147 break;
13148 case UNSPEC_DTPOFF:
13149 fputs ("@dtpoff", file);
13150 break;
13151 case UNSPEC_GOTNTPOFF:
13152 if (TARGET_64BIT)
13153 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
13154 "@gottpoff(%rip)": "@gottpoff[rip]", file);
13155 else
13156 fputs ("@gotntpoff", file);
13157 break;
13158 case UNSPEC_INDNTPOFF:
13159 fputs ("@indntpoff", file);
13160 break;
13161 #if TARGET_MACHO
13162 case UNSPEC_MACHOPIC_OFFSET:
13163 putc ('-', file);
13164 machopic_output_function_base_name (file);
13165 break;
13166 #endif
13167 default:
13168 output_operand_lossage ("invalid UNSPEC as operand");
13169 break;
13170 }
13171 break;
13172
13173 default:
13174 output_operand_lossage ("invalid expression as operand");
13175 }
13176 }
13177
13178 /* This is called from dwarf2out.c via TARGET_ASM_OUTPUT_DWARF_DTPREL.
13179 We need to emit DTP-relative relocations. */
13180
13181 static void ATTRIBUTE_UNUSED
13182 i386_output_dwarf_dtprel (FILE *file, int size, rtx x)
13183 {
13184 fputs (ASM_LONG, file);
13185 output_addr_const (file, x);
13186 fputs ("@dtpoff", file);
13187 switch (size)
13188 {
13189 case 4:
13190 break;
13191 case 8:
13192 fputs (", 0", file);
13193 break;
13194 default:
13195 gcc_unreachable ();
13196 }
13197 }
13198
13199 /* Return true if X is a representation of the PIC register. This copes
13200 with calls from ix86_find_base_term, where the register might have
13201 been replaced by a cselib value. */
13202
13203 static bool
13204 ix86_pic_register_p (rtx x)
13205 {
13206 if (GET_CODE (x) == VALUE && CSELIB_VAL_PTR (x))
13207 return (pic_offset_table_rtx
13208 && rtx_equal_for_cselib_p (x, pic_offset_table_rtx));
13209 else
13210 return REG_P (x) && REGNO (x) == PIC_OFFSET_TABLE_REGNUM;
13211 }
13212
13213 /* Helper function for ix86_delegitimize_address.
13214 Attempt to delegitimize TLS local-exec accesses. */
13215
13216 static rtx
13217 ix86_delegitimize_tls_address (rtx orig_x)
13218 {
13219 rtx x = orig_x, unspec;
13220 struct ix86_address addr;
13221
13222 if (!TARGET_TLS_DIRECT_SEG_REFS)
13223 return orig_x;
13224 if (MEM_P (x))
13225 x = XEXP (x, 0);
13226 if (GET_CODE (x) != PLUS || GET_MODE (x) != Pmode)
13227 return orig_x;
13228 if (ix86_decompose_address (x, &addr) == 0
13229 || addr.seg != (TARGET_64BIT ? SEG_FS : SEG_GS)
13230 || addr.disp == NULL_RTX
13231 || GET_CODE (addr.disp) != CONST)
13232 return orig_x;
13233 unspec = XEXP (addr.disp, 0);
13234 if (GET_CODE (unspec) == PLUS && CONST_INT_P (XEXP (unspec, 1)))
13235 unspec = XEXP (unspec, 0);
13236 if (GET_CODE (unspec) != UNSPEC || XINT (unspec, 1) != UNSPEC_NTPOFF)
13237 return orig_x;
13238 x = XVECEXP (unspec, 0, 0);
13239 gcc_assert (GET_CODE (x) == SYMBOL_REF);
13240 if (unspec != XEXP (addr.disp, 0))
13241 x = gen_rtx_PLUS (Pmode, x, XEXP (XEXP (addr.disp, 0), 1));
13242 if (addr.index)
13243 {
13244 rtx idx = addr.index;
13245 if (addr.scale != 1)
13246 idx = gen_rtx_MULT (Pmode, idx, GEN_INT (addr.scale));
13247 x = gen_rtx_PLUS (Pmode, idx, x);
13248 }
13249 if (addr.base)
13250 x = gen_rtx_PLUS (Pmode, addr.base, x);
13251 if (MEM_P (orig_x))
13252 x = replace_equiv_address_nv (orig_x, x);
13253 return x;
13254 }
13255
13256 /* In the name of slightly smaller debug output, and to cater to
13257 general assembler lossage, recognize PIC+GOTOFF and turn it back
13258 into a direct symbol reference.
13259
13260 On Darwin, this is necessary to avoid a crash, because Darwin
13261 has a different PIC label for each routine but the DWARF debugging
13262 information is not associated with any particular routine, so it's
13263 necessary to remove references to the PIC label from RTL stored by
13264 the DWARF output code. */
13265
13266 static rtx
13267 ix86_delegitimize_address (rtx x)
13268 {
13269 rtx orig_x = delegitimize_mem_from_attrs (x);
13270 /* addend is NULL or some rtx if x is something+GOTOFF where
13271 something doesn't include the PIC register. */
13272 rtx addend = NULL_RTX;
13273 /* reg_addend is NULL or a multiple of some register. */
13274 rtx reg_addend = NULL_RTX;
13275 /* const_addend is NULL or a const_int. */
13276 rtx const_addend = NULL_RTX;
13277 /* This is the result, or NULL. */
13278 rtx result = NULL_RTX;
13279
13280 x = orig_x;
13281
13282 if (MEM_P (x))
13283 x = XEXP (x, 0);
13284
13285 if (TARGET_64BIT)
13286 {
13287 if (GET_CODE (x) == CONST
13288 && GET_CODE (XEXP (x, 0)) == PLUS
13289 && GET_MODE (XEXP (x, 0)) == Pmode
13290 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
13291 && GET_CODE (XEXP (XEXP (x, 0), 0)) == UNSPEC
13292 && XINT (XEXP (XEXP (x, 0), 0), 1) == UNSPEC_PCREL)
13293 {
13294 rtx x2 = XVECEXP (XEXP (XEXP (x, 0), 0), 0, 0);
13295 x = gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 1), x2);
13296 if (MEM_P (orig_x))
13297 x = replace_equiv_address_nv (orig_x, x);
13298 return x;
13299 }
13300 if (GET_CODE (x) != CONST
13301 || GET_CODE (XEXP (x, 0)) != UNSPEC
13302 || (XINT (XEXP (x, 0), 1) != UNSPEC_GOTPCREL
13303 && XINT (XEXP (x, 0), 1) != UNSPEC_PCREL)
13304 || (!MEM_P (orig_x) && XINT (XEXP (x, 0), 1) != UNSPEC_PCREL))
13305 return ix86_delegitimize_tls_address (orig_x);
13306 x = XVECEXP (XEXP (x, 0), 0, 0);
13307 if (GET_MODE (orig_x) != GET_MODE (x) && MEM_P (orig_x))
13308 {
13309 x = simplify_gen_subreg (GET_MODE (orig_x), x,
13310 GET_MODE (x), 0);
13311 if (x == NULL_RTX)
13312 return orig_x;
13313 }
13314 return x;
13315 }
13316
13317 if (GET_CODE (x) != PLUS
13318 || GET_CODE (XEXP (x, 1)) != CONST)
13319 return ix86_delegitimize_tls_address (orig_x);
13320
13321 if (ix86_pic_register_p (XEXP (x, 0)))
13322 /* %ebx + GOT/GOTOFF */
13323 ;
13324 else if (GET_CODE (XEXP (x, 0)) == PLUS)
13325 {
13326 /* %ebx + %reg * scale + GOT/GOTOFF */
13327 reg_addend = XEXP (x, 0);
13328 if (ix86_pic_register_p (XEXP (reg_addend, 0)))
13329 reg_addend = XEXP (reg_addend, 1);
13330 else if (ix86_pic_register_p (XEXP (reg_addend, 1)))
13331 reg_addend = XEXP (reg_addend, 0);
13332 else
13333 {
13334 reg_addend = NULL_RTX;
13335 addend = XEXP (x, 0);
13336 }
13337 }
13338 else
13339 addend = XEXP (x, 0);
13340
13341 x = XEXP (XEXP (x, 1), 0);
13342 if (GET_CODE (x) == PLUS
13343 && CONST_INT_P (XEXP (x, 1)))
13344 {
13345 const_addend = XEXP (x, 1);
13346 x = XEXP (x, 0);
13347 }
13348
13349 if (GET_CODE (x) == UNSPEC
13350 && ((XINT (x, 1) == UNSPEC_GOT && MEM_P (orig_x) && !addend)
13351 || (XINT (x, 1) == UNSPEC_GOTOFF && !MEM_P (orig_x))))
13352 result = XVECEXP (x, 0, 0);
13353
13354 if (TARGET_MACHO && darwin_local_data_pic (x)
13355 && !MEM_P (orig_x))
13356 result = XVECEXP (x, 0, 0);
13357
13358 if (! result)
13359 return ix86_delegitimize_tls_address (orig_x);
13360
13361 if (const_addend)
13362 result = gen_rtx_CONST (Pmode, gen_rtx_PLUS (Pmode, result, const_addend));
13363 if (reg_addend)
13364 result = gen_rtx_PLUS (Pmode, reg_addend, result);
13365 if (addend)
13366 {
13367 /* If the rest of original X doesn't involve the PIC register, add
13368 addend and subtract pic_offset_table_rtx. This can happen e.g.
13369 for code like:
13370 leal (%ebx, %ecx, 4), %ecx
13371 ...
13372 movl foo@GOTOFF(%ecx), %edx
13373 in which case we return (%ecx - %ebx) + foo. */
13374 if (pic_offset_table_rtx)
13375 result = gen_rtx_PLUS (Pmode, gen_rtx_MINUS (Pmode, copy_rtx (addend),
13376 pic_offset_table_rtx),
13377 result);
13378 else
13379 return orig_x;
13380 }
13381 if (GET_MODE (orig_x) != Pmode && MEM_P (orig_x))
13382 {
13383 result = simplify_gen_subreg (GET_MODE (orig_x), result, Pmode, 0);
13384 if (result == NULL_RTX)
13385 return orig_x;
13386 }
13387 return result;
13388 }
13389
13390 /* If X is a machine specific address (i.e. a symbol or label being
13391 referenced as a displacement from the GOT implemented using an
13392 UNSPEC), then return the base term. Otherwise return X. */
13393
13394 rtx
13395 ix86_find_base_term (rtx x)
13396 {
13397 rtx term;
13398
13399 if (TARGET_64BIT)
13400 {
13401 if (GET_CODE (x) != CONST)
13402 return x;
13403 term = XEXP (x, 0);
13404 if (GET_CODE (term) == PLUS
13405 && (CONST_INT_P (XEXP (term, 1))
13406 || GET_CODE (XEXP (term, 1)) == CONST_DOUBLE))
13407 term = XEXP (term, 0);
13408 if (GET_CODE (term) != UNSPEC
13409 || (XINT (term, 1) != UNSPEC_GOTPCREL
13410 && XINT (term, 1) != UNSPEC_PCREL))
13411 return x;
13412
13413 return XVECEXP (term, 0, 0);
13414 }
13415
13416 return ix86_delegitimize_address (x);
13417 }
13418 \f
13419 static void
13420 put_condition_code (enum rtx_code code, enum machine_mode mode, int reverse,
13421 int fp, FILE *file)
13422 {
13423 const char *suffix;
13424
13425 if (mode == CCFPmode || mode == CCFPUmode)
13426 {
13427 code = ix86_fp_compare_code_to_integer (code);
13428 mode = CCmode;
13429 }
13430 if (reverse)
13431 code = reverse_condition (code);
13432
13433 switch (code)
13434 {
13435 case EQ:
13436 switch (mode)
13437 {
13438 case CCAmode:
13439 suffix = "a";
13440 break;
13441
13442 case CCCmode:
13443 suffix = "c";
13444 break;
13445
13446 case CCOmode:
13447 suffix = "o";
13448 break;
13449
13450 case CCSmode:
13451 suffix = "s";
13452 break;
13453
13454 default:
13455 suffix = "e";
13456 }
13457 break;
13458 case NE:
13459 switch (mode)
13460 {
13461 case CCAmode:
13462 suffix = "na";
13463 break;
13464
13465 case CCCmode:
13466 suffix = "nc";
13467 break;
13468
13469 case CCOmode:
13470 suffix = "no";
13471 break;
13472
13473 case CCSmode:
13474 suffix = "ns";
13475 break;
13476
13477 default:
13478 suffix = "ne";
13479 }
13480 break;
13481 case GT:
13482 gcc_assert (mode == CCmode || mode == CCNOmode || mode == CCGCmode);
13483 suffix = "g";
13484 break;
13485 case GTU:
13486 /* ??? Use "nbe" instead of "a" for fcmov lossage on some assemblers.
13487 Those same assemblers have the same but opposite lossage on cmov. */
13488 if (mode == CCmode)
13489 suffix = fp ? "nbe" : "a";
13490 else if (mode == CCCmode)
13491 suffix = "b";
13492 else
13493 gcc_unreachable ();
13494 break;
13495 case LT:
13496 switch (mode)
13497 {
13498 case CCNOmode:
13499 case CCGOCmode:
13500 suffix = "s";
13501 break;
13502
13503 case CCmode:
13504 case CCGCmode:
13505 suffix = "l";
13506 break;
13507
13508 default:
13509 gcc_unreachable ();
13510 }
13511 break;
13512 case LTU:
13513 gcc_assert (mode == CCmode || mode == CCCmode);
13514 suffix = "b";
13515 break;
13516 case GE:
13517 switch (mode)
13518 {
13519 case CCNOmode:
13520 case CCGOCmode:
13521 suffix = "ns";
13522 break;
13523
13524 case CCmode:
13525 case CCGCmode:
13526 suffix = "ge";
13527 break;
13528
13529 default:
13530 gcc_unreachable ();
13531 }
13532 break;
13533 case GEU:
13534 /* ??? As above. */
13535 gcc_assert (mode == CCmode || mode == CCCmode);
13536 suffix = fp ? "nb" : "ae";
13537 break;
13538 case LE:
13539 gcc_assert (mode == CCmode || mode == CCGCmode || mode == CCNOmode);
13540 suffix = "le";
13541 break;
13542 case LEU:
13543 /* ??? As above. */
13544 if (mode == CCmode)
13545 suffix = "be";
13546 else if (mode == CCCmode)
13547 suffix = fp ? "nb" : "ae";
13548 else
13549 gcc_unreachable ();
13550 break;
13551 case UNORDERED:
13552 suffix = fp ? "u" : "p";
13553 break;
13554 case ORDERED:
13555 suffix = fp ? "nu" : "np";
13556 break;
13557 default:
13558 gcc_unreachable ();
13559 }
13560 fputs (suffix, file);
13561 }
13562
13563 /* Print the name of register X to FILE based on its machine mode and number.
13564 If CODE is 'w', pretend the mode is HImode.
13565 If CODE is 'b', pretend the mode is QImode.
13566 If CODE is 'k', pretend the mode is SImode.
13567 If CODE is 'q', pretend the mode is DImode.
13568 If CODE is 'x', pretend the mode is V4SFmode.
13569 If CODE is 't', pretend the mode is V8SFmode.
13570 If CODE is 'h', pretend the reg is the 'high' byte register.
13571 If CODE is 'y', print "st(0)" instead of "st", if the reg is stack op.
13572 If CODE is 'd', duplicate the operand for AVX instruction.
13573 */
13574
13575 void
13576 print_reg (rtx x, int code, FILE *file)
13577 {
13578 const char *reg;
13579 bool duplicated = code == 'd' && TARGET_AVX;
13580
13581 gcc_assert (x == pc_rtx
13582 || (REGNO (x) != ARG_POINTER_REGNUM
13583 && REGNO (x) != FRAME_POINTER_REGNUM
13584 && REGNO (x) != FLAGS_REG
13585 && REGNO (x) != FPSR_REG
13586 && REGNO (x) != FPCR_REG));
13587
13588 if (ASSEMBLER_DIALECT == ASM_ATT)
13589 putc ('%', file);
13590
13591 if (x == pc_rtx)
13592 {
13593 gcc_assert (TARGET_64BIT);
13594 fputs ("rip", file);
13595 return;
13596 }
13597
13598 if (code == 'w' || MMX_REG_P (x))
13599 code = 2;
13600 else if (code == 'b')
13601 code = 1;
13602 else if (code == 'k')
13603 code = 4;
13604 else if (code == 'q')
13605 code = 8;
13606 else if (code == 'y')
13607 code = 3;
13608 else if (code == 'h')
13609 code = 0;
13610 else if (code == 'x')
13611 code = 16;
13612 else if (code == 't')
13613 code = 32;
13614 else
13615 code = GET_MODE_SIZE (GET_MODE (x));
13616
13617 /* Irritatingly, AMD extended registers use different naming convention
13618 from the normal registers: "r%d[bwd]" */
13619 if (REX_INT_REG_P (x))
13620 {
13621 gcc_assert (TARGET_64BIT);
13622 putc ('r', file);
13623 fprint_ul (file, REGNO (x) - FIRST_REX_INT_REG + 8);
13624 switch (code)
13625 {
13626 case 0:
13627 error ("extended registers have no high halves");
13628 break;
13629 case 1:
13630 putc ('b', file);
13631 break;
13632 case 2:
13633 putc ('w', file);
13634 break;
13635 case 4:
13636 putc ('d', file);
13637 break;
13638 case 8:
13639 /* no suffix */
13640 break;
13641 default:
13642 error ("unsupported operand size for extended register");
13643 break;
13644 }
13645 return;
13646 }
13647
13648 reg = NULL;
13649 switch (code)
13650 {
13651 case 3:
13652 if (STACK_TOP_P (x))
13653 {
13654 reg = "st(0)";
13655 break;
13656 }
13657 /* FALLTHRU */
13658 case 8:
13659 case 4:
13660 case 12:
13661 if (! ANY_FP_REG_P (x))
13662 putc (code == 8 && TARGET_64BIT ? 'r' : 'e', file);
13663 /* FALLTHRU */
13664 case 16:
13665 case 2:
13666 normal:
13667 reg = hi_reg_name[REGNO (x)];
13668 break;
13669 case 1:
13670 if (REGNO (x) >= ARRAY_SIZE (qi_reg_name))
13671 goto normal;
13672 reg = qi_reg_name[REGNO (x)];
13673 break;
13674 case 0:
13675 if (REGNO (x) >= ARRAY_SIZE (qi_high_reg_name))
13676 goto normal;
13677 reg = qi_high_reg_name[REGNO (x)];
13678 break;
13679 case 32:
13680 if (SSE_REG_P (x))
13681 {
13682 gcc_assert (!duplicated);
13683 putc ('y', file);
13684 fputs (hi_reg_name[REGNO (x)] + 1, file);
13685 return;
13686 }
13687 break;
13688 default:
13689 gcc_unreachable ();
13690 }
13691
13692 fputs (reg, file);
13693 if (duplicated)
13694 {
13695 if (ASSEMBLER_DIALECT == ASM_ATT)
13696 fprintf (file, ", %%%s", reg);
13697 else
13698 fprintf (file, ", %s", reg);
13699 }
13700 }
13701
13702 /* Locate some local-dynamic symbol still in use by this function
13703 so that we can print its name in some tls_local_dynamic_base
13704 pattern. */
13705
13706 static int
13707 get_some_local_dynamic_name_1 (rtx *px, void *data ATTRIBUTE_UNUSED)
13708 {
13709 rtx x = *px;
13710
13711 if (GET_CODE (x) == SYMBOL_REF
13712 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_DYNAMIC)
13713 {
13714 cfun->machine->some_ld_name = XSTR (x, 0);
13715 return 1;
13716 }
13717
13718 return 0;
13719 }
13720
13721 static const char *
13722 get_some_local_dynamic_name (void)
13723 {
13724 rtx insn;
13725
13726 if (cfun->machine->some_ld_name)
13727 return cfun->machine->some_ld_name;
13728
13729 for (insn = get_insns (); insn ; insn = NEXT_INSN (insn))
13730 if (NONDEBUG_INSN_P (insn)
13731 && for_each_rtx (&PATTERN (insn), get_some_local_dynamic_name_1, 0))
13732 return cfun->machine->some_ld_name;
13733
13734 return NULL;
13735 }
13736
13737 /* Meaning of CODE:
13738 L,W,B,Q,S,T -- print the opcode suffix for specified size of operand.
13739 C -- print opcode suffix for set/cmov insn.
13740 c -- like C, but print reversed condition
13741 F,f -- likewise, but for floating-point.
13742 O -- if HAVE_AS_IX86_CMOV_SUN_SYNTAX, expand to "w.", "l." or "q.",
13743 otherwise nothing
13744 R -- print the prefix for register names.
13745 z -- print the opcode suffix for the size of the current operand.
13746 Z -- likewise, with special suffixes for x87 instructions.
13747 * -- print a star (in certain assembler syntax)
13748 A -- print an absolute memory reference.
13749 E -- print address with DImode register names if TARGET_64BIT.
13750 w -- print the operand as if it's a "word" (HImode) even if it isn't.
13751 s -- print a shift double count, followed by the assemblers argument
13752 delimiter.
13753 b -- print the QImode name of the register for the indicated operand.
13754 %b0 would print %al if operands[0] is reg 0.
13755 w -- likewise, print the HImode name of the register.
13756 k -- likewise, print the SImode name of the register.
13757 q -- likewise, print the DImode name of the register.
13758 x -- likewise, print the V4SFmode name of the register.
13759 t -- likewise, print the V8SFmode name of the register.
13760 h -- print the QImode name for a "high" register, either ah, bh, ch or dh.
13761 y -- print "st(0)" instead of "st" as a register.
13762 d -- print duplicated register operand for AVX instruction.
13763 D -- print condition for SSE cmp instruction.
13764 P -- if PIC, print an @PLT suffix.
13765 p -- print raw symbol name.
13766 X -- don't print any sort of PIC '@' suffix for a symbol.
13767 & -- print some in-use local-dynamic symbol name.
13768 H -- print a memory address offset by 8; used for sse high-parts
13769 Y -- print condition for XOP pcom* instruction.
13770 + -- print a branch hint as 'cs' or 'ds' prefix
13771 ; -- print a semicolon (after prefixes due to bug in older gas).
13772 ~ -- print "i" if TARGET_AVX2, "f" otherwise.
13773 @ -- print a segment register of thread base pointer load
13774 ^ -- print addr32 prefix if TARGET_64BIT and Pmode != word_mode
13775 */
13776
13777 void
13778 ix86_print_operand (FILE *file, rtx x, int code)
13779 {
13780 if (code)
13781 {
13782 switch (code)
13783 {
13784 case '*':
13785 if (ASSEMBLER_DIALECT == ASM_ATT)
13786 putc ('*', file);
13787 return;
13788
13789 case '&':
13790 {
13791 const char *name = get_some_local_dynamic_name ();
13792 if (name == NULL)
13793 output_operand_lossage ("'%%&' used without any "
13794 "local dynamic TLS references");
13795 else
13796 assemble_name (file, name);
13797 return;
13798 }
13799
13800 case 'A':
13801 switch (ASSEMBLER_DIALECT)
13802 {
13803 case ASM_ATT:
13804 putc ('*', file);
13805 break;
13806
13807 case ASM_INTEL:
13808 /* Intel syntax. For absolute addresses, registers should not
13809 be surrounded by braces. */
13810 if (!REG_P (x))
13811 {
13812 putc ('[', file);
13813 ix86_print_operand (file, x, 0);
13814 putc (']', file);
13815 return;
13816 }
13817 break;
13818
13819 default:
13820 gcc_unreachable ();
13821 }
13822
13823 ix86_print_operand (file, x, 0);
13824 return;
13825
13826 case 'E':
13827 /* Wrap address in an UNSPEC to declare special handling. */
13828 if (TARGET_64BIT)
13829 x = gen_rtx_UNSPEC (DImode, gen_rtvec (1, x), UNSPEC_LEA_ADDR);
13830
13831 output_address (x);
13832 return;
13833
13834 case 'L':
13835 if (ASSEMBLER_DIALECT == ASM_ATT)
13836 putc ('l', file);
13837 return;
13838
13839 case 'W':
13840 if (ASSEMBLER_DIALECT == ASM_ATT)
13841 putc ('w', file);
13842 return;
13843
13844 case 'B':
13845 if (ASSEMBLER_DIALECT == ASM_ATT)
13846 putc ('b', file);
13847 return;
13848
13849 case 'Q':
13850 if (ASSEMBLER_DIALECT == ASM_ATT)
13851 putc ('l', file);
13852 return;
13853
13854 case 'S':
13855 if (ASSEMBLER_DIALECT == ASM_ATT)
13856 putc ('s', file);
13857 return;
13858
13859 case 'T':
13860 if (ASSEMBLER_DIALECT == ASM_ATT)
13861 putc ('t', file);
13862 return;
13863
13864 case 'z':
13865 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
13866 {
13867 /* Opcodes don't get size suffixes if using Intel opcodes. */
13868 if (ASSEMBLER_DIALECT == ASM_INTEL)
13869 return;
13870
13871 switch (GET_MODE_SIZE (GET_MODE (x)))
13872 {
13873 case 1:
13874 putc ('b', file);
13875 return;
13876
13877 case 2:
13878 putc ('w', file);
13879 return;
13880
13881 case 4:
13882 putc ('l', file);
13883 return;
13884
13885 case 8:
13886 putc ('q', file);
13887 return;
13888
13889 default:
13890 output_operand_lossage
13891 ("invalid operand size for operand code '%c'", code);
13892 return;
13893 }
13894 }
13895
13896 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
13897 warning
13898 (0, "non-integer operand used with operand code '%c'", code);
13899 /* FALLTHRU */
13900
13901 case 'Z':
13902 /* 387 opcodes don't get size suffixes if using Intel opcodes. */
13903 if (ASSEMBLER_DIALECT == ASM_INTEL)
13904 return;
13905
13906 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
13907 {
13908 switch (GET_MODE_SIZE (GET_MODE (x)))
13909 {
13910 case 2:
13911 #ifdef HAVE_AS_IX86_FILDS
13912 putc ('s', file);
13913 #endif
13914 return;
13915
13916 case 4:
13917 putc ('l', file);
13918 return;
13919
13920 case 8:
13921 #ifdef HAVE_AS_IX86_FILDQ
13922 putc ('q', file);
13923 #else
13924 fputs ("ll", file);
13925 #endif
13926 return;
13927
13928 default:
13929 break;
13930 }
13931 }
13932 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
13933 {
13934 /* 387 opcodes don't get size suffixes
13935 if the operands are registers. */
13936 if (STACK_REG_P (x))
13937 return;
13938
13939 switch (GET_MODE_SIZE (GET_MODE (x)))
13940 {
13941 case 4:
13942 putc ('s', file);
13943 return;
13944
13945 case 8:
13946 putc ('l', file);
13947 return;
13948
13949 case 12:
13950 case 16:
13951 putc ('t', file);
13952 return;
13953
13954 default:
13955 break;
13956 }
13957 }
13958 else
13959 {
13960 output_operand_lossage
13961 ("invalid operand type used with operand code '%c'", code);
13962 return;
13963 }
13964
13965 output_operand_lossage
13966 ("invalid operand size for operand code '%c'", code);
13967 return;
13968
13969 case 'd':
13970 case 'b':
13971 case 'w':
13972 case 'k':
13973 case 'q':
13974 case 'h':
13975 case 't':
13976 case 'y':
13977 case 'x':
13978 case 'X':
13979 case 'P':
13980 case 'p':
13981 break;
13982
13983 case 's':
13984 if (CONST_INT_P (x) || ! SHIFT_DOUBLE_OMITS_COUNT)
13985 {
13986 ix86_print_operand (file, x, 0);
13987 fputs (", ", file);
13988 }
13989 return;
13990
13991 case 'D':
13992 /* Little bit of braindamage here. The SSE compare instructions
13993 does use completely different names for the comparisons that the
13994 fp conditional moves. */
13995 if (TARGET_AVX)
13996 {
13997 switch (GET_CODE (x))
13998 {
13999 case EQ:
14000 fputs ("eq", file);
14001 break;
14002 case UNEQ:
14003 fputs ("eq_us", file);
14004 break;
14005 case LT:
14006 fputs ("lt", file);
14007 break;
14008 case UNLT:
14009 fputs ("nge", file);
14010 break;
14011 case LE:
14012 fputs ("le", file);
14013 break;
14014 case UNLE:
14015 fputs ("ngt", file);
14016 break;
14017 case UNORDERED:
14018 fputs ("unord", file);
14019 break;
14020 case NE:
14021 fputs ("neq", file);
14022 break;
14023 case LTGT:
14024 fputs ("neq_oq", file);
14025 break;
14026 case GE:
14027 fputs ("ge", file);
14028 break;
14029 case UNGE:
14030 fputs ("nlt", file);
14031 break;
14032 case GT:
14033 fputs ("gt", file);
14034 break;
14035 case UNGT:
14036 fputs ("nle", file);
14037 break;
14038 case ORDERED:
14039 fputs ("ord", file);
14040 break;
14041 default:
14042 output_operand_lossage ("operand is not a condition code, "
14043 "invalid operand code 'D'");
14044 return;
14045 }
14046 }
14047 else
14048 {
14049 switch (GET_CODE (x))
14050 {
14051 case EQ:
14052 case UNEQ:
14053 fputs ("eq", file);
14054 break;
14055 case LT:
14056 case UNLT:
14057 fputs ("lt", file);
14058 break;
14059 case LE:
14060 case UNLE:
14061 fputs ("le", file);
14062 break;
14063 case UNORDERED:
14064 fputs ("unord", file);
14065 break;
14066 case NE:
14067 case LTGT:
14068 fputs ("neq", file);
14069 break;
14070 case UNGE:
14071 case GE:
14072 fputs ("nlt", file);
14073 break;
14074 case UNGT:
14075 case GT:
14076 fputs ("nle", file);
14077 break;
14078 case ORDERED:
14079 fputs ("ord", file);
14080 break;
14081 default:
14082 output_operand_lossage ("operand is not a condition code, "
14083 "invalid operand code 'D'");
14084 return;
14085 }
14086 }
14087 return;
14088 case 'O':
14089 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
14090 if (ASSEMBLER_DIALECT == ASM_ATT)
14091 {
14092 switch (GET_MODE (x))
14093 {
14094 case HImode: putc ('w', file); break;
14095 case SImode:
14096 case SFmode: putc ('l', file); break;
14097 case DImode:
14098 case DFmode: putc ('q', file); break;
14099 default: gcc_unreachable ();
14100 }
14101 putc ('.', file);
14102 }
14103 #endif
14104 return;
14105 case 'C':
14106 if (!COMPARISON_P (x))
14107 {
14108 output_operand_lossage ("operand is neither a constant nor a "
14109 "condition code, invalid operand code "
14110 "'C'");
14111 return;
14112 }
14113 put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)), 0, 0, file);
14114 return;
14115 case 'F':
14116 if (!COMPARISON_P (x))
14117 {
14118 output_operand_lossage ("operand is neither a constant nor a "
14119 "condition code, invalid operand code "
14120 "'F'");
14121 return;
14122 }
14123 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
14124 if (ASSEMBLER_DIALECT == ASM_ATT)
14125 putc ('.', file);
14126 #endif
14127 put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)), 0, 1, file);
14128 return;
14129
14130 /* Like above, but reverse condition */
14131 case 'c':
14132 /* Check to see if argument to %c is really a constant
14133 and not a condition code which needs to be reversed. */
14134 if (!COMPARISON_P (x))
14135 {
14136 output_operand_lossage ("operand is neither a constant nor a "
14137 "condition code, invalid operand "
14138 "code 'c'");
14139 return;
14140 }
14141 put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)), 1, 0, file);
14142 return;
14143 case 'f':
14144 if (!COMPARISON_P (x))
14145 {
14146 output_operand_lossage ("operand is neither a constant nor a "
14147 "condition code, invalid operand "
14148 "code 'f'");
14149 return;
14150 }
14151 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
14152 if (ASSEMBLER_DIALECT == ASM_ATT)
14153 putc ('.', file);
14154 #endif
14155 put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)), 1, 1, file);
14156 return;
14157
14158 case 'H':
14159 if (!offsettable_memref_p (x))
14160 {
14161 output_operand_lossage ("operand is not an offsettable memory "
14162 "reference, invalid operand "
14163 "code 'H'");
14164 return;
14165 }
14166 /* It doesn't actually matter what mode we use here, as we're
14167 only going to use this for printing. */
14168 x = adjust_address_nv (x, DImode, 8);
14169 break;
14170
14171 case '+':
14172 {
14173 rtx x;
14174
14175 if (!optimize
14176 || optimize_function_for_size_p (cfun)
14177 || !TARGET_BRANCH_PREDICTION_HINTS)
14178 return;
14179
14180 x = find_reg_note (current_output_insn, REG_BR_PROB, 0);
14181 if (x)
14182 {
14183 int pred_val = INTVAL (XEXP (x, 0));
14184
14185 if (pred_val < REG_BR_PROB_BASE * 45 / 100
14186 || pred_val > REG_BR_PROB_BASE * 55 / 100)
14187 {
14188 bool taken = pred_val > REG_BR_PROB_BASE / 2;
14189 bool cputaken
14190 = final_forward_branch_p (current_output_insn) == 0;
14191
14192 /* Emit hints only in the case default branch prediction
14193 heuristics would fail. */
14194 if (taken != cputaken)
14195 {
14196 /* We use 3e (DS) prefix for taken branches and
14197 2e (CS) prefix for not taken branches. */
14198 if (taken)
14199 fputs ("ds ; ", file);
14200 else
14201 fputs ("cs ; ", file);
14202 }
14203 }
14204 }
14205 return;
14206 }
14207
14208 case 'Y':
14209 switch (GET_CODE (x))
14210 {
14211 case NE:
14212 fputs ("neq", file);
14213 break;
14214 case EQ:
14215 fputs ("eq", file);
14216 break;
14217 case GE:
14218 case GEU:
14219 fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "ge" : "unlt", file);
14220 break;
14221 case GT:
14222 case GTU:
14223 fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "gt" : "unle", file);
14224 break;
14225 case LE:
14226 case LEU:
14227 fputs ("le", file);
14228 break;
14229 case LT:
14230 case LTU:
14231 fputs ("lt", file);
14232 break;
14233 case UNORDERED:
14234 fputs ("unord", file);
14235 break;
14236 case ORDERED:
14237 fputs ("ord", file);
14238 break;
14239 case UNEQ:
14240 fputs ("ueq", file);
14241 break;
14242 case UNGE:
14243 fputs ("nlt", file);
14244 break;
14245 case UNGT:
14246 fputs ("nle", file);
14247 break;
14248 case UNLE:
14249 fputs ("ule", file);
14250 break;
14251 case UNLT:
14252 fputs ("ult", file);
14253 break;
14254 case LTGT:
14255 fputs ("une", file);
14256 break;
14257 default:
14258 output_operand_lossage ("operand is not a condition code, "
14259 "invalid operand code 'Y'");
14260 return;
14261 }
14262 return;
14263
14264 case ';':
14265 #ifndef HAVE_AS_IX86_REP_LOCK_PREFIX
14266 putc (';', file);
14267 #endif
14268 return;
14269
14270 case '@':
14271 if (ASSEMBLER_DIALECT == ASM_ATT)
14272 putc ('%', file);
14273
14274 /* The kernel uses a different segment register for performance
14275 reasons; a system call would not have to trash the userspace
14276 segment register, which would be expensive. */
14277 if (TARGET_64BIT && ix86_cmodel != CM_KERNEL)
14278 fputs ("fs", file);
14279 else
14280 fputs ("gs", file);
14281 return;
14282
14283 case '~':
14284 putc (TARGET_AVX2 ? 'i' : 'f', file);
14285 return;
14286
14287 case '^':
14288 if (TARGET_64BIT && Pmode != word_mode)
14289 fputs ("addr32 ", file);
14290 return;
14291
14292 default:
14293 output_operand_lossage ("invalid operand code '%c'", code);
14294 }
14295 }
14296
14297 if (REG_P (x))
14298 print_reg (x, code, file);
14299
14300 else if (MEM_P (x))
14301 {
14302 /* No `byte ptr' prefix for call instructions or BLKmode operands. */
14303 if (ASSEMBLER_DIALECT == ASM_INTEL && code != 'X' && code != 'P'
14304 && GET_MODE (x) != BLKmode)
14305 {
14306 const char * size;
14307 switch (GET_MODE_SIZE (GET_MODE (x)))
14308 {
14309 case 1: size = "BYTE"; break;
14310 case 2: size = "WORD"; break;
14311 case 4: size = "DWORD"; break;
14312 case 8: size = "QWORD"; break;
14313 case 12: size = "TBYTE"; break;
14314 case 16:
14315 if (GET_MODE (x) == XFmode)
14316 size = "TBYTE";
14317 else
14318 size = "XMMWORD";
14319 break;
14320 case 32: size = "YMMWORD"; break;
14321 default:
14322 gcc_unreachable ();
14323 }
14324
14325 /* Check for explicit size override (codes 'b', 'w', 'k',
14326 'q' and 'x') */
14327 if (code == 'b')
14328 size = "BYTE";
14329 else if (code == 'w')
14330 size = "WORD";
14331 else if (code == 'k')
14332 size = "DWORD";
14333 else if (code == 'q')
14334 size = "QWORD";
14335 else if (code == 'x')
14336 size = "XMMWORD";
14337
14338 fputs (size, file);
14339 fputs (" PTR ", file);
14340 }
14341
14342 x = XEXP (x, 0);
14343 /* Avoid (%rip) for call operands. */
14344 if (CONSTANT_ADDRESS_P (x) && code == 'P'
14345 && !CONST_INT_P (x))
14346 output_addr_const (file, x);
14347 else if (this_is_asm_operands && ! address_operand (x, VOIDmode))
14348 output_operand_lossage ("invalid constraints for operand");
14349 else
14350 output_address (x);
14351 }
14352
14353 else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == SFmode)
14354 {
14355 REAL_VALUE_TYPE r;
14356 long l;
14357
14358 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
14359 REAL_VALUE_TO_TARGET_SINGLE (r, l);
14360
14361 if (ASSEMBLER_DIALECT == ASM_ATT)
14362 putc ('$', file);
14363 /* Sign extend 32bit SFmode immediate to 8 bytes. */
14364 if (code == 'q')
14365 fprintf (file, "0x%08llx", (unsigned long long) (int) l);
14366 else
14367 fprintf (file, "0x%08x", (unsigned int) l);
14368 }
14369
14370 else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == DFmode)
14371 {
14372 REAL_VALUE_TYPE r;
14373 long l[2];
14374
14375 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
14376 REAL_VALUE_TO_TARGET_DOUBLE (r, l);
14377
14378 if (ASSEMBLER_DIALECT == ASM_ATT)
14379 putc ('$', file);
14380 fprintf (file, "0x%lx%08lx", l[1] & 0xffffffff, l[0] & 0xffffffff);
14381 }
14382
14383 /* These float cases don't actually occur as immediate operands. */
14384 else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == XFmode)
14385 {
14386 char dstr[30];
14387
14388 real_to_decimal (dstr, CONST_DOUBLE_REAL_VALUE (x), sizeof (dstr), 0, 1);
14389 fputs (dstr, file);
14390 }
14391
14392 else
14393 {
14394 /* We have patterns that allow zero sets of memory, for instance.
14395 In 64-bit mode, we should probably support all 8-byte vectors,
14396 since we can in fact encode that into an immediate. */
14397 if (GET_CODE (x) == CONST_VECTOR)
14398 {
14399 gcc_assert (x == CONST0_RTX (GET_MODE (x)));
14400 x = const0_rtx;
14401 }
14402
14403 if (code != 'P' && code != 'p')
14404 {
14405 if (CONST_INT_P (x) || GET_CODE (x) == CONST_DOUBLE)
14406 {
14407 if (ASSEMBLER_DIALECT == ASM_ATT)
14408 putc ('$', file);
14409 }
14410 else if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF
14411 || GET_CODE (x) == LABEL_REF)
14412 {
14413 if (ASSEMBLER_DIALECT == ASM_ATT)
14414 putc ('$', file);
14415 else
14416 fputs ("OFFSET FLAT:", file);
14417 }
14418 }
14419 if (CONST_INT_P (x))
14420 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
14421 else if (flag_pic || MACHOPIC_INDIRECT)
14422 output_pic_addr_const (file, x, code);
14423 else
14424 output_addr_const (file, x);
14425 }
14426 }
14427
14428 static bool
14429 ix86_print_operand_punct_valid_p (unsigned char code)
14430 {
14431 return (code == '@' || code == '*' || code == '+' || code == '&'
14432 || code == ';' || code == '~' || code == '^');
14433 }
14434 \f
14435 /* Print a memory operand whose address is ADDR. */
14436
14437 static void
14438 ix86_print_operand_address (FILE *file, rtx addr)
14439 {
14440 struct ix86_address parts;
14441 rtx base, index, disp;
14442 int scale;
14443 int ok;
14444 bool vsib = false;
14445 int code = 0;
14446
14447 if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_VSIBADDR)
14448 {
14449 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
14450 gcc_assert (parts.index == NULL_RTX);
14451 parts.index = XVECEXP (addr, 0, 1);
14452 parts.scale = INTVAL (XVECEXP (addr, 0, 2));
14453 addr = XVECEXP (addr, 0, 0);
14454 vsib = true;
14455 }
14456 else if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_LEA_ADDR)
14457 {
14458 gcc_assert (TARGET_64BIT);
14459 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
14460 code = 'q';
14461 }
14462 else
14463 ok = ix86_decompose_address (addr, &parts);
14464
14465 gcc_assert (ok);
14466
14467 if (parts.base && GET_CODE (parts.base) == SUBREG)
14468 {
14469 rtx tmp = SUBREG_REG (parts.base);
14470 parts.base = simplify_subreg (GET_MODE (parts.base),
14471 tmp, GET_MODE (tmp), 0);
14472 }
14473
14474 if (parts.index && GET_CODE (parts.index) == SUBREG)
14475 {
14476 rtx tmp = SUBREG_REG (parts.index);
14477 parts.index = simplify_subreg (GET_MODE (parts.index),
14478 tmp, GET_MODE (tmp), 0);
14479 }
14480
14481 base = parts.base;
14482 index = parts.index;
14483 disp = parts.disp;
14484 scale = parts.scale;
14485
14486 switch (parts.seg)
14487 {
14488 case SEG_DEFAULT:
14489 break;
14490 case SEG_FS:
14491 case SEG_GS:
14492 if (ASSEMBLER_DIALECT == ASM_ATT)
14493 putc ('%', file);
14494 fputs ((parts.seg == SEG_FS ? "fs:" : "gs:"), file);
14495 break;
14496 default:
14497 gcc_unreachable ();
14498 }
14499
14500 /* Use one byte shorter RIP relative addressing for 64bit mode. */
14501 if (TARGET_64BIT && !base && !index)
14502 {
14503 rtx symbol = disp;
14504
14505 if (GET_CODE (disp) == CONST
14506 && GET_CODE (XEXP (disp, 0)) == PLUS
14507 && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
14508 symbol = XEXP (XEXP (disp, 0), 0);
14509
14510 if (GET_CODE (symbol) == LABEL_REF
14511 || (GET_CODE (symbol) == SYMBOL_REF
14512 && SYMBOL_REF_TLS_MODEL (symbol) == 0))
14513 base = pc_rtx;
14514 }
14515 if (!base && !index)
14516 {
14517 /* Displacement only requires special attention. */
14518
14519 if (CONST_INT_P (disp))
14520 {
14521 if (ASSEMBLER_DIALECT == ASM_INTEL && parts.seg == SEG_DEFAULT)
14522 fputs ("ds:", file);
14523 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (disp));
14524 }
14525 else if (flag_pic)
14526 output_pic_addr_const (file, disp, 0);
14527 else
14528 output_addr_const (file, disp);
14529 }
14530 else
14531 {
14532 /* Print SImode register names for zero-extended
14533 addresses to force addr32 prefix. */
14534 if (TARGET_64BIT
14535 && (GET_CODE (addr) == ZERO_EXTEND
14536 || GET_CODE (addr) == AND))
14537 {
14538 gcc_assert (!code);
14539 code = 'l';
14540 }
14541
14542 if (ASSEMBLER_DIALECT == ASM_ATT)
14543 {
14544 if (disp)
14545 {
14546 if (flag_pic)
14547 output_pic_addr_const (file, disp, 0);
14548 else if (GET_CODE (disp) == LABEL_REF)
14549 output_asm_label (disp);
14550 else
14551 output_addr_const (file, disp);
14552 }
14553
14554 putc ('(', file);
14555 if (base)
14556 print_reg (base, code, file);
14557 if (index)
14558 {
14559 putc (',', file);
14560 print_reg (index, vsib ? 0 : code, file);
14561 if (scale != 1 || vsib)
14562 fprintf (file, ",%d", scale);
14563 }
14564 putc (')', file);
14565 }
14566 else
14567 {
14568 rtx offset = NULL_RTX;
14569
14570 if (disp)
14571 {
14572 /* Pull out the offset of a symbol; print any symbol itself. */
14573 if (GET_CODE (disp) == CONST
14574 && GET_CODE (XEXP (disp, 0)) == PLUS
14575 && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
14576 {
14577 offset = XEXP (XEXP (disp, 0), 1);
14578 disp = gen_rtx_CONST (VOIDmode,
14579 XEXP (XEXP (disp, 0), 0));
14580 }
14581
14582 if (flag_pic)
14583 output_pic_addr_const (file, disp, 0);
14584 else if (GET_CODE (disp) == LABEL_REF)
14585 output_asm_label (disp);
14586 else if (CONST_INT_P (disp))
14587 offset = disp;
14588 else
14589 output_addr_const (file, disp);
14590 }
14591
14592 putc ('[', file);
14593 if (base)
14594 {
14595 print_reg (base, code, file);
14596 if (offset)
14597 {
14598 if (INTVAL (offset) >= 0)
14599 putc ('+', file);
14600 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
14601 }
14602 }
14603 else if (offset)
14604 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
14605 else
14606 putc ('0', file);
14607
14608 if (index)
14609 {
14610 putc ('+', file);
14611 print_reg (index, vsib ? 0 : code, file);
14612 if (scale != 1 || vsib)
14613 fprintf (file, "*%d", scale);
14614 }
14615 putc (']', file);
14616 }
14617 }
14618 }
14619
14620 /* Implementation of TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA. */
14621
14622 static bool
14623 i386_asm_output_addr_const_extra (FILE *file, rtx x)
14624 {
14625 rtx op;
14626
14627 if (GET_CODE (x) != UNSPEC)
14628 return false;
14629
14630 op = XVECEXP (x, 0, 0);
14631 switch (XINT (x, 1))
14632 {
14633 case UNSPEC_GOTTPOFF:
14634 output_addr_const (file, op);
14635 /* FIXME: This might be @TPOFF in Sun ld. */
14636 fputs ("@gottpoff", file);
14637 break;
14638 case UNSPEC_TPOFF:
14639 output_addr_const (file, op);
14640 fputs ("@tpoff", file);
14641 break;
14642 case UNSPEC_NTPOFF:
14643 output_addr_const (file, op);
14644 if (TARGET_64BIT)
14645 fputs ("@tpoff", file);
14646 else
14647 fputs ("@ntpoff", file);
14648 break;
14649 case UNSPEC_DTPOFF:
14650 output_addr_const (file, op);
14651 fputs ("@dtpoff", file);
14652 break;
14653 case UNSPEC_GOTNTPOFF:
14654 output_addr_const (file, op);
14655 if (TARGET_64BIT)
14656 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
14657 "@gottpoff(%rip)" : "@gottpoff[rip]", file);
14658 else
14659 fputs ("@gotntpoff", file);
14660 break;
14661 case UNSPEC_INDNTPOFF:
14662 output_addr_const (file, op);
14663 fputs ("@indntpoff", file);
14664 break;
14665 #if TARGET_MACHO
14666 case UNSPEC_MACHOPIC_OFFSET:
14667 output_addr_const (file, op);
14668 putc ('-', file);
14669 machopic_output_function_base_name (file);
14670 break;
14671 #endif
14672
14673 case UNSPEC_STACK_CHECK:
14674 {
14675 int offset;
14676
14677 gcc_assert (flag_split_stack);
14678
14679 #ifdef TARGET_THREAD_SPLIT_STACK_OFFSET
14680 offset = TARGET_THREAD_SPLIT_STACK_OFFSET;
14681 #else
14682 gcc_unreachable ();
14683 #endif
14684
14685 fprintf (file, "%s:%d", TARGET_64BIT ? "%fs" : "%gs", offset);
14686 }
14687 break;
14688
14689 default:
14690 return false;
14691 }
14692
14693 return true;
14694 }
14695 \f
14696 /* Split one or more double-mode RTL references into pairs of half-mode
14697 references. The RTL can be REG, offsettable MEM, integer constant, or
14698 CONST_DOUBLE. "operands" is a pointer to an array of double-mode RTLs to
14699 split and "num" is its length. lo_half and hi_half are output arrays
14700 that parallel "operands". */
14701
14702 void
14703 split_double_mode (enum machine_mode mode, rtx operands[],
14704 int num, rtx lo_half[], rtx hi_half[])
14705 {
14706 enum machine_mode half_mode;
14707 unsigned int byte;
14708
14709 switch (mode)
14710 {
14711 case TImode:
14712 half_mode = DImode;
14713 break;
14714 case DImode:
14715 half_mode = SImode;
14716 break;
14717 default:
14718 gcc_unreachable ();
14719 }
14720
14721 byte = GET_MODE_SIZE (half_mode);
14722
14723 while (num--)
14724 {
14725 rtx op = operands[num];
14726
14727 /* simplify_subreg refuse to split volatile memory addresses,
14728 but we still have to handle it. */
14729 if (MEM_P (op))
14730 {
14731 lo_half[num] = adjust_address (op, half_mode, 0);
14732 hi_half[num] = adjust_address (op, half_mode, byte);
14733 }
14734 else
14735 {
14736 lo_half[num] = simplify_gen_subreg (half_mode, op,
14737 GET_MODE (op) == VOIDmode
14738 ? mode : GET_MODE (op), 0);
14739 hi_half[num] = simplify_gen_subreg (half_mode, op,
14740 GET_MODE (op) == VOIDmode
14741 ? mode : GET_MODE (op), byte);
14742 }
14743 }
14744 }
14745 \f
14746 /* Output code to perform a 387 binary operation in INSN, one of PLUS,
14747 MINUS, MULT or DIV. OPERANDS are the insn operands, where operands[3]
14748 is the expression of the binary operation. The output may either be
14749 emitted here, or returned to the caller, like all output_* functions.
14750
14751 There is no guarantee that the operands are the same mode, as they
14752 might be within FLOAT or FLOAT_EXTEND expressions. */
14753
14754 #ifndef SYSV386_COMPAT
14755 /* Set to 1 for compatibility with brain-damaged assemblers. No-one
14756 wants to fix the assemblers because that causes incompatibility
14757 with gcc. No-one wants to fix gcc because that causes
14758 incompatibility with assemblers... You can use the option of
14759 -DSYSV386_COMPAT=0 if you recompile both gcc and gas this way. */
14760 #define SYSV386_COMPAT 1
14761 #endif
14762
14763 const char *
14764 output_387_binary_op (rtx insn, rtx *operands)
14765 {
14766 static char buf[40];
14767 const char *p;
14768 const char *ssep;
14769 int is_sse = SSE_REG_P (operands[0]) || SSE_REG_P (operands[1]) || SSE_REG_P (operands[2]);
14770
14771 #ifdef ENABLE_CHECKING
14772 /* Even if we do not want to check the inputs, this documents input
14773 constraints. Which helps in understanding the following code. */
14774 if (STACK_REG_P (operands[0])
14775 && ((REG_P (operands[1])
14776 && REGNO (operands[0]) == REGNO (operands[1])
14777 && (STACK_REG_P (operands[2]) || MEM_P (operands[2])))
14778 || (REG_P (operands[2])
14779 && REGNO (operands[0]) == REGNO (operands[2])
14780 && (STACK_REG_P (operands[1]) || MEM_P (operands[1]))))
14781 && (STACK_TOP_P (operands[1]) || STACK_TOP_P (operands[2])))
14782 ; /* ok */
14783 else
14784 gcc_assert (is_sse);
14785 #endif
14786
14787 switch (GET_CODE (operands[3]))
14788 {
14789 case PLUS:
14790 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
14791 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
14792 p = "fiadd";
14793 else
14794 p = "fadd";
14795 ssep = "vadd";
14796 break;
14797
14798 case MINUS:
14799 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
14800 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
14801 p = "fisub";
14802 else
14803 p = "fsub";
14804 ssep = "vsub";
14805 break;
14806
14807 case MULT:
14808 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
14809 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
14810 p = "fimul";
14811 else
14812 p = "fmul";
14813 ssep = "vmul";
14814 break;
14815
14816 case DIV:
14817 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
14818 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
14819 p = "fidiv";
14820 else
14821 p = "fdiv";
14822 ssep = "vdiv";
14823 break;
14824
14825 default:
14826 gcc_unreachable ();
14827 }
14828
14829 if (is_sse)
14830 {
14831 if (TARGET_AVX)
14832 {
14833 strcpy (buf, ssep);
14834 if (GET_MODE (operands[0]) == SFmode)
14835 strcat (buf, "ss\t{%2, %1, %0|%0, %1, %2}");
14836 else
14837 strcat (buf, "sd\t{%2, %1, %0|%0, %1, %2}");
14838 }
14839 else
14840 {
14841 strcpy (buf, ssep + 1);
14842 if (GET_MODE (operands[0]) == SFmode)
14843 strcat (buf, "ss\t{%2, %0|%0, %2}");
14844 else
14845 strcat (buf, "sd\t{%2, %0|%0, %2}");
14846 }
14847 return buf;
14848 }
14849 strcpy (buf, p);
14850
14851 switch (GET_CODE (operands[3]))
14852 {
14853 case MULT:
14854 case PLUS:
14855 if (REG_P (operands[2]) && REGNO (operands[0]) == REGNO (operands[2]))
14856 {
14857 rtx temp = operands[2];
14858 operands[2] = operands[1];
14859 operands[1] = temp;
14860 }
14861
14862 /* know operands[0] == operands[1]. */
14863
14864 if (MEM_P (operands[2]))
14865 {
14866 p = "%Z2\t%2";
14867 break;
14868 }
14869
14870 if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
14871 {
14872 if (STACK_TOP_P (operands[0]))
14873 /* How is it that we are storing to a dead operand[2]?
14874 Well, presumably operands[1] is dead too. We can't
14875 store the result to st(0) as st(0) gets popped on this
14876 instruction. Instead store to operands[2] (which I
14877 think has to be st(1)). st(1) will be popped later.
14878 gcc <= 2.8.1 didn't have this check and generated
14879 assembly code that the Unixware assembler rejected. */
14880 p = "p\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */
14881 else
14882 p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */
14883 break;
14884 }
14885
14886 if (STACK_TOP_P (operands[0]))
14887 p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */
14888 else
14889 p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */
14890 break;
14891
14892 case MINUS:
14893 case DIV:
14894 if (MEM_P (operands[1]))
14895 {
14896 p = "r%Z1\t%1";
14897 break;
14898 }
14899
14900 if (MEM_P (operands[2]))
14901 {
14902 p = "%Z2\t%2";
14903 break;
14904 }
14905
14906 if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
14907 {
14908 #if SYSV386_COMPAT
14909 /* The SystemV/386 SVR3.2 assembler, and probably all AT&T
14910 derived assemblers, confusingly reverse the direction of
14911 the operation for fsub{r} and fdiv{r} when the
14912 destination register is not st(0). The Intel assembler
14913 doesn't have this brain damage. Read !SYSV386_COMPAT to
14914 figure out what the hardware really does. */
14915 if (STACK_TOP_P (operands[0]))
14916 p = "{p\t%0, %2|rp\t%2, %0}";
14917 else
14918 p = "{rp\t%2, %0|p\t%0, %2}";
14919 #else
14920 if (STACK_TOP_P (operands[0]))
14921 /* As above for fmul/fadd, we can't store to st(0). */
14922 p = "rp\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */
14923 else
14924 p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */
14925 #endif
14926 break;
14927 }
14928
14929 if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
14930 {
14931 #if SYSV386_COMPAT
14932 if (STACK_TOP_P (operands[0]))
14933 p = "{rp\t%0, %1|p\t%1, %0}";
14934 else
14935 p = "{p\t%1, %0|rp\t%0, %1}";
14936 #else
14937 if (STACK_TOP_P (operands[0]))
14938 p = "p\t{%0, %1|%1, %0}"; /* st(1) = st(1) op st(0); pop */
14939 else
14940 p = "rp\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2); pop */
14941 #endif
14942 break;
14943 }
14944
14945 if (STACK_TOP_P (operands[0]))
14946 {
14947 if (STACK_TOP_P (operands[1]))
14948 p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */
14949 else
14950 p = "r\t{%y1, %0|%0, %y1}"; /* st(0) = st(r1) op st(0) */
14951 break;
14952 }
14953 else if (STACK_TOP_P (operands[1]))
14954 {
14955 #if SYSV386_COMPAT
14956 p = "{\t%1, %0|r\t%0, %1}";
14957 #else
14958 p = "r\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2) */
14959 #endif
14960 }
14961 else
14962 {
14963 #if SYSV386_COMPAT
14964 p = "{r\t%2, %0|\t%0, %2}";
14965 #else
14966 p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */
14967 #endif
14968 }
14969 break;
14970
14971 default:
14972 gcc_unreachable ();
14973 }
14974
14975 strcat (buf, p);
14976 return buf;
14977 }
14978
14979 /* Return needed mode for entity in optimize_mode_switching pass. */
14980
14981 int
14982 ix86_mode_needed (int entity, rtx insn)
14983 {
14984 enum attr_i387_cw mode;
14985
14986 /* The mode UNINITIALIZED is used to store control word after a
14987 function call or ASM pattern. The mode ANY specify that function
14988 has no requirements on the control word and make no changes in the
14989 bits we are interested in. */
14990
14991 if (CALL_P (insn)
14992 || (NONJUMP_INSN_P (insn)
14993 && (asm_noperands (PATTERN (insn)) >= 0
14994 || GET_CODE (PATTERN (insn)) == ASM_INPUT)))
14995 return I387_CW_UNINITIALIZED;
14996
14997 if (recog_memoized (insn) < 0)
14998 return I387_CW_ANY;
14999
15000 mode = get_attr_i387_cw (insn);
15001
15002 switch (entity)
15003 {
15004 case I387_TRUNC:
15005 if (mode == I387_CW_TRUNC)
15006 return mode;
15007 break;
15008
15009 case I387_FLOOR:
15010 if (mode == I387_CW_FLOOR)
15011 return mode;
15012 break;
15013
15014 case I387_CEIL:
15015 if (mode == I387_CW_CEIL)
15016 return mode;
15017 break;
15018
15019 case I387_MASK_PM:
15020 if (mode == I387_CW_MASK_PM)
15021 return mode;
15022 break;
15023
15024 default:
15025 gcc_unreachable ();
15026 }
15027
15028 return I387_CW_ANY;
15029 }
15030
15031 /* Output code to initialize control word copies used by trunc?f?i and
15032 rounding patterns. CURRENT_MODE is set to current control word,
15033 while NEW_MODE is set to new control word. */
15034
15035 void
15036 emit_i387_cw_initialization (int mode)
15037 {
15038 rtx stored_mode = assign_386_stack_local (HImode, SLOT_CW_STORED);
15039 rtx new_mode;
15040
15041 enum ix86_stack_slot slot;
15042
15043 rtx reg = gen_reg_rtx (HImode);
15044
15045 emit_insn (gen_x86_fnstcw_1 (stored_mode));
15046 emit_move_insn (reg, copy_rtx (stored_mode));
15047
15048 if (TARGET_64BIT || TARGET_PARTIAL_REG_STALL
15049 || optimize_function_for_size_p (cfun))
15050 {
15051 switch (mode)
15052 {
15053 case I387_CW_TRUNC:
15054 /* round toward zero (truncate) */
15055 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0c00)));
15056 slot = SLOT_CW_TRUNC;
15057 break;
15058
15059 case I387_CW_FLOOR:
15060 /* round down toward -oo */
15061 emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
15062 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0400)));
15063 slot = SLOT_CW_FLOOR;
15064 break;
15065
15066 case I387_CW_CEIL:
15067 /* round up toward +oo */
15068 emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
15069 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0800)));
15070 slot = SLOT_CW_CEIL;
15071 break;
15072
15073 case I387_CW_MASK_PM:
15074 /* mask precision exception for nearbyint() */
15075 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
15076 slot = SLOT_CW_MASK_PM;
15077 break;
15078
15079 default:
15080 gcc_unreachable ();
15081 }
15082 }
15083 else
15084 {
15085 switch (mode)
15086 {
15087 case I387_CW_TRUNC:
15088 /* round toward zero (truncate) */
15089 emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0xc)));
15090 slot = SLOT_CW_TRUNC;
15091 break;
15092
15093 case I387_CW_FLOOR:
15094 /* round down toward -oo */
15095 emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0x4)));
15096 slot = SLOT_CW_FLOOR;
15097 break;
15098
15099 case I387_CW_CEIL:
15100 /* round up toward +oo */
15101 emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0x8)));
15102 slot = SLOT_CW_CEIL;
15103 break;
15104
15105 case I387_CW_MASK_PM:
15106 /* mask precision exception for nearbyint() */
15107 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
15108 slot = SLOT_CW_MASK_PM;
15109 break;
15110
15111 default:
15112 gcc_unreachable ();
15113 }
15114 }
15115
15116 gcc_assert (slot < MAX_386_STACK_LOCALS);
15117
15118 new_mode = assign_386_stack_local (HImode, slot);
15119 emit_move_insn (new_mode, reg);
15120 }
15121
15122 /* Output code for INSN to convert a float to a signed int. OPERANDS
15123 are the insn operands. The output may be [HSD]Imode and the input
15124 operand may be [SDX]Fmode. */
15125
15126 const char *
15127 output_fix_trunc (rtx insn, rtx *operands, bool fisttp)
15128 {
15129 int stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG) != 0;
15130 int dimode_p = GET_MODE (operands[0]) == DImode;
15131 int round_mode = get_attr_i387_cw (insn);
15132
15133 /* Jump through a hoop or two for DImode, since the hardware has no
15134 non-popping instruction. We used to do this a different way, but
15135 that was somewhat fragile and broke with post-reload splitters. */
15136 if ((dimode_p || fisttp) && !stack_top_dies)
15137 output_asm_insn ("fld\t%y1", operands);
15138
15139 gcc_assert (STACK_TOP_P (operands[1]));
15140 gcc_assert (MEM_P (operands[0]));
15141 gcc_assert (GET_MODE (operands[1]) != TFmode);
15142
15143 if (fisttp)
15144 output_asm_insn ("fisttp%Z0\t%0", operands);
15145 else
15146 {
15147 if (round_mode != I387_CW_ANY)
15148 output_asm_insn ("fldcw\t%3", operands);
15149 if (stack_top_dies || dimode_p)
15150 output_asm_insn ("fistp%Z0\t%0", operands);
15151 else
15152 output_asm_insn ("fist%Z0\t%0", operands);
15153 if (round_mode != I387_CW_ANY)
15154 output_asm_insn ("fldcw\t%2", operands);
15155 }
15156
15157 return "";
15158 }
15159
15160 /* Output code for x87 ffreep insn. The OPNO argument, which may only
15161 have the values zero or one, indicates the ffreep insn's operand
15162 from the OPERANDS array. */
15163
15164 static const char *
15165 output_387_ffreep (rtx *operands ATTRIBUTE_UNUSED, int opno)
15166 {
15167 if (TARGET_USE_FFREEP)
15168 #ifdef HAVE_AS_IX86_FFREEP
15169 return opno ? "ffreep\t%y1" : "ffreep\t%y0";
15170 #else
15171 {
15172 static char retval[32];
15173 int regno = REGNO (operands[opno]);
15174
15175 gcc_assert (FP_REGNO_P (regno));
15176
15177 regno -= FIRST_STACK_REG;
15178
15179 snprintf (retval, sizeof (retval), ASM_SHORT "0xc%ddf", regno);
15180 return retval;
15181 }
15182 #endif
15183
15184 return opno ? "fstp\t%y1" : "fstp\t%y0";
15185 }
15186
15187
15188 /* Output code for INSN to compare OPERANDS. EFLAGS_P is 1 when fcomi
15189 should be used. UNORDERED_P is true when fucom should be used. */
15190
15191 const char *
15192 output_fp_compare (rtx insn, rtx *operands, bool eflags_p, bool unordered_p)
15193 {
15194 int stack_top_dies;
15195 rtx cmp_op0, cmp_op1;
15196 int is_sse = SSE_REG_P (operands[0]) || SSE_REG_P (operands[1]);
15197
15198 if (eflags_p)
15199 {
15200 cmp_op0 = operands[0];
15201 cmp_op1 = operands[1];
15202 }
15203 else
15204 {
15205 cmp_op0 = operands[1];
15206 cmp_op1 = operands[2];
15207 }
15208
15209 if (is_sse)
15210 {
15211 if (GET_MODE (operands[0]) == SFmode)
15212 if (unordered_p)
15213 return "%vucomiss\t{%1, %0|%0, %1}";
15214 else
15215 return "%vcomiss\t{%1, %0|%0, %1}";
15216 else
15217 if (unordered_p)
15218 return "%vucomisd\t{%1, %0|%0, %1}";
15219 else
15220 return "%vcomisd\t{%1, %0|%0, %1}";
15221 }
15222
15223 gcc_assert (STACK_TOP_P (cmp_op0));
15224
15225 stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG) != 0;
15226
15227 if (cmp_op1 == CONST0_RTX (GET_MODE (cmp_op1)))
15228 {
15229 if (stack_top_dies)
15230 {
15231 output_asm_insn ("ftst\n\tfnstsw\t%0", operands);
15232 return output_387_ffreep (operands, 1);
15233 }
15234 else
15235 return "ftst\n\tfnstsw\t%0";
15236 }
15237
15238 if (STACK_REG_P (cmp_op1)
15239 && stack_top_dies
15240 && find_regno_note (insn, REG_DEAD, REGNO (cmp_op1))
15241 && REGNO (cmp_op1) != FIRST_STACK_REG)
15242 {
15243 /* If both the top of the 387 stack dies, and the other operand
15244 is also a stack register that dies, then this must be a
15245 `fcompp' float compare */
15246
15247 if (eflags_p)
15248 {
15249 /* There is no double popping fcomi variant. Fortunately,
15250 eflags is immune from the fstp's cc clobbering. */
15251 if (unordered_p)
15252 output_asm_insn ("fucomip\t{%y1, %0|%0, %y1}", operands);
15253 else
15254 output_asm_insn ("fcomip\t{%y1, %0|%0, %y1}", operands);
15255 return output_387_ffreep (operands, 0);
15256 }
15257 else
15258 {
15259 if (unordered_p)
15260 return "fucompp\n\tfnstsw\t%0";
15261 else
15262 return "fcompp\n\tfnstsw\t%0";
15263 }
15264 }
15265 else
15266 {
15267 /* Encoded here as eflags_p | intmode | unordered_p | stack_top_dies. */
15268
15269 static const char * const alt[16] =
15270 {
15271 "fcom%Z2\t%y2\n\tfnstsw\t%0",
15272 "fcomp%Z2\t%y2\n\tfnstsw\t%0",
15273 "fucom%Z2\t%y2\n\tfnstsw\t%0",
15274 "fucomp%Z2\t%y2\n\tfnstsw\t%0",
15275
15276 "ficom%Z2\t%y2\n\tfnstsw\t%0",
15277 "ficomp%Z2\t%y2\n\tfnstsw\t%0",
15278 NULL,
15279 NULL,
15280
15281 "fcomi\t{%y1, %0|%0, %y1}",
15282 "fcomip\t{%y1, %0|%0, %y1}",
15283 "fucomi\t{%y1, %0|%0, %y1}",
15284 "fucomip\t{%y1, %0|%0, %y1}",
15285
15286 NULL,
15287 NULL,
15288 NULL,
15289 NULL
15290 };
15291
15292 int mask;
15293 const char *ret;
15294
15295 mask = eflags_p << 3;
15296 mask |= (GET_MODE_CLASS (GET_MODE (cmp_op1)) == MODE_INT) << 2;
15297 mask |= unordered_p << 1;
15298 mask |= stack_top_dies;
15299
15300 gcc_assert (mask < 16);
15301 ret = alt[mask];
15302 gcc_assert (ret);
15303
15304 return ret;
15305 }
15306 }
15307
15308 void
15309 ix86_output_addr_vec_elt (FILE *file, int value)
15310 {
15311 const char *directive = ASM_LONG;
15312
15313 #ifdef ASM_QUAD
15314 if (TARGET_LP64)
15315 directive = ASM_QUAD;
15316 #else
15317 gcc_assert (!TARGET_64BIT);
15318 #endif
15319
15320 fprintf (file, "%s%s%d\n", directive, LPREFIX, value);
15321 }
15322
15323 void
15324 ix86_output_addr_diff_elt (FILE *file, int value, int rel)
15325 {
15326 const char *directive = ASM_LONG;
15327
15328 #ifdef ASM_QUAD
15329 if (TARGET_64BIT && CASE_VECTOR_MODE == DImode)
15330 directive = ASM_QUAD;
15331 #else
15332 gcc_assert (!TARGET_64BIT);
15333 #endif
15334 /* We can't use @GOTOFF for text labels on VxWorks; see gotoff_operand. */
15335 if (TARGET_64BIT || TARGET_VXWORKS_RTP)
15336 fprintf (file, "%s%s%d-%s%d\n",
15337 directive, LPREFIX, value, LPREFIX, rel);
15338 else if (HAVE_AS_GOTOFF_IN_DATA)
15339 fprintf (file, ASM_LONG "%s%d@GOTOFF\n", LPREFIX, value);
15340 #if TARGET_MACHO
15341 else if (TARGET_MACHO)
15342 {
15343 fprintf (file, ASM_LONG "%s%d-", LPREFIX, value);
15344 machopic_output_function_base_name (file);
15345 putc ('\n', file);
15346 }
15347 #endif
15348 else
15349 asm_fprintf (file, ASM_LONG "%U%s+[.-%s%d]\n",
15350 GOT_SYMBOL_NAME, LPREFIX, value);
15351 }
15352 \f
15353 /* Generate either "mov $0, reg" or "xor reg, reg", as appropriate
15354 for the target. */
15355
15356 void
15357 ix86_expand_clear (rtx dest)
15358 {
15359 rtx tmp;
15360
15361 /* We play register width games, which are only valid after reload. */
15362 gcc_assert (reload_completed);
15363
15364 /* Avoid HImode and its attendant prefix byte. */
15365 if (GET_MODE_SIZE (GET_MODE (dest)) < 4)
15366 dest = gen_rtx_REG (SImode, REGNO (dest));
15367 tmp = gen_rtx_SET (VOIDmode, dest, const0_rtx);
15368
15369 /* This predicate should match that for movsi_xor and movdi_xor_rex64. */
15370 if (!TARGET_USE_MOV0 || optimize_insn_for_speed_p ())
15371 {
15372 rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
15373 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, clob));
15374 }
15375
15376 emit_insn (tmp);
15377 }
15378
15379 /* X is an unchanging MEM. If it is a constant pool reference, return
15380 the constant pool rtx, else NULL. */
15381
15382 rtx
15383 maybe_get_pool_constant (rtx x)
15384 {
15385 x = ix86_delegitimize_address (XEXP (x, 0));
15386
15387 if (GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x))
15388 return get_pool_constant (x);
15389
15390 return NULL_RTX;
15391 }
15392
15393 void
15394 ix86_expand_move (enum machine_mode mode, rtx operands[])
15395 {
15396 rtx op0, op1;
15397 enum tls_model model;
15398
15399 op0 = operands[0];
15400 op1 = operands[1];
15401
15402 if (GET_CODE (op1) == SYMBOL_REF)
15403 {
15404 model = SYMBOL_REF_TLS_MODEL (op1);
15405 if (model)
15406 {
15407 op1 = legitimize_tls_address (op1, model, true);
15408 op1 = force_operand (op1, op0);
15409 if (op1 == op0)
15410 return;
15411 if (GET_MODE (op1) != mode)
15412 op1 = convert_to_mode (mode, op1, 1);
15413 }
15414 else if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
15415 && SYMBOL_REF_DLLIMPORT_P (op1))
15416 op1 = legitimize_dllimport_symbol (op1, false);
15417 }
15418 else if (GET_CODE (op1) == CONST
15419 && GET_CODE (XEXP (op1, 0)) == PLUS
15420 && GET_CODE (XEXP (XEXP (op1, 0), 0)) == SYMBOL_REF)
15421 {
15422 rtx addend = XEXP (XEXP (op1, 0), 1);
15423 rtx symbol = XEXP (XEXP (op1, 0), 0);
15424 rtx tmp = NULL;
15425
15426 model = SYMBOL_REF_TLS_MODEL (symbol);
15427 if (model)
15428 tmp = legitimize_tls_address (symbol, model, true);
15429 else if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
15430 && SYMBOL_REF_DLLIMPORT_P (symbol))
15431 tmp = legitimize_dllimport_symbol (symbol, true);
15432
15433 if (tmp)
15434 {
15435 tmp = force_operand (tmp, NULL);
15436 tmp = expand_simple_binop (Pmode, PLUS, tmp, addend,
15437 op0, 1, OPTAB_DIRECT);
15438 if (tmp == op0)
15439 return;
15440 if (GET_MODE (tmp) != mode)
15441 op1 = convert_to_mode (mode, tmp, 1);
15442 }
15443 }
15444
15445 if ((flag_pic || MACHOPIC_INDIRECT)
15446 && symbolic_operand (op1, mode))
15447 {
15448 if (TARGET_MACHO && !TARGET_64BIT)
15449 {
15450 #if TARGET_MACHO
15451 /* dynamic-no-pic */
15452 if (MACHOPIC_INDIRECT)
15453 {
15454 rtx temp = ((reload_in_progress
15455 || ((op0 && REG_P (op0))
15456 && mode == Pmode))
15457 ? op0 : gen_reg_rtx (Pmode));
15458 op1 = machopic_indirect_data_reference (op1, temp);
15459 if (MACHOPIC_PURE)
15460 op1 = machopic_legitimize_pic_address (op1, mode,
15461 temp == op1 ? 0 : temp);
15462 }
15463 if (op0 != op1 && GET_CODE (op0) != MEM)
15464 {
15465 rtx insn = gen_rtx_SET (VOIDmode, op0, op1);
15466 emit_insn (insn);
15467 return;
15468 }
15469 if (GET_CODE (op0) == MEM)
15470 op1 = force_reg (Pmode, op1);
15471 else
15472 {
15473 rtx temp = op0;
15474 if (GET_CODE (temp) != REG)
15475 temp = gen_reg_rtx (Pmode);
15476 temp = legitimize_pic_address (op1, temp);
15477 if (temp == op0)
15478 return;
15479 op1 = temp;
15480 }
15481 /* dynamic-no-pic */
15482 #endif
15483 }
15484 else
15485 {
15486 if (MEM_P (op0))
15487 op1 = force_reg (mode, op1);
15488 else if (!(TARGET_64BIT && x86_64_movabs_operand (op1, DImode)))
15489 {
15490 rtx reg = can_create_pseudo_p () ? NULL_RTX : op0;
15491 op1 = legitimize_pic_address (op1, reg);
15492 if (op0 == op1)
15493 return;
15494 if (GET_MODE (op1) != mode)
15495 op1 = convert_to_mode (mode, op1, 1);
15496 }
15497 }
15498 }
15499 else
15500 {
15501 if (MEM_P (op0)
15502 && (PUSH_ROUNDING (GET_MODE_SIZE (mode)) != GET_MODE_SIZE (mode)
15503 || !push_operand (op0, mode))
15504 && MEM_P (op1))
15505 op1 = force_reg (mode, op1);
15506
15507 if (push_operand (op0, mode)
15508 && ! general_no_elim_operand (op1, mode))
15509 op1 = copy_to_mode_reg (mode, op1);
15510
15511 /* Force large constants in 64bit compilation into register
15512 to get them CSEed. */
15513 if (can_create_pseudo_p ()
15514 && (mode == DImode) && TARGET_64BIT
15515 && immediate_operand (op1, mode)
15516 && !x86_64_zext_immediate_operand (op1, VOIDmode)
15517 && !register_operand (op0, mode)
15518 && optimize)
15519 op1 = copy_to_mode_reg (mode, op1);
15520
15521 if (can_create_pseudo_p ()
15522 && FLOAT_MODE_P (mode)
15523 && GET_CODE (op1) == CONST_DOUBLE)
15524 {
15525 /* If we are loading a floating point constant to a register,
15526 force the value to memory now, since we'll get better code
15527 out the back end. */
15528
15529 op1 = validize_mem (force_const_mem (mode, op1));
15530 if (!register_operand (op0, mode))
15531 {
15532 rtx temp = gen_reg_rtx (mode);
15533 emit_insn (gen_rtx_SET (VOIDmode, temp, op1));
15534 emit_move_insn (op0, temp);
15535 return;
15536 }
15537 }
15538 }
15539
15540 emit_insn (gen_rtx_SET (VOIDmode, op0, op1));
15541 }
15542
15543 void
15544 ix86_expand_vector_move (enum machine_mode mode, rtx operands[])
15545 {
15546 rtx op0 = operands[0], op1 = operands[1];
15547 unsigned int align = GET_MODE_ALIGNMENT (mode);
15548
15549 /* Force constants other than zero into memory. We do not know how
15550 the instructions used to build constants modify the upper 64 bits
15551 of the register, once we have that information we may be able
15552 to handle some of them more efficiently. */
15553 if (can_create_pseudo_p ()
15554 && register_operand (op0, mode)
15555 && (CONSTANT_P (op1)
15556 || (GET_CODE (op1) == SUBREG
15557 && CONSTANT_P (SUBREG_REG (op1))))
15558 && !standard_sse_constant_p (op1))
15559 op1 = validize_mem (force_const_mem (mode, op1));
15560
15561 /* We need to check memory alignment for SSE mode since attribute
15562 can make operands unaligned. */
15563 if (can_create_pseudo_p ()
15564 && SSE_REG_MODE_P (mode)
15565 && ((MEM_P (op0) && (MEM_ALIGN (op0) < align))
15566 || (MEM_P (op1) && (MEM_ALIGN (op1) < align))))
15567 {
15568 rtx tmp[2];
15569
15570 /* ix86_expand_vector_move_misalign() does not like constants ... */
15571 if (CONSTANT_P (op1)
15572 || (GET_CODE (op1) == SUBREG
15573 && CONSTANT_P (SUBREG_REG (op1))))
15574 op1 = validize_mem (force_const_mem (mode, op1));
15575
15576 /* ... nor both arguments in memory. */
15577 if (!register_operand (op0, mode)
15578 && !register_operand (op1, mode))
15579 op1 = force_reg (mode, op1);
15580
15581 tmp[0] = op0; tmp[1] = op1;
15582 ix86_expand_vector_move_misalign (mode, tmp);
15583 return;
15584 }
15585
15586 /* Make operand1 a register if it isn't already. */
15587 if (can_create_pseudo_p ()
15588 && !register_operand (op0, mode)
15589 && !register_operand (op1, mode))
15590 {
15591 emit_move_insn (op0, force_reg (GET_MODE (op0), op1));
15592 return;
15593 }
15594
15595 emit_insn (gen_rtx_SET (VOIDmode, op0, op1));
15596 }
15597
15598 /* Split 32-byte AVX unaligned load and store if needed. */
15599
15600 static void
15601 ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1)
15602 {
15603 rtx m;
15604 rtx (*extract) (rtx, rtx, rtx);
15605 rtx (*move_unaligned) (rtx, rtx);
15606 enum machine_mode mode;
15607
15608 switch (GET_MODE (op0))
15609 {
15610 default:
15611 gcc_unreachable ();
15612 case V32QImode:
15613 extract = gen_avx_vextractf128v32qi;
15614 move_unaligned = gen_avx_movdqu256;
15615 mode = V16QImode;
15616 break;
15617 case V8SFmode:
15618 extract = gen_avx_vextractf128v8sf;
15619 move_unaligned = gen_avx_movups256;
15620 mode = V4SFmode;
15621 break;
15622 case V4DFmode:
15623 extract = gen_avx_vextractf128v4df;
15624 move_unaligned = gen_avx_movupd256;
15625 mode = V2DFmode;
15626 break;
15627 }
15628
15629 if (MEM_P (op1) && TARGET_AVX256_SPLIT_UNALIGNED_LOAD)
15630 {
15631 rtx r = gen_reg_rtx (mode);
15632 m = adjust_address (op1, mode, 0);
15633 emit_move_insn (r, m);
15634 m = adjust_address (op1, mode, 16);
15635 r = gen_rtx_VEC_CONCAT (GET_MODE (op0), r, m);
15636 emit_move_insn (op0, r);
15637 }
15638 else if (MEM_P (op0) && TARGET_AVX256_SPLIT_UNALIGNED_STORE)
15639 {
15640 m = adjust_address (op0, mode, 0);
15641 emit_insn (extract (m, op1, const0_rtx));
15642 m = adjust_address (op0, mode, 16);
15643 emit_insn (extract (m, op1, const1_rtx));
15644 }
15645 else
15646 emit_insn (move_unaligned (op0, op1));
15647 }
15648
15649 /* Implement the movmisalign patterns for SSE. Non-SSE modes go
15650 straight to ix86_expand_vector_move. */
15651 /* Code generation for scalar reg-reg moves of single and double precision data:
15652 if (x86_sse_partial_reg_dependency == true | x86_sse_split_regs == true)
15653 movaps reg, reg
15654 else
15655 movss reg, reg
15656 if (x86_sse_partial_reg_dependency == true)
15657 movapd reg, reg
15658 else
15659 movsd reg, reg
15660
15661 Code generation for scalar loads of double precision data:
15662 if (x86_sse_split_regs == true)
15663 movlpd mem, reg (gas syntax)
15664 else
15665 movsd mem, reg
15666
15667 Code generation for unaligned packed loads of single precision data
15668 (x86_sse_unaligned_move_optimal overrides x86_sse_partial_reg_dependency):
15669 if (x86_sse_unaligned_move_optimal)
15670 movups mem, reg
15671
15672 if (x86_sse_partial_reg_dependency == true)
15673 {
15674 xorps reg, reg
15675 movlps mem, reg
15676 movhps mem+8, reg
15677 }
15678 else
15679 {
15680 movlps mem, reg
15681 movhps mem+8, reg
15682 }
15683
15684 Code generation for unaligned packed loads of double precision data
15685 (x86_sse_unaligned_move_optimal overrides x86_sse_split_regs):
15686 if (x86_sse_unaligned_move_optimal)
15687 movupd mem, reg
15688
15689 if (x86_sse_split_regs == true)
15690 {
15691 movlpd mem, reg
15692 movhpd mem+8, reg
15693 }
15694 else
15695 {
15696 movsd mem, reg
15697 movhpd mem+8, reg
15698 }
15699 */
15700
15701 void
15702 ix86_expand_vector_move_misalign (enum machine_mode mode, rtx operands[])
15703 {
15704 rtx op0, op1, m;
15705
15706 op0 = operands[0];
15707 op1 = operands[1];
15708
15709 if (TARGET_AVX)
15710 {
15711 switch (GET_MODE_CLASS (mode))
15712 {
15713 case MODE_VECTOR_INT:
15714 case MODE_INT:
15715 switch (GET_MODE_SIZE (mode))
15716 {
15717 case 16:
15718 /* If we're optimizing for size, movups is the smallest. */
15719 if (TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
15720 {
15721 op0 = gen_lowpart (V4SFmode, op0);
15722 op1 = gen_lowpart (V4SFmode, op1);
15723 emit_insn (gen_sse_movups (op0, op1));
15724 return;
15725 }
15726 op0 = gen_lowpart (V16QImode, op0);
15727 op1 = gen_lowpart (V16QImode, op1);
15728 emit_insn (gen_sse2_movdqu (op0, op1));
15729 break;
15730 case 32:
15731 op0 = gen_lowpart (V32QImode, op0);
15732 op1 = gen_lowpart (V32QImode, op1);
15733 ix86_avx256_split_vector_move_misalign (op0, op1);
15734 break;
15735 default:
15736 gcc_unreachable ();
15737 }
15738 break;
15739 case MODE_VECTOR_FLOAT:
15740 op0 = gen_lowpart (mode, op0);
15741 op1 = gen_lowpart (mode, op1);
15742
15743 switch (mode)
15744 {
15745 case V4SFmode:
15746 emit_insn (gen_sse_movups (op0, op1));
15747 break;
15748 case V8SFmode:
15749 ix86_avx256_split_vector_move_misalign (op0, op1);
15750 break;
15751 case V2DFmode:
15752 if (TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
15753 {
15754 op0 = gen_lowpart (V4SFmode, op0);
15755 op1 = gen_lowpart (V4SFmode, op1);
15756 emit_insn (gen_sse_movups (op0, op1));
15757 return;
15758 }
15759 emit_insn (gen_sse2_movupd (op0, op1));
15760 break;
15761 case V4DFmode:
15762 ix86_avx256_split_vector_move_misalign (op0, op1);
15763 break;
15764 default:
15765 gcc_unreachable ();
15766 }
15767 break;
15768
15769 default:
15770 gcc_unreachable ();
15771 }
15772
15773 return;
15774 }
15775
15776 if (MEM_P (op1))
15777 {
15778 /* If we're optimizing for size, movups is the smallest. */
15779 if (optimize_insn_for_size_p ()
15780 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
15781 {
15782 op0 = gen_lowpart (V4SFmode, op0);
15783 op1 = gen_lowpart (V4SFmode, op1);
15784 emit_insn (gen_sse_movups (op0, op1));
15785 return;
15786 }
15787
15788 /* ??? If we have typed data, then it would appear that using
15789 movdqu is the only way to get unaligned data loaded with
15790 integer type. */
15791 if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
15792 {
15793 op0 = gen_lowpart (V16QImode, op0);
15794 op1 = gen_lowpart (V16QImode, op1);
15795 emit_insn (gen_sse2_movdqu (op0, op1));
15796 return;
15797 }
15798
15799 if (TARGET_SSE2 && mode == V2DFmode)
15800 {
15801 rtx zero;
15802
15803 if (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL)
15804 {
15805 op0 = gen_lowpart (V2DFmode, op0);
15806 op1 = gen_lowpart (V2DFmode, op1);
15807 emit_insn (gen_sse2_movupd (op0, op1));
15808 return;
15809 }
15810
15811 /* When SSE registers are split into halves, we can avoid
15812 writing to the top half twice. */
15813 if (TARGET_SSE_SPLIT_REGS)
15814 {
15815 emit_clobber (op0);
15816 zero = op0;
15817 }
15818 else
15819 {
15820 /* ??? Not sure about the best option for the Intel chips.
15821 The following would seem to satisfy; the register is
15822 entirely cleared, breaking the dependency chain. We
15823 then store to the upper half, with a dependency depth
15824 of one. A rumor has it that Intel recommends two movsd
15825 followed by an unpacklpd, but this is unconfirmed. And
15826 given that the dependency depth of the unpacklpd would
15827 still be one, I'm not sure why this would be better. */
15828 zero = CONST0_RTX (V2DFmode);
15829 }
15830
15831 m = adjust_address (op1, DFmode, 0);
15832 emit_insn (gen_sse2_loadlpd (op0, zero, m));
15833 m = adjust_address (op1, DFmode, 8);
15834 emit_insn (gen_sse2_loadhpd (op0, op0, m));
15835 }
15836 else
15837 {
15838 if (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL)
15839 {
15840 op0 = gen_lowpart (V4SFmode, op0);
15841 op1 = gen_lowpart (V4SFmode, op1);
15842 emit_insn (gen_sse_movups (op0, op1));
15843 return;
15844 }
15845
15846 if (TARGET_SSE_PARTIAL_REG_DEPENDENCY)
15847 emit_move_insn (op0, CONST0_RTX (mode));
15848 else
15849 emit_clobber (op0);
15850
15851 if (mode != V4SFmode)
15852 op0 = gen_lowpart (V4SFmode, op0);
15853 m = adjust_address (op1, V2SFmode, 0);
15854 emit_insn (gen_sse_loadlps (op0, op0, m));
15855 m = adjust_address (op1, V2SFmode, 8);
15856 emit_insn (gen_sse_loadhps (op0, op0, m));
15857 }
15858 }
15859 else if (MEM_P (op0))
15860 {
15861 /* If we're optimizing for size, movups is the smallest. */
15862 if (optimize_insn_for_size_p ()
15863 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
15864 {
15865 op0 = gen_lowpart (V4SFmode, op0);
15866 op1 = gen_lowpart (V4SFmode, op1);
15867 emit_insn (gen_sse_movups (op0, op1));
15868 return;
15869 }
15870
15871 /* ??? Similar to above, only less clear because of quote
15872 typeless stores unquote. */
15873 if (TARGET_SSE2 && !TARGET_SSE_TYPELESS_STORES
15874 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
15875 {
15876 op0 = gen_lowpart (V16QImode, op0);
15877 op1 = gen_lowpart (V16QImode, op1);
15878 emit_insn (gen_sse2_movdqu (op0, op1));
15879 return;
15880 }
15881
15882 if (TARGET_SSE2 && mode == V2DFmode)
15883 {
15884 if (TARGET_SSE_UNALIGNED_STORE_OPTIMAL)
15885 {
15886 op0 = gen_lowpart (V2DFmode, op0);
15887 op1 = gen_lowpart (V2DFmode, op1);
15888 emit_insn (gen_sse2_movupd (op0, op1));
15889 }
15890 else
15891 {
15892 m = adjust_address (op0, DFmode, 0);
15893 emit_insn (gen_sse2_storelpd (m, op1));
15894 m = adjust_address (op0, DFmode, 8);
15895 emit_insn (gen_sse2_storehpd (m, op1));
15896 }
15897 }
15898 else
15899 {
15900 if (mode != V4SFmode)
15901 op1 = gen_lowpart (V4SFmode, op1);
15902
15903 if (TARGET_SSE_UNALIGNED_STORE_OPTIMAL)
15904 {
15905 op0 = gen_lowpart (V4SFmode, op0);
15906 emit_insn (gen_sse_movups (op0, op1));
15907 }
15908 else
15909 {
15910 m = adjust_address (op0, V2SFmode, 0);
15911 emit_insn (gen_sse_storelps (m, op1));
15912 m = adjust_address (op0, V2SFmode, 8);
15913 emit_insn (gen_sse_storehps (m, op1));
15914 }
15915 }
15916 }
15917 else
15918 gcc_unreachable ();
15919 }
15920
15921 /* Expand a push in MODE. This is some mode for which we do not support
15922 proper push instructions, at least from the registers that we expect
15923 the value to live in. */
15924
15925 void
15926 ix86_expand_push (enum machine_mode mode, rtx x)
15927 {
15928 rtx tmp;
15929
15930 tmp = expand_simple_binop (Pmode, PLUS, stack_pointer_rtx,
15931 GEN_INT (-GET_MODE_SIZE (mode)),
15932 stack_pointer_rtx, 1, OPTAB_DIRECT);
15933 if (tmp != stack_pointer_rtx)
15934 emit_move_insn (stack_pointer_rtx, tmp);
15935
15936 tmp = gen_rtx_MEM (mode, stack_pointer_rtx);
15937
15938 /* When we push an operand onto stack, it has to be aligned at least
15939 at the function argument boundary. However since we don't have
15940 the argument type, we can't determine the actual argument
15941 boundary. */
15942 emit_move_insn (tmp, x);
15943 }
15944
15945 /* Helper function of ix86_fixup_binary_operands to canonicalize
15946 operand order. Returns true if the operands should be swapped. */
15947
15948 static bool
15949 ix86_swap_binary_operands_p (enum rtx_code code, enum machine_mode mode,
15950 rtx operands[])
15951 {
15952 rtx dst = operands[0];
15953 rtx src1 = operands[1];
15954 rtx src2 = operands[2];
15955
15956 /* If the operation is not commutative, we can't do anything. */
15957 if (GET_RTX_CLASS (code) != RTX_COMM_ARITH)
15958 return false;
15959
15960 /* Highest priority is that src1 should match dst. */
15961 if (rtx_equal_p (dst, src1))
15962 return false;
15963 if (rtx_equal_p (dst, src2))
15964 return true;
15965
15966 /* Next highest priority is that immediate constants come second. */
15967 if (immediate_operand (src2, mode))
15968 return false;
15969 if (immediate_operand (src1, mode))
15970 return true;
15971
15972 /* Lowest priority is that memory references should come second. */
15973 if (MEM_P (src2))
15974 return false;
15975 if (MEM_P (src1))
15976 return true;
15977
15978 return false;
15979 }
15980
15981
15982 /* Fix up OPERANDS to satisfy ix86_binary_operator_ok. Return the
15983 destination to use for the operation. If different from the true
15984 destination in operands[0], a copy operation will be required. */
15985
15986 rtx
15987 ix86_fixup_binary_operands (enum rtx_code code, enum machine_mode mode,
15988 rtx operands[])
15989 {
15990 rtx dst = operands[0];
15991 rtx src1 = operands[1];
15992 rtx src2 = operands[2];
15993
15994 /* Canonicalize operand order. */
15995 if (ix86_swap_binary_operands_p (code, mode, operands))
15996 {
15997 rtx temp;
15998
15999 /* It is invalid to swap operands of different modes. */
16000 gcc_assert (GET_MODE (src1) == GET_MODE (src2));
16001
16002 temp = src1;
16003 src1 = src2;
16004 src2 = temp;
16005 }
16006
16007 /* Both source operands cannot be in memory. */
16008 if (MEM_P (src1) && MEM_P (src2))
16009 {
16010 /* Optimization: Only read from memory once. */
16011 if (rtx_equal_p (src1, src2))
16012 {
16013 src2 = force_reg (mode, src2);
16014 src1 = src2;
16015 }
16016 else
16017 src2 = force_reg (mode, src2);
16018 }
16019
16020 /* If the destination is memory, and we do not have matching source
16021 operands, do things in registers. */
16022 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
16023 dst = gen_reg_rtx (mode);
16024
16025 /* Source 1 cannot be a constant. */
16026 if (CONSTANT_P (src1))
16027 src1 = force_reg (mode, src1);
16028
16029 /* Source 1 cannot be a non-matching memory. */
16030 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
16031 src1 = force_reg (mode, src1);
16032
16033 /* Improve address combine. */
16034 if (code == PLUS
16035 && GET_MODE_CLASS (mode) == MODE_INT
16036 && MEM_P (src2))
16037 src2 = force_reg (mode, src2);
16038
16039 operands[1] = src1;
16040 operands[2] = src2;
16041 return dst;
16042 }
16043
16044 /* Similarly, but assume that the destination has already been
16045 set up properly. */
16046
16047 void
16048 ix86_fixup_binary_operands_no_copy (enum rtx_code code,
16049 enum machine_mode mode, rtx operands[])
16050 {
16051 rtx dst = ix86_fixup_binary_operands (code, mode, operands);
16052 gcc_assert (dst == operands[0]);
16053 }
16054
16055 /* Attempt to expand a binary operator. Make the expansion closer to the
16056 actual machine, then just general_operand, which will allow 3 separate
16057 memory references (one output, two input) in a single insn. */
16058
16059 void
16060 ix86_expand_binary_operator (enum rtx_code code, enum machine_mode mode,
16061 rtx operands[])
16062 {
16063 rtx src1, src2, dst, op, clob;
16064
16065 dst = ix86_fixup_binary_operands (code, mode, operands);
16066 src1 = operands[1];
16067 src2 = operands[2];
16068
16069 /* Emit the instruction. */
16070
16071 op = gen_rtx_SET (VOIDmode, dst, gen_rtx_fmt_ee (code, mode, src1, src2));
16072 if (reload_in_progress)
16073 {
16074 /* Reload doesn't know about the flags register, and doesn't know that
16075 it doesn't want to clobber it. We can only do this with PLUS. */
16076 gcc_assert (code == PLUS);
16077 emit_insn (op);
16078 }
16079 else if (reload_completed
16080 && code == PLUS
16081 && !rtx_equal_p (dst, src1))
16082 {
16083 /* This is going to be an LEA; avoid splitting it later. */
16084 emit_insn (op);
16085 }
16086 else
16087 {
16088 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
16089 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
16090 }
16091
16092 /* Fix up the destination if needed. */
16093 if (dst != operands[0])
16094 emit_move_insn (operands[0], dst);
16095 }
16096
16097 /* Return TRUE or FALSE depending on whether the binary operator meets the
16098 appropriate constraints. */
16099
16100 bool
16101 ix86_binary_operator_ok (enum rtx_code code, enum machine_mode mode,
16102 rtx operands[3])
16103 {
16104 rtx dst = operands[0];
16105 rtx src1 = operands[1];
16106 rtx src2 = operands[2];
16107
16108 /* Both source operands cannot be in memory. */
16109 if (MEM_P (src1) && MEM_P (src2))
16110 return false;
16111
16112 /* Canonicalize operand order for commutative operators. */
16113 if (ix86_swap_binary_operands_p (code, mode, operands))
16114 {
16115 rtx temp = src1;
16116 src1 = src2;
16117 src2 = temp;
16118 }
16119
16120 /* If the destination is memory, we must have a matching source operand. */
16121 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
16122 return false;
16123
16124 /* Source 1 cannot be a constant. */
16125 if (CONSTANT_P (src1))
16126 return false;
16127
16128 /* Source 1 cannot be a non-matching memory. */
16129 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
16130 /* Support "andhi/andsi/anddi" as a zero-extending move. */
16131 return (code == AND
16132 && (mode == HImode
16133 || mode == SImode
16134 || (TARGET_64BIT && mode == DImode))
16135 && satisfies_constraint_L (src2));
16136
16137 return true;
16138 }
16139
16140 /* Attempt to expand a unary operator. Make the expansion closer to the
16141 actual machine, then just general_operand, which will allow 2 separate
16142 memory references (one output, one input) in a single insn. */
16143
16144 void
16145 ix86_expand_unary_operator (enum rtx_code code, enum machine_mode mode,
16146 rtx operands[])
16147 {
16148 int matching_memory;
16149 rtx src, dst, op, clob;
16150
16151 dst = operands[0];
16152 src = operands[1];
16153
16154 /* If the destination is memory, and we do not have matching source
16155 operands, do things in registers. */
16156 matching_memory = 0;
16157 if (MEM_P (dst))
16158 {
16159 if (rtx_equal_p (dst, src))
16160 matching_memory = 1;
16161 else
16162 dst = gen_reg_rtx (mode);
16163 }
16164
16165 /* When source operand is memory, destination must match. */
16166 if (MEM_P (src) && !matching_memory)
16167 src = force_reg (mode, src);
16168
16169 /* Emit the instruction. */
16170
16171 op = gen_rtx_SET (VOIDmode, dst, gen_rtx_fmt_e (code, mode, src));
16172 if (reload_in_progress || code == NOT)
16173 {
16174 /* Reload doesn't know about the flags register, and doesn't know that
16175 it doesn't want to clobber it. */
16176 gcc_assert (code == NOT);
16177 emit_insn (op);
16178 }
16179 else
16180 {
16181 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
16182 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
16183 }
16184
16185 /* Fix up the destination if needed. */
16186 if (dst != operands[0])
16187 emit_move_insn (operands[0], dst);
16188 }
16189
16190 /* Split 32bit/64bit divmod with 8bit unsigned divmod if dividend and
16191 divisor are within the range [0-255]. */
16192
16193 void
16194 ix86_split_idivmod (enum machine_mode mode, rtx operands[],
16195 bool signed_p)
16196 {
16197 rtx end_label, qimode_label;
16198 rtx insn, div, mod;
16199 rtx scratch, tmp0, tmp1, tmp2;
16200 rtx (*gen_divmod4_1) (rtx, rtx, rtx, rtx);
16201 rtx (*gen_zero_extend) (rtx, rtx);
16202 rtx (*gen_test_ccno_1) (rtx, rtx);
16203
16204 switch (mode)
16205 {
16206 case SImode:
16207 gen_divmod4_1 = signed_p ? gen_divmodsi4_1 : gen_udivmodsi4_1;
16208 gen_test_ccno_1 = gen_testsi_ccno_1;
16209 gen_zero_extend = gen_zero_extendqisi2;
16210 break;
16211 case DImode:
16212 gen_divmod4_1 = signed_p ? gen_divmoddi4_1 : gen_udivmoddi4_1;
16213 gen_test_ccno_1 = gen_testdi_ccno_1;
16214 gen_zero_extend = gen_zero_extendqidi2;
16215 break;
16216 default:
16217 gcc_unreachable ();
16218 }
16219
16220 end_label = gen_label_rtx ();
16221 qimode_label = gen_label_rtx ();
16222
16223 scratch = gen_reg_rtx (mode);
16224
16225 /* Use 8bit unsigned divimod if dividend and divisor are within
16226 the range [0-255]. */
16227 emit_move_insn (scratch, operands[2]);
16228 scratch = expand_simple_binop (mode, IOR, scratch, operands[3],
16229 scratch, 1, OPTAB_DIRECT);
16230 emit_insn (gen_test_ccno_1 (scratch, GEN_INT (-0x100)));
16231 tmp0 = gen_rtx_REG (CCNOmode, FLAGS_REG);
16232 tmp0 = gen_rtx_EQ (VOIDmode, tmp0, const0_rtx);
16233 tmp0 = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp0,
16234 gen_rtx_LABEL_REF (VOIDmode, qimode_label),
16235 pc_rtx);
16236 insn = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp0));
16237 predict_jump (REG_BR_PROB_BASE * 50 / 100);
16238 JUMP_LABEL (insn) = qimode_label;
16239
16240 /* Generate original signed/unsigned divimod. */
16241 div = gen_divmod4_1 (operands[0], operands[1],
16242 operands[2], operands[3]);
16243 emit_insn (div);
16244
16245 /* Branch to the end. */
16246 emit_jump_insn (gen_jump (end_label));
16247 emit_barrier ();
16248
16249 /* Generate 8bit unsigned divide. */
16250 emit_label (qimode_label);
16251 /* Don't use operands[0] for result of 8bit divide since not all
16252 registers support QImode ZERO_EXTRACT. */
16253 tmp0 = simplify_gen_subreg (HImode, scratch, mode, 0);
16254 tmp1 = simplify_gen_subreg (HImode, operands[2], mode, 0);
16255 tmp2 = simplify_gen_subreg (QImode, operands[3], mode, 0);
16256 emit_insn (gen_udivmodhiqi3 (tmp0, tmp1, tmp2));
16257
16258 if (signed_p)
16259 {
16260 div = gen_rtx_DIV (SImode, operands[2], operands[3]);
16261 mod = gen_rtx_MOD (SImode, operands[2], operands[3]);
16262 }
16263 else
16264 {
16265 div = gen_rtx_UDIV (SImode, operands[2], operands[3]);
16266 mod = gen_rtx_UMOD (SImode, operands[2], operands[3]);
16267 }
16268
16269 /* Extract remainder from AH. */
16270 tmp1 = gen_rtx_ZERO_EXTRACT (mode, tmp0, GEN_INT (8), GEN_INT (8));
16271 if (REG_P (operands[1]))
16272 insn = emit_move_insn (operands[1], tmp1);
16273 else
16274 {
16275 /* Need a new scratch register since the old one has result
16276 of 8bit divide. */
16277 scratch = gen_reg_rtx (mode);
16278 emit_move_insn (scratch, tmp1);
16279 insn = emit_move_insn (operands[1], scratch);
16280 }
16281 set_unique_reg_note (insn, REG_EQUAL, mod);
16282
16283 /* Zero extend quotient from AL. */
16284 tmp1 = gen_lowpart (QImode, tmp0);
16285 insn = emit_insn (gen_zero_extend (operands[0], tmp1));
16286 set_unique_reg_note (insn, REG_EQUAL, div);
16287
16288 emit_label (end_label);
16289 }
16290
16291 #define LEA_MAX_STALL (3)
16292 #define LEA_SEARCH_THRESHOLD (LEA_MAX_STALL << 1)
16293
16294 /* Increase given DISTANCE in half-cycles according to
16295 dependencies between PREV and NEXT instructions.
16296 Add 1 half-cycle if there is no dependency and
16297 go to next cycle if there is some dependecy. */
16298
16299 static unsigned int
16300 increase_distance (rtx prev, rtx next, unsigned int distance)
16301 {
16302 df_ref *use_rec;
16303 df_ref *def_rec;
16304
16305 if (!prev || !next)
16306 return distance + (distance & 1) + 2;
16307
16308 if (!DF_INSN_USES (next) || !DF_INSN_DEFS (prev))
16309 return distance + 1;
16310
16311 for (use_rec = DF_INSN_USES (next); *use_rec; use_rec++)
16312 for (def_rec = DF_INSN_DEFS (prev); *def_rec; def_rec++)
16313 if (!DF_REF_IS_ARTIFICIAL (*def_rec)
16314 && DF_REF_REGNO (*use_rec) == DF_REF_REGNO (*def_rec))
16315 return distance + (distance & 1) + 2;
16316
16317 return distance + 1;
16318 }
16319
16320 /* Function checks if instruction INSN defines register number
16321 REGNO1 or REGNO2. */
16322
16323 static bool
16324 insn_defines_reg (unsigned int regno1, unsigned int regno2,
16325 rtx insn)
16326 {
16327 df_ref *def_rec;
16328
16329 for (def_rec = DF_INSN_DEFS (insn); *def_rec; def_rec++)
16330 if (DF_REF_REG_DEF_P (*def_rec)
16331 && !DF_REF_IS_ARTIFICIAL (*def_rec)
16332 && (regno1 == DF_REF_REGNO (*def_rec)
16333 || regno2 == DF_REF_REGNO (*def_rec)))
16334 {
16335 return true;
16336 }
16337
16338 return false;
16339 }
16340
16341 /* Function checks if instruction INSN uses register number
16342 REGNO as a part of address expression. */
16343
16344 static bool
16345 insn_uses_reg_mem (unsigned int regno, rtx insn)
16346 {
16347 df_ref *use_rec;
16348
16349 for (use_rec = DF_INSN_USES (insn); *use_rec; use_rec++)
16350 if (DF_REF_REG_MEM_P (*use_rec) && regno == DF_REF_REGNO (*use_rec))
16351 return true;
16352
16353 return false;
16354 }
16355
16356 /* Search backward for non-agu definition of register number REGNO1
16357 or register number REGNO2 in basic block starting from instruction
16358 START up to head of basic block or instruction INSN.
16359
16360 Function puts true value into *FOUND var if definition was found
16361 and false otherwise.
16362
16363 Distance in half-cycles between START and found instruction or head
16364 of BB is added to DISTANCE and returned. */
16365
16366 static int
16367 distance_non_agu_define_in_bb (unsigned int regno1, unsigned int regno2,
16368 rtx insn, int distance,
16369 rtx start, bool *found)
16370 {
16371 basic_block bb = start ? BLOCK_FOR_INSN (start) : NULL;
16372 rtx prev = start;
16373 rtx next = NULL;
16374
16375 *found = false;
16376
16377 while (prev
16378 && prev != insn
16379 && distance < LEA_SEARCH_THRESHOLD)
16380 {
16381 if (NONDEBUG_INSN_P (prev) && NONJUMP_INSN_P (prev))
16382 {
16383 distance = increase_distance (prev, next, distance);
16384 if (insn_defines_reg (regno1, regno2, prev))
16385 {
16386 if (recog_memoized (prev) < 0
16387 || get_attr_type (prev) != TYPE_LEA)
16388 {
16389 *found = true;
16390 return distance;
16391 }
16392 }
16393
16394 next = prev;
16395 }
16396 if (prev == BB_HEAD (bb))
16397 break;
16398
16399 prev = PREV_INSN (prev);
16400 }
16401
16402 return distance;
16403 }
16404
16405 /* Search backward for non-agu definition of register number REGNO1
16406 or register number REGNO2 in INSN's basic block until
16407 1. Pass LEA_SEARCH_THRESHOLD instructions, or
16408 2. Reach neighbour BBs boundary, or
16409 3. Reach agu definition.
16410 Returns the distance between the non-agu definition point and INSN.
16411 If no definition point, returns -1. */
16412
16413 static int
16414 distance_non_agu_define (unsigned int regno1, unsigned int regno2,
16415 rtx insn)
16416 {
16417 basic_block bb = BLOCK_FOR_INSN (insn);
16418 int distance = 0;
16419 bool found = false;
16420
16421 if (insn != BB_HEAD (bb))
16422 distance = distance_non_agu_define_in_bb (regno1, regno2, insn,
16423 distance, PREV_INSN (insn),
16424 &found);
16425
16426 if (!found && distance < LEA_SEARCH_THRESHOLD)
16427 {
16428 edge e;
16429 edge_iterator ei;
16430 bool simple_loop = false;
16431
16432 FOR_EACH_EDGE (e, ei, bb->preds)
16433 if (e->src == bb)
16434 {
16435 simple_loop = true;
16436 break;
16437 }
16438
16439 if (simple_loop)
16440 distance = distance_non_agu_define_in_bb (regno1, regno2,
16441 insn, distance,
16442 BB_END (bb), &found);
16443 else
16444 {
16445 int shortest_dist = -1;
16446 bool found_in_bb = false;
16447
16448 FOR_EACH_EDGE (e, ei, bb->preds)
16449 {
16450 int bb_dist
16451 = distance_non_agu_define_in_bb (regno1, regno2,
16452 insn, distance,
16453 BB_END (e->src),
16454 &found_in_bb);
16455 if (found_in_bb)
16456 {
16457 if (shortest_dist < 0)
16458 shortest_dist = bb_dist;
16459 else if (bb_dist > 0)
16460 shortest_dist = MIN (bb_dist, shortest_dist);
16461
16462 found = true;
16463 }
16464 }
16465
16466 distance = shortest_dist;
16467 }
16468 }
16469
16470 /* get_attr_type may modify recog data. We want to make sure
16471 that recog data is valid for instruction INSN, on which
16472 distance_non_agu_define is called. INSN is unchanged here. */
16473 extract_insn_cached (insn);
16474
16475 if (!found)
16476 return -1;
16477
16478 return distance >> 1;
16479 }
16480
16481 /* Return the distance in half-cycles between INSN and the next
16482 insn that uses register number REGNO in memory address added
16483 to DISTANCE. Return -1 if REGNO0 is set.
16484
16485 Put true value into *FOUND if register usage was found and
16486 false otherwise.
16487 Put true value into *REDEFINED if register redefinition was
16488 found and false otherwise. */
16489
16490 static int
16491 distance_agu_use_in_bb (unsigned int regno,
16492 rtx insn, int distance, rtx start,
16493 bool *found, bool *redefined)
16494 {
16495 basic_block bb = start ? BLOCK_FOR_INSN (start) : NULL;
16496 rtx next = start;
16497 rtx prev = NULL;
16498
16499 *found = false;
16500 *redefined = false;
16501
16502 while (next
16503 && next != insn
16504 && distance < LEA_SEARCH_THRESHOLD)
16505 {
16506 if (NONDEBUG_INSN_P (next) && NONJUMP_INSN_P (next))
16507 {
16508 distance = increase_distance(prev, next, distance);
16509 if (insn_uses_reg_mem (regno, next))
16510 {
16511 /* Return DISTANCE if OP0 is used in memory
16512 address in NEXT. */
16513 *found = true;
16514 return distance;
16515 }
16516
16517 if (insn_defines_reg (regno, INVALID_REGNUM, next))
16518 {
16519 /* Return -1 if OP0 is set in NEXT. */
16520 *redefined = true;
16521 return -1;
16522 }
16523
16524 prev = next;
16525 }
16526
16527 if (next == BB_END (bb))
16528 break;
16529
16530 next = NEXT_INSN (next);
16531 }
16532
16533 return distance;
16534 }
16535
16536 /* Return the distance between INSN and the next insn that uses
16537 register number REGNO0 in memory address. Return -1 if no such
16538 a use is found within LEA_SEARCH_THRESHOLD or REGNO0 is set. */
16539
16540 static int
16541 distance_agu_use (unsigned int regno0, rtx insn)
16542 {
16543 basic_block bb = BLOCK_FOR_INSN (insn);
16544 int distance = 0;
16545 bool found = false;
16546 bool redefined = false;
16547
16548 if (insn != BB_END (bb))
16549 distance = distance_agu_use_in_bb (regno0, insn, distance,
16550 NEXT_INSN (insn),
16551 &found, &redefined);
16552
16553 if (!found && !redefined && distance < LEA_SEARCH_THRESHOLD)
16554 {
16555 edge e;
16556 edge_iterator ei;
16557 bool simple_loop = false;
16558
16559 FOR_EACH_EDGE (e, ei, bb->succs)
16560 if (e->dest == bb)
16561 {
16562 simple_loop = true;
16563 break;
16564 }
16565
16566 if (simple_loop)
16567 distance = distance_agu_use_in_bb (regno0, insn,
16568 distance, BB_HEAD (bb),
16569 &found, &redefined);
16570 else
16571 {
16572 int shortest_dist = -1;
16573 bool found_in_bb = false;
16574 bool redefined_in_bb = false;
16575
16576 FOR_EACH_EDGE (e, ei, bb->succs)
16577 {
16578 int bb_dist
16579 = distance_agu_use_in_bb (regno0, insn,
16580 distance, BB_HEAD (e->dest),
16581 &found_in_bb, &redefined_in_bb);
16582 if (found_in_bb)
16583 {
16584 if (shortest_dist < 0)
16585 shortest_dist = bb_dist;
16586 else if (bb_dist > 0)
16587 shortest_dist = MIN (bb_dist, shortest_dist);
16588
16589 found = true;
16590 }
16591 }
16592
16593 distance = shortest_dist;
16594 }
16595 }
16596
16597 if (!found || redefined)
16598 return -1;
16599
16600 return distance >> 1;
16601 }
16602
16603 /* Define this macro to tune LEA priority vs ADD, it take effect when
16604 there is a dilemma of choicing LEA or ADD
16605 Negative value: ADD is more preferred than LEA
16606 Zero: Netrual
16607 Positive value: LEA is more preferred than ADD*/
16608 #define IX86_LEA_PRIORITY 0
16609
16610 /* Return true if usage of lea INSN has performance advantage
16611 over a sequence of instructions. Instructions sequence has
16612 SPLIT_COST cycles higher latency than lea latency. */
16613
16614 bool
16615 ix86_lea_outperforms (rtx insn, unsigned int regno0, unsigned int regno1,
16616 unsigned int regno2, unsigned int split_cost)
16617 {
16618 int dist_define, dist_use;
16619
16620 dist_define = distance_non_agu_define (regno1, regno2, insn);
16621 dist_use = distance_agu_use (regno0, insn);
16622
16623 if (dist_define < 0 || dist_define >= LEA_MAX_STALL)
16624 {
16625 /* If there is no non AGU operand definition, no AGU
16626 operand usage and split cost is 0 then both lea
16627 and non lea variants have same priority. Currently
16628 we prefer lea for 64 bit code and non lea on 32 bit
16629 code. */
16630 if (dist_use < 0 && split_cost == 0)
16631 return TARGET_64BIT || IX86_LEA_PRIORITY;
16632 else
16633 return true;
16634 }
16635
16636 /* With longer definitions distance lea is more preferable.
16637 Here we change it to take into account splitting cost and
16638 lea priority. */
16639 dist_define += split_cost + IX86_LEA_PRIORITY;
16640
16641 /* If there is no use in memory addess then we just check
16642 that split cost does not exceed AGU stall. */
16643 if (dist_use < 0)
16644 return dist_define >= LEA_MAX_STALL;
16645
16646 /* If this insn has both backward non-agu dependence and forward
16647 agu dependence, the one with short distance takes effect. */
16648 return dist_define >= dist_use;
16649 }
16650
16651 /* Return true if it is legal to clobber flags by INSN and
16652 false otherwise. */
16653
16654 static bool
16655 ix86_ok_to_clobber_flags (rtx insn)
16656 {
16657 basic_block bb = BLOCK_FOR_INSN (insn);
16658 df_ref *use;
16659 bitmap live;
16660
16661 while (insn)
16662 {
16663 if (NONDEBUG_INSN_P (insn))
16664 {
16665 for (use = DF_INSN_USES (insn); *use; use++)
16666 if (DF_REF_REG_USE_P (*use) && DF_REF_REGNO (*use) == FLAGS_REG)
16667 return false;
16668
16669 if (insn_defines_reg (FLAGS_REG, INVALID_REGNUM, insn))
16670 return true;
16671 }
16672
16673 if (insn == BB_END (bb))
16674 break;
16675
16676 insn = NEXT_INSN (insn);
16677 }
16678
16679 live = df_get_live_out(bb);
16680 return !REGNO_REG_SET_P (live, FLAGS_REG);
16681 }
16682
16683 /* Return true if we need to split op0 = op1 + op2 into a sequence of
16684 move and add to avoid AGU stalls. */
16685
16686 bool
16687 ix86_avoid_lea_for_add (rtx insn, rtx operands[])
16688 {
16689 unsigned int regno0 = true_regnum (operands[0]);
16690 unsigned int regno1 = true_regnum (operands[1]);
16691 unsigned int regno2 = true_regnum (operands[2]);
16692
16693 /* Check if we need to optimize. */
16694 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
16695 return false;
16696
16697 /* Check it is correct to split here. */
16698 if (!ix86_ok_to_clobber_flags(insn))
16699 return false;
16700
16701 /* We need to split only adds with non destructive
16702 destination operand. */
16703 if (regno0 == regno1 || regno0 == regno2)
16704 return false;
16705 else
16706 return !ix86_lea_outperforms (insn, regno0, regno1, regno2, 1);
16707 }
16708
16709 /* Return true if we should emit lea instruction instead of mov
16710 instruction. */
16711
16712 bool
16713 ix86_use_lea_for_mov (rtx insn, rtx operands[])
16714 {
16715 unsigned int regno0;
16716 unsigned int regno1;
16717
16718 /* Check if we need to optimize. */
16719 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
16720 return false;
16721
16722 /* Use lea for reg to reg moves only. */
16723 if (!REG_P (operands[0]) || !REG_P (operands[1]))
16724 return false;
16725
16726 regno0 = true_regnum (operands[0]);
16727 regno1 = true_regnum (operands[1]);
16728
16729 return ix86_lea_outperforms (insn, regno0, regno1, -1, 0);
16730 }
16731
16732 /* Return true if we need to split lea into a sequence of
16733 instructions to avoid AGU stalls. */
16734
16735 bool
16736 ix86_avoid_lea_for_addr (rtx insn, rtx operands[])
16737 {
16738 unsigned int regno0 = true_regnum (operands[0]) ;
16739 unsigned int regno1 = -1;
16740 unsigned int regno2 = -1;
16741 unsigned int split_cost = 0;
16742 struct ix86_address parts;
16743 int ok;
16744
16745 /* Check we need to optimize. */
16746 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
16747 return false;
16748
16749 /* Check it is correct to split here. */
16750 if (!ix86_ok_to_clobber_flags(insn))
16751 return false;
16752
16753 ok = ix86_decompose_address (operands[1], &parts);
16754 gcc_assert (ok);
16755
16756 /* We should not split into add if non legitimate pic
16757 operand is used as displacement. */
16758 if (parts.disp && flag_pic && !LEGITIMATE_PIC_OPERAND_P (parts.disp))
16759 return false;
16760
16761 if (parts.base)
16762 regno1 = true_regnum (parts.base);
16763 if (parts.index)
16764 regno2 = true_regnum (parts.index);
16765
16766 /* Compute how many cycles we will add to execution time
16767 if split lea into a sequence of instructions. */
16768 if (parts.base || parts.index)
16769 {
16770 /* Have to use mov instruction if non desctructive
16771 destination form is used. */
16772 if (regno1 != regno0 && regno2 != regno0)
16773 split_cost += 1;
16774
16775 /* Have to add index to base if both exist. */
16776 if (parts.base && parts.index)
16777 split_cost += 1;
16778
16779 /* Have to use shift and adds if scale is 2 or greater. */
16780 if (parts.scale > 1)
16781 {
16782 if (regno0 != regno1)
16783 split_cost += 1;
16784 else if (regno2 == regno0)
16785 split_cost += 4;
16786 else
16787 split_cost += parts.scale;
16788 }
16789
16790 /* Have to use add instruction with immediate if
16791 disp is non zero. */
16792 if (parts.disp && parts.disp != const0_rtx)
16793 split_cost += 1;
16794
16795 /* Subtract the price of lea. */
16796 split_cost -= 1;
16797 }
16798
16799 return !ix86_lea_outperforms (insn, regno0, regno1, regno2, split_cost);
16800 }
16801
16802 /* Emit x86 binary operand CODE in mode MODE, where the first operand
16803 matches destination. RTX includes clobber of FLAGS_REG. */
16804
16805 static void
16806 ix86_emit_binop (enum rtx_code code, enum machine_mode mode,
16807 rtx dst, rtx src)
16808 {
16809 rtx op, clob;
16810
16811 op = gen_rtx_SET (VOIDmode, dst, gen_rtx_fmt_ee (code, mode, dst, src));
16812 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
16813
16814 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
16815 }
16816
16817 /* Split lea instructions into a sequence of instructions
16818 which are executed on ALU to avoid AGU stalls.
16819 It is assumed that it is allowed to clobber flags register
16820 at lea position. */
16821
16822 extern void
16823 ix86_split_lea_for_addr (rtx operands[], enum machine_mode mode)
16824 {
16825 unsigned int regno0 = true_regnum (operands[0]) ;
16826 unsigned int regno1 = INVALID_REGNUM;
16827 unsigned int regno2 = INVALID_REGNUM;
16828 struct ix86_address parts;
16829 rtx tmp;
16830 int ok, adds;
16831
16832 ok = ix86_decompose_address (operands[1], &parts);
16833 gcc_assert (ok);
16834
16835 if (parts.base)
16836 {
16837 if (GET_MODE (parts.base) != mode)
16838 parts.base = gen_rtx_SUBREG (mode, parts.base, 0);
16839 regno1 = true_regnum (parts.base);
16840 }
16841
16842 if (parts.index)
16843 {
16844 if (GET_MODE (parts.index) != mode)
16845 parts.index = gen_rtx_SUBREG (mode, parts.index, 0);
16846 regno2 = true_regnum (parts.index);
16847 }
16848
16849 if (parts.scale > 1)
16850 {
16851 /* Case r1 = r1 + ... */
16852 if (regno1 == regno0)
16853 {
16854 /* If we have a case r1 = r1 + C * r1 then we
16855 should use multiplication which is very
16856 expensive. Assume cost model is wrong if we
16857 have such case here. */
16858 gcc_assert (regno2 != regno0);
16859
16860 for (adds = parts.scale; adds > 0; adds--)
16861 ix86_emit_binop (PLUS, mode, operands[0], parts.index);
16862 }
16863 else
16864 {
16865 /* r1 = r2 + r3 * C case. Need to move r3 into r1. */
16866 if (regno0 != regno2)
16867 emit_insn (gen_rtx_SET (VOIDmode, operands[0], parts.index));
16868
16869 /* Use shift for scaling. */
16870 ix86_emit_binop (ASHIFT, mode, operands[0],
16871 GEN_INT (exact_log2 (parts.scale)));
16872
16873 if (parts.base)
16874 ix86_emit_binop (PLUS, mode, operands[0], parts.base);
16875
16876 if (parts.disp && parts.disp != const0_rtx)
16877 ix86_emit_binop (PLUS, mode, operands[0], parts.disp);
16878 }
16879 }
16880 else if (!parts.base && !parts.index)
16881 {
16882 gcc_assert(parts.disp);
16883 emit_insn (gen_rtx_SET (VOIDmode, operands[0], parts.disp));
16884 }
16885 else
16886 {
16887 if (!parts.base)
16888 {
16889 if (regno0 != regno2)
16890 emit_insn (gen_rtx_SET (VOIDmode, operands[0], parts.index));
16891 }
16892 else if (!parts.index)
16893 {
16894 if (regno0 != regno1)
16895 emit_insn (gen_rtx_SET (VOIDmode, operands[0], parts.base));
16896 }
16897 else
16898 {
16899 if (regno0 == regno1)
16900 tmp = parts.index;
16901 else if (regno0 == regno2)
16902 tmp = parts.base;
16903 else
16904 {
16905 emit_insn (gen_rtx_SET (VOIDmode, operands[0], parts.base));
16906 tmp = parts.index;
16907 }
16908
16909 ix86_emit_binop (PLUS, mode, operands[0], tmp);
16910 }
16911
16912 if (parts.disp && parts.disp != const0_rtx)
16913 ix86_emit_binop (PLUS, mode, operands[0], parts.disp);
16914 }
16915 }
16916
16917 /* Return true if it is ok to optimize an ADD operation to LEA
16918 operation to avoid flag register consumation. For most processors,
16919 ADD is faster than LEA. For the processors like ATOM, if the
16920 destination register of LEA holds an actual address which will be
16921 used soon, LEA is better and otherwise ADD is better. */
16922
16923 bool
16924 ix86_lea_for_add_ok (rtx insn, rtx operands[])
16925 {
16926 unsigned int regno0 = true_regnum (operands[0]);
16927 unsigned int regno1 = true_regnum (operands[1]);
16928 unsigned int regno2 = true_regnum (operands[2]);
16929
16930 /* If a = b + c, (a!=b && a!=c), must use lea form. */
16931 if (regno0 != regno1 && regno0 != regno2)
16932 return true;
16933
16934 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
16935 return false;
16936
16937 return ix86_lea_outperforms (insn, regno0, regno1, regno2, 0);
16938 }
16939
16940 /* Return true if destination reg of SET_BODY is shift count of
16941 USE_BODY. */
16942
16943 static bool
16944 ix86_dep_by_shift_count_body (const_rtx set_body, const_rtx use_body)
16945 {
16946 rtx set_dest;
16947 rtx shift_rtx;
16948 int i;
16949
16950 /* Retrieve destination of SET_BODY. */
16951 switch (GET_CODE (set_body))
16952 {
16953 case SET:
16954 set_dest = SET_DEST (set_body);
16955 if (!set_dest || !REG_P (set_dest))
16956 return false;
16957 break;
16958 case PARALLEL:
16959 for (i = XVECLEN (set_body, 0) - 1; i >= 0; i--)
16960 if (ix86_dep_by_shift_count_body (XVECEXP (set_body, 0, i),
16961 use_body))
16962 return true;
16963 default:
16964 return false;
16965 break;
16966 }
16967
16968 /* Retrieve shift count of USE_BODY. */
16969 switch (GET_CODE (use_body))
16970 {
16971 case SET:
16972 shift_rtx = XEXP (use_body, 1);
16973 break;
16974 case PARALLEL:
16975 for (i = XVECLEN (use_body, 0) - 1; i >= 0; i--)
16976 if (ix86_dep_by_shift_count_body (set_body,
16977 XVECEXP (use_body, 0, i)))
16978 return true;
16979 default:
16980 return false;
16981 break;
16982 }
16983
16984 if (shift_rtx
16985 && (GET_CODE (shift_rtx) == ASHIFT
16986 || GET_CODE (shift_rtx) == LSHIFTRT
16987 || GET_CODE (shift_rtx) == ASHIFTRT
16988 || GET_CODE (shift_rtx) == ROTATE
16989 || GET_CODE (shift_rtx) == ROTATERT))
16990 {
16991 rtx shift_count = XEXP (shift_rtx, 1);
16992
16993 /* Return true if shift count is dest of SET_BODY. */
16994 if (REG_P (shift_count)
16995 && true_regnum (set_dest) == true_regnum (shift_count))
16996 return true;
16997 }
16998
16999 return false;
17000 }
17001
17002 /* Return true if destination reg of SET_INSN is shift count of
17003 USE_INSN. */
17004
17005 bool
17006 ix86_dep_by_shift_count (const_rtx set_insn, const_rtx use_insn)
17007 {
17008 return ix86_dep_by_shift_count_body (PATTERN (set_insn),
17009 PATTERN (use_insn));
17010 }
17011
17012 /* Return TRUE or FALSE depending on whether the unary operator meets the
17013 appropriate constraints. */
17014
17015 bool
17016 ix86_unary_operator_ok (enum rtx_code code ATTRIBUTE_UNUSED,
17017 enum machine_mode mode ATTRIBUTE_UNUSED,
17018 rtx operands[2] ATTRIBUTE_UNUSED)
17019 {
17020 /* If one of operands is memory, source and destination must match. */
17021 if ((MEM_P (operands[0])
17022 || MEM_P (operands[1]))
17023 && ! rtx_equal_p (operands[0], operands[1]))
17024 return false;
17025 return true;
17026 }
17027
17028 /* Return TRUE if the operands to a vec_interleave_{high,low}v2df
17029 are ok, keeping in mind the possible movddup alternative. */
17030
17031 bool
17032 ix86_vec_interleave_v2df_operator_ok (rtx operands[3], bool high)
17033 {
17034 if (MEM_P (operands[0]))
17035 return rtx_equal_p (operands[0], operands[1 + high]);
17036 if (MEM_P (operands[1]) && MEM_P (operands[2]))
17037 return TARGET_SSE3 && rtx_equal_p (operands[1], operands[2]);
17038 return true;
17039 }
17040
17041 /* Post-reload splitter for converting an SF or DFmode value in an
17042 SSE register into an unsigned SImode. */
17043
17044 void
17045 ix86_split_convert_uns_si_sse (rtx operands[])
17046 {
17047 enum machine_mode vecmode;
17048 rtx value, large, zero_or_two31, input, two31, x;
17049
17050 large = operands[1];
17051 zero_or_two31 = operands[2];
17052 input = operands[3];
17053 two31 = operands[4];
17054 vecmode = GET_MODE (large);
17055 value = gen_rtx_REG (vecmode, REGNO (operands[0]));
17056
17057 /* Load up the value into the low element. We must ensure that the other
17058 elements are valid floats -- zero is the easiest such value. */
17059 if (MEM_P (input))
17060 {
17061 if (vecmode == V4SFmode)
17062 emit_insn (gen_vec_setv4sf_0 (value, CONST0_RTX (V4SFmode), input));
17063 else
17064 emit_insn (gen_sse2_loadlpd (value, CONST0_RTX (V2DFmode), input));
17065 }
17066 else
17067 {
17068 input = gen_rtx_REG (vecmode, REGNO (input));
17069 emit_move_insn (value, CONST0_RTX (vecmode));
17070 if (vecmode == V4SFmode)
17071 emit_insn (gen_sse_movss (value, value, input));
17072 else
17073 emit_insn (gen_sse2_movsd (value, value, input));
17074 }
17075
17076 emit_move_insn (large, two31);
17077 emit_move_insn (zero_or_two31, MEM_P (two31) ? large : two31);
17078
17079 x = gen_rtx_fmt_ee (LE, vecmode, large, value);
17080 emit_insn (gen_rtx_SET (VOIDmode, large, x));
17081
17082 x = gen_rtx_AND (vecmode, zero_or_two31, large);
17083 emit_insn (gen_rtx_SET (VOIDmode, zero_or_two31, x));
17084
17085 x = gen_rtx_MINUS (vecmode, value, zero_or_two31);
17086 emit_insn (gen_rtx_SET (VOIDmode, value, x));
17087
17088 large = gen_rtx_REG (V4SImode, REGNO (large));
17089 emit_insn (gen_ashlv4si3 (large, large, GEN_INT (31)));
17090
17091 x = gen_rtx_REG (V4SImode, REGNO (value));
17092 if (vecmode == V4SFmode)
17093 emit_insn (gen_fix_truncv4sfv4si2 (x, value));
17094 else
17095 emit_insn (gen_sse2_cvttpd2dq (x, value));
17096 value = x;
17097
17098 emit_insn (gen_xorv4si3 (value, value, large));
17099 }
17100
17101 /* Convert an unsigned DImode value into a DFmode, using only SSE.
17102 Expects the 64-bit DImode to be supplied in a pair of integral
17103 registers. Requires SSE2; will use SSE3 if available. For x86_32,
17104 -mfpmath=sse, !optimize_size only. */
17105
17106 void
17107 ix86_expand_convert_uns_didf_sse (rtx target, rtx input)
17108 {
17109 REAL_VALUE_TYPE bias_lo_rvt, bias_hi_rvt;
17110 rtx int_xmm, fp_xmm;
17111 rtx biases, exponents;
17112 rtx x;
17113
17114 int_xmm = gen_reg_rtx (V4SImode);
17115 if (TARGET_INTER_UNIT_MOVES)
17116 emit_insn (gen_movdi_to_sse (int_xmm, input));
17117 else if (TARGET_SSE_SPLIT_REGS)
17118 {
17119 emit_clobber (int_xmm);
17120 emit_move_insn (gen_lowpart (DImode, int_xmm), input);
17121 }
17122 else
17123 {
17124 x = gen_reg_rtx (V2DImode);
17125 ix86_expand_vector_init_one_nonzero (false, V2DImode, x, input, 0);
17126 emit_move_insn (int_xmm, gen_lowpart (V4SImode, x));
17127 }
17128
17129 x = gen_rtx_CONST_VECTOR (V4SImode,
17130 gen_rtvec (4, GEN_INT (0x43300000UL),
17131 GEN_INT (0x45300000UL),
17132 const0_rtx, const0_rtx));
17133 exponents = validize_mem (force_const_mem (V4SImode, x));
17134
17135 /* int_xmm = {0x45300000UL, fp_xmm/hi, 0x43300000, fp_xmm/lo } */
17136 emit_insn (gen_vec_interleave_lowv4si (int_xmm, int_xmm, exponents));
17137
17138 /* Concatenating (juxtaposing) (0x43300000UL ## fp_value_low_xmm)
17139 yields a valid DF value equal to (0x1.0p52 + double(fp_value_lo_xmm)).
17140 Similarly (0x45300000UL ## fp_value_hi_xmm) yields
17141 (0x1.0p84 + double(fp_value_hi_xmm)).
17142 Note these exponents differ by 32. */
17143
17144 fp_xmm = copy_to_mode_reg (V2DFmode, gen_lowpart (V2DFmode, int_xmm));
17145
17146 /* Subtract off those 0x1.0p52 and 0x1.0p84 biases, to produce values
17147 in [0,2**32-1] and [0]+[2**32,2**64-1] respectively. */
17148 real_ldexp (&bias_lo_rvt, &dconst1, 52);
17149 real_ldexp (&bias_hi_rvt, &dconst1, 84);
17150 biases = const_double_from_real_value (bias_lo_rvt, DFmode);
17151 x = const_double_from_real_value (bias_hi_rvt, DFmode);
17152 biases = gen_rtx_CONST_VECTOR (V2DFmode, gen_rtvec (2, biases, x));
17153 biases = validize_mem (force_const_mem (V2DFmode, biases));
17154 emit_insn (gen_subv2df3 (fp_xmm, fp_xmm, biases));
17155
17156 /* Add the upper and lower DFmode values together. */
17157 if (TARGET_SSE3)
17158 emit_insn (gen_sse3_haddv2df3 (fp_xmm, fp_xmm, fp_xmm));
17159 else
17160 {
17161 x = copy_to_mode_reg (V2DFmode, fp_xmm);
17162 emit_insn (gen_vec_interleave_highv2df (fp_xmm, fp_xmm, fp_xmm));
17163 emit_insn (gen_addv2df3 (fp_xmm, fp_xmm, x));
17164 }
17165
17166 ix86_expand_vector_extract (false, target, fp_xmm, 0);
17167 }
17168
17169 /* Not used, but eases macroization of patterns. */
17170 void
17171 ix86_expand_convert_uns_sixf_sse (rtx target ATTRIBUTE_UNUSED,
17172 rtx input ATTRIBUTE_UNUSED)
17173 {
17174 gcc_unreachable ();
17175 }
17176
17177 /* Convert an unsigned SImode value into a DFmode. Only currently used
17178 for SSE, but applicable anywhere. */
17179
17180 void
17181 ix86_expand_convert_uns_sidf_sse (rtx target, rtx input)
17182 {
17183 REAL_VALUE_TYPE TWO31r;
17184 rtx x, fp;
17185
17186 x = expand_simple_binop (SImode, PLUS, input, GEN_INT (-2147483647 - 1),
17187 NULL, 1, OPTAB_DIRECT);
17188
17189 fp = gen_reg_rtx (DFmode);
17190 emit_insn (gen_floatsidf2 (fp, x));
17191
17192 real_ldexp (&TWO31r, &dconst1, 31);
17193 x = const_double_from_real_value (TWO31r, DFmode);
17194
17195 x = expand_simple_binop (DFmode, PLUS, fp, x, target, 0, OPTAB_DIRECT);
17196 if (x != target)
17197 emit_move_insn (target, x);
17198 }
17199
17200 /* Convert a signed DImode value into a DFmode. Only used for SSE in
17201 32-bit mode; otherwise we have a direct convert instruction. */
17202
17203 void
17204 ix86_expand_convert_sign_didf_sse (rtx target, rtx input)
17205 {
17206 REAL_VALUE_TYPE TWO32r;
17207 rtx fp_lo, fp_hi, x;
17208
17209 fp_lo = gen_reg_rtx (DFmode);
17210 fp_hi = gen_reg_rtx (DFmode);
17211
17212 emit_insn (gen_floatsidf2 (fp_hi, gen_highpart (SImode, input)));
17213
17214 real_ldexp (&TWO32r, &dconst1, 32);
17215 x = const_double_from_real_value (TWO32r, DFmode);
17216 fp_hi = expand_simple_binop (DFmode, MULT, fp_hi, x, fp_hi, 0, OPTAB_DIRECT);
17217
17218 ix86_expand_convert_uns_sidf_sse (fp_lo, gen_lowpart (SImode, input));
17219
17220 x = expand_simple_binop (DFmode, PLUS, fp_hi, fp_lo, target,
17221 0, OPTAB_DIRECT);
17222 if (x != target)
17223 emit_move_insn (target, x);
17224 }
17225
17226 /* Convert an unsigned SImode value into a SFmode, using only SSE.
17227 For x86_32, -mfpmath=sse, !optimize_size only. */
17228 void
17229 ix86_expand_convert_uns_sisf_sse (rtx target, rtx input)
17230 {
17231 REAL_VALUE_TYPE ONE16r;
17232 rtx fp_hi, fp_lo, int_hi, int_lo, x;
17233
17234 real_ldexp (&ONE16r, &dconst1, 16);
17235 x = const_double_from_real_value (ONE16r, SFmode);
17236 int_lo = expand_simple_binop (SImode, AND, input, GEN_INT(0xffff),
17237 NULL, 0, OPTAB_DIRECT);
17238 int_hi = expand_simple_binop (SImode, LSHIFTRT, input, GEN_INT(16),
17239 NULL, 0, OPTAB_DIRECT);
17240 fp_hi = gen_reg_rtx (SFmode);
17241 fp_lo = gen_reg_rtx (SFmode);
17242 emit_insn (gen_floatsisf2 (fp_hi, int_hi));
17243 emit_insn (gen_floatsisf2 (fp_lo, int_lo));
17244 fp_hi = expand_simple_binop (SFmode, MULT, fp_hi, x, fp_hi,
17245 0, OPTAB_DIRECT);
17246 fp_hi = expand_simple_binop (SFmode, PLUS, fp_hi, fp_lo, target,
17247 0, OPTAB_DIRECT);
17248 if (!rtx_equal_p (target, fp_hi))
17249 emit_move_insn (target, fp_hi);
17250 }
17251
17252 /* floatunsv{4,8}siv{4,8}sf2 expander. Expand code to convert
17253 a vector of unsigned ints VAL to vector of floats TARGET. */
17254
17255 void
17256 ix86_expand_vector_convert_uns_vsivsf (rtx target, rtx val)
17257 {
17258 rtx tmp[8];
17259 REAL_VALUE_TYPE TWO16r;
17260 enum machine_mode intmode = GET_MODE (val);
17261 enum machine_mode fltmode = GET_MODE (target);
17262 rtx (*cvt) (rtx, rtx);
17263
17264 if (intmode == V4SImode)
17265 cvt = gen_floatv4siv4sf2;
17266 else
17267 cvt = gen_floatv8siv8sf2;
17268 tmp[0] = ix86_build_const_vector (intmode, 1, GEN_INT (0xffff));
17269 tmp[0] = force_reg (intmode, tmp[0]);
17270 tmp[1] = expand_simple_binop (intmode, AND, val, tmp[0], NULL_RTX, 1,
17271 OPTAB_DIRECT);
17272 tmp[2] = expand_simple_binop (intmode, LSHIFTRT, val, GEN_INT (16),
17273 NULL_RTX, 1, OPTAB_DIRECT);
17274 tmp[3] = gen_reg_rtx (fltmode);
17275 emit_insn (cvt (tmp[3], tmp[1]));
17276 tmp[4] = gen_reg_rtx (fltmode);
17277 emit_insn (cvt (tmp[4], tmp[2]));
17278 real_ldexp (&TWO16r, &dconst1, 16);
17279 tmp[5] = const_double_from_real_value (TWO16r, SFmode);
17280 tmp[5] = force_reg (fltmode, ix86_build_const_vector (fltmode, 1, tmp[5]));
17281 tmp[6] = expand_simple_binop (fltmode, MULT, tmp[4], tmp[5], NULL_RTX, 1,
17282 OPTAB_DIRECT);
17283 tmp[7] = expand_simple_binop (fltmode, PLUS, tmp[3], tmp[6], target, 1,
17284 OPTAB_DIRECT);
17285 if (tmp[7] != target)
17286 emit_move_insn (target, tmp[7]);
17287 }
17288
17289 /* Adjust a V*SFmode/V*DFmode value VAL so that *sfix_trunc* resp. fix_trunc*
17290 pattern can be used on it instead of *ufix_trunc* resp. fixuns_trunc*.
17291 This is done by doing just signed conversion if < 0x1p31, and otherwise by
17292 subtracting 0x1p31 first and xoring in 0x80000000 from *XORP afterwards. */
17293
17294 rtx
17295 ix86_expand_adjust_ufix_to_sfix_si (rtx val, rtx *xorp)
17296 {
17297 REAL_VALUE_TYPE TWO31r;
17298 rtx two31r, tmp[4];
17299 enum machine_mode mode = GET_MODE (val);
17300 enum machine_mode scalarmode = GET_MODE_INNER (mode);
17301 enum machine_mode intmode = GET_MODE_SIZE (mode) == 32 ? V8SImode : V4SImode;
17302 rtx (*cmp) (rtx, rtx, rtx, rtx);
17303 int i;
17304
17305 for (i = 0; i < 3; i++)
17306 tmp[i] = gen_reg_rtx (mode);
17307 real_ldexp (&TWO31r, &dconst1, 31);
17308 two31r = const_double_from_real_value (TWO31r, scalarmode);
17309 two31r = ix86_build_const_vector (mode, 1, two31r);
17310 two31r = force_reg (mode, two31r);
17311 switch (mode)
17312 {
17313 case V8SFmode: cmp = gen_avx_maskcmpv8sf3; break;
17314 case V4SFmode: cmp = gen_sse_maskcmpv4sf3; break;
17315 case V4DFmode: cmp = gen_avx_maskcmpv4df3; break;
17316 case V2DFmode: cmp = gen_sse2_maskcmpv2df3; break;
17317 default: gcc_unreachable ();
17318 }
17319 tmp[3] = gen_rtx_LE (mode, two31r, val);
17320 emit_insn (cmp (tmp[0], two31r, val, tmp[3]));
17321 tmp[1] = expand_simple_binop (mode, AND, tmp[0], two31r, tmp[1],
17322 0, OPTAB_DIRECT);
17323 if (intmode == V4SImode || TARGET_AVX2)
17324 *xorp = expand_simple_binop (intmode, ASHIFT,
17325 gen_lowpart (intmode, tmp[0]),
17326 GEN_INT (31), NULL_RTX, 0,
17327 OPTAB_DIRECT);
17328 else
17329 {
17330 rtx two31 = GEN_INT ((unsigned HOST_WIDE_INT) 1 << 31);
17331 two31 = ix86_build_const_vector (intmode, 1, two31);
17332 *xorp = expand_simple_binop (intmode, AND,
17333 gen_lowpart (intmode, tmp[0]),
17334 two31, NULL_RTX, 0,
17335 OPTAB_DIRECT);
17336 }
17337 return expand_simple_binop (mode, MINUS, val, tmp[1], tmp[2],
17338 0, OPTAB_DIRECT);
17339 }
17340
17341 /* A subroutine of ix86_build_signbit_mask. If VECT is true,
17342 then replicate the value for all elements of the vector
17343 register. */
17344
17345 rtx
17346 ix86_build_const_vector (enum machine_mode mode, bool vect, rtx value)
17347 {
17348 int i, n_elt;
17349 rtvec v;
17350 enum machine_mode scalar_mode;
17351
17352 switch (mode)
17353 {
17354 case V32QImode:
17355 case V16QImode:
17356 case V16HImode:
17357 case V8HImode:
17358 case V8SImode:
17359 case V4SImode:
17360 case V4DImode:
17361 case V2DImode:
17362 gcc_assert (vect);
17363 case V8SFmode:
17364 case V4SFmode:
17365 case V4DFmode:
17366 case V2DFmode:
17367 n_elt = GET_MODE_NUNITS (mode);
17368 v = rtvec_alloc (n_elt);
17369 scalar_mode = GET_MODE_INNER (mode);
17370
17371 RTVEC_ELT (v, 0) = value;
17372
17373 for (i = 1; i < n_elt; ++i)
17374 RTVEC_ELT (v, i) = vect ? value : CONST0_RTX (scalar_mode);
17375
17376 return gen_rtx_CONST_VECTOR (mode, v);
17377
17378 default:
17379 gcc_unreachable ();
17380 }
17381 }
17382
17383 /* A subroutine of ix86_expand_fp_absneg_operator, copysign expanders
17384 and ix86_expand_int_vcond. Create a mask for the sign bit in MODE
17385 for an SSE register. If VECT is true, then replicate the mask for
17386 all elements of the vector register. If INVERT is true, then create
17387 a mask excluding the sign bit. */
17388
17389 rtx
17390 ix86_build_signbit_mask (enum machine_mode mode, bool vect, bool invert)
17391 {
17392 enum machine_mode vec_mode, imode;
17393 HOST_WIDE_INT hi, lo;
17394 int shift = 63;
17395 rtx v;
17396 rtx mask;
17397
17398 /* Find the sign bit, sign extended to 2*HWI. */
17399 switch (mode)
17400 {
17401 case V8SImode:
17402 case V4SImode:
17403 case V8SFmode:
17404 case V4SFmode:
17405 vec_mode = mode;
17406 mode = GET_MODE_INNER (mode);
17407 imode = SImode;
17408 lo = 0x80000000, hi = lo < 0;
17409 break;
17410
17411 case V4DImode:
17412 case V2DImode:
17413 case V4DFmode:
17414 case V2DFmode:
17415 vec_mode = mode;
17416 mode = GET_MODE_INNER (mode);
17417 imode = DImode;
17418 if (HOST_BITS_PER_WIDE_INT >= 64)
17419 lo = (HOST_WIDE_INT)1 << shift, hi = -1;
17420 else
17421 lo = 0, hi = (HOST_WIDE_INT)1 << (shift - HOST_BITS_PER_WIDE_INT);
17422 break;
17423
17424 case TImode:
17425 case TFmode:
17426 vec_mode = VOIDmode;
17427 if (HOST_BITS_PER_WIDE_INT >= 64)
17428 {
17429 imode = TImode;
17430 lo = 0, hi = (HOST_WIDE_INT)1 << shift;
17431 }
17432 else
17433 {
17434 rtvec vec;
17435
17436 imode = DImode;
17437 lo = 0, hi = (HOST_WIDE_INT)1 << (shift - HOST_BITS_PER_WIDE_INT);
17438
17439 if (invert)
17440 {
17441 lo = ~lo, hi = ~hi;
17442 v = constm1_rtx;
17443 }
17444 else
17445 v = const0_rtx;
17446
17447 mask = immed_double_const (lo, hi, imode);
17448
17449 vec = gen_rtvec (2, v, mask);
17450 v = gen_rtx_CONST_VECTOR (V2DImode, vec);
17451 v = copy_to_mode_reg (mode, gen_lowpart (mode, v));
17452
17453 return v;
17454 }
17455 break;
17456
17457 default:
17458 gcc_unreachable ();
17459 }
17460
17461 if (invert)
17462 lo = ~lo, hi = ~hi;
17463
17464 /* Force this value into the low part of a fp vector constant. */
17465 mask = immed_double_const (lo, hi, imode);
17466 mask = gen_lowpart (mode, mask);
17467
17468 if (vec_mode == VOIDmode)
17469 return force_reg (mode, mask);
17470
17471 v = ix86_build_const_vector (vec_mode, vect, mask);
17472 return force_reg (vec_mode, v);
17473 }
17474
17475 /* Generate code for floating point ABS or NEG. */
17476
17477 void
17478 ix86_expand_fp_absneg_operator (enum rtx_code code, enum machine_mode mode,
17479 rtx operands[])
17480 {
17481 rtx mask, set, dst, src;
17482 bool use_sse = false;
17483 bool vector_mode = VECTOR_MODE_P (mode);
17484 enum machine_mode vmode = mode;
17485
17486 if (vector_mode)
17487 use_sse = true;
17488 else if (mode == TFmode)
17489 use_sse = true;
17490 else if (TARGET_SSE_MATH)
17491 {
17492 use_sse = SSE_FLOAT_MODE_P (mode);
17493 if (mode == SFmode)
17494 vmode = V4SFmode;
17495 else if (mode == DFmode)
17496 vmode = V2DFmode;
17497 }
17498
17499 /* NEG and ABS performed with SSE use bitwise mask operations.
17500 Create the appropriate mask now. */
17501 if (use_sse)
17502 mask = ix86_build_signbit_mask (vmode, vector_mode, code == ABS);
17503 else
17504 mask = NULL_RTX;
17505
17506 dst = operands[0];
17507 src = operands[1];
17508
17509 set = gen_rtx_fmt_e (code, mode, src);
17510 set = gen_rtx_SET (VOIDmode, dst, set);
17511
17512 if (mask)
17513 {
17514 rtx use, clob;
17515 rtvec par;
17516
17517 use = gen_rtx_USE (VOIDmode, mask);
17518 if (vector_mode)
17519 par = gen_rtvec (2, set, use);
17520 else
17521 {
17522 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
17523 par = gen_rtvec (3, set, use, clob);
17524 }
17525 emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
17526 }
17527 else
17528 emit_insn (set);
17529 }
17530
17531 /* Expand a copysign operation. Special case operand 0 being a constant. */
17532
17533 void
17534 ix86_expand_copysign (rtx operands[])
17535 {
17536 enum machine_mode mode, vmode;
17537 rtx dest, op0, op1, mask, nmask;
17538
17539 dest = operands[0];
17540 op0 = operands[1];
17541 op1 = operands[2];
17542
17543 mode = GET_MODE (dest);
17544
17545 if (mode == SFmode)
17546 vmode = V4SFmode;
17547 else if (mode == DFmode)
17548 vmode = V2DFmode;
17549 else
17550 vmode = mode;
17551
17552 if (GET_CODE (op0) == CONST_DOUBLE)
17553 {
17554 rtx (*copysign_insn)(rtx, rtx, rtx, rtx);
17555
17556 if (real_isneg (CONST_DOUBLE_REAL_VALUE (op0)))
17557 op0 = simplify_unary_operation (ABS, mode, op0, mode);
17558
17559 if (mode == SFmode || mode == DFmode)
17560 {
17561 if (op0 == CONST0_RTX (mode))
17562 op0 = CONST0_RTX (vmode);
17563 else
17564 {
17565 rtx v = ix86_build_const_vector (vmode, false, op0);
17566
17567 op0 = force_reg (vmode, v);
17568 }
17569 }
17570 else if (op0 != CONST0_RTX (mode))
17571 op0 = force_reg (mode, op0);
17572
17573 mask = ix86_build_signbit_mask (vmode, 0, 0);
17574
17575 if (mode == SFmode)
17576 copysign_insn = gen_copysignsf3_const;
17577 else if (mode == DFmode)
17578 copysign_insn = gen_copysigndf3_const;
17579 else
17580 copysign_insn = gen_copysigntf3_const;
17581
17582 emit_insn (copysign_insn (dest, op0, op1, mask));
17583 }
17584 else
17585 {
17586 rtx (*copysign_insn)(rtx, rtx, rtx, rtx, rtx, rtx);
17587
17588 nmask = ix86_build_signbit_mask (vmode, 0, 1);
17589 mask = ix86_build_signbit_mask (vmode, 0, 0);
17590
17591 if (mode == SFmode)
17592 copysign_insn = gen_copysignsf3_var;
17593 else if (mode == DFmode)
17594 copysign_insn = gen_copysigndf3_var;
17595 else
17596 copysign_insn = gen_copysigntf3_var;
17597
17598 emit_insn (copysign_insn (dest, NULL_RTX, op0, op1, nmask, mask));
17599 }
17600 }
17601
17602 /* Deconstruct a copysign operation into bit masks. Operand 0 is known to
17603 be a constant, and so has already been expanded into a vector constant. */
17604
17605 void
17606 ix86_split_copysign_const (rtx operands[])
17607 {
17608 enum machine_mode mode, vmode;
17609 rtx dest, op0, mask, x;
17610
17611 dest = operands[0];
17612 op0 = operands[1];
17613 mask = operands[3];
17614
17615 mode = GET_MODE (dest);
17616 vmode = GET_MODE (mask);
17617
17618 dest = simplify_gen_subreg (vmode, dest, mode, 0);
17619 x = gen_rtx_AND (vmode, dest, mask);
17620 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
17621
17622 if (op0 != CONST0_RTX (vmode))
17623 {
17624 x = gen_rtx_IOR (vmode, dest, op0);
17625 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
17626 }
17627 }
17628
17629 /* Deconstruct a copysign operation into bit masks. Operand 0 is variable,
17630 so we have to do two masks. */
17631
17632 void
17633 ix86_split_copysign_var (rtx operands[])
17634 {
17635 enum machine_mode mode, vmode;
17636 rtx dest, scratch, op0, op1, mask, nmask, x;
17637
17638 dest = operands[0];
17639 scratch = operands[1];
17640 op0 = operands[2];
17641 op1 = operands[3];
17642 nmask = operands[4];
17643 mask = operands[5];
17644
17645 mode = GET_MODE (dest);
17646 vmode = GET_MODE (mask);
17647
17648 if (rtx_equal_p (op0, op1))
17649 {
17650 /* Shouldn't happen often (it's useless, obviously), but when it does
17651 we'd generate incorrect code if we continue below. */
17652 emit_move_insn (dest, op0);
17653 return;
17654 }
17655
17656 if (REG_P (mask) && REGNO (dest) == REGNO (mask)) /* alternative 0 */
17657 {
17658 gcc_assert (REGNO (op1) == REGNO (scratch));
17659
17660 x = gen_rtx_AND (vmode, scratch, mask);
17661 emit_insn (gen_rtx_SET (VOIDmode, scratch, x));
17662
17663 dest = mask;
17664 op0 = simplify_gen_subreg (vmode, op0, mode, 0);
17665 x = gen_rtx_NOT (vmode, dest);
17666 x = gen_rtx_AND (vmode, x, op0);
17667 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
17668 }
17669 else
17670 {
17671 if (REGNO (op1) == REGNO (scratch)) /* alternative 1,3 */
17672 {
17673 x = gen_rtx_AND (vmode, scratch, mask);
17674 }
17675 else /* alternative 2,4 */
17676 {
17677 gcc_assert (REGNO (mask) == REGNO (scratch));
17678 op1 = simplify_gen_subreg (vmode, op1, mode, 0);
17679 x = gen_rtx_AND (vmode, scratch, op1);
17680 }
17681 emit_insn (gen_rtx_SET (VOIDmode, scratch, x));
17682
17683 if (REGNO (op0) == REGNO (dest)) /* alternative 1,2 */
17684 {
17685 dest = simplify_gen_subreg (vmode, op0, mode, 0);
17686 x = gen_rtx_AND (vmode, dest, nmask);
17687 }
17688 else /* alternative 3,4 */
17689 {
17690 gcc_assert (REGNO (nmask) == REGNO (dest));
17691 dest = nmask;
17692 op0 = simplify_gen_subreg (vmode, op0, mode, 0);
17693 x = gen_rtx_AND (vmode, dest, op0);
17694 }
17695 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
17696 }
17697
17698 x = gen_rtx_IOR (vmode, dest, scratch);
17699 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
17700 }
17701
17702 /* Return TRUE or FALSE depending on whether the first SET in INSN
17703 has source and destination with matching CC modes, and that the
17704 CC mode is at least as constrained as REQ_MODE. */
17705
17706 bool
17707 ix86_match_ccmode (rtx insn, enum machine_mode req_mode)
17708 {
17709 rtx set;
17710 enum machine_mode set_mode;
17711
17712 set = PATTERN (insn);
17713 if (GET_CODE (set) == PARALLEL)
17714 set = XVECEXP (set, 0, 0);
17715 gcc_assert (GET_CODE (set) == SET);
17716 gcc_assert (GET_CODE (SET_SRC (set)) == COMPARE);
17717
17718 set_mode = GET_MODE (SET_DEST (set));
17719 switch (set_mode)
17720 {
17721 case CCNOmode:
17722 if (req_mode != CCNOmode
17723 && (req_mode != CCmode
17724 || XEXP (SET_SRC (set), 1) != const0_rtx))
17725 return false;
17726 break;
17727 case CCmode:
17728 if (req_mode == CCGCmode)
17729 return false;
17730 /* FALLTHRU */
17731 case CCGCmode:
17732 if (req_mode == CCGOCmode || req_mode == CCNOmode)
17733 return false;
17734 /* FALLTHRU */
17735 case CCGOCmode:
17736 if (req_mode == CCZmode)
17737 return false;
17738 /* FALLTHRU */
17739 case CCZmode:
17740 break;
17741
17742 case CCAmode:
17743 case CCCmode:
17744 case CCOmode:
17745 case CCSmode:
17746 if (set_mode != req_mode)
17747 return false;
17748 break;
17749
17750 default:
17751 gcc_unreachable ();
17752 }
17753
17754 return GET_MODE (SET_SRC (set)) == set_mode;
17755 }
17756
17757 /* Generate insn patterns to do an integer compare of OPERANDS. */
17758
17759 static rtx
17760 ix86_expand_int_compare (enum rtx_code code, rtx op0, rtx op1)
17761 {
17762 enum machine_mode cmpmode;
17763 rtx tmp, flags;
17764
17765 cmpmode = SELECT_CC_MODE (code, op0, op1);
17766 flags = gen_rtx_REG (cmpmode, FLAGS_REG);
17767
17768 /* This is very simple, but making the interface the same as in the
17769 FP case makes the rest of the code easier. */
17770 tmp = gen_rtx_COMPARE (cmpmode, op0, op1);
17771 emit_insn (gen_rtx_SET (VOIDmode, flags, tmp));
17772
17773 /* Return the test that should be put into the flags user, i.e.
17774 the bcc, scc, or cmov instruction. */
17775 return gen_rtx_fmt_ee (code, VOIDmode, flags, const0_rtx);
17776 }
17777
17778 /* Figure out whether to use ordered or unordered fp comparisons.
17779 Return the appropriate mode to use. */
17780
17781 enum machine_mode
17782 ix86_fp_compare_mode (enum rtx_code code ATTRIBUTE_UNUSED)
17783 {
17784 /* ??? In order to make all comparisons reversible, we do all comparisons
17785 non-trapping when compiling for IEEE. Once gcc is able to distinguish
17786 all forms trapping and nontrapping comparisons, we can make inequality
17787 comparisons trapping again, since it results in better code when using
17788 FCOM based compares. */
17789 return TARGET_IEEE_FP ? CCFPUmode : CCFPmode;
17790 }
17791
17792 enum machine_mode
17793 ix86_cc_mode (enum rtx_code code, rtx op0, rtx op1)
17794 {
17795 enum machine_mode mode = GET_MODE (op0);
17796
17797 if (SCALAR_FLOAT_MODE_P (mode))
17798 {
17799 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
17800 return ix86_fp_compare_mode (code);
17801 }
17802
17803 switch (code)
17804 {
17805 /* Only zero flag is needed. */
17806 case EQ: /* ZF=0 */
17807 case NE: /* ZF!=0 */
17808 return CCZmode;
17809 /* Codes needing carry flag. */
17810 case GEU: /* CF=0 */
17811 case LTU: /* CF=1 */
17812 /* Detect overflow checks. They need just the carry flag. */
17813 if (GET_CODE (op0) == PLUS
17814 && rtx_equal_p (op1, XEXP (op0, 0)))
17815 return CCCmode;
17816 else
17817 return CCmode;
17818 case GTU: /* CF=0 & ZF=0 */
17819 case LEU: /* CF=1 | ZF=1 */
17820 /* Detect overflow checks. They need just the carry flag. */
17821 if (GET_CODE (op0) == MINUS
17822 && rtx_equal_p (op1, XEXP (op0, 0)))
17823 return CCCmode;
17824 else
17825 return CCmode;
17826 /* Codes possibly doable only with sign flag when
17827 comparing against zero. */
17828 case GE: /* SF=OF or SF=0 */
17829 case LT: /* SF<>OF or SF=1 */
17830 if (op1 == const0_rtx)
17831 return CCGOCmode;
17832 else
17833 /* For other cases Carry flag is not required. */
17834 return CCGCmode;
17835 /* Codes doable only with sign flag when comparing
17836 against zero, but we miss jump instruction for it
17837 so we need to use relational tests against overflow
17838 that thus needs to be zero. */
17839 case GT: /* ZF=0 & SF=OF */
17840 case LE: /* ZF=1 | SF<>OF */
17841 if (op1 == const0_rtx)
17842 return CCNOmode;
17843 else
17844 return CCGCmode;
17845 /* strcmp pattern do (use flags) and combine may ask us for proper
17846 mode. */
17847 case USE:
17848 return CCmode;
17849 default:
17850 gcc_unreachable ();
17851 }
17852 }
17853
17854 /* Return the fixed registers used for condition codes. */
17855
17856 static bool
17857 ix86_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
17858 {
17859 *p1 = FLAGS_REG;
17860 *p2 = FPSR_REG;
17861 return true;
17862 }
17863
17864 /* If two condition code modes are compatible, return a condition code
17865 mode which is compatible with both. Otherwise, return
17866 VOIDmode. */
17867
17868 static enum machine_mode
17869 ix86_cc_modes_compatible (enum machine_mode m1, enum machine_mode m2)
17870 {
17871 if (m1 == m2)
17872 return m1;
17873
17874 if (GET_MODE_CLASS (m1) != MODE_CC || GET_MODE_CLASS (m2) != MODE_CC)
17875 return VOIDmode;
17876
17877 if ((m1 == CCGCmode && m2 == CCGOCmode)
17878 || (m1 == CCGOCmode && m2 == CCGCmode))
17879 return CCGCmode;
17880
17881 if (m1 == CCZmode && (m2 == CCGCmode || m2 == CCGOCmode))
17882 return m2;
17883 else if (m2 == CCZmode && (m1 == CCGCmode || m1 == CCGOCmode))
17884 return m1;
17885
17886 switch (m1)
17887 {
17888 default:
17889 gcc_unreachable ();
17890
17891 case CCmode:
17892 case CCGCmode:
17893 case CCGOCmode:
17894 case CCNOmode:
17895 case CCAmode:
17896 case CCCmode:
17897 case CCOmode:
17898 case CCSmode:
17899 case CCZmode:
17900 switch (m2)
17901 {
17902 default:
17903 return VOIDmode;
17904
17905 case CCmode:
17906 case CCGCmode:
17907 case CCGOCmode:
17908 case CCNOmode:
17909 case CCAmode:
17910 case CCCmode:
17911 case CCOmode:
17912 case CCSmode:
17913 case CCZmode:
17914 return CCmode;
17915 }
17916
17917 case CCFPmode:
17918 case CCFPUmode:
17919 /* These are only compatible with themselves, which we already
17920 checked above. */
17921 return VOIDmode;
17922 }
17923 }
17924
17925
17926 /* Return a comparison we can do and that it is equivalent to
17927 swap_condition (code) apart possibly from orderedness.
17928 But, never change orderedness if TARGET_IEEE_FP, returning
17929 UNKNOWN in that case if necessary. */
17930
17931 static enum rtx_code
17932 ix86_fp_swap_condition (enum rtx_code code)
17933 {
17934 switch (code)
17935 {
17936 case GT: /* GTU - CF=0 & ZF=0 */
17937 return TARGET_IEEE_FP ? UNKNOWN : UNLT;
17938 case GE: /* GEU - CF=0 */
17939 return TARGET_IEEE_FP ? UNKNOWN : UNLE;
17940 case UNLT: /* LTU - CF=1 */
17941 return TARGET_IEEE_FP ? UNKNOWN : GT;
17942 case UNLE: /* LEU - CF=1 | ZF=1 */
17943 return TARGET_IEEE_FP ? UNKNOWN : GE;
17944 default:
17945 return swap_condition (code);
17946 }
17947 }
17948
17949 /* Return cost of comparison CODE using the best strategy for performance.
17950 All following functions do use number of instructions as a cost metrics.
17951 In future this should be tweaked to compute bytes for optimize_size and
17952 take into account performance of various instructions on various CPUs. */
17953
17954 static int
17955 ix86_fp_comparison_cost (enum rtx_code code)
17956 {
17957 int arith_cost;
17958
17959 /* The cost of code using bit-twiddling on %ah. */
17960 switch (code)
17961 {
17962 case UNLE:
17963 case UNLT:
17964 case LTGT:
17965 case GT:
17966 case GE:
17967 case UNORDERED:
17968 case ORDERED:
17969 case UNEQ:
17970 arith_cost = 4;
17971 break;
17972 case LT:
17973 case NE:
17974 case EQ:
17975 case UNGE:
17976 arith_cost = TARGET_IEEE_FP ? 5 : 4;
17977 break;
17978 case LE:
17979 case UNGT:
17980 arith_cost = TARGET_IEEE_FP ? 6 : 4;
17981 break;
17982 default:
17983 gcc_unreachable ();
17984 }
17985
17986 switch (ix86_fp_comparison_strategy (code))
17987 {
17988 case IX86_FPCMP_COMI:
17989 return arith_cost > 4 ? 3 : 2;
17990 case IX86_FPCMP_SAHF:
17991 return arith_cost > 4 ? 4 : 3;
17992 default:
17993 return arith_cost;
17994 }
17995 }
17996
17997 /* Return strategy to use for floating-point. We assume that fcomi is always
17998 preferrable where available, since that is also true when looking at size
17999 (2 bytes, vs. 3 for fnstsw+sahf and at least 5 for fnstsw+test). */
18000
18001 enum ix86_fpcmp_strategy
18002 ix86_fp_comparison_strategy (enum rtx_code code ATTRIBUTE_UNUSED)
18003 {
18004 /* Do fcomi/sahf based test when profitable. */
18005
18006 if (TARGET_CMOVE)
18007 return IX86_FPCMP_COMI;
18008
18009 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_function_for_size_p (cfun)))
18010 return IX86_FPCMP_SAHF;
18011
18012 return IX86_FPCMP_ARITH;
18013 }
18014
18015 /* Swap, force into registers, or otherwise massage the two operands
18016 to a fp comparison. The operands are updated in place; the new
18017 comparison code is returned. */
18018
18019 static enum rtx_code
18020 ix86_prepare_fp_compare_args (enum rtx_code code, rtx *pop0, rtx *pop1)
18021 {
18022 enum machine_mode fpcmp_mode = ix86_fp_compare_mode (code);
18023 rtx op0 = *pop0, op1 = *pop1;
18024 enum machine_mode op_mode = GET_MODE (op0);
18025 int is_sse = TARGET_SSE_MATH && SSE_FLOAT_MODE_P (op_mode);
18026
18027 /* All of the unordered compare instructions only work on registers.
18028 The same is true of the fcomi compare instructions. The XFmode
18029 compare instructions require registers except when comparing
18030 against zero or when converting operand 1 from fixed point to
18031 floating point. */
18032
18033 if (!is_sse
18034 && (fpcmp_mode == CCFPUmode
18035 || (op_mode == XFmode
18036 && ! (standard_80387_constant_p (op0) == 1
18037 || standard_80387_constant_p (op1) == 1)
18038 && GET_CODE (op1) != FLOAT)
18039 || ix86_fp_comparison_strategy (code) == IX86_FPCMP_COMI))
18040 {
18041 op0 = force_reg (op_mode, op0);
18042 op1 = force_reg (op_mode, op1);
18043 }
18044 else
18045 {
18046 /* %%% We only allow op1 in memory; op0 must be st(0). So swap
18047 things around if they appear profitable, otherwise force op0
18048 into a register. */
18049
18050 if (standard_80387_constant_p (op0) == 0
18051 || (MEM_P (op0)
18052 && ! (standard_80387_constant_p (op1) == 0
18053 || MEM_P (op1))))
18054 {
18055 enum rtx_code new_code = ix86_fp_swap_condition (code);
18056 if (new_code != UNKNOWN)
18057 {
18058 rtx tmp;
18059 tmp = op0, op0 = op1, op1 = tmp;
18060 code = new_code;
18061 }
18062 }
18063
18064 if (!REG_P (op0))
18065 op0 = force_reg (op_mode, op0);
18066
18067 if (CONSTANT_P (op1))
18068 {
18069 int tmp = standard_80387_constant_p (op1);
18070 if (tmp == 0)
18071 op1 = validize_mem (force_const_mem (op_mode, op1));
18072 else if (tmp == 1)
18073 {
18074 if (TARGET_CMOVE)
18075 op1 = force_reg (op_mode, op1);
18076 }
18077 else
18078 op1 = force_reg (op_mode, op1);
18079 }
18080 }
18081
18082 /* Try to rearrange the comparison to make it cheaper. */
18083 if (ix86_fp_comparison_cost (code)
18084 > ix86_fp_comparison_cost (swap_condition (code))
18085 && (REG_P (op1) || can_create_pseudo_p ()))
18086 {
18087 rtx tmp;
18088 tmp = op0, op0 = op1, op1 = tmp;
18089 code = swap_condition (code);
18090 if (!REG_P (op0))
18091 op0 = force_reg (op_mode, op0);
18092 }
18093
18094 *pop0 = op0;
18095 *pop1 = op1;
18096 return code;
18097 }
18098
18099 /* Convert comparison codes we use to represent FP comparison to integer
18100 code that will result in proper branch. Return UNKNOWN if no such code
18101 is available. */
18102
18103 enum rtx_code
18104 ix86_fp_compare_code_to_integer (enum rtx_code code)
18105 {
18106 switch (code)
18107 {
18108 case GT:
18109 return GTU;
18110 case GE:
18111 return GEU;
18112 case ORDERED:
18113 case UNORDERED:
18114 return code;
18115 break;
18116 case UNEQ:
18117 return EQ;
18118 break;
18119 case UNLT:
18120 return LTU;
18121 break;
18122 case UNLE:
18123 return LEU;
18124 break;
18125 case LTGT:
18126 return NE;
18127 break;
18128 default:
18129 return UNKNOWN;
18130 }
18131 }
18132
18133 /* Generate insn patterns to do a floating point compare of OPERANDS. */
18134
18135 static rtx
18136 ix86_expand_fp_compare (enum rtx_code code, rtx op0, rtx op1, rtx scratch)
18137 {
18138 enum machine_mode fpcmp_mode, intcmp_mode;
18139 rtx tmp, tmp2;
18140
18141 fpcmp_mode = ix86_fp_compare_mode (code);
18142 code = ix86_prepare_fp_compare_args (code, &op0, &op1);
18143
18144 /* Do fcomi/sahf based test when profitable. */
18145 switch (ix86_fp_comparison_strategy (code))
18146 {
18147 case IX86_FPCMP_COMI:
18148 intcmp_mode = fpcmp_mode;
18149 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
18150 tmp = gen_rtx_SET (VOIDmode, gen_rtx_REG (fpcmp_mode, FLAGS_REG),
18151 tmp);
18152 emit_insn (tmp);
18153 break;
18154
18155 case IX86_FPCMP_SAHF:
18156 intcmp_mode = fpcmp_mode;
18157 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
18158 tmp = gen_rtx_SET (VOIDmode, gen_rtx_REG (fpcmp_mode, FLAGS_REG),
18159 tmp);
18160
18161 if (!scratch)
18162 scratch = gen_reg_rtx (HImode);
18163 tmp2 = gen_rtx_CLOBBER (VOIDmode, scratch);
18164 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, tmp2)));
18165 break;
18166
18167 case IX86_FPCMP_ARITH:
18168 /* Sadness wrt reg-stack pops killing fpsr -- gotta get fnstsw first. */
18169 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
18170 tmp2 = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
18171 if (!scratch)
18172 scratch = gen_reg_rtx (HImode);
18173 emit_insn (gen_rtx_SET (VOIDmode, scratch, tmp2));
18174
18175 /* In the unordered case, we have to check C2 for NaN's, which
18176 doesn't happen to work out to anything nice combination-wise.
18177 So do some bit twiddling on the value we've got in AH to come
18178 up with an appropriate set of condition codes. */
18179
18180 intcmp_mode = CCNOmode;
18181 switch (code)
18182 {
18183 case GT:
18184 case UNGT:
18185 if (code == GT || !TARGET_IEEE_FP)
18186 {
18187 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x45)));
18188 code = EQ;
18189 }
18190 else
18191 {
18192 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18193 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
18194 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x44)));
18195 intcmp_mode = CCmode;
18196 code = GEU;
18197 }
18198 break;
18199 case LT:
18200 case UNLT:
18201 if (code == LT && TARGET_IEEE_FP)
18202 {
18203 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18204 emit_insn (gen_cmpqi_ext_3 (scratch, const1_rtx));
18205 intcmp_mode = CCmode;
18206 code = EQ;
18207 }
18208 else
18209 {
18210 emit_insn (gen_testqi_ext_ccno_0 (scratch, const1_rtx));
18211 code = NE;
18212 }
18213 break;
18214 case GE:
18215 case UNGE:
18216 if (code == GE || !TARGET_IEEE_FP)
18217 {
18218 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x05)));
18219 code = EQ;
18220 }
18221 else
18222 {
18223 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18224 emit_insn (gen_xorqi_cc_ext_1 (scratch, scratch, const1_rtx));
18225 code = NE;
18226 }
18227 break;
18228 case LE:
18229 case UNLE:
18230 if (code == LE && TARGET_IEEE_FP)
18231 {
18232 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18233 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
18234 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
18235 intcmp_mode = CCmode;
18236 code = LTU;
18237 }
18238 else
18239 {
18240 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x45)));
18241 code = NE;
18242 }
18243 break;
18244 case EQ:
18245 case UNEQ:
18246 if (code == EQ && TARGET_IEEE_FP)
18247 {
18248 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18249 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
18250 intcmp_mode = CCmode;
18251 code = EQ;
18252 }
18253 else
18254 {
18255 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x40)));
18256 code = NE;
18257 }
18258 break;
18259 case NE:
18260 case LTGT:
18261 if (code == NE && TARGET_IEEE_FP)
18262 {
18263 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18264 emit_insn (gen_xorqi_cc_ext_1 (scratch, scratch,
18265 GEN_INT (0x40)));
18266 code = NE;
18267 }
18268 else
18269 {
18270 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x40)));
18271 code = EQ;
18272 }
18273 break;
18274
18275 case UNORDERED:
18276 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x04)));
18277 code = NE;
18278 break;
18279 case ORDERED:
18280 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x04)));
18281 code = EQ;
18282 break;
18283
18284 default:
18285 gcc_unreachable ();
18286 }
18287 break;
18288
18289 default:
18290 gcc_unreachable();
18291 }
18292
18293 /* Return the test that should be put into the flags user, i.e.
18294 the bcc, scc, or cmov instruction. */
18295 return gen_rtx_fmt_ee (code, VOIDmode,
18296 gen_rtx_REG (intcmp_mode, FLAGS_REG),
18297 const0_rtx);
18298 }
18299
18300 static rtx
18301 ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1)
18302 {
18303 rtx ret;
18304
18305 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC)
18306 ret = gen_rtx_fmt_ee (code, VOIDmode, op0, op1);
18307
18308 else if (SCALAR_FLOAT_MODE_P (GET_MODE (op0)))
18309 {
18310 gcc_assert (!DECIMAL_FLOAT_MODE_P (GET_MODE (op0)));
18311 ret = ix86_expand_fp_compare (code, op0, op1, NULL_RTX);
18312 }
18313 else
18314 ret = ix86_expand_int_compare (code, op0, op1);
18315
18316 return ret;
18317 }
18318
18319 void
18320 ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label)
18321 {
18322 enum machine_mode mode = GET_MODE (op0);
18323 rtx tmp;
18324
18325 switch (mode)
18326 {
18327 case SFmode:
18328 case DFmode:
18329 case XFmode:
18330 case QImode:
18331 case HImode:
18332 case SImode:
18333 simple:
18334 tmp = ix86_expand_compare (code, op0, op1);
18335 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
18336 gen_rtx_LABEL_REF (VOIDmode, label),
18337 pc_rtx);
18338 emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
18339 return;
18340
18341 case DImode:
18342 if (TARGET_64BIT)
18343 goto simple;
18344 case TImode:
18345 /* Expand DImode branch into multiple compare+branch. */
18346 {
18347 rtx lo[2], hi[2], label2;
18348 enum rtx_code code1, code2, code3;
18349 enum machine_mode submode;
18350
18351 if (CONSTANT_P (op0) && !CONSTANT_P (op1))
18352 {
18353 tmp = op0, op0 = op1, op1 = tmp;
18354 code = swap_condition (code);
18355 }
18356
18357 split_double_mode (mode, &op0, 1, lo+0, hi+0);
18358 split_double_mode (mode, &op1, 1, lo+1, hi+1);
18359
18360 submode = mode == DImode ? SImode : DImode;
18361
18362 /* When comparing for equality, we can use (hi0^hi1)|(lo0^lo1) to
18363 avoid two branches. This costs one extra insn, so disable when
18364 optimizing for size. */
18365
18366 if ((code == EQ || code == NE)
18367 && (!optimize_insn_for_size_p ()
18368 || hi[1] == const0_rtx || lo[1] == const0_rtx))
18369 {
18370 rtx xor0, xor1;
18371
18372 xor1 = hi[0];
18373 if (hi[1] != const0_rtx)
18374 xor1 = expand_binop (submode, xor_optab, xor1, hi[1],
18375 NULL_RTX, 0, OPTAB_WIDEN);
18376
18377 xor0 = lo[0];
18378 if (lo[1] != const0_rtx)
18379 xor0 = expand_binop (submode, xor_optab, xor0, lo[1],
18380 NULL_RTX, 0, OPTAB_WIDEN);
18381
18382 tmp = expand_binop (submode, ior_optab, xor1, xor0,
18383 NULL_RTX, 0, OPTAB_WIDEN);
18384
18385 ix86_expand_branch (code, tmp, const0_rtx, label);
18386 return;
18387 }
18388
18389 /* Otherwise, if we are doing less-than or greater-or-equal-than,
18390 op1 is a constant and the low word is zero, then we can just
18391 examine the high word. Similarly for low word -1 and
18392 less-or-equal-than or greater-than. */
18393
18394 if (CONST_INT_P (hi[1]))
18395 switch (code)
18396 {
18397 case LT: case LTU: case GE: case GEU:
18398 if (lo[1] == const0_rtx)
18399 {
18400 ix86_expand_branch (code, hi[0], hi[1], label);
18401 return;
18402 }
18403 break;
18404 case LE: case LEU: case GT: case GTU:
18405 if (lo[1] == constm1_rtx)
18406 {
18407 ix86_expand_branch (code, hi[0], hi[1], label);
18408 return;
18409 }
18410 break;
18411 default:
18412 break;
18413 }
18414
18415 /* Otherwise, we need two or three jumps. */
18416
18417 label2 = gen_label_rtx ();
18418
18419 code1 = code;
18420 code2 = swap_condition (code);
18421 code3 = unsigned_condition (code);
18422
18423 switch (code)
18424 {
18425 case LT: case GT: case LTU: case GTU:
18426 break;
18427
18428 case LE: code1 = LT; code2 = GT; break;
18429 case GE: code1 = GT; code2 = LT; break;
18430 case LEU: code1 = LTU; code2 = GTU; break;
18431 case GEU: code1 = GTU; code2 = LTU; break;
18432
18433 case EQ: code1 = UNKNOWN; code2 = NE; break;
18434 case NE: code2 = UNKNOWN; break;
18435
18436 default:
18437 gcc_unreachable ();
18438 }
18439
18440 /*
18441 * a < b =>
18442 * if (hi(a) < hi(b)) goto true;
18443 * if (hi(a) > hi(b)) goto false;
18444 * if (lo(a) < lo(b)) goto true;
18445 * false:
18446 */
18447
18448 if (code1 != UNKNOWN)
18449 ix86_expand_branch (code1, hi[0], hi[1], label);
18450 if (code2 != UNKNOWN)
18451 ix86_expand_branch (code2, hi[0], hi[1], label2);
18452
18453 ix86_expand_branch (code3, lo[0], lo[1], label);
18454
18455 if (code2 != UNKNOWN)
18456 emit_label (label2);
18457 return;
18458 }
18459
18460 default:
18461 gcc_assert (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC);
18462 goto simple;
18463 }
18464 }
18465
18466 /* Split branch based on floating point condition. */
18467 void
18468 ix86_split_fp_branch (enum rtx_code code, rtx op1, rtx op2,
18469 rtx target1, rtx target2, rtx tmp, rtx pushed)
18470 {
18471 rtx condition;
18472 rtx i;
18473
18474 if (target2 != pc_rtx)
18475 {
18476 rtx tmp = target2;
18477 code = reverse_condition_maybe_unordered (code);
18478 target2 = target1;
18479 target1 = tmp;
18480 }
18481
18482 condition = ix86_expand_fp_compare (code, op1, op2,
18483 tmp);
18484
18485 /* Remove pushed operand from stack. */
18486 if (pushed)
18487 ix86_free_from_memory (GET_MODE (pushed));
18488
18489 i = emit_jump_insn (gen_rtx_SET
18490 (VOIDmode, pc_rtx,
18491 gen_rtx_IF_THEN_ELSE (VOIDmode,
18492 condition, target1, target2)));
18493 if (split_branch_probability >= 0)
18494 add_reg_note (i, REG_BR_PROB, GEN_INT (split_branch_probability));
18495 }
18496
18497 void
18498 ix86_expand_setcc (rtx dest, enum rtx_code code, rtx op0, rtx op1)
18499 {
18500 rtx ret;
18501
18502 gcc_assert (GET_MODE (dest) == QImode);
18503
18504 ret = ix86_expand_compare (code, op0, op1);
18505 PUT_MODE (ret, QImode);
18506 emit_insn (gen_rtx_SET (VOIDmode, dest, ret));
18507 }
18508
18509 /* Expand comparison setting or clearing carry flag. Return true when
18510 successful and set pop for the operation. */
18511 static bool
18512 ix86_expand_carry_flag_compare (enum rtx_code code, rtx op0, rtx op1, rtx *pop)
18513 {
18514 enum machine_mode mode =
18515 GET_MODE (op0) != VOIDmode ? GET_MODE (op0) : GET_MODE (op1);
18516
18517 /* Do not handle double-mode compares that go through special path. */
18518 if (mode == (TARGET_64BIT ? TImode : DImode))
18519 return false;
18520
18521 if (SCALAR_FLOAT_MODE_P (mode))
18522 {
18523 rtx compare_op, compare_seq;
18524
18525 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
18526
18527 /* Shortcut: following common codes never translate
18528 into carry flag compares. */
18529 if (code == EQ || code == NE || code == UNEQ || code == LTGT
18530 || code == ORDERED || code == UNORDERED)
18531 return false;
18532
18533 /* These comparisons require zero flag; swap operands so they won't. */
18534 if ((code == GT || code == UNLE || code == LE || code == UNGT)
18535 && !TARGET_IEEE_FP)
18536 {
18537 rtx tmp = op0;
18538 op0 = op1;
18539 op1 = tmp;
18540 code = swap_condition (code);
18541 }
18542
18543 /* Try to expand the comparison and verify that we end up with
18544 carry flag based comparison. This fails to be true only when
18545 we decide to expand comparison using arithmetic that is not
18546 too common scenario. */
18547 start_sequence ();
18548 compare_op = ix86_expand_fp_compare (code, op0, op1, NULL_RTX);
18549 compare_seq = get_insns ();
18550 end_sequence ();
18551
18552 if (GET_MODE (XEXP (compare_op, 0)) == CCFPmode
18553 || GET_MODE (XEXP (compare_op, 0)) == CCFPUmode)
18554 code = ix86_fp_compare_code_to_integer (GET_CODE (compare_op));
18555 else
18556 code = GET_CODE (compare_op);
18557
18558 if (code != LTU && code != GEU)
18559 return false;
18560
18561 emit_insn (compare_seq);
18562 *pop = compare_op;
18563 return true;
18564 }
18565
18566 if (!INTEGRAL_MODE_P (mode))
18567 return false;
18568
18569 switch (code)
18570 {
18571 case LTU:
18572 case GEU:
18573 break;
18574
18575 /* Convert a==0 into (unsigned)a<1. */
18576 case EQ:
18577 case NE:
18578 if (op1 != const0_rtx)
18579 return false;
18580 op1 = const1_rtx;
18581 code = (code == EQ ? LTU : GEU);
18582 break;
18583
18584 /* Convert a>b into b<a or a>=b-1. */
18585 case GTU:
18586 case LEU:
18587 if (CONST_INT_P (op1))
18588 {
18589 op1 = gen_int_mode (INTVAL (op1) + 1, GET_MODE (op0));
18590 /* Bail out on overflow. We still can swap operands but that
18591 would force loading of the constant into register. */
18592 if (op1 == const0_rtx
18593 || !x86_64_immediate_operand (op1, GET_MODE (op1)))
18594 return false;
18595 code = (code == GTU ? GEU : LTU);
18596 }
18597 else
18598 {
18599 rtx tmp = op1;
18600 op1 = op0;
18601 op0 = tmp;
18602 code = (code == GTU ? LTU : GEU);
18603 }
18604 break;
18605
18606 /* Convert a>=0 into (unsigned)a<0x80000000. */
18607 case LT:
18608 case GE:
18609 if (mode == DImode || op1 != const0_rtx)
18610 return false;
18611 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
18612 code = (code == LT ? GEU : LTU);
18613 break;
18614 case LE:
18615 case GT:
18616 if (mode == DImode || op1 != constm1_rtx)
18617 return false;
18618 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
18619 code = (code == LE ? GEU : LTU);
18620 break;
18621
18622 default:
18623 return false;
18624 }
18625 /* Swapping operands may cause constant to appear as first operand. */
18626 if (!nonimmediate_operand (op0, VOIDmode))
18627 {
18628 if (!can_create_pseudo_p ())
18629 return false;
18630 op0 = force_reg (mode, op0);
18631 }
18632 *pop = ix86_expand_compare (code, op0, op1);
18633 gcc_assert (GET_CODE (*pop) == LTU || GET_CODE (*pop) == GEU);
18634 return true;
18635 }
18636
18637 bool
18638 ix86_expand_int_movcc (rtx operands[])
18639 {
18640 enum rtx_code code = GET_CODE (operands[1]), compare_code;
18641 rtx compare_seq, compare_op;
18642 enum machine_mode mode = GET_MODE (operands[0]);
18643 bool sign_bit_compare_p = false;
18644 rtx op0 = XEXP (operands[1], 0);
18645 rtx op1 = XEXP (operands[1], 1);
18646
18647 start_sequence ();
18648 compare_op = ix86_expand_compare (code, op0, op1);
18649 compare_seq = get_insns ();
18650 end_sequence ();
18651
18652 compare_code = GET_CODE (compare_op);
18653
18654 if ((op1 == const0_rtx && (code == GE || code == LT))
18655 || (op1 == constm1_rtx && (code == GT || code == LE)))
18656 sign_bit_compare_p = true;
18657
18658 /* Don't attempt mode expansion here -- if we had to expand 5 or 6
18659 HImode insns, we'd be swallowed in word prefix ops. */
18660
18661 if ((mode != HImode || TARGET_FAST_PREFIX)
18662 && (mode != (TARGET_64BIT ? TImode : DImode))
18663 && CONST_INT_P (operands[2])
18664 && CONST_INT_P (operands[3]))
18665 {
18666 rtx out = operands[0];
18667 HOST_WIDE_INT ct = INTVAL (operands[2]);
18668 HOST_WIDE_INT cf = INTVAL (operands[3]);
18669 HOST_WIDE_INT diff;
18670
18671 diff = ct - cf;
18672 /* Sign bit compares are better done using shifts than we do by using
18673 sbb. */
18674 if (sign_bit_compare_p
18675 || ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
18676 {
18677 /* Detect overlap between destination and compare sources. */
18678 rtx tmp = out;
18679
18680 if (!sign_bit_compare_p)
18681 {
18682 rtx flags;
18683 bool fpcmp = false;
18684
18685 compare_code = GET_CODE (compare_op);
18686
18687 flags = XEXP (compare_op, 0);
18688
18689 if (GET_MODE (flags) == CCFPmode
18690 || GET_MODE (flags) == CCFPUmode)
18691 {
18692 fpcmp = true;
18693 compare_code
18694 = ix86_fp_compare_code_to_integer (compare_code);
18695 }
18696
18697 /* To simplify rest of code, restrict to the GEU case. */
18698 if (compare_code == LTU)
18699 {
18700 HOST_WIDE_INT tmp = ct;
18701 ct = cf;
18702 cf = tmp;
18703 compare_code = reverse_condition (compare_code);
18704 code = reverse_condition (code);
18705 }
18706 else
18707 {
18708 if (fpcmp)
18709 PUT_CODE (compare_op,
18710 reverse_condition_maybe_unordered
18711 (GET_CODE (compare_op)));
18712 else
18713 PUT_CODE (compare_op,
18714 reverse_condition (GET_CODE (compare_op)));
18715 }
18716 diff = ct - cf;
18717
18718 if (reg_overlap_mentioned_p (out, op0)
18719 || reg_overlap_mentioned_p (out, op1))
18720 tmp = gen_reg_rtx (mode);
18721
18722 if (mode == DImode)
18723 emit_insn (gen_x86_movdicc_0_m1 (tmp, flags, compare_op));
18724 else
18725 emit_insn (gen_x86_movsicc_0_m1 (gen_lowpart (SImode, tmp),
18726 flags, compare_op));
18727 }
18728 else
18729 {
18730 if (code == GT || code == GE)
18731 code = reverse_condition (code);
18732 else
18733 {
18734 HOST_WIDE_INT tmp = ct;
18735 ct = cf;
18736 cf = tmp;
18737 diff = ct - cf;
18738 }
18739 tmp = emit_store_flag (tmp, code, op0, op1, VOIDmode, 0, -1);
18740 }
18741
18742 if (diff == 1)
18743 {
18744 /*
18745 * cmpl op0,op1
18746 * sbbl dest,dest
18747 * [addl dest, ct]
18748 *
18749 * Size 5 - 8.
18750 */
18751 if (ct)
18752 tmp = expand_simple_binop (mode, PLUS,
18753 tmp, GEN_INT (ct),
18754 copy_rtx (tmp), 1, OPTAB_DIRECT);
18755 }
18756 else if (cf == -1)
18757 {
18758 /*
18759 * cmpl op0,op1
18760 * sbbl dest,dest
18761 * orl $ct, dest
18762 *
18763 * Size 8.
18764 */
18765 tmp = expand_simple_binop (mode, IOR,
18766 tmp, GEN_INT (ct),
18767 copy_rtx (tmp), 1, OPTAB_DIRECT);
18768 }
18769 else if (diff == -1 && ct)
18770 {
18771 /*
18772 * cmpl op0,op1
18773 * sbbl dest,dest
18774 * notl dest
18775 * [addl dest, cf]
18776 *
18777 * Size 8 - 11.
18778 */
18779 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
18780 if (cf)
18781 tmp = expand_simple_binop (mode, PLUS,
18782 copy_rtx (tmp), GEN_INT (cf),
18783 copy_rtx (tmp), 1, OPTAB_DIRECT);
18784 }
18785 else
18786 {
18787 /*
18788 * cmpl op0,op1
18789 * sbbl dest,dest
18790 * [notl dest]
18791 * andl cf - ct, dest
18792 * [addl dest, ct]
18793 *
18794 * Size 8 - 11.
18795 */
18796
18797 if (cf == 0)
18798 {
18799 cf = ct;
18800 ct = 0;
18801 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
18802 }
18803
18804 tmp = expand_simple_binop (mode, AND,
18805 copy_rtx (tmp),
18806 gen_int_mode (cf - ct, mode),
18807 copy_rtx (tmp), 1, OPTAB_DIRECT);
18808 if (ct)
18809 tmp = expand_simple_binop (mode, PLUS,
18810 copy_rtx (tmp), GEN_INT (ct),
18811 copy_rtx (tmp), 1, OPTAB_DIRECT);
18812 }
18813
18814 if (!rtx_equal_p (tmp, out))
18815 emit_move_insn (copy_rtx (out), copy_rtx (tmp));
18816
18817 return true;
18818 }
18819
18820 if (diff < 0)
18821 {
18822 enum machine_mode cmp_mode = GET_MODE (op0);
18823
18824 HOST_WIDE_INT tmp;
18825 tmp = ct, ct = cf, cf = tmp;
18826 diff = -diff;
18827
18828 if (SCALAR_FLOAT_MODE_P (cmp_mode))
18829 {
18830 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
18831
18832 /* We may be reversing unordered compare to normal compare, that
18833 is not valid in general (we may convert non-trapping condition
18834 to trapping one), however on i386 we currently emit all
18835 comparisons unordered. */
18836 compare_code = reverse_condition_maybe_unordered (compare_code);
18837 code = reverse_condition_maybe_unordered (code);
18838 }
18839 else
18840 {
18841 compare_code = reverse_condition (compare_code);
18842 code = reverse_condition (code);
18843 }
18844 }
18845
18846 compare_code = UNKNOWN;
18847 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT
18848 && CONST_INT_P (op1))
18849 {
18850 if (op1 == const0_rtx
18851 && (code == LT || code == GE))
18852 compare_code = code;
18853 else if (op1 == constm1_rtx)
18854 {
18855 if (code == LE)
18856 compare_code = LT;
18857 else if (code == GT)
18858 compare_code = GE;
18859 }
18860 }
18861
18862 /* Optimize dest = (op0 < 0) ? -1 : cf. */
18863 if (compare_code != UNKNOWN
18864 && GET_MODE (op0) == GET_MODE (out)
18865 && (cf == -1 || ct == -1))
18866 {
18867 /* If lea code below could be used, only optimize
18868 if it results in a 2 insn sequence. */
18869
18870 if (! (diff == 1 || diff == 2 || diff == 4 || diff == 8
18871 || diff == 3 || diff == 5 || diff == 9)
18872 || (compare_code == LT && ct == -1)
18873 || (compare_code == GE && cf == -1))
18874 {
18875 /*
18876 * notl op1 (if necessary)
18877 * sarl $31, op1
18878 * orl cf, op1
18879 */
18880 if (ct != -1)
18881 {
18882 cf = ct;
18883 ct = -1;
18884 code = reverse_condition (code);
18885 }
18886
18887 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
18888
18889 out = expand_simple_binop (mode, IOR,
18890 out, GEN_INT (cf),
18891 out, 1, OPTAB_DIRECT);
18892 if (out != operands[0])
18893 emit_move_insn (operands[0], out);
18894
18895 return true;
18896 }
18897 }
18898
18899
18900 if ((diff == 1 || diff == 2 || diff == 4 || diff == 8
18901 || diff == 3 || diff == 5 || diff == 9)
18902 && ((mode != QImode && mode != HImode) || !TARGET_PARTIAL_REG_STALL)
18903 && (mode != DImode
18904 || x86_64_immediate_operand (GEN_INT (cf), VOIDmode)))
18905 {
18906 /*
18907 * xorl dest,dest
18908 * cmpl op1,op2
18909 * setcc dest
18910 * lea cf(dest*(ct-cf)),dest
18911 *
18912 * Size 14.
18913 *
18914 * This also catches the degenerate setcc-only case.
18915 */
18916
18917 rtx tmp;
18918 int nops;
18919
18920 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
18921
18922 nops = 0;
18923 /* On x86_64 the lea instruction operates on Pmode, so we need
18924 to get arithmetics done in proper mode to match. */
18925 if (diff == 1)
18926 tmp = copy_rtx (out);
18927 else
18928 {
18929 rtx out1;
18930 out1 = copy_rtx (out);
18931 tmp = gen_rtx_MULT (mode, out1, GEN_INT (diff & ~1));
18932 nops++;
18933 if (diff & 1)
18934 {
18935 tmp = gen_rtx_PLUS (mode, tmp, out1);
18936 nops++;
18937 }
18938 }
18939 if (cf != 0)
18940 {
18941 tmp = gen_rtx_PLUS (mode, tmp, GEN_INT (cf));
18942 nops++;
18943 }
18944 if (!rtx_equal_p (tmp, out))
18945 {
18946 if (nops == 1)
18947 out = force_operand (tmp, copy_rtx (out));
18948 else
18949 emit_insn (gen_rtx_SET (VOIDmode, copy_rtx (out), copy_rtx (tmp)));
18950 }
18951 if (!rtx_equal_p (out, operands[0]))
18952 emit_move_insn (operands[0], copy_rtx (out));
18953
18954 return true;
18955 }
18956
18957 /*
18958 * General case: Jumpful:
18959 * xorl dest,dest cmpl op1, op2
18960 * cmpl op1, op2 movl ct, dest
18961 * setcc dest jcc 1f
18962 * decl dest movl cf, dest
18963 * andl (cf-ct),dest 1:
18964 * addl ct,dest
18965 *
18966 * Size 20. Size 14.
18967 *
18968 * This is reasonably steep, but branch mispredict costs are
18969 * high on modern cpus, so consider failing only if optimizing
18970 * for space.
18971 */
18972
18973 if ((!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
18974 && BRANCH_COST (optimize_insn_for_speed_p (),
18975 false) >= 2)
18976 {
18977 if (cf == 0)
18978 {
18979 enum machine_mode cmp_mode = GET_MODE (op0);
18980
18981 cf = ct;
18982 ct = 0;
18983
18984 if (SCALAR_FLOAT_MODE_P (cmp_mode))
18985 {
18986 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
18987
18988 /* We may be reversing unordered compare to normal compare,
18989 that is not valid in general (we may convert non-trapping
18990 condition to trapping one), however on i386 we currently
18991 emit all comparisons unordered. */
18992 code = reverse_condition_maybe_unordered (code);
18993 }
18994 else
18995 {
18996 code = reverse_condition (code);
18997 if (compare_code != UNKNOWN)
18998 compare_code = reverse_condition (compare_code);
18999 }
19000 }
19001
19002 if (compare_code != UNKNOWN)
19003 {
19004 /* notl op1 (if needed)
19005 sarl $31, op1
19006 andl (cf-ct), op1
19007 addl ct, op1
19008
19009 For x < 0 (resp. x <= -1) there will be no notl,
19010 so if possible swap the constants to get rid of the
19011 complement.
19012 True/false will be -1/0 while code below (store flag
19013 followed by decrement) is 0/-1, so the constants need
19014 to be exchanged once more. */
19015
19016 if (compare_code == GE || !cf)
19017 {
19018 code = reverse_condition (code);
19019 compare_code = LT;
19020 }
19021 else
19022 {
19023 HOST_WIDE_INT tmp = cf;
19024 cf = ct;
19025 ct = tmp;
19026 }
19027
19028 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
19029 }
19030 else
19031 {
19032 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
19033
19034 out = expand_simple_binop (mode, PLUS, copy_rtx (out),
19035 constm1_rtx,
19036 copy_rtx (out), 1, OPTAB_DIRECT);
19037 }
19038
19039 out = expand_simple_binop (mode, AND, copy_rtx (out),
19040 gen_int_mode (cf - ct, mode),
19041 copy_rtx (out), 1, OPTAB_DIRECT);
19042 if (ct)
19043 out = expand_simple_binop (mode, PLUS, copy_rtx (out), GEN_INT (ct),
19044 copy_rtx (out), 1, OPTAB_DIRECT);
19045 if (!rtx_equal_p (out, operands[0]))
19046 emit_move_insn (operands[0], copy_rtx (out));
19047
19048 return true;
19049 }
19050 }
19051
19052 if (!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
19053 {
19054 /* Try a few things more with specific constants and a variable. */
19055
19056 optab op;
19057 rtx var, orig_out, out, tmp;
19058
19059 if (BRANCH_COST (optimize_insn_for_speed_p (), false) <= 2)
19060 return false;
19061
19062 /* If one of the two operands is an interesting constant, load a
19063 constant with the above and mask it in with a logical operation. */
19064
19065 if (CONST_INT_P (operands[2]))
19066 {
19067 var = operands[3];
19068 if (INTVAL (operands[2]) == 0 && operands[3] != constm1_rtx)
19069 operands[3] = constm1_rtx, op = and_optab;
19070 else if (INTVAL (operands[2]) == -1 && operands[3] != const0_rtx)
19071 operands[3] = const0_rtx, op = ior_optab;
19072 else
19073 return false;
19074 }
19075 else if (CONST_INT_P (operands[3]))
19076 {
19077 var = operands[2];
19078 if (INTVAL (operands[3]) == 0 && operands[2] != constm1_rtx)
19079 operands[2] = constm1_rtx, op = and_optab;
19080 else if (INTVAL (operands[3]) == -1 && operands[3] != const0_rtx)
19081 operands[2] = const0_rtx, op = ior_optab;
19082 else
19083 return false;
19084 }
19085 else
19086 return false;
19087
19088 orig_out = operands[0];
19089 tmp = gen_reg_rtx (mode);
19090 operands[0] = tmp;
19091
19092 /* Recurse to get the constant loaded. */
19093 if (ix86_expand_int_movcc (operands) == 0)
19094 return false;
19095
19096 /* Mask in the interesting variable. */
19097 out = expand_binop (mode, op, var, tmp, orig_out, 0,
19098 OPTAB_WIDEN);
19099 if (!rtx_equal_p (out, orig_out))
19100 emit_move_insn (copy_rtx (orig_out), copy_rtx (out));
19101
19102 return true;
19103 }
19104
19105 /*
19106 * For comparison with above,
19107 *
19108 * movl cf,dest
19109 * movl ct,tmp
19110 * cmpl op1,op2
19111 * cmovcc tmp,dest
19112 *
19113 * Size 15.
19114 */
19115
19116 if (! nonimmediate_operand (operands[2], mode))
19117 operands[2] = force_reg (mode, operands[2]);
19118 if (! nonimmediate_operand (operands[3], mode))
19119 operands[3] = force_reg (mode, operands[3]);
19120
19121 if (! register_operand (operands[2], VOIDmode)
19122 && (mode == QImode
19123 || ! register_operand (operands[3], VOIDmode)))
19124 operands[2] = force_reg (mode, operands[2]);
19125
19126 if (mode == QImode
19127 && ! register_operand (operands[3], VOIDmode))
19128 operands[3] = force_reg (mode, operands[3]);
19129
19130 emit_insn (compare_seq);
19131 emit_insn (gen_rtx_SET (VOIDmode, operands[0],
19132 gen_rtx_IF_THEN_ELSE (mode,
19133 compare_op, operands[2],
19134 operands[3])));
19135 return true;
19136 }
19137
19138 /* Swap, force into registers, or otherwise massage the two operands
19139 to an sse comparison with a mask result. Thus we differ a bit from
19140 ix86_prepare_fp_compare_args which expects to produce a flags result.
19141
19142 The DEST operand exists to help determine whether to commute commutative
19143 operators. The POP0/POP1 operands are updated in place. The new
19144 comparison code is returned, or UNKNOWN if not implementable. */
19145
19146 static enum rtx_code
19147 ix86_prepare_sse_fp_compare_args (rtx dest, enum rtx_code code,
19148 rtx *pop0, rtx *pop1)
19149 {
19150 rtx tmp;
19151
19152 switch (code)
19153 {
19154 case LTGT:
19155 case UNEQ:
19156 /* AVX supports all the needed comparisons. */
19157 if (TARGET_AVX)
19158 break;
19159 /* We have no LTGT as an operator. We could implement it with
19160 NE & ORDERED, but this requires an extra temporary. It's
19161 not clear that it's worth it. */
19162 return UNKNOWN;
19163
19164 case LT:
19165 case LE:
19166 case UNGT:
19167 case UNGE:
19168 /* These are supported directly. */
19169 break;
19170
19171 case EQ:
19172 case NE:
19173 case UNORDERED:
19174 case ORDERED:
19175 /* AVX has 3 operand comparisons, no need to swap anything. */
19176 if (TARGET_AVX)
19177 break;
19178 /* For commutative operators, try to canonicalize the destination
19179 operand to be first in the comparison - this helps reload to
19180 avoid extra moves. */
19181 if (!dest || !rtx_equal_p (dest, *pop1))
19182 break;
19183 /* FALLTHRU */
19184
19185 case GE:
19186 case GT:
19187 case UNLE:
19188 case UNLT:
19189 /* These are not supported directly before AVX, and furthermore
19190 ix86_expand_sse_fp_minmax only optimizes LT/UNGE. Swap the
19191 comparison operands to transform into something that is
19192 supported. */
19193 tmp = *pop0;
19194 *pop0 = *pop1;
19195 *pop1 = tmp;
19196 code = swap_condition (code);
19197 break;
19198
19199 default:
19200 gcc_unreachable ();
19201 }
19202
19203 return code;
19204 }
19205
19206 /* Detect conditional moves that exactly match min/max operational
19207 semantics. Note that this is IEEE safe, as long as we don't
19208 interchange the operands.
19209
19210 Returns FALSE if this conditional move doesn't match a MIN/MAX,
19211 and TRUE if the operation is successful and instructions are emitted. */
19212
19213 static bool
19214 ix86_expand_sse_fp_minmax (rtx dest, enum rtx_code code, rtx cmp_op0,
19215 rtx cmp_op1, rtx if_true, rtx if_false)
19216 {
19217 enum machine_mode mode;
19218 bool is_min;
19219 rtx tmp;
19220
19221 if (code == LT)
19222 ;
19223 else if (code == UNGE)
19224 {
19225 tmp = if_true;
19226 if_true = if_false;
19227 if_false = tmp;
19228 }
19229 else
19230 return false;
19231
19232 if (rtx_equal_p (cmp_op0, if_true) && rtx_equal_p (cmp_op1, if_false))
19233 is_min = true;
19234 else if (rtx_equal_p (cmp_op1, if_true) && rtx_equal_p (cmp_op0, if_false))
19235 is_min = false;
19236 else
19237 return false;
19238
19239 mode = GET_MODE (dest);
19240
19241 /* We want to check HONOR_NANS and HONOR_SIGNED_ZEROS here,
19242 but MODE may be a vector mode and thus not appropriate. */
19243 if (!flag_finite_math_only || !flag_unsafe_math_optimizations)
19244 {
19245 int u = is_min ? UNSPEC_IEEE_MIN : UNSPEC_IEEE_MAX;
19246 rtvec v;
19247
19248 if_true = force_reg (mode, if_true);
19249 v = gen_rtvec (2, if_true, if_false);
19250 tmp = gen_rtx_UNSPEC (mode, v, u);
19251 }
19252 else
19253 {
19254 code = is_min ? SMIN : SMAX;
19255 tmp = gen_rtx_fmt_ee (code, mode, if_true, if_false);
19256 }
19257
19258 emit_insn (gen_rtx_SET (VOIDmode, dest, tmp));
19259 return true;
19260 }
19261
19262 /* Expand an sse vector comparison. Return the register with the result. */
19263
19264 static rtx
19265 ix86_expand_sse_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1,
19266 rtx op_true, rtx op_false)
19267 {
19268 enum machine_mode mode = GET_MODE (dest);
19269 enum machine_mode cmp_mode = GET_MODE (cmp_op0);
19270 rtx x;
19271
19272 cmp_op0 = force_reg (cmp_mode, cmp_op0);
19273 if (!nonimmediate_operand (cmp_op1, cmp_mode))
19274 cmp_op1 = force_reg (cmp_mode, cmp_op1);
19275
19276 if (optimize
19277 || reg_overlap_mentioned_p (dest, op_true)
19278 || reg_overlap_mentioned_p (dest, op_false))
19279 dest = gen_reg_rtx (mode);
19280
19281 x = gen_rtx_fmt_ee (code, cmp_mode, cmp_op0, cmp_op1);
19282 if (cmp_mode != mode)
19283 {
19284 x = force_reg (cmp_mode, x);
19285 convert_move (dest, x, false);
19286 }
19287 else
19288 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
19289
19290 return dest;
19291 }
19292
19293 /* Expand DEST = CMP ? OP_TRUE : OP_FALSE into a sequence of logical
19294 operations. This is used for both scalar and vector conditional moves. */
19295
19296 static void
19297 ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
19298 {
19299 enum machine_mode mode = GET_MODE (dest);
19300 rtx t2, t3, x;
19301
19302 if (vector_all_ones_operand (op_true, mode)
19303 && rtx_equal_p (op_false, CONST0_RTX (mode)))
19304 {
19305 emit_insn (gen_rtx_SET (VOIDmode, dest, cmp));
19306 }
19307 else if (op_false == CONST0_RTX (mode))
19308 {
19309 op_true = force_reg (mode, op_true);
19310 x = gen_rtx_AND (mode, cmp, op_true);
19311 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
19312 }
19313 else if (op_true == CONST0_RTX (mode))
19314 {
19315 op_false = force_reg (mode, op_false);
19316 x = gen_rtx_NOT (mode, cmp);
19317 x = gen_rtx_AND (mode, x, op_false);
19318 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
19319 }
19320 else if (INTEGRAL_MODE_P (mode) && op_true == CONSTM1_RTX (mode))
19321 {
19322 op_false = force_reg (mode, op_false);
19323 x = gen_rtx_IOR (mode, cmp, op_false);
19324 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
19325 }
19326 else if (TARGET_XOP)
19327 {
19328 op_true = force_reg (mode, op_true);
19329
19330 if (!nonimmediate_operand (op_false, mode))
19331 op_false = force_reg (mode, op_false);
19332
19333 emit_insn (gen_rtx_SET (mode, dest,
19334 gen_rtx_IF_THEN_ELSE (mode, cmp,
19335 op_true,
19336 op_false)));
19337 }
19338 else
19339 {
19340 rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
19341
19342 if (!nonimmediate_operand (op_true, mode))
19343 op_true = force_reg (mode, op_true);
19344
19345 op_false = force_reg (mode, op_false);
19346
19347 switch (mode)
19348 {
19349 case V4SFmode:
19350 if (TARGET_SSE4_1)
19351 gen = gen_sse4_1_blendvps;
19352 break;
19353 case V2DFmode:
19354 if (TARGET_SSE4_1)
19355 gen = gen_sse4_1_blendvpd;
19356 break;
19357 case V16QImode:
19358 case V8HImode:
19359 case V4SImode:
19360 case V2DImode:
19361 if (TARGET_SSE4_1)
19362 {
19363 gen = gen_sse4_1_pblendvb;
19364 dest = gen_lowpart (V16QImode, dest);
19365 op_false = gen_lowpart (V16QImode, op_false);
19366 op_true = gen_lowpart (V16QImode, op_true);
19367 cmp = gen_lowpart (V16QImode, cmp);
19368 }
19369 break;
19370 case V8SFmode:
19371 if (TARGET_AVX)
19372 gen = gen_avx_blendvps256;
19373 break;
19374 case V4DFmode:
19375 if (TARGET_AVX)
19376 gen = gen_avx_blendvpd256;
19377 break;
19378 case V32QImode:
19379 case V16HImode:
19380 case V8SImode:
19381 case V4DImode:
19382 if (TARGET_AVX2)
19383 {
19384 gen = gen_avx2_pblendvb;
19385 dest = gen_lowpart (V32QImode, dest);
19386 op_false = gen_lowpart (V32QImode, op_false);
19387 op_true = gen_lowpart (V32QImode, op_true);
19388 cmp = gen_lowpart (V32QImode, cmp);
19389 }
19390 break;
19391 default:
19392 break;
19393 }
19394
19395 if (gen != NULL)
19396 emit_insn (gen (dest, op_false, op_true, cmp));
19397 else
19398 {
19399 op_true = force_reg (mode, op_true);
19400
19401 t2 = gen_reg_rtx (mode);
19402 if (optimize)
19403 t3 = gen_reg_rtx (mode);
19404 else
19405 t3 = dest;
19406
19407 x = gen_rtx_AND (mode, op_true, cmp);
19408 emit_insn (gen_rtx_SET (VOIDmode, t2, x));
19409
19410 x = gen_rtx_NOT (mode, cmp);
19411 x = gen_rtx_AND (mode, x, op_false);
19412 emit_insn (gen_rtx_SET (VOIDmode, t3, x));
19413
19414 x = gen_rtx_IOR (mode, t3, t2);
19415 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
19416 }
19417 }
19418 }
19419
19420 /* Expand a floating-point conditional move. Return true if successful. */
19421
19422 bool
19423 ix86_expand_fp_movcc (rtx operands[])
19424 {
19425 enum machine_mode mode = GET_MODE (operands[0]);
19426 enum rtx_code code = GET_CODE (operands[1]);
19427 rtx tmp, compare_op;
19428 rtx op0 = XEXP (operands[1], 0);
19429 rtx op1 = XEXP (operands[1], 1);
19430
19431 if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode))
19432 {
19433 enum machine_mode cmode;
19434
19435 /* Since we've no cmove for sse registers, don't force bad register
19436 allocation just to gain access to it. Deny movcc when the
19437 comparison mode doesn't match the move mode. */
19438 cmode = GET_MODE (op0);
19439 if (cmode == VOIDmode)
19440 cmode = GET_MODE (op1);
19441 if (cmode != mode)
19442 return false;
19443
19444 code = ix86_prepare_sse_fp_compare_args (operands[0], code, &op0, &op1);
19445 if (code == UNKNOWN)
19446 return false;
19447
19448 if (ix86_expand_sse_fp_minmax (operands[0], code, op0, op1,
19449 operands[2], operands[3]))
19450 return true;
19451
19452 tmp = ix86_expand_sse_cmp (operands[0], code, op0, op1,
19453 operands[2], operands[3]);
19454 ix86_expand_sse_movcc (operands[0], tmp, operands[2], operands[3]);
19455 return true;
19456 }
19457
19458 /* The floating point conditional move instructions don't directly
19459 support conditions resulting from a signed integer comparison. */
19460
19461 compare_op = ix86_expand_compare (code, op0, op1);
19462 if (!fcmov_comparison_operator (compare_op, VOIDmode))
19463 {
19464 tmp = gen_reg_rtx (QImode);
19465 ix86_expand_setcc (tmp, code, op0, op1);
19466
19467 compare_op = ix86_expand_compare (NE, tmp, const0_rtx);
19468 }
19469
19470 emit_insn (gen_rtx_SET (VOIDmode, operands[0],
19471 gen_rtx_IF_THEN_ELSE (mode, compare_op,
19472 operands[2], operands[3])));
19473
19474 return true;
19475 }
19476
19477 /* Expand a floating-point vector conditional move; a vcond operation
19478 rather than a movcc operation. */
19479
19480 bool
19481 ix86_expand_fp_vcond (rtx operands[])
19482 {
19483 enum rtx_code code = GET_CODE (operands[3]);
19484 rtx cmp;
19485
19486 code = ix86_prepare_sse_fp_compare_args (operands[0], code,
19487 &operands[4], &operands[5]);
19488 if (code == UNKNOWN)
19489 {
19490 rtx temp;
19491 switch (GET_CODE (operands[3]))
19492 {
19493 case LTGT:
19494 temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[4],
19495 operands[5], operands[0], operands[0]);
19496 cmp = ix86_expand_sse_cmp (operands[0], NE, operands[4],
19497 operands[5], operands[1], operands[2]);
19498 code = AND;
19499 break;
19500 case UNEQ:
19501 temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[4],
19502 operands[5], operands[0], operands[0]);
19503 cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[4],
19504 operands[5], operands[1], operands[2]);
19505 code = IOR;
19506 break;
19507 default:
19508 gcc_unreachable ();
19509 }
19510 cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
19511 OPTAB_DIRECT);
19512 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
19513 return true;
19514 }
19515
19516 if (ix86_expand_sse_fp_minmax (operands[0], code, operands[4],
19517 operands[5], operands[1], operands[2]))
19518 return true;
19519
19520 cmp = ix86_expand_sse_cmp (operands[0], code, operands[4], operands[5],
19521 operands[1], operands[2]);
19522 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
19523 return true;
19524 }
19525
19526 /* Expand a signed/unsigned integral vector conditional move. */
19527
19528 bool
19529 ix86_expand_int_vcond (rtx operands[])
19530 {
19531 enum machine_mode data_mode = GET_MODE (operands[0]);
19532 enum machine_mode mode = GET_MODE (operands[4]);
19533 enum rtx_code code = GET_CODE (operands[3]);
19534 bool negate = false;
19535 rtx x, cop0, cop1;
19536
19537 cop0 = operands[4];
19538 cop1 = operands[5];
19539
19540 /* Try to optimize x < 0 ? -1 : 0 into (signed) x >> 31
19541 and x < 0 ? 1 : 0 into (unsigned) x >> 31. */
19542 if ((code == LT || code == GE)
19543 && data_mode == mode
19544 && cop1 == CONST0_RTX (mode)
19545 && operands[1 + (code == LT)] == CONST0_RTX (data_mode)
19546 && GET_MODE_SIZE (GET_MODE_INNER (data_mode)) > 1
19547 && GET_MODE_SIZE (GET_MODE_INNER (data_mode)) <= 8
19548 && (GET_MODE_SIZE (data_mode) == 16
19549 || (TARGET_AVX2 && GET_MODE_SIZE (data_mode) == 32)))
19550 {
19551 rtx negop = operands[2 - (code == LT)];
19552 int shift = GET_MODE_BITSIZE (GET_MODE_INNER (data_mode)) - 1;
19553 if (negop == CONST1_RTX (data_mode))
19554 {
19555 rtx res = expand_simple_binop (mode, LSHIFTRT, cop0, GEN_INT (shift),
19556 operands[0], 1, OPTAB_DIRECT);
19557 if (res != operands[0])
19558 emit_move_insn (operands[0], res);
19559 return true;
19560 }
19561 else if (GET_MODE_INNER (data_mode) != DImode
19562 && vector_all_ones_operand (negop, data_mode))
19563 {
19564 rtx res = expand_simple_binop (mode, ASHIFTRT, cop0, GEN_INT (shift),
19565 operands[0], 0, OPTAB_DIRECT);
19566 if (res != operands[0])
19567 emit_move_insn (operands[0], res);
19568 return true;
19569 }
19570 }
19571
19572 if (!nonimmediate_operand (cop1, mode))
19573 cop1 = force_reg (mode, cop1);
19574 if (!general_operand (operands[1], data_mode))
19575 operands[1] = force_reg (data_mode, operands[1]);
19576 if (!general_operand (operands[2], data_mode))
19577 operands[2] = force_reg (data_mode, operands[2]);
19578
19579 /* XOP supports all of the comparisons on all 128-bit vector int types. */
19580 if (TARGET_XOP
19581 && (mode == V16QImode || mode == V8HImode
19582 || mode == V4SImode || mode == V2DImode))
19583 ;
19584 else
19585 {
19586 /* Canonicalize the comparison to EQ, GT, GTU. */
19587 switch (code)
19588 {
19589 case EQ:
19590 case GT:
19591 case GTU:
19592 break;
19593
19594 case NE:
19595 case LE:
19596 case LEU:
19597 code = reverse_condition (code);
19598 negate = true;
19599 break;
19600
19601 case GE:
19602 case GEU:
19603 code = reverse_condition (code);
19604 negate = true;
19605 /* FALLTHRU */
19606
19607 case LT:
19608 case LTU:
19609 code = swap_condition (code);
19610 x = cop0, cop0 = cop1, cop1 = x;
19611 break;
19612
19613 default:
19614 gcc_unreachable ();
19615 }
19616
19617 /* Only SSE4.1/SSE4.2 supports V2DImode. */
19618 if (mode == V2DImode)
19619 {
19620 switch (code)
19621 {
19622 case EQ:
19623 /* SSE4.1 supports EQ. */
19624 if (!TARGET_SSE4_1)
19625 return false;
19626 break;
19627
19628 case GT:
19629 case GTU:
19630 /* SSE4.2 supports GT/GTU. */
19631 if (!TARGET_SSE4_2)
19632 return false;
19633 break;
19634
19635 default:
19636 gcc_unreachable ();
19637 }
19638 }
19639
19640 /* Unsigned parallel compare is not supported by the hardware.
19641 Play some tricks to turn this into a signed comparison
19642 against 0. */
19643 if (code == GTU)
19644 {
19645 cop0 = force_reg (mode, cop0);
19646
19647 switch (mode)
19648 {
19649 case V8SImode:
19650 case V4DImode:
19651 case V4SImode:
19652 case V2DImode:
19653 {
19654 rtx t1, t2, mask;
19655 rtx (*gen_sub3) (rtx, rtx, rtx);
19656
19657 switch (mode)
19658 {
19659 case V8SImode: gen_sub3 = gen_subv8si3; break;
19660 case V4DImode: gen_sub3 = gen_subv4di3; break;
19661 case V4SImode: gen_sub3 = gen_subv4si3; break;
19662 case V2DImode: gen_sub3 = gen_subv2di3; break;
19663 default:
19664 gcc_unreachable ();
19665 }
19666 /* Subtract (-(INT MAX) - 1) from both operands to make
19667 them signed. */
19668 mask = ix86_build_signbit_mask (mode, true, false);
19669 t1 = gen_reg_rtx (mode);
19670 emit_insn (gen_sub3 (t1, cop0, mask));
19671
19672 t2 = gen_reg_rtx (mode);
19673 emit_insn (gen_sub3 (t2, cop1, mask));
19674
19675 cop0 = t1;
19676 cop1 = t2;
19677 code = GT;
19678 }
19679 break;
19680
19681 case V32QImode:
19682 case V16HImode:
19683 case V16QImode:
19684 case V8HImode:
19685 /* Perform a parallel unsigned saturating subtraction. */
19686 x = gen_reg_rtx (mode);
19687 emit_insn (gen_rtx_SET (VOIDmode, x,
19688 gen_rtx_US_MINUS (mode, cop0, cop1)));
19689
19690 cop0 = x;
19691 cop1 = CONST0_RTX (mode);
19692 code = EQ;
19693 negate = !negate;
19694 break;
19695
19696 default:
19697 gcc_unreachable ();
19698 }
19699 }
19700 }
19701
19702 /* Allow the comparison to be done in one mode, but the movcc to
19703 happen in another mode. */
19704 if (data_mode == mode)
19705 {
19706 x = ix86_expand_sse_cmp (operands[0], code, cop0, cop1,
19707 operands[1+negate], operands[2-negate]);
19708 }
19709 else
19710 {
19711 gcc_assert (GET_MODE_SIZE (data_mode) == GET_MODE_SIZE (mode));
19712 x = ix86_expand_sse_cmp (gen_lowpart (mode, operands[0]),
19713 code, cop0, cop1,
19714 operands[1+negate], operands[2-negate]);
19715 x = gen_lowpart (data_mode, x);
19716 }
19717
19718 ix86_expand_sse_movcc (operands[0], x, operands[1+negate],
19719 operands[2-negate]);
19720 return true;
19721 }
19722
19723 /* Expand a variable vector permutation. */
19724
19725 void
19726 ix86_expand_vec_perm (rtx operands[])
19727 {
19728 rtx target = operands[0];
19729 rtx op0 = operands[1];
19730 rtx op1 = operands[2];
19731 rtx mask = operands[3];
19732 rtx t1, t2, t3, t4, vt, vt2, vec[32];
19733 enum machine_mode mode = GET_MODE (op0);
19734 enum machine_mode maskmode = GET_MODE (mask);
19735 int w, e, i;
19736 bool one_operand_shuffle = rtx_equal_p (op0, op1);
19737
19738 /* Number of elements in the vector. */
19739 w = GET_MODE_NUNITS (mode);
19740 e = GET_MODE_UNIT_SIZE (mode);
19741 gcc_assert (w <= 32);
19742
19743 if (TARGET_AVX2)
19744 {
19745 if (mode == V4DImode || mode == V4DFmode || mode == V16HImode)
19746 {
19747 /* Unfortunately, the VPERMQ and VPERMPD instructions only support
19748 an constant shuffle operand. With a tiny bit of effort we can
19749 use VPERMD instead. A re-interpretation stall for V4DFmode is
19750 unfortunate but there's no avoiding it.
19751 Similarly for V16HImode we don't have instructions for variable
19752 shuffling, while for V32QImode we can use after preparing suitable
19753 masks vpshufb; vpshufb; vpermq; vpor. */
19754
19755 if (mode == V16HImode)
19756 {
19757 maskmode = mode = V32QImode;
19758 w = 32;
19759 e = 1;
19760 }
19761 else
19762 {
19763 maskmode = mode = V8SImode;
19764 w = 8;
19765 e = 4;
19766 }
19767 t1 = gen_reg_rtx (maskmode);
19768
19769 /* Replicate the low bits of the V4DImode mask into V8SImode:
19770 mask = { A B C D }
19771 t1 = { A A B B C C D D }. */
19772 for (i = 0; i < w / 2; ++i)
19773 vec[i*2 + 1] = vec[i*2] = GEN_INT (i * 2);
19774 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
19775 vt = force_reg (maskmode, vt);
19776 mask = gen_lowpart (maskmode, mask);
19777 if (maskmode == V8SImode)
19778 emit_insn (gen_avx2_permvarv8si (t1, vt, mask));
19779 else
19780 emit_insn (gen_avx2_pshufbv32qi3 (t1, mask, vt));
19781
19782 /* Multiply the shuffle indicies by two. */
19783 t1 = expand_simple_binop (maskmode, PLUS, t1, t1, t1, 1,
19784 OPTAB_DIRECT);
19785
19786 /* Add one to the odd shuffle indicies:
19787 t1 = { A*2, A*2+1, B*2, B*2+1, ... }. */
19788 for (i = 0; i < w / 2; ++i)
19789 {
19790 vec[i * 2] = const0_rtx;
19791 vec[i * 2 + 1] = const1_rtx;
19792 }
19793 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
19794 vt = force_const_mem (maskmode, vt);
19795 t1 = expand_simple_binop (maskmode, PLUS, t1, vt, t1, 1,
19796 OPTAB_DIRECT);
19797
19798 /* Continue as if V8SImode (resp. V32QImode) was used initially. */
19799 operands[3] = mask = t1;
19800 target = gen_lowpart (mode, target);
19801 op0 = gen_lowpart (mode, op0);
19802 op1 = gen_lowpart (mode, op1);
19803 }
19804
19805 switch (mode)
19806 {
19807 case V8SImode:
19808 /* The VPERMD and VPERMPS instructions already properly ignore
19809 the high bits of the shuffle elements. No need for us to
19810 perform an AND ourselves. */
19811 if (one_operand_shuffle)
19812 emit_insn (gen_avx2_permvarv8si (target, mask, op0));
19813 else
19814 {
19815 t1 = gen_reg_rtx (V8SImode);
19816 t2 = gen_reg_rtx (V8SImode);
19817 emit_insn (gen_avx2_permvarv8si (t1, mask, op0));
19818 emit_insn (gen_avx2_permvarv8si (t2, mask, op1));
19819 goto merge_two;
19820 }
19821 return;
19822
19823 case V8SFmode:
19824 mask = gen_lowpart (V8SFmode, mask);
19825 if (one_operand_shuffle)
19826 emit_insn (gen_avx2_permvarv8sf (target, mask, op0));
19827 else
19828 {
19829 t1 = gen_reg_rtx (V8SFmode);
19830 t2 = gen_reg_rtx (V8SFmode);
19831 emit_insn (gen_avx2_permvarv8sf (t1, mask, op0));
19832 emit_insn (gen_avx2_permvarv8sf (t2, mask, op1));
19833 goto merge_two;
19834 }
19835 return;
19836
19837 case V4SImode:
19838 /* By combining the two 128-bit input vectors into one 256-bit
19839 input vector, we can use VPERMD and VPERMPS for the full
19840 two-operand shuffle. */
19841 t1 = gen_reg_rtx (V8SImode);
19842 t2 = gen_reg_rtx (V8SImode);
19843 emit_insn (gen_avx_vec_concatv8si (t1, op0, op1));
19844 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
19845 emit_insn (gen_avx2_permvarv8si (t1, t2, t1));
19846 emit_insn (gen_avx_vextractf128v8si (target, t1, const0_rtx));
19847 return;
19848
19849 case V4SFmode:
19850 t1 = gen_reg_rtx (V8SFmode);
19851 t2 = gen_reg_rtx (V8SFmode);
19852 mask = gen_lowpart (V4SFmode, mask);
19853 emit_insn (gen_avx_vec_concatv8sf (t1, op0, op1));
19854 emit_insn (gen_avx_vec_concatv8sf (t2, mask, mask));
19855 emit_insn (gen_avx2_permvarv8sf (t1, t2, t1));
19856 emit_insn (gen_avx_vextractf128v8sf (target, t1, const0_rtx));
19857 return;
19858
19859 case V32QImode:
19860 t1 = gen_reg_rtx (V32QImode);
19861 t2 = gen_reg_rtx (V32QImode);
19862 t3 = gen_reg_rtx (V32QImode);
19863 vt2 = GEN_INT (128);
19864 for (i = 0; i < 32; i++)
19865 vec[i] = vt2;
19866 vt = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
19867 vt = force_reg (V32QImode, vt);
19868 for (i = 0; i < 32; i++)
19869 vec[i] = i < 16 ? vt2 : const0_rtx;
19870 vt2 = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
19871 vt2 = force_reg (V32QImode, vt2);
19872 /* From mask create two adjusted masks, which contain the same
19873 bits as mask in the low 7 bits of each vector element.
19874 The first mask will have the most significant bit clear
19875 if it requests element from the same 128-bit lane
19876 and MSB set if it requests element from the other 128-bit lane.
19877 The second mask will have the opposite values of the MSB,
19878 and additionally will have its 128-bit lanes swapped.
19879 E.g. { 07 12 1e 09 ... | 17 19 05 1f ... } mask vector will have
19880 t1 { 07 92 9e 09 ... | 17 19 85 1f ... } and
19881 t3 { 97 99 05 9f ... | 87 12 1e 89 ... } where each ...
19882 stands for other 12 bytes. */
19883 /* The bit whether element is from the same lane or the other
19884 lane is bit 4, so shift it up by 3 to the MSB position. */
19885 emit_insn (gen_ashlv4di3 (gen_lowpart (V4DImode, t1),
19886 gen_lowpart (V4DImode, mask),
19887 GEN_INT (3)));
19888 /* Clear MSB bits from the mask just in case it had them set. */
19889 emit_insn (gen_avx2_andnotv32qi3 (t2, vt, mask));
19890 /* After this t1 will have MSB set for elements from other lane. */
19891 emit_insn (gen_xorv32qi3 (t1, t1, vt2));
19892 /* Clear bits other than MSB. */
19893 emit_insn (gen_andv32qi3 (t1, t1, vt));
19894 /* Or in the lower bits from mask into t3. */
19895 emit_insn (gen_iorv32qi3 (t3, t1, t2));
19896 /* And invert MSB bits in t1, so MSB is set for elements from the same
19897 lane. */
19898 emit_insn (gen_xorv32qi3 (t1, t1, vt));
19899 /* Swap 128-bit lanes in t3. */
19900 emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t3),
19901 gen_lowpart (V4DImode, t3),
19902 const2_rtx, GEN_INT (3),
19903 const0_rtx, const1_rtx));
19904 /* And or in the lower bits from mask into t1. */
19905 emit_insn (gen_iorv32qi3 (t1, t1, t2));
19906 if (one_operand_shuffle)
19907 {
19908 /* Each of these shuffles will put 0s in places where
19909 element from the other 128-bit lane is needed, otherwise
19910 will shuffle in the requested value. */
19911 emit_insn (gen_avx2_pshufbv32qi3 (t3, op0, t3));
19912 emit_insn (gen_avx2_pshufbv32qi3 (t1, op0, t1));
19913 /* For t3 the 128-bit lanes are swapped again. */
19914 emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t3),
19915 gen_lowpart (V4DImode, t3),
19916 const2_rtx, GEN_INT (3),
19917 const0_rtx, const1_rtx));
19918 /* And oring both together leads to the result. */
19919 emit_insn (gen_iorv32qi3 (target, t1, t3));
19920 return;
19921 }
19922
19923 t4 = gen_reg_rtx (V32QImode);
19924 /* Similarly to the above one_operand_shuffle code,
19925 just for repeated twice for each operand. merge_two:
19926 code will merge the two results together. */
19927 emit_insn (gen_avx2_pshufbv32qi3 (t4, op0, t3));
19928 emit_insn (gen_avx2_pshufbv32qi3 (t3, op1, t3));
19929 emit_insn (gen_avx2_pshufbv32qi3 (t2, op0, t1));
19930 emit_insn (gen_avx2_pshufbv32qi3 (t1, op1, t1));
19931 emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t4),
19932 gen_lowpart (V4DImode, t4),
19933 const2_rtx, GEN_INT (3),
19934 const0_rtx, const1_rtx));
19935 emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t3),
19936 gen_lowpart (V4DImode, t3),
19937 const2_rtx, GEN_INT (3),
19938 const0_rtx, const1_rtx));
19939 emit_insn (gen_iorv32qi3 (t4, t2, t4));
19940 emit_insn (gen_iorv32qi3 (t3, t1, t3));
19941 t1 = t4;
19942 t2 = t3;
19943 goto merge_two;
19944
19945 default:
19946 gcc_assert (GET_MODE_SIZE (mode) <= 16);
19947 break;
19948 }
19949 }
19950
19951 if (TARGET_XOP)
19952 {
19953 /* The XOP VPPERM insn supports three inputs. By ignoring the
19954 one_operand_shuffle special case, we avoid creating another
19955 set of constant vectors in memory. */
19956 one_operand_shuffle = false;
19957
19958 /* mask = mask & {2*w-1, ...} */
19959 vt = GEN_INT (2*w - 1);
19960 }
19961 else
19962 {
19963 /* mask = mask & {w-1, ...} */
19964 vt = GEN_INT (w - 1);
19965 }
19966
19967 for (i = 0; i < w; i++)
19968 vec[i] = vt;
19969 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
19970 mask = expand_simple_binop (maskmode, AND, mask, vt,
19971 NULL_RTX, 0, OPTAB_DIRECT);
19972
19973 /* For non-QImode operations, convert the word permutation control
19974 into a byte permutation control. */
19975 if (mode != V16QImode)
19976 {
19977 mask = expand_simple_binop (maskmode, ASHIFT, mask,
19978 GEN_INT (exact_log2 (e)),
19979 NULL_RTX, 0, OPTAB_DIRECT);
19980
19981 /* Convert mask to vector of chars. */
19982 mask = force_reg (V16QImode, gen_lowpart (V16QImode, mask));
19983
19984 /* Replicate each of the input bytes into byte positions:
19985 (v2di) --> {0,0,0,0,0,0,0,0, 8,8,8,8,8,8,8,8}
19986 (v4si) --> {0,0,0,0, 4,4,4,4, 8,8,8,8, 12,12,12,12}
19987 (v8hi) --> {0,0, 2,2, 4,4, 6,6, ...}. */
19988 for (i = 0; i < 16; ++i)
19989 vec[i] = GEN_INT (i/e * e);
19990 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
19991 vt = force_const_mem (V16QImode, vt);
19992 if (TARGET_XOP)
19993 emit_insn (gen_xop_pperm (mask, mask, mask, vt));
19994 else
19995 emit_insn (gen_ssse3_pshufbv16qi3 (mask, mask, vt));
19996
19997 /* Convert it into the byte positions by doing
19998 mask = mask + {0,1,..,16/w, 0,1,..,16/w, ...} */
19999 for (i = 0; i < 16; ++i)
20000 vec[i] = GEN_INT (i % e);
20001 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
20002 vt = force_const_mem (V16QImode, vt);
20003 emit_insn (gen_addv16qi3 (mask, mask, vt));
20004 }
20005
20006 /* The actual shuffle operations all operate on V16QImode. */
20007 op0 = gen_lowpart (V16QImode, op0);
20008 op1 = gen_lowpart (V16QImode, op1);
20009 target = gen_lowpart (V16QImode, target);
20010
20011 if (TARGET_XOP)
20012 {
20013 emit_insn (gen_xop_pperm (target, op0, op1, mask));
20014 }
20015 else if (one_operand_shuffle)
20016 {
20017 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, mask));
20018 }
20019 else
20020 {
20021 rtx xops[6];
20022 bool ok;
20023
20024 /* Shuffle the two input vectors independently. */
20025 t1 = gen_reg_rtx (V16QImode);
20026 t2 = gen_reg_rtx (V16QImode);
20027 emit_insn (gen_ssse3_pshufbv16qi3 (t1, op0, mask));
20028 emit_insn (gen_ssse3_pshufbv16qi3 (t2, op1, mask));
20029
20030 merge_two:
20031 /* Then merge them together. The key is whether any given control
20032 element contained a bit set that indicates the second word. */
20033 mask = operands[3];
20034 vt = GEN_INT (w);
20035 if (maskmode == V2DImode && !TARGET_SSE4_1)
20036 {
20037 /* Without SSE4.1, we don't have V2DImode EQ. Perform one
20038 more shuffle to convert the V2DI input mask into a V4SI
20039 input mask. At which point the masking that expand_int_vcond
20040 will work as desired. */
20041 rtx t3 = gen_reg_rtx (V4SImode);
20042 emit_insn (gen_sse2_pshufd_1 (t3, gen_lowpart (V4SImode, mask),
20043 const0_rtx, const0_rtx,
20044 const2_rtx, const2_rtx));
20045 mask = t3;
20046 maskmode = V4SImode;
20047 e = w = 4;
20048 }
20049
20050 for (i = 0; i < w; i++)
20051 vec[i] = vt;
20052 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
20053 vt = force_reg (maskmode, vt);
20054 mask = expand_simple_binop (maskmode, AND, mask, vt,
20055 NULL_RTX, 0, OPTAB_DIRECT);
20056
20057 xops[0] = gen_lowpart (mode, operands[0]);
20058 xops[1] = gen_lowpart (mode, t2);
20059 xops[2] = gen_lowpart (mode, t1);
20060 xops[3] = gen_rtx_EQ (maskmode, mask, vt);
20061 xops[4] = mask;
20062 xops[5] = vt;
20063 ok = ix86_expand_int_vcond (xops);
20064 gcc_assert (ok);
20065 }
20066 }
20067
20068 /* Unpack OP[1] into the next wider integer vector type. UNSIGNED_P is
20069 true if we should do zero extension, else sign extension. HIGH_P is
20070 true if we want the N/2 high elements, else the low elements. */
20071
20072 void
20073 ix86_expand_sse_unpack (rtx operands[2], bool unsigned_p, bool high_p)
20074 {
20075 enum machine_mode imode = GET_MODE (operands[1]);
20076 rtx tmp, dest;
20077
20078 if (TARGET_SSE4_1)
20079 {
20080 rtx (*unpack)(rtx, rtx);
20081 rtx (*extract)(rtx, rtx) = NULL;
20082 enum machine_mode halfmode = BLKmode;
20083
20084 switch (imode)
20085 {
20086 case V32QImode:
20087 if (unsigned_p)
20088 unpack = gen_avx2_zero_extendv16qiv16hi2;
20089 else
20090 unpack = gen_avx2_sign_extendv16qiv16hi2;
20091 halfmode = V16QImode;
20092 extract
20093 = high_p ? gen_vec_extract_hi_v32qi : gen_vec_extract_lo_v32qi;
20094 break;
20095 case V16HImode:
20096 if (unsigned_p)
20097 unpack = gen_avx2_zero_extendv8hiv8si2;
20098 else
20099 unpack = gen_avx2_sign_extendv8hiv8si2;
20100 halfmode = V8HImode;
20101 extract
20102 = high_p ? gen_vec_extract_hi_v16hi : gen_vec_extract_lo_v16hi;
20103 break;
20104 case V8SImode:
20105 if (unsigned_p)
20106 unpack = gen_avx2_zero_extendv4siv4di2;
20107 else
20108 unpack = gen_avx2_sign_extendv4siv4di2;
20109 halfmode = V4SImode;
20110 extract
20111 = high_p ? gen_vec_extract_hi_v8si : gen_vec_extract_lo_v8si;
20112 break;
20113 case V16QImode:
20114 if (unsigned_p)
20115 unpack = gen_sse4_1_zero_extendv8qiv8hi2;
20116 else
20117 unpack = gen_sse4_1_sign_extendv8qiv8hi2;
20118 break;
20119 case V8HImode:
20120 if (unsigned_p)
20121 unpack = gen_sse4_1_zero_extendv4hiv4si2;
20122 else
20123 unpack = gen_sse4_1_sign_extendv4hiv4si2;
20124 break;
20125 case V4SImode:
20126 if (unsigned_p)
20127 unpack = gen_sse4_1_zero_extendv2siv2di2;
20128 else
20129 unpack = gen_sse4_1_sign_extendv2siv2di2;
20130 break;
20131 default:
20132 gcc_unreachable ();
20133 }
20134
20135 if (GET_MODE_SIZE (imode) == 32)
20136 {
20137 tmp = gen_reg_rtx (halfmode);
20138 emit_insn (extract (tmp, operands[1]));
20139 }
20140 else if (high_p)
20141 {
20142 /* Shift higher 8 bytes to lower 8 bytes. */
20143 tmp = gen_reg_rtx (imode);
20144 emit_insn (gen_sse2_lshrv1ti3 (gen_lowpart (V1TImode, tmp),
20145 gen_lowpart (V1TImode, operands[1]),
20146 GEN_INT (64)));
20147 }
20148 else
20149 tmp = operands[1];
20150
20151 emit_insn (unpack (operands[0], tmp));
20152 }
20153 else
20154 {
20155 rtx (*unpack)(rtx, rtx, rtx);
20156
20157 switch (imode)
20158 {
20159 case V16QImode:
20160 if (high_p)
20161 unpack = gen_vec_interleave_highv16qi;
20162 else
20163 unpack = gen_vec_interleave_lowv16qi;
20164 break;
20165 case V8HImode:
20166 if (high_p)
20167 unpack = gen_vec_interleave_highv8hi;
20168 else
20169 unpack = gen_vec_interleave_lowv8hi;
20170 break;
20171 case V4SImode:
20172 if (high_p)
20173 unpack = gen_vec_interleave_highv4si;
20174 else
20175 unpack = gen_vec_interleave_lowv4si;
20176 break;
20177 default:
20178 gcc_unreachable ();
20179 }
20180
20181 dest = gen_lowpart (imode, operands[0]);
20182
20183 if (unsigned_p)
20184 tmp = force_reg (imode, CONST0_RTX (imode));
20185 else
20186 tmp = ix86_expand_sse_cmp (gen_reg_rtx (imode), GT, CONST0_RTX (imode),
20187 operands[1], pc_rtx, pc_rtx);
20188
20189 emit_insn (unpack (dest, operands[1], tmp));
20190 }
20191 }
20192
20193 /* Expand conditional increment or decrement using adb/sbb instructions.
20194 The default case using setcc followed by the conditional move can be
20195 done by generic code. */
20196 bool
20197 ix86_expand_int_addcc (rtx operands[])
20198 {
20199 enum rtx_code code = GET_CODE (operands[1]);
20200 rtx flags;
20201 rtx (*insn)(rtx, rtx, rtx, rtx, rtx);
20202 rtx compare_op;
20203 rtx val = const0_rtx;
20204 bool fpcmp = false;
20205 enum machine_mode mode;
20206 rtx op0 = XEXP (operands[1], 0);
20207 rtx op1 = XEXP (operands[1], 1);
20208
20209 if (operands[3] != const1_rtx
20210 && operands[3] != constm1_rtx)
20211 return false;
20212 if (!ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
20213 return false;
20214 code = GET_CODE (compare_op);
20215
20216 flags = XEXP (compare_op, 0);
20217
20218 if (GET_MODE (flags) == CCFPmode
20219 || GET_MODE (flags) == CCFPUmode)
20220 {
20221 fpcmp = true;
20222 code = ix86_fp_compare_code_to_integer (code);
20223 }
20224
20225 if (code != LTU)
20226 {
20227 val = constm1_rtx;
20228 if (fpcmp)
20229 PUT_CODE (compare_op,
20230 reverse_condition_maybe_unordered
20231 (GET_CODE (compare_op)));
20232 else
20233 PUT_CODE (compare_op, reverse_condition (GET_CODE (compare_op)));
20234 }
20235
20236 mode = GET_MODE (operands[0]);
20237
20238 /* Construct either adc or sbb insn. */
20239 if ((code == LTU) == (operands[3] == constm1_rtx))
20240 {
20241 switch (mode)
20242 {
20243 case QImode:
20244 insn = gen_subqi3_carry;
20245 break;
20246 case HImode:
20247 insn = gen_subhi3_carry;
20248 break;
20249 case SImode:
20250 insn = gen_subsi3_carry;
20251 break;
20252 case DImode:
20253 insn = gen_subdi3_carry;
20254 break;
20255 default:
20256 gcc_unreachable ();
20257 }
20258 }
20259 else
20260 {
20261 switch (mode)
20262 {
20263 case QImode:
20264 insn = gen_addqi3_carry;
20265 break;
20266 case HImode:
20267 insn = gen_addhi3_carry;
20268 break;
20269 case SImode:
20270 insn = gen_addsi3_carry;
20271 break;
20272 case DImode:
20273 insn = gen_adddi3_carry;
20274 break;
20275 default:
20276 gcc_unreachable ();
20277 }
20278 }
20279 emit_insn (insn (operands[0], operands[2], val, flags, compare_op));
20280
20281 return true;
20282 }
20283
20284
20285 /* Split operands 0 and 1 into half-mode parts. Similar to split_double_mode,
20286 but works for floating pointer parameters and nonoffsetable memories.
20287 For pushes, it returns just stack offsets; the values will be saved
20288 in the right order. Maximally three parts are generated. */
20289
20290 static int
20291 ix86_split_to_parts (rtx operand, rtx *parts, enum machine_mode mode)
20292 {
20293 int size;
20294
20295 if (!TARGET_64BIT)
20296 size = mode==XFmode ? 3 : GET_MODE_SIZE (mode) / 4;
20297 else
20298 size = (GET_MODE_SIZE (mode) + 4) / 8;
20299
20300 gcc_assert (!REG_P (operand) || !MMX_REGNO_P (REGNO (operand)));
20301 gcc_assert (size >= 2 && size <= 4);
20302
20303 /* Optimize constant pool reference to immediates. This is used by fp
20304 moves, that force all constants to memory to allow combining. */
20305 if (MEM_P (operand) && MEM_READONLY_P (operand))
20306 {
20307 rtx tmp = maybe_get_pool_constant (operand);
20308 if (tmp)
20309 operand = tmp;
20310 }
20311
20312 if (MEM_P (operand) && !offsettable_memref_p (operand))
20313 {
20314 /* The only non-offsetable memories we handle are pushes. */
20315 int ok = push_operand (operand, VOIDmode);
20316
20317 gcc_assert (ok);
20318
20319 operand = copy_rtx (operand);
20320 PUT_MODE (operand, word_mode);
20321 parts[0] = parts[1] = parts[2] = parts[3] = operand;
20322 return size;
20323 }
20324
20325 if (GET_CODE (operand) == CONST_VECTOR)
20326 {
20327 enum machine_mode imode = int_mode_for_mode (mode);
20328 /* Caution: if we looked through a constant pool memory above,
20329 the operand may actually have a different mode now. That's
20330 ok, since we want to pun this all the way back to an integer. */
20331 operand = simplify_subreg (imode, operand, GET_MODE (operand), 0);
20332 gcc_assert (operand != NULL);
20333 mode = imode;
20334 }
20335
20336 if (!TARGET_64BIT)
20337 {
20338 if (mode == DImode)
20339 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
20340 else
20341 {
20342 int i;
20343
20344 if (REG_P (operand))
20345 {
20346 gcc_assert (reload_completed);
20347 for (i = 0; i < size; i++)
20348 parts[i] = gen_rtx_REG (SImode, REGNO (operand) + i);
20349 }
20350 else if (offsettable_memref_p (operand))
20351 {
20352 operand = adjust_address (operand, SImode, 0);
20353 parts[0] = operand;
20354 for (i = 1; i < size; i++)
20355 parts[i] = adjust_address (operand, SImode, 4 * i);
20356 }
20357 else if (GET_CODE (operand) == CONST_DOUBLE)
20358 {
20359 REAL_VALUE_TYPE r;
20360 long l[4];
20361
20362 REAL_VALUE_FROM_CONST_DOUBLE (r, operand);
20363 switch (mode)
20364 {
20365 case TFmode:
20366 real_to_target (l, &r, mode);
20367 parts[3] = gen_int_mode (l[3], SImode);
20368 parts[2] = gen_int_mode (l[2], SImode);
20369 break;
20370 case XFmode:
20371 REAL_VALUE_TO_TARGET_LONG_DOUBLE (r, l);
20372 parts[2] = gen_int_mode (l[2], SImode);
20373 break;
20374 case DFmode:
20375 REAL_VALUE_TO_TARGET_DOUBLE (r, l);
20376 break;
20377 default:
20378 gcc_unreachable ();
20379 }
20380 parts[1] = gen_int_mode (l[1], SImode);
20381 parts[0] = gen_int_mode (l[0], SImode);
20382 }
20383 else
20384 gcc_unreachable ();
20385 }
20386 }
20387 else
20388 {
20389 if (mode == TImode)
20390 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
20391 if (mode == XFmode || mode == TFmode)
20392 {
20393 enum machine_mode upper_mode = mode==XFmode ? SImode : DImode;
20394 if (REG_P (operand))
20395 {
20396 gcc_assert (reload_completed);
20397 parts[0] = gen_rtx_REG (DImode, REGNO (operand) + 0);
20398 parts[1] = gen_rtx_REG (upper_mode, REGNO (operand) + 1);
20399 }
20400 else if (offsettable_memref_p (operand))
20401 {
20402 operand = adjust_address (operand, DImode, 0);
20403 parts[0] = operand;
20404 parts[1] = adjust_address (operand, upper_mode, 8);
20405 }
20406 else if (GET_CODE (operand) == CONST_DOUBLE)
20407 {
20408 REAL_VALUE_TYPE r;
20409 long l[4];
20410
20411 REAL_VALUE_FROM_CONST_DOUBLE (r, operand);
20412 real_to_target (l, &r, mode);
20413
20414 /* Do not use shift by 32 to avoid warning on 32bit systems. */
20415 if (HOST_BITS_PER_WIDE_INT >= 64)
20416 parts[0]
20417 = gen_int_mode
20418 ((l[0] & (((HOST_WIDE_INT) 2 << 31) - 1))
20419 + ((((HOST_WIDE_INT) l[1]) << 31) << 1),
20420 DImode);
20421 else
20422 parts[0] = immed_double_const (l[0], l[1], DImode);
20423
20424 if (upper_mode == SImode)
20425 parts[1] = gen_int_mode (l[2], SImode);
20426 else if (HOST_BITS_PER_WIDE_INT >= 64)
20427 parts[1]
20428 = gen_int_mode
20429 ((l[2] & (((HOST_WIDE_INT) 2 << 31) - 1))
20430 + ((((HOST_WIDE_INT) l[3]) << 31) << 1),
20431 DImode);
20432 else
20433 parts[1] = immed_double_const (l[2], l[3], DImode);
20434 }
20435 else
20436 gcc_unreachable ();
20437 }
20438 }
20439
20440 return size;
20441 }
20442
20443 /* Emit insns to perform a move or push of DI, DF, XF, and TF values.
20444 Return false when normal moves are needed; true when all required
20445 insns have been emitted. Operands 2-4 contain the input values
20446 int the correct order; operands 5-7 contain the output values. */
20447
20448 void
20449 ix86_split_long_move (rtx operands[])
20450 {
20451 rtx part[2][4];
20452 int nparts, i, j;
20453 int push = 0;
20454 int collisions = 0;
20455 enum machine_mode mode = GET_MODE (operands[0]);
20456 bool collisionparts[4];
20457
20458 /* The DFmode expanders may ask us to move double.
20459 For 64bit target this is single move. By hiding the fact
20460 here we simplify i386.md splitters. */
20461 if (TARGET_64BIT && GET_MODE_SIZE (GET_MODE (operands[0])) == 8)
20462 {
20463 /* Optimize constant pool reference to immediates. This is used by
20464 fp moves, that force all constants to memory to allow combining. */
20465
20466 if (MEM_P (operands[1])
20467 && GET_CODE (XEXP (operands[1], 0)) == SYMBOL_REF
20468 && CONSTANT_POOL_ADDRESS_P (XEXP (operands[1], 0)))
20469 operands[1] = get_pool_constant (XEXP (operands[1], 0));
20470 if (push_operand (operands[0], VOIDmode))
20471 {
20472 operands[0] = copy_rtx (operands[0]);
20473 PUT_MODE (operands[0], word_mode);
20474 }
20475 else
20476 operands[0] = gen_lowpart (DImode, operands[0]);
20477 operands[1] = gen_lowpart (DImode, operands[1]);
20478 emit_move_insn (operands[0], operands[1]);
20479 return;
20480 }
20481
20482 /* The only non-offsettable memory we handle is push. */
20483 if (push_operand (operands[0], VOIDmode))
20484 push = 1;
20485 else
20486 gcc_assert (!MEM_P (operands[0])
20487 || offsettable_memref_p (operands[0]));
20488
20489 nparts = ix86_split_to_parts (operands[1], part[1], GET_MODE (operands[0]));
20490 ix86_split_to_parts (operands[0], part[0], GET_MODE (operands[0]));
20491
20492 /* When emitting push, take care for source operands on the stack. */
20493 if (push && MEM_P (operands[1])
20494 && reg_overlap_mentioned_p (stack_pointer_rtx, operands[1]))
20495 {
20496 rtx src_base = XEXP (part[1][nparts - 1], 0);
20497
20498 /* Compensate for the stack decrement by 4. */
20499 if (!TARGET_64BIT && nparts == 3
20500 && mode == XFmode && TARGET_128BIT_LONG_DOUBLE)
20501 src_base = plus_constant (src_base, 4);
20502
20503 /* src_base refers to the stack pointer and is
20504 automatically decreased by emitted push. */
20505 for (i = 0; i < nparts; i++)
20506 part[1][i] = change_address (part[1][i],
20507 GET_MODE (part[1][i]), src_base);
20508 }
20509
20510 /* We need to do copy in the right order in case an address register
20511 of the source overlaps the destination. */
20512 if (REG_P (part[0][0]) && MEM_P (part[1][0]))
20513 {
20514 rtx tmp;
20515
20516 for (i = 0; i < nparts; i++)
20517 {
20518 collisionparts[i]
20519 = reg_overlap_mentioned_p (part[0][i], XEXP (part[1][0], 0));
20520 if (collisionparts[i])
20521 collisions++;
20522 }
20523
20524 /* Collision in the middle part can be handled by reordering. */
20525 if (collisions == 1 && nparts == 3 && collisionparts [1])
20526 {
20527 tmp = part[0][1]; part[0][1] = part[0][2]; part[0][2] = tmp;
20528 tmp = part[1][1]; part[1][1] = part[1][2]; part[1][2] = tmp;
20529 }
20530 else if (collisions == 1
20531 && nparts == 4
20532 && (collisionparts [1] || collisionparts [2]))
20533 {
20534 if (collisionparts [1])
20535 {
20536 tmp = part[0][1]; part[0][1] = part[0][2]; part[0][2] = tmp;
20537 tmp = part[1][1]; part[1][1] = part[1][2]; part[1][2] = tmp;
20538 }
20539 else
20540 {
20541 tmp = part[0][2]; part[0][2] = part[0][3]; part[0][3] = tmp;
20542 tmp = part[1][2]; part[1][2] = part[1][3]; part[1][3] = tmp;
20543 }
20544 }
20545
20546 /* If there are more collisions, we can't handle it by reordering.
20547 Do an lea to the last part and use only one colliding move. */
20548 else if (collisions > 1)
20549 {
20550 rtx base;
20551
20552 collisions = 1;
20553
20554 base = part[0][nparts - 1];
20555
20556 /* Handle the case when the last part isn't valid for lea.
20557 Happens in 64-bit mode storing the 12-byte XFmode. */
20558 if (GET_MODE (base) != Pmode)
20559 base = gen_rtx_REG (Pmode, REGNO (base));
20560
20561 emit_insn (gen_rtx_SET (VOIDmode, base, XEXP (part[1][0], 0)));
20562 part[1][0] = replace_equiv_address (part[1][0], base);
20563 for (i = 1; i < nparts; i++)
20564 {
20565 tmp = plus_constant (base, UNITS_PER_WORD * i);
20566 part[1][i] = replace_equiv_address (part[1][i], tmp);
20567 }
20568 }
20569 }
20570
20571 if (push)
20572 {
20573 if (!TARGET_64BIT)
20574 {
20575 if (nparts == 3)
20576 {
20577 if (TARGET_128BIT_LONG_DOUBLE && mode == XFmode)
20578 emit_insn (gen_addsi3 (stack_pointer_rtx,
20579 stack_pointer_rtx, GEN_INT (-4)));
20580 emit_move_insn (part[0][2], part[1][2]);
20581 }
20582 else if (nparts == 4)
20583 {
20584 emit_move_insn (part[0][3], part[1][3]);
20585 emit_move_insn (part[0][2], part[1][2]);
20586 }
20587 }
20588 else
20589 {
20590 /* In 64bit mode we don't have 32bit push available. In case this is
20591 register, it is OK - we will just use larger counterpart. We also
20592 retype memory - these comes from attempt to avoid REX prefix on
20593 moving of second half of TFmode value. */
20594 if (GET_MODE (part[1][1]) == SImode)
20595 {
20596 switch (GET_CODE (part[1][1]))
20597 {
20598 case MEM:
20599 part[1][1] = adjust_address (part[1][1], DImode, 0);
20600 break;
20601
20602 case REG:
20603 part[1][1] = gen_rtx_REG (DImode, REGNO (part[1][1]));
20604 break;
20605
20606 default:
20607 gcc_unreachable ();
20608 }
20609
20610 if (GET_MODE (part[1][0]) == SImode)
20611 part[1][0] = part[1][1];
20612 }
20613 }
20614 emit_move_insn (part[0][1], part[1][1]);
20615 emit_move_insn (part[0][0], part[1][0]);
20616 return;
20617 }
20618
20619 /* Choose correct order to not overwrite the source before it is copied. */
20620 if ((REG_P (part[0][0])
20621 && REG_P (part[1][1])
20622 && (REGNO (part[0][0]) == REGNO (part[1][1])
20623 || (nparts == 3
20624 && REGNO (part[0][0]) == REGNO (part[1][2]))
20625 || (nparts == 4
20626 && REGNO (part[0][0]) == REGNO (part[1][3]))))
20627 || (collisions > 0
20628 && reg_overlap_mentioned_p (part[0][0], XEXP (part[1][0], 0))))
20629 {
20630 for (i = 0, j = nparts - 1; i < nparts; i++, j--)
20631 {
20632 operands[2 + i] = part[0][j];
20633 operands[6 + i] = part[1][j];
20634 }
20635 }
20636 else
20637 {
20638 for (i = 0; i < nparts; i++)
20639 {
20640 operands[2 + i] = part[0][i];
20641 operands[6 + i] = part[1][i];
20642 }
20643 }
20644
20645 /* If optimizing for size, attempt to locally unCSE nonzero constants. */
20646 if (optimize_insn_for_size_p ())
20647 {
20648 for (j = 0; j < nparts - 1; j++)
20649 if (CONST_INT_P (operands[6 + j])
20650 && operands[6 + j] != const0_rtx
20651 && REG_P (operands[2 + j]))
20652 for (i = j; i < nparts - 1; i++)
20653 if (CONST_INT_P (operands[7 + i])
20654 && INTVAL (operands[7 + i]) == INTVAL (operands[6 + j]))
20655 operands[7 + i] = operands[2 + j];
20656 }
20657
20658 for (i = 0; i < nparts; i++)
20659 emit_move_insn (operands[2 + i], operands[6 + i]);
20660
20661 return;
20662 }
20663
20664 /* Helper function of ix86_split_ashl used to generate an SImode/DImode
20665 left shift by a constant, either using a single shift or
20666 a sequence of add instructions. */
20667
20668 static void
20669 ix86_expand_ashl_const (rtx operand, int count, enum machine_mode mode)
20670 {
20671 rtx (*insn)(rtx, rtx, rtx);
20672
20673 if (count == 1
20674 || (count * ix86_cost->add <= ix86_cost->shift_const
20675 && !optimize_insn_for_size_p ()))
20676 {
20677 insn = mode == DImode ? gen_addsi3 : gen_adddi3;
20678 while (count-- > 0)
20679 emit_insn (insn (operand, operand, operand));
20680 }
20681 else
20682 {
20683 insn = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
20684 emit_insn (insn (operand, operand, GEN_INT (count)));
20685 }
20686 }
20687
20688 void
20689 ix86_split_ashl (rtx *operands, rtx scratch, enum machine_mode mode)
20690 {
20691 rtx (*gen_ashl3)(rtx, rtx, rtx);
20692 rtx (*gen_shld)(rtx, rtx, rtx);
20693 int half_width = GET_MODE_BITSIZE (mode) >> 1;
20694
20695 rtx low[2], high[2];
20696 int count;
20697
20698 if (CONST_INT_P (operands[2]))
20699 {
20700 split_double_mode (mode, operands, 2, low, high);
20701 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
20702
20703 if (count >= half_width)
20704 {
20705 emit_move_insn (high[0], low[1]);
20706 emit_move_insn (low[0], const0_rtx);
20707
20708 if (count > half_width)
20709 ix86_expand_ashl_const (high[0], count - half_width, mode);
20710 }
20711 else
20712 {
20713 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
20714
20715 if (!rtx_equal_p (operands[0], operands[1]))
20716 emit_move_insn (operands[0], operands[1]);
20717
20718 emit_insn (gen_shld (high[0], low[0], GEN_INT (count)));
20719 ix86_expand_ashl_const (low[0], count, mode);
20720 }
20721 return;
20722 }
20723
20724 split_double_mode (mode, operands, 1, low, high);
20725
20726 gen_ashl3 = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
20727
20728 if (operands[1] == const1_rtx)
20729 {
20730 /* Assuming we've chosen a QImode capable registers, then 1 << N
20731 can be done with two 32/64-bit shifts, no branches, no cmoves. */
20732 if (ANY_QI_REG_P (low[0]) && ANY_QI_REG_P (high[0]))
20733 {
20734 rtx s, d, flags = gen_rtx_REG (CCZmode, FLAGS_REG);
20735
20736 ix86_expand_clear (low[0]);
20737 ix86_expand_clear (high[0]);
20738 emit_insn (gen_testqi_ccz_1 (operands[2], GEN_INT (half_width)));
20739
20740 d = gen_lowpart (QImode, low[0]);
20741 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
20742 s = gen_rtx_EQ (QImode, flags, const0_rtx);
20743 emit_insn (gen_rtx_SET (VOIDmode, d, s));
20744
20745 d = gen_lowpart (QImode, high[0]);
20746 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
20747 s = gen_rtx_NE (QImode, flags, const0_rtx);
20748 emit_insn (gen_rtx_SET (VOIDmode, d, s));
20749 }
20750
20751 /* Otherwise, we can get the same results by manually performing
20752 a bit extract operation on bit 5/6, and then performing the two
20753 shifts. The two methods of getting 0/1 into low/high are exactly
20754 the same size. Avoiding the shift in the bit extract case helps
20755 pentium4 a bit; no one else seems to care much either way. */
20756 else
20757 {
20758 enum machine_mode half_mode;
20759 rtx (*gen_lshr3)(rtx, rtx, rtx);
20760 rtx (*gen_and3)(rtx, rtx, rtx);
20761 rtx (*gen_xor3)(rtx, rtx, rtx);
20762 HOST_WIDE_INT bits;
20763 rtx x;
20764
20765 if (mode == DImode)
20766 {
20767 half_mode = SImode;
20768 gen_lshr3 = gen_lshrsi3;
20769 gen_and3 = gen_andsi3;
20770 gen_xor3 = gen_xorsi3;
20771 bits = 5;
20772 }
20773 else
20774 {
20775 half_mode = DImode;
20776 gen_lshr3 = gen_lshrdi3;
20777 gen_and3 = gen_anddi3;
20778 gen_xor3 = gen_xordi3;
20779 bits = 6;
20780 }
20781
20782 if (TARGET_PARTIAL_REG_STALL && !optimize_insn_for_size_p ())
20783 x = gen_rtx_ZERO_EXTEND (half_mode, operands[2]);
20784 else
20785 x = gen_lowpart (half_mode, operands[2]);
20786 emit_insn (gen_rtx_SET (VOIDmode, high[0], x));
20787
20788 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (bits)));
20789 emit_insn (gen_and3 (high[0], high[0], const1_rtx));
20790 emit_move_insn (low[0], high[0]);
20791 emit_insn (gen_xor3 (low[0], low[0], const1_rtx));
20792 }
20793
20794 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
20795 emit_insn (gen_ashl3 (high[0], high[0], operands[2]));
20796 return;
20797 }
20798
20799 if (operands[1] == constm1_rtx)
20800 {
20801 /* For -1 << N, we can avoid the shld instruction, because we
20802 know that we're shifting 0...31/63 ones into a -1. */
20803 emit_move_insn (low[0], constm1_rtx);
20804 if (optimize_insn_for_size_p ())
20805 emit_move_insn (high[0], low[0]);
20806 else
20807 emit_move_insn (high[0], constm1_rtx);
20808 }
20809 else
20810 {
20811 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
20812
20813 if (!rtx_equal_p (operands[0], operands[1]))
20814 emit_move_insn (operands[0], operands[1]);
20815
20816 split_double_mode (mode, operands, 1, low, high);
20817 emit_insn (gen_shld (high[0], low[0], operands[2]));
20818 }
20819
20820 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
20821
20822 if (TARGET_CMOVE && scratch)
20823 {
20824 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
20825 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
20826
20827 ix86_expand_clear (scratch);
20828 emit_insn (gen_x86_shift_adj_1 (high[0], low[0], operands[2], scratch));
20829 }
20830 else
20831 {
20832 rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
20833 = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
20834
20835 emit_insn (gen_x86_shift_adj_2 (high[0], low[0], operands[2]));
20836 }
20837 }
20838
20839 void
20840 ix86_split_ashr (rtx *operands, rtx scratch, enum machine_mode mode)
20841 {
20842 rtx (*gen_ashr3)(rtx, rtx, rtx)
20843 = mode == DImode ? gen_ashrsi3 : gen_ashrdi3;
20844 rtx (*gen_shrd)(rtx, rtx, rtx);
20845 int half_width = GET_MODE_BITSIZE (mode) >> 1;
20846
20847 rtx low[2], high[2];
20848 int count;
20849
20850 if (CONST_INT_P (operands[2]))
20851 {
20852 split_double_mode (mode, operands, 2, low, high);
20853 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
20854
20855 if (count == GET_MODE_BITSIZE (mode) - 1)
20856 {
20857 emit_move_insn (high[0], high[1]);
20858 emit_insn (gen_ashr3 (high[0], high[0],
20859 GEN_INT (half_width - 1)));
20860 emit_move_insn (low[0], high[0]);
20861
20862 }
20863 else if (count >= half_width)
20864 {
20865 emit_move_insn (low[0], high[1]);
20866 emit_move_insn (high[0], low[0]);
20867 emit_insn (gen_ashr3 (high[0], high[0],
20868 GEN_INT (half_width - 1)));
20869
20870 if (count > half_width)
20871 emit_insn (gen_ashr3 (low[0], low[0],
20872 GEN_INT (count - half_width)));
20873 }
20874 else
20875 {
20876 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
20877
20878 if (!rtx_equal_p (operands[0], operands[1]))
20879 emit_move_insn (operands[0], operands[1]);
20880
20881 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
20882 emit_insn (gen_ashr3 (high[0], high[0], GEN_INT (count)));
20883 }
20884 }
20885 else
20886 {
20887 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
20888
20889 if (!rtx_equal_p (operands[0], operands[1]))
20890 emit_move_insn (operands[0], operands[1]);
20891
20892 split_double_mode (mode, operands, 1, low, high);
20893
20894 emit_insn (gen_shrd (low[0], high[0], operands[2]));
20895 emit_insn (gen_ashr3 (high[0], high[0], operands[2]));
20896
20897 if (TARGET_CMOVE && scratch)
20898 {
20899 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
20900 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
20901
20902 emit_move_insn (scratch, high[0]);
20903 emit_insn (gen_ashr3 (scratch, scratch,
20904 GEN_INT (half_width - 1)));
20905 emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
20906 scratch));
20907 }
20908 else
20909 {
20910 rtx (*gen_x86_shift_adj_3)(rtx, rtx, rtx)
20911 = mode == DImode ? gen_x86_shiftsi_adj_3 : gen_x86_shiftdi_adj_3;
20912
20913 emit_insn (gen_x86_shift_adj_3 (low[0], high[0], operands[2]));
20914 }
20915 }
20916 }
20917
20918 void
20919 ix86_split_lshr (rtx *operands, rtx scratch, enum machine_mode mode)
20920 {
20921 rtx (*gen_lshr3)(rtx, rtx, rtx)
20922 = mode == DImode ? gen_lshrsi3 : gen_lshrdi3;
20923 rtx (*gen_shrd)(rtx, rtx, rtx);
20924 int half_width = GET_MODE_BITSIZE (mode) >> 1;
20925
20926 rtx low[2], high[2];
20927 int count;
20928
20929 if (CONST_INT_P (operands[2]))
20930 {
20931 split_double_mode (mode, operands, 2, low, high);
20932 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
20933
20934 if (count >= half_width)
20935 {
20936 emit_move_insn (low[0], high[1]);
20937 ix86_expand_clear (high[0]);
20938
20939 if (count > half_width)
20940 emit_insn (gen_lshr3 (low[0], low[0],
20941 GEN_INT (count - half_width)));
20942 }
20943 else
20944 {
20945 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
20946
20947 if (!rtx_equal_p (operands[0], operands[1]))
20948 emit_move_insn (operands[0], operands[1]);
20949
20950 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
20951 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (count)));
20952 }
20953 }
20954 else
20955 {
20956 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
20957
20958 if (!rtx_equal_p (operands[0], operands[1]))
20959 emit_move_insn (operands[0], operands[1]);
20960
20961 split_double_mode (mode, operands, 1, low, high);
20962
20963 emit_insn (gen_shrd (low[0], high[0], operands[2]));
20964 emit_insn (gen_lshr3 (high[0], high[0], operands[2]));
20965
20966 if (TARGET_CMOVE && scratch)
20967 {
20968 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
20969 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
20970
20971 ix86_expand_clear (scratch);
20972 emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
20973 scratch));
20974 }
20975 else
20976 {
20977 rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
20978 = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
20979
20980 emit_insn (gen_x86_shift_adj_2 (low[0], high[0], operands[2]));
20981 }
20982 }
20983 }
20984
20985 /* Predict just emitted jump instruction to be taken with probability PROB. */
20986 static void
20987 predict_jump (int prob)
20988 {
20989 rtx insn = get_last_insn ();
20990 gcc_assert (JUMP_P (insn));
20991 add_reg_note (insn, REG_BR_PROB, GEN_INT (prob));
20992 }
20993
20994 /* Helper function for the string operations below. Dest VARIABLE whether
20995 it is aligned to VALUE bytes. If true, jump to the label. */
20996 static rtx
20997 ix86_expand_aligntest (rtx variable, int value, bool epilogue)
20998 {
20999 rtx label = gen_label_rtx ();
21000 rtx tmpcount = gen_reg_rtx (GET_MODE (variable));
21001 if (GET_MODE (variable) == DImode)
21002 emit_insn (gen_anddi3 (tmpcount, variable, GEN_INT (value)));
21003 else
21004 emit_insn (gen_andsi3 (tmpcount, variable, GEN_INT (value)));
21005 emit_cmp_and_jump_insns (tmpcount, const0_rtx, EQ, 0, GET_MODE (variable),
21006 1, label);
21007 if (epilogue)
21008 predict_jump (REG_BR_PROB_BASE * 50 / 100);
21009 else
21010 predict_jump (REG_BR_PROB_BASE * 90 / 100);
21011 return label;
21012 }
21013
21014 /* Adjust COUNTER by the VALUE. */
21015 static void
21016 ix86_adjust_counter (rtx countreg, HOST_WIDE_INT value)
21017 {
21018 rtx (*gen_add)(rtx, rtx, rtx)
21019 = GET_MODE (countreg) == DImode ? gen_adddi3 : gen_addsi3;
21020
21021 emit_insn (gen_add (countreg, countreg, GEN_INT (-value)));
21022 }
21023
21024 /* Zero extend possibly SImode EXP to Pmode register. */
21025 rtx
21026 ix86_zero_extend_to_Pmode (rtx exp)
21027 {
21028 rtx r;
21029 if (GET_MODE (exp) == VOIDmode)
21030 return force_reg (Pmode, exp);
21031 if (GET_MODE (exp) == Pmode)
21032 return copy_to_mode_reg (Pmode, exp);
21033 r = gen_reg_rtx (Pmode);
21034 emit_insn (gen_zero_extendsidi2 (r, exp));
21035 return r;
21036 }
21037
21038 /* Divide COUNTREG by SCALE. */
21039 static rtx
21040 scale_counter (rtx countreg, int scale)
21041 {
21042 rtx sc;
21043
21044 if (scale == 1)
21045 return countreg;
21046 if (CONST_INT_P (countreg))
21047 return GEN_INT (INTVAL (countreg) / scale);
21048 gcc_assert (REG_P (countreg));
21049
21050 sc = expand_simple_binop (GET_MODE (countreg), LSHIFTRT, countreg,
21051 GEN_INT (exact_log2 (scale)),
21052 NULL, 1, OPTAB_DIRECT);
21053 return sc;
21054 }
21055
21056 /* Return mode for the memcpy/memset loop counter. Prefer SImode over
21057 DImode for constant loop counts. */
21058
21059 static enum machine_mode
21060 counter_mode (rtx count_exp)
21061 {
21062 if (GET_MODE (count_exp) != VOIDmode)
21063 return GET_MODE (count_exp);
21064 if (!CONST_INT_P (count_exp))
21065 return Pmode;
21066 if (TARGET_64BIT && (INTVAL (count_exp) & ~0xffffffff))
21067 return DImode;
21068 return SImode;
21069 }
21070
21071 /* When SRCPTR is non-NULL, output simple loop to move memory
21072 pointer to SRCPTR to DESTPTR via chunks of MODE unrolled UNROLL times,
21073 overall size is COUNT specified in bytes. When SRCPTR is NULL, output the
21074 equivalent loop to set memory by VALUE (supposed to be in MODE).
21075
21076 The size is rounded down to whole number of chunk size moved at once.
21077 SRCMEM and DESTMEM provide MEMrtx to feed proper aliasing info. */
21078
21079
21080 static void
21081 expand_set_or_movmem_via_loop (rtx destmem, rtx srcmem,
21082 rtx destptr, rtx srcptr, rtx value,
21083 rtx count, enum machine_mode mode, int unroll,
21084 int expected_size)
21085 {
21086 rtx out_label, top_label, iter, tmp;
21087 enum machine_mode iter_mode = counter_mode (count);
21088 rtx piece_size = GEN_INT (GET_MODE_SIZE (mode) * unroll);
21089 rtx piece_size_mask = GEN_INT (~((GET_MODE_SIZE (mode) * unroll) - 1));
21090 rtx size;
21091 rtx x_addr;
21092 rtx y_addr;
21093 int i;
21094
21095 top_label = gen_label_rtx ();
21096 out_label = gen_label_rtx ();
21097 iter = gen_reg_rtx (iter_mode);
21098
21099 size = expand_simple_binop (iter_mode, AND, count, piece_size_mask,
21100 NULL, 1, OPTAB_DIRECT);
21101 /* Those two should combine. */
21102 if (piece_size == const1_rtx)
21103 {
21104 emit_cmp_and_jump_insns (size, const0_rtx, EQ, NULL_RTX, iter_mode,
21105 true, out_label);
21106 predict_jump (REG_BR_PROB_BASE * 10 / 100);
21107 }
21108 emit_move_insn (iter, const0_rtx);
21109
21110 emit_label (top_label);
21111
21112 tmp = convert_modes (Pmode, iter_mode, iter, true);
21113 x_addr = gen_rtx_PLUS (Pmode, destptr, tmp);
21114 destmem = change_address (destmem, mode, x_addr);
21115
21116 if (srcmem)
21117 {
21118 y_addr = gen_rtx_PLUS (Pmode, srcptr, copy_rtx (tmp));
21119 srcmem = change_address (srcmem, mode, y_addr);
21120
21121 /* When unrolling for chips that reorder memory reads and writes,
21122 we can save registers by using single temporary.
21123 Also using 4 temporaries is overkill in 32bit mode. */
21124 if (!TARGET_64BIT && 0)
21125 {
21126 for (i = 0; i < unroll; i++)
21127 {
21128 if (i)
21129 {
21130 destmem =
21131 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
21132 srcmem =
21133 adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
21134 }
21135 emit_move_insn (destmem, srcmem);
21136 }
21137 }
21138 else
21139 {
21140 rtx tmpreg[4];
21141 gcc_assert (unroll <= 4);
21142 for (i = 0; i < unroll; i++)
21143 {
21144 tmpreg[i] = gen_reg_rtx (mode);
21145 if (i)
21146 {
21147 srcmem =
21148 adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
21149 }
21150 emit_move_insn (tmpreg[i], srcmem);
21151 }
21152 for (i = 0; i < unroll; i++)
21153 {
21154 if (i)
21155 {
21156 destmem =
21157 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
21158 }
21159 emit_move_insn (destmem, tmpreg[i]);
21160 }
21161 }
21162 }
21163 else
21164 for (i = 0; i < unroll; i++)
21165 {
21166 if (i)
21167 destmem =
21168 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
21169 emit_move_insn (destmem, value);
21170 }
21171
21172 tmp = expand_simple_binop (iter_mode, PLUS, iter, piece_size, iter,
21173 true, OPTAB_LIB_WIDEN);
21174 if (tmp != iter)
21175 emit_move_insn (iter, tmp);
21176
21177 emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
21178 true, top_label);
21179 if (expected_size != -1)
21180 {
21181 expected_size /= GET_MODE_SIZE (mode) * unroll;
21182 if (expected_size == 0)
21183 predict_jump (0);
21184 else if (expected_size > REG_BR_PROB_BASE)
21185 predict_jump (REG_BR_PROB_BASE - 1);
21186 else
21187 predict_jump (REG_BR_PROB_BASE - (REG_BR_PROB_BASE + expected_size / 2) / expected_size);
21188 }
21189 else
21190 predict_jump (REG_BR_PROB_BASE * 80 / 100);
21191 iter = ix86_zero_extend_to_Pmode (iter);
21192 tmp = expand_simple_binop (Pmode, PLUS, destptr, iter, destptr,
21193 true, OPTAB_LIB_WIDEN);
21194 if (tmp != destptr)
21195 emit_move_insn (destptr, tmp);
21196 if (srcptr)
21197 {
21198 tmp = expand_simple_binop (Pmode, PLUS, srcptr, iter, srcptr,
21199 true, OPTAB_LIB_WIDEN);
21200 if (tmp != srcptr)
21201 emit_move_insn (srcptr, tmp);
21202 }
21203 emit_label (out_label);
21204 }
21205
21206 /* Output "rep; mov" instruction.
21207 Arguments have same meaning as for previous function */
21208 static void
21209 expand_movmem_via_rep_mov (rtx destmem, rtx srcmem,
21210 rtx destptr, rtx srcptr,
21211 rtx count,
21212 enum machine_mode mode)
21213 {
21214 rtx destexp;
21215 rtx srcexp;
21216 rtx countreg;
21217 HOST_WIDE_INT rounded_count;
21218
21219 /* If the size is known, it is shorter to use rep movs. */
21220 if (mode == QImode && CONST_INT_P (count)
21221 && !(INTVAL (count) & 3))
21222 mode = SImode;
21223
21224 if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
21225 destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
21226 if (srcptr != XEXP (srcmem, 0) || GET_MODE (srcmem) != BLKmode)
21227 srcmem = adjust_automodify_address_nv (srcmem, BLKmode, srcptr, 0);
21228 countreg = ix86_zero_extend_to_Pmode (scale_counter (count, GET_MODE_SIZE (mode)));
21229 if (mode != QImode)
21230 {
21231 destexp = gen_rtx_ASHIFT (Pmode, countreg,
21232 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
21233 destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
21234 srcexp = gen_rtx_ASHIFT (Pmode, countreg,
21235 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
21236 srcexp = gen_rtx_PLUS (Pmode, srcexp, srcptr);
21237 }
21238 else
21239 {
21240 destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
21241 srcexp = gen_rtx_PLUS (Pmode, srcptr, countreg);
21242 }
21243 if (CONST_INT_P (count))
21244 {
21245 rounded_count = (INTVAL (count)
21246 & ~((HOST_WIDE_INT) GET_MODE_SIZE (mode) - 1));
21247 destmem = shallow_copy_rtx (destmem);
21248 srcmem = shallow_copy_rtx (srcmem);
21249 set_mem_size (destmem, rounded_count);
21250 set_mem_size (srcmem, rounded_count);
21251 }
21252 else
21253 {
21254 if (MEM_SIZE_KNOWN_P (destmem))
21255 clear_mem_size (destmem);
21256 if (MEM_SIZE_KNOWN_P (srcmem))
21257 clear_mem_size (srcmem);
21258 }
21259 emit_insn (gen_rep_mov (destptr, destmem, srcptr, srcmem, countreg,
21260 destexp, srcexp));
21261 }
21262
21263 /* Output "rep; stos" instruction.
21264 Arguments have same meaning as for previous function */
21265 static void
21266 expand_setmem_via_rep_stos (rtx destmem, rtx destptr, rtx value,
21267 rtx count, enum machine_mode mode,
21268 rtx orig_value)
21269 {
21270 rtx destexp;
21271 rtx countreg;
21272 HOST_WIDE_INT rounded_count;
21273
21274 if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
21275 destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
21276 value = force_reg (mode, gen_lowpart (mode, value));
21277 countreg = ix86_zero_extend_to_Pmode (scale_counter (count, GET_MODE_SIZE (mode)));
21278 if (mode != QImode)
21279 {
21280 destexp = gen_rtx_ASHIFT (Pmode, countreg,
21281 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
21282 destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
21283 }
21284 else
21285 destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
21286 if (orig_value == const0_rtx && CONST_INT_P (count))
21287 {
21288 rounded_count = (INTVAL (count)
21289 & ~((HOST_WIDE_INT) GET_MODE_SIZE (mode) - 1));
21290 destmem = shallow_copy_rtx (destmem);
21291 set_mem_size (destmem, rounded_count);
21292 }
21293 else if (MEM_SIZE_KNOWN_P (destmem))
21294 clear_mem_size (destmem);
21295 emit_insn (gen_rep_stos (destptr, countreg, destmem, value, destexp));
21296 }
21297
21298 static void
21299 emit_strmov (rtx destmem, rtx srcmem,
21300 rtx destptr, rtx srcptr, enum machine_mode mode, int offset)
21301 {
21302 rtx src = adjust_automodify_address_nv (srcmem, mode, srcptr, offset);
21303 rtx dest = adjust_automodify_address_nv (destmem, mode, destptr, offset);
21304 emit_insn (gen_strmov (destptr, dest, srcptr, src));
21305 }
21306
21307 /* Output code to copy at most count & (max_size - 1) bytes from SRC to DEST. */
21308 static void
21309 expand_movmem_epilogue (rtx destmem, rtx srcmem,
21310 rtx destptr, rtx srcptr, rtx count, int max_size)
21311 {
21312 rtx src, dest;
21313 if (CONST_INT_P (count))
21314 {
21315 HOST_WIDE_INT countval = INTVAL (count);
21316 int offset = 0;
21317
21318 if ((countval & 0x10) && max_size > 16)
21319 {
21320 if (TARGET_64BIT)
21321 {
21322 emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset);
21323 emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset + 8);
21324 }
21325 else
21326 gcc_unreachable ();
21327 offset += 16;
21328 }
21329 if ((countval & 0x08) && max_size > 8)
21330 {
21331 if (TARGET_64BIT)
21332 emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset);
21333 else
21334 {
21335 emit_strmov (destmem, srcmem, destptr, srcptr, SImode, offset);
21336 emit_strmov (destmem, srcmem, destptr, srcptr, SImode, offset + 4);
21337 }
21338 offset += 8;
21339 }
21340 if ((countval & 0x04) && max_size > 4)
21341 {
21342 emit_strmov (destmem, srcmem, destptr, srcptr, SImode, offset);
21343 offset += 4;
21344 }
21345 if ((countval & 0x02) && max_size > 2)
21346 {
21347 emit_strmov (destmem, srcmem, destptr, srcptr, HImode, offset);
21348 offset += 2;
21349 }
21350 if ((countval & 0x01) && max_size > 1)
21351 {
21352 emit_strmov (destmem, srcmem, destptr, srcptr, QImode, offset);
21353 offset += 1;
21354 }
21355 return;
21356 }
21357 if (max_size > 8)
21358 {
21359 count = expand_simple_binop (GET_MODE (count), AND, count, GEN_INT (max_size - 1),
21360 count, 1, OPTAB_DIRECT);
21361 expand_set_or_movmem_via_loop (destmem, srcmem, destptr, srcptr, NULL,
21362 count, QImode, 1, 4);
21363 return;
21364 }
21365
21366 /* When there are stringops, we can cheaply increase dest and src pointers.
21367 Otherwise we save code size by maintaining offset (zero is readily
21368 available from preceding rep operation) and using x86 addressing modes.
21369 */
21370 if (TARGET_SINGLE_STRINGOP)
21371 {
21372 if (max_size > 4)
21373 {
21374 rtx label = ix86_expand_aligntest (count, 4, true);
21375 src = change_address (srcmem, SImode, srcptr);
21376 dest = change_address (destmem, SImode, destptr);
21377 emit_insn (gen_strmov (destptr, dest, srcptr, src));
21378 emit_label (label);
21379 LABEL_NUSES (label) = 1;
21380 }
21381 if (max_size > 2)
21382 {
21383 rtx label = ix86_expand_aligntest (count, 2, true);
21384 src = change_address (srcmem, HImode, srcptr);
21385 dest = change_address (destmem, HImode, destptr);
21386 emit_insn (gen_strmov (destptr, dest, srcptr, src));
21387 emit_label (label);
21388 LABEL_NUSES (label) = 1;
21389 }
21390 if (max_size > 1)
21391 {
21392 rtx label = ix86_expand_aligntest (count, 1, true);
21393 src = change_address (srcmem, QImode, srcptr);
21394 dest = change_address (destmem, QImode, destptr);
21395 emit_insn (gen_strmov (destptr, dest, srcptr, src));
21396 emit_label (label);
21397 LABEL_NUSES (label) = 1;
21398 }
21399 }
21400 else
21401 {
21402 rtx offset = force_reg (Pmode, const0_rtx);
21403 rtx tmp;
21404
21405 if (max_size > 4)
21406 {
21407 rtx label = ix86_expand_aligntest (count, 4, true);
21408 src = change_address (srcmem, SImode, srcptr);
21409 dest = change_address (destmem, SImode, destptr);
21410 emit_move_insn (dest, src);
21411 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (4), NULL,
21412 true, OPTAB_LIB_WIDEN);
21413 if (tmp != offset)
21414 emit_move_insn (offset, tmp);
21415 emit_label (label);
21416 LABEL_NUSES (label) = 1;
21417 }
21418 if (max_size > 2)
21419 {
21420 rtx label = ix86_expand_aligntest (count, 2, true);
21421 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
21422 src = change_address (srcmem, HImode, tmp);
21423 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
21424 dest = change_address (destmem, HImode, tmp);
21425 emit_move_insn (dest, src);
21426 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (2), tmp,
21427 true, OPTAB_LIB_WIDEN);
21428 if (tmp != offset)
21429 emit_move_insn (offset, tmp);
21430 emit_label (label);
21431 LABEL_NUSES (label) = 1;
21432 }
21433 if (max_size > 1)
21434 {
21435 rtx label = ix86_expand_aligntest (count, 1, true);
21436 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
21437 src = change_address (srcmem, QImode, tmp);
21438 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
21439 dest = change_address (destmem, QImode, tmp);
21440 emit_move_insn (dest, src);
21441 emit_label (label);
21442 LABEL_NUSES (label) = 1;
21443 }
21444 }
21445 }
21446
21447 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
21448 static void
21449 expand_setmem_epilogue_via_loop (rtx destmem, rtx destptr, rtx value,
21450 rtx count, int max_size)
21451 {
21452 count =
21453 expand_simple_binop (counter_mode (count), AND, count,
21454 GEN_INT (max_size - 1), count, 1, OPTAB_DIRECT);
21455 expand_set_or_movmem_via_loop (destmem, NULL, destptr, NULL,
21456 gen_lowpart (QImode, value), count, QImode,
21457 1, max_size / 2);
21458 }
21459
21460 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
21461 static void
21462 expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx count, int max_size)
21463 {
21464 rtx dest;
21465
21466 if (CONST_INT_P (count))
21467 {
21468 HOST_WIDE_INT countval = INTVAL (count);
21469 int offset = 0;
21470
21471 if ((countval & 0x10) && max_size > 16)
21472 {
21473 if (TARGET_64BIT)
21474 {
21475 dest = adjust_automodify_address_nv (destmem, DImode, destptr, offset);
21476 emit_insn (gen_strset (destptr, dest, value));
21477 dest = adjust_automodify_address_nv (destmem, DImode, destptr, offset + 8);
21478 emit_insn (gen_strset (destptr, dest, value));
21479 }
21480 else
21481 gcc_unreachable ();
21482 offset += 16;
21483 }
21484 if ((countval & 0x08) && max_size > 8)
21485 {
21486 if (TARGET_64BIT)
21487 {
21488 dest = adjust_automodify_address_nv (destmem, DImode, destptr, offset);
21489 emit_insn (gen_strset (destptr, dest, value));
21490 }
21491 else
21492 {
21493 dest = adjust_automodify_address_nv (destmem, SImode, destptr, offset);
21494 emit_insn (gen_strset (destptr, dest, value));
21495 dest = adjust_automodify_address_nv (destmem, SImode, destptr, offset + 4);
21496 emit_insn (gen_strset (destptr, dest, value));
21497 }
21498 offset += 8;
21499 }
21500 if ((countval & 0x04) && max_size > 4)
21501 {
21502 dest = adjust_automodify_address_nv (destmem, SImode, destptr, offset);
21503 emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
21504 offset += 4;
21505 }
21506 if ((countval & 0x02) && max_size > 2)
21507 {
21508 dest = adjust_automodify_address_nv (destmem, HImode, destptr, offset);
21509 emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
21510 offset += 2;
21511 }
21512 if ((countval & 0x01) && max_size > 1)
21513 {
21514 dest = adjust_automodify_address_nv (destmem, QImode, destptr, offset);
21515 emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
21516 offset += 1;
21517 }
21518 return;
21519 }
21520 if (max_size > 32)
21521 {
21522 expand_setmem_epilogue_via_loop (destmem, destptr, value, count, max_size);
21523 return;
21524 }
21525 if (max_size > 16)
21526 {
21527 rtx label = ix86_expand_aligntest (count, 16, true);
21528 if (TARGET_64BIT)
21529 {
21530 dest = change_address (destmem, DImode, destptr);
21531 emit_insn (gen_strset (destptr, dest, value));
21532 emit_insn (gen_strset (destptr, dest, value));
21533 }
21534 else
21535 {
21536 dest = change_address (destmem, SImode, destptr);
21537 emit_insn (gen_strset (destptr, dest, value));
21538 emit_insn (gen_strset (destptr, dest, value));
21539 emit_insn (gen_strset (destptr, dest, value));
21540 emit_insn (gen_strset (destptr, dest, value));
21541 }
21542 emit_label (label);
21543 LABEL_NUSES (label) = 1;
21544 }
21545 if (max_size > 8)
21546 {
21547 rtx label = ix86_expand_aligntest (count, 8, true);
21548 if (TARGET_64BIT)
21549 {
21550 dest = change_address (destmem, DImode, destptr);
21551 emit_insn (gen_strset (destptr, dest, value));
21552 }
21553 else
21554 {
21555 dest = change_address (destmem, SImode, destptr);
21556 emit_insn (gen_strset (destptr, dest, value));
21557 emit_insn (gen_strset (destptr, dest, value));
21558 }
21559 emit_label (label);
21560 LABEL_NUSES (label) = 1;
21561 }
21562 if (max_size > 4)
21563 {
21564 rtx label = ix86_expand_aligntest (count, 4, true);
21565 dest = change_address (destmem, SImode, destptr);
21566 emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
21567 emit_label (label);
21568 LABEL_NUSES (label) = 1;
21569 }
21570 if (max_size > 2)
21571 {
21572 rtx label = ix86_expand_aligntest (count, 2, true);
21573 dest = change_address (destmem, HImode, destptr);
21574 emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
21575 emit_label (label);
21576 LABEL_NUSES (label) = 1;
21577 }
21578 if (max_size > 1)
21579 {
21580 rtx label = ix86_expand_aligntest (count, 1, true);
21581 dest = change_address (destmem, QImode, destptr);
21582 emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
21583 emit_label (label);
21584 LABEL_NUSES (label) = 1;
21585 }
21586 }
21587
21588 /* Copy enough from DEST to SRC to align DEST known to by aligned by ALIGN to
21589 DESIRED_ALIGNMENT. */
21590 static void
21591 expand_movmem_prologue (rtx destmem, rtx srcmem,
21592 rtx destptr, rtx srcptr, rtx count,
21593 int align, int desired_alignment)
21594 {
21595 if (align <= 1 && desired_alignment > 1)
21596 {
21597 rtx label = ix86_expand_aligntest (destptr, 1, false);
21598 srcmem = change_address (srcmem, QImode, srcptr);
21599 destmem = change_address (destmem, QImode, destptr);
21600 emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem));
21601 ix86_adjust_counter (count, 1);
21602 emit_label (label);
21603 LABEL_NUSES (label) = 1;
21604 }
21605 if (align <= 2 && desired_alignment > 2)
21606 {
21607 rtx label = ix86_expand_aligntest (destptr, 2, false);
21608 srcmem = change_address (srcmem, HImode, srcptr);
21609 destmem = change_address (destmem, HImode, destptr);
21610 emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem));
21611 ix86_adjust_counter (count, 2);
21612 emit_label (label);
21613 LABEL_NUSES (label) = 1;
21614 }
21615 if (align <= 4 && desired_alignment > 4)
21616 {
21617 rtx label = ix86_expand_aligntest (destptr, 4, false);
21618 srcmem = change_address (srcmem, SImode, srcptr);
21619 destmem = change_address (destmem, SImode, destptr);
21620 emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem));
21621 ix86_adjust_counter (count, 4);
21622 emit_label (label);
21623 LABEL_NUSES (label) = 1;
21624 }
21625 gcc_assert (desired_alignment <= 8);
21626 }
21627
21628 /* Copy enough from DST to SRC to align DST known to DESIRED_ALIGN.
21629 ALIGN_BYTES is how many bytes need to be copied. */
21630 static rtx
21631 expand_constant_movmem_prologue (rtx dst, rtx *srcp, rtx destreg, rtx srcreg,
21632 int desired_align, int align_bytes)
21633 {
21634 rtx src = *srcp;
21635 rtx orig_dst = dst;
21636 rtx orig_src = src;
21637 int off = 0;
21638 int src_align_bytes = get_mem_align_offset (src, desired_align * BITS_PER_UNIT);
21639 if (src_align_bytes >= 0)
21640 src_align_bytes = desired_align - src_align_bytes;
21641 if (align_bytes & 1)
21642 {
21643 dst = adjust_automodify_address_nv (dst, QImode, destreg, 0);
21644 src = adjust_automodify_address_nv (src, QImode, srcreg, 0);
21645 off = 1;
21646 emit_insn (gen_strmov (destreg, dst, srcreg, src));
21647 }
21648 if (align_bytes & 2)
21649 {
21650 dst = adjust_automodify_address_nv (dst, HImode, destreg, off);
21651 src = adjust_automodify_address_nv (src, HImode, srcreg, off);
21652 if (MEM_ALIGN (dst) < 2 * BITS_PER_UNIT)
21653 set_mem_align (dst, 2 * BITS_PER_UNIT);
21654 if (src_align_bytes >= 0
21655 && (src_align_bytes & 1) == (align_bytes & 1)
21656 && MEM_ALIGN (src) < 2 * BITS_PER_UNIT)
21657 set_mem_align (src, 2 * BITS_PER_UNIT);
21658 off = 2;
21659 emit_insn (gen_strmov (destreg, dst, srcreg, src));
21660 }
21661 if (align_bytes & 4)
21662 {
21663 dst = adjust_automodify_address_nv (dst, SImode, destreg, off);
21664 src = adjust_automodify_address_nv (src, SImode, srcreg, off);
21665 if (MEM_ALIGN (dst) < 4 * BITS_PER_UNIT)
21666 set_mem_align (dst, 4 * BITS_PER_UNIT);
21667 if (src_align_bytes >= 0)
21668 {
21669 unsigned int src_align = 0;
21670 if ((src_align_bytes & 3) == (align_bytes & 3))
21671 src_align = 4;
21672 else if ((src_align_bytes & 1) == (align_bytes & 1))
21673 src_align = 2;
21674 if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
21675 set_mem_align (src, src_align * BITS_PER_UNIT);
21676 }
21677 off = 4;
21678 emit_insn (gen_strmov (destreg, dst, srcreg, src));
21679 }
21680 dst = adjust_automodify_address_nv (dst, BLKmode, destreg, off);
21681 src = adjust_automodify_address_nv (src, BLKmode, srcreg, off);
21682 if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
21683 set_mem_align (dst, desired_align * BITS_PER_UNIT);
21684 if (src_align_bytes >= 0)
21685 {
21686 unsigned int src_align = 0;
21687 if ((src_align_bytes & 7) == (align_bytes & 7))
21688 src_align = 8;
21689 else if ((src_align_bytes & 3) == (align_bytes & 3))
21690 src_align = 4;
21691 else if ((src_align_bytes & 1) == (align_bytes & 1))
21692 src_align = 2;
21693 if (src_align > (unsigned int) desired_align)
21694 src_align = desired_align;
21695 if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
21696 set_mem_align (src, src_align * BITS_PER_UNIT);
21697 }
21698 if (MEM_SIZE_KNOWN_P (orig_dst))
21699 set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
21700 if (MEM_SIZE_KNOWN_P (orig_src))
21701 set_mem_size (src, MEM_SIZE (orig_src) - align_bytes);
21702 *srcp = src;
21703 return dst;
21704 }
21705
21706 /* Set enough from DEST to align DEST known to by aligned by ALIGN to
21707 DESIRED_ALIGNMENT. */
21708 static void
21709 expand_setmem_prologue (rtx destmem, rtx destptr, rtx value, rtx count,
21710 int align, int desired_alignment)
21711 {
21712 if (align <= 1 && desired_alignment > 1)
21713 {
21714 rtx label = ix86_expand_aligntest (destptr, 1, false);
21715 destmem = change_address (destmem, QImode, destptr);
21716 emit_insn (gen_strset (destptr, destmem, gen_lowpart (QImode, value)));
21717 ix86_adjust_counter (count, 1);
21718 emit_label (label);
21719 LABEL_NUSES (label) = 1;
21720 }
21721 if (align <= 2 && desired_alignment > 2)
21722 {
21723 rtx label = ix86_expand_aligntest (destptr, 2, false);
21724 destmem = change_address (destmem, HImode, destptr);
21725 emit_insn (gen_strset (destptr, destmem, gen_lowpart (HImode, value)));
21726 ix86_adjust_counter (count, 2);
21727 emit_label (label);
21728 LABEL_NUSES (label) = 1;
21729 }
21730 if (align <= 4 && desired_alignment > 4)
21731 {
21732 rtx label = ix86_expand_aligntest (destptr, 4, false);
21733 destmem = change_address (destmem, SImode, destptr);
21734 emit_insn (gen_strset (destptr, destmem, gen_lowpart (SImode, value)));
21735 ix86_adjust_counter (count, 4);
21736 emit_label (label);
21737 LABEL_NUSES (label) = 1;
21738 }
21739 gcc_assert (desired_alignment <= 8);
21740 }
21741
21742 /* Set enough from DST to align DST known to by aligned by ALIGN to
21743 DESIRED_ALIGN. ALIGN_BYTES is how many bytes need to be stored. */
21744 static rtx
21745 expand_constant_setmem_prologue (rtx dst, rtx destreg, rtx value,
21746 int desired_align, int align_bytes)
21747 {
21748 int off = 0;
21749 rtx orig_dst = dst;
21750 if (align_bytes & 1)
21751 {
21752 dst = adjust_automodify_address_nv (dst, QImode, destreg, 0);
21753 off = 1;
21754 emit_insn (gen_strset (destreg, dst,
21755 gen_lowpart (QImode, value)));
21756 }
21757 if (align_bytes & 2)
21758 {
21759 dst = adjust_automodify_address_nv (dst, HImode, destreg, off);
21760 if (MEM_ALIGN (dst) < 2 * BITS_PER_UNIT)
21761 set_mem_align (dst, 2 * BITS_PER_UNIT);
21762 off = 2;
21763 emit_insn (gen_strset (destreg, dst,
21764 gen_lowpart (HImode, value)));
21765 }
21766 if (align_bytes & 4)
21767 {
21768 dst = adjust_automodify_address_nv (dst, SImode, destreg, off);
21769 if (MEM_ALIGN (dst) < 4 * BITS_PER_UNIT)
21770 set_mem_align (dst, 4 * BITS_PER_UNIT);
21771 off = 4;
21772 emit_insn (gen_strset (destreg, dst,
21773 gen_lowpart (SImode, value)));
21774 }
21775 dst = adjust_automodify_address_nv (dst, BLKmode, destreg, off);
21776 if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
21777 set_mem_align (dst, desired_align * BITS_PER_UNIT);
21778 if (MEM_SIZE_KNOWN_P (orig_dst))
21779 set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
21780 return dst;
21781 }
21782
21783 /* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation. */
21784 static enum stringop_alg
21785 decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size, bool memset,
21786 int *dynamic_check)
21787 {
21788 const struct stringop_algs * algs;
21789 bool optimize_for_speed;
21790 /* Algorithms using the rep prefix want at least edi and ecx;
21791 additionally, memset wants eax and memcpy wants esi. Don't
21792 consider such algorithms if the user has appropriated those
21793 registers for their own purposes. */
21794 bool rep_prefix_usable = !(fixed_regs[CX_REG] || fixed_regs[DI_REG]
21795 || (memset
21796 ? fixed_regs[AX_REG] : fixed_regs[SI_REG]));
21797
21798 #define ALG_USABLE_P(alg) (rep_prefix_usable \
21799 || (alg != rep_prefix_1_byte \
21800 && alg != rep_prefix_4_byte \
21801 && alg != rep_prefix_8_byte))
21802 const struct processor_costs *cost;
21803
21804 /* Even if the string operation call is cold, we still might spend a lot
21805 of time processing large blocks. */
21806 if (optimize_function_for_size_p (cfun)
21807 || (optimize_insn_for_size_p ()
21808 && expected_size != -1 && expected_size < 256))
21809 optimize_for_speed = false;
21810 else
21811 optimize_for_speed = true;
21812
21813 cost = optimize_for_speed ? ix86_cost : &ix86_size_cost;
21814
21815 *dynamic_check = -1;
21816 if (memset)
21817 algs = &cost->memset[TARGET_64BIT != 0];
21818 else
21819 algs = &cost->memcpy[TARGET_64BIT != 0];
21820 if (ix86_stringop_alg != no_stringop && ALG_USABLE_P (ix86_stringop_alg))
21821 return ix86_stringop_alg;
21822 /* rep; movq or rep; movl is the smallest variant. */
21823 else if (!optimize_for_speed)
21824 {
21825 if (!count || (count & 3))
21826 return rep_prefix_usable ? rep_prefix_1_byte : loop_1_byte;
21827 else
21828 return rep_prefix_usable ? rep_prefix_4_byte : loop;
21829 }
21830 /* Very tiny blocks are best handled via the loop, REP is expensive to setup.
21831 */
21832 else if (expected_size != -1 && expected_size < 4)
21833 return loop_1_byte;
21834 else if (expected_size != -1)
21835 {
21836 unsigned int i;
21837 enum stringop_alg alg = libcall;
21838 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
21839 {
21840 /* We get here if the algorithms that were not libcall-based
21841 were rep-prefix based and we are unable to use rep prefixes
21842 based on global register usage. Break out of the loop and
21843 use the heuristic below. */
21844 if (algs->size[i].max == 0)
21845 break;
21846 if (algs->size[i].max >= expected_size || algs->size[i].max == -1)
21847 {
21848 enum stringop_alg candidate = algs->size[i].alg;
21849
21850 if (candidate != libcall && ALG_USABLE_P (candidate))
21851 alg = candidate;
21852 /* Honor TARGET_INLINE_ALL_STRINGOPS by picking
21853 last non-libcall inline algorithm. */
21854 if (TARGET_INLINE_ALL_STRINGOPS)
21855 {
21856 /* When the current size is best to be copied by a libcall,
21857 but we are still forced to inline, run the heuristic below
21858 that will pick code for medium sized blocks. */
21859 if (alg != libcall)
21860 return alg;
21861 break;
21862 }
21863 else if (ALG_USABLE_P (candidate))
21864 return candidate;
21865 }
21866 }
21867 gcc_assert (TARGET_INLINE_ALL_STRINGOPS || !rep_prefix_usable);
21868 }
21869 /* When asked to inline the call anyway, try to pick meaningful choice.
21870 We look for maximal size of block that is faster to copy by hand and
21871 take blocks of at most of that size guessing that average size will
21872 be roughly half of the block.
21873
21874 If this turns out to be bad, we might simply specify the preferred
21875 choice in ix86_costs. */
21876 if ((TARGET_INLINE_ALL_STRINGOPS || TARGET_INLINE_STRINGOPS_DYNAMICALLY)
21877 && (algs->unknown_size == libcall || !ALG_USABLE_P (algs->unknown_size)))
21878 {
21879 int max = -1;
21880 enum stringop_alg alg;
21881 int i;
21882 bool any_alg_usable_p = true;
21883
21884 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
21885 {
21886 enum stringop_alg candidate = algs->size[i].alg;
21887 any_alg_usable_p = any_alg_usable_p && ALG_USABLE_P (candidate);
21888
21889 if (candidate != libcall && candidate
21890 && ALG_USABLE_P (candidate))
21891 max = algs->size[i].max;
21892 }
21893 /* If there aren't any usable algorithms, then recursing on
21894 smaller sizes isn't going to find anything. Just return the
21895 simple byte-at-a-time copy loop. */
21896 if (!any_alg_usable_p)
21897 {
21898 /* Pick something reasonable. */
21899 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
21900 *dynamic_check = 128;
21901 return loop_1_byte;
21902 }
21903 if (max == -1)
21904 max = 4096;
21905 alg = decide_alg (count, max / 2, memset, dynamic_check);
21906 gcc_assert (*dynamic_check == -1);
21907 gcc_assert (alg != libcall);
21908 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
21909 *dynamic_check = max;
21910 return alg;
21911 }
21912 return ALG_USABLE_P (algs->unknown_size) ? algs->unknown_size : libcall;
21913 #undef ALG_USABLE_P
21914 }
21915
21916 /* Decide on alignment. We know that the operand is already aligned to ALIGN
21917 (ALIGN can be based on profile feedback and thus it is not 100% guaranteed). */
21918 static int
21919 decide_alignment (int align,
21920 enum stringop_alg alg,
21921 int expected_size)
21922 {
21923 int desired_align = 0;
21924 switch (alg)
21925 {
21926 case no_stringop:
21927 gcc_unreachable ();
21928 case loop:
21929 case unrolled_loop:
21930 desired_align = GET_MODE_SIZE (Pmode);
21931 break;
21932 case rep_prefix_8_byte:
21933 desired_align = 8;
21934 break;
21935 case rep_prefix_4_byte:
21936 /* PentiumPro has special logic triggering for 8 byte aligned blocks.
21937 copying whole cacheline at once. */
21938 if (TARGET_PENTIUMPRO)
21939 desired_align = 8;
21940 else
21941 desired_align = 4;
21942 break;
21943 case rep_prefix_1_byte:
21944 /* PentiumPro has special logic triggering for 8 byte aligned blocks.
21945 copying whole cacheline at once. */
21946 if (TARGET_PENTIUMPRO)
21947 desired_align = 8;
21948 else
21949 desired_align = 1;
21950 break;
21951 case loop_1_byte:
21952 desired_align = 1;
21953 break;
21954 case libcall:
21955 return 0;
21956 }
21957
21958 if (optimize_size)
21959 desired_align = 1;
21960 if (desired_align < align)
21961 desired_align = align;
21962 if (expected_size != -1 && expected_size < 4)
21963 desired_align = align;
21964 return desired_align;
21965 }
21966
21967 /* Return the smallest power of 2 greater than VAL. */
21968 static int
21969 smallest_pow2_greater_than (int val)
21970 {
21971 int ret = 1;
21972 while (ret <= val)
21973 ret <<= 1;
21974 return ret;
21975 }
21976
21977 /* Expand string move (memcpy) operation. Use i386 string operations
21978 when profitable. expand_setmem contains similar code. The code
21979 depends upon architecture, block size and alignment, but always has
21980 the same overall structure:
21981
21982 1) Prologue guard: Conditional that jumps up to epilogues for small
21983 blocks that can be handled by epilogue alone. This is faster
21984 but also needed for correctness, since prologue assume the block
21985 is larger than the desired alignment.
21986
21987 Optional dynamic check for size and libcall for large
21988 blocks is emitted here too, with -minline-stringops-dynamically.
21989
21990 2) Prologue: copy first few bytes in order to get destination
21991 aligned to DESIRED_ALIGN. It is emitted only when ALIGN is less
21992 than DESIRED_ALIGN and up to DESIRED_ALIGN - ALIGN bytes can be
21993 copied. We emit either a jump tree on power of two sized
21994 blocks, or a byte loop.
21995
21996 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
21997 with specified algorithm.
21998
21999 4) Epilogue: code copying tail of the block that is too small to be
22000 handled by main body (or up to size guarded by prologue guard). */
22001
22002 bool
22003 ix86_expand_movmem (rtx dst, rtx src, rtx count_exp, rtx align_exp,
22004 rtx expected_align_exp, rtx expected_size_exp)
22005 {
22006 rtx destreg;
22007 rtx srcreg;
22008 rtx label = NULL;
22009 rtx tmp;
22010 rtx jump_around_label = NULL;
22011 HOST_WIDE_INT align = 1;
22012 unsigned HOST_WIDE_INT count = 0;
22013 HOST_WIDE_INT expected_size = -1;
22014 int size_needed = 0, epilogue_size_needed;
22015 int desired_align = 0, align_bytes = 0;
22016 enum stringop_alg alg;
22017 int dynamic_check;
22018 bool need_zero_guard = false;
22019
22020 if (CONST_INT_P (align_exp))
22021 align = INTVAL (align_exp);
22022 /* i386 can do misaligned access on reasonably increased cost. */
22023 if (CONST_INT_P (expected_align_exp)
22024 && INTVAL (expected_align_exp) > align)
22025 align = INTVAL (expected_align_exp);
22026 /* ALIGN is the minimum of destination and source alignment, but we care here
22027 just about destination alignment. */
22028 else if (MEM_ALIGN (dst) > (unsigned HOST_WIDE_INT) align * BITS_PER_UNIT)
22029 align = MEM_ALIGN (dst) / BITS_PER_UNIT;
22030
22031 if (CONST_INT_P (count_exp))
22032 count = expected_size = INTVAL (count_exp);
22033 if (CONST_INT_P (expected_size_exp) && count == 0)
22034 expected_size = INTVAL (expected_size_exp);
22035
22036 /* Make sure we don't need to care about overflow later on. */
22037 if (count > ((unsigned HOST_WIDE_INT) 1 << 30))
22038 return false;
22039
22040 /* Step 0: Decide on preferred algorithm, desired alignment and
22041 size of chunks to be copied by main loop. */
22042
22043 alg = decide_alg (count, expected_size, false, &dynamic_check);
22044 desired_align = decide_alignment (align, alg, expected_size);
22045
22046 if (!TARGET_ALIGN_STRINGOPS)
22047 align = desired_align;
22048
22049 if (alg == libcall)
22050 return false;
22051 gcc_assert (alg != no_stringop);
22052 if (!count)
22053 count_exp = copy_to_mode_reg (GET_MODE (count_exp), count_exp);
22054 destreg = copy_to_mode_reg (Pmode, XEXP (dst, 0));
22055 srcreg = copy_to_mode_reg (Pmode, XEXP (src, 0));
22056 switch (alg)
22057 {
22058 case libcall:
22059 case no_stringop:
22060 gcc_unreachable ();
22061 case loop:
22062 need_zero_guard = true;
22063 size_needed = GET_MODE_SIZE (Pmode);
22064 break;
22065 case unrolled_loop:
22066 need_zero_guard = true;
22067 size_needed = GET_MODE_SIZE (Pmode) * (TARGET_64BIT ? 4 : 2);
22068 break;
22069 case rep_prefix_8_byte:
22070 size_needed = 8;
22071 break;
22072 case rep_prefix_4_byte:
22073 size_needed = 4;
22074 break;
22075 case rep_prefix_1_byte:
22076 size_needed = 1;
22077 break;
22078 case loop_1_byte:
22079 need_zero_guard = true;
22080 size_needed = 1;
22081 break;
22082 }
22083
22084 epilogue_size_needed = size_needed;
22085
22086 /* Step 1: Prologue guard. */
22087
22088 /* Alignment code needs count to be in register. */
22089 if (CONST_INT_P (count_exp) && desired_align > align)
22090 {
22091 if (INTVAL (count_exp) > desired_align
22092 && INTVAL (count_exp) > size_needed)
22093 {
22094 align_bytes
22095 = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
22096 if (align_bytes <= 0)
22097 align_bytes = 0;
22098 else
22099 align_bytes = desired_align - align_bytes;
22100 }
22101 if (align_bytes == 0)
22102 count_exp = force_reg (counter_mode (count_exp), count_exp);
22103 }
22104 gcc_assert (desired_align >= 1 && align >= 1);
22105
22106 /* Ensure that alignment prologue won't copy past end of block. */
22107 if (size_needed > 1 || (desired_align > 1 && desired_align > align))
22108 {
22109 epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
22110 /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes.
22111 Make sure it is power of 2. */
22112 epilogue_size_needed = smallest_pow2_greater_than (epilogue_size_needed);
22113
22114 if (count)
22115 {
22116 if (count < (unsigned HOST_WIDE_INT)epilogue_size_needed)
22117 {
22118 /* If main algorithm works on QImode, no epilogue is needed.
22119 For small sizes just don't align anything. */
22120 if (size_needed == 1)
22121 desired_align = align;
22122 else
22123 goto epilogue;
22124 }
22125 }
22126 else
22127 {
22128 label = gen_label_rtx ();
22129 emit_cmp_and_jump_insns (count_exp,
22130 GEN_INT (epilogue_size_needed),
22131 LTU, 0, counter_mode (count_exp), 1, label);
22132 if (expected_size == -1 || expected_size < epilogue_size_needed)
22133 predict_jump (REG_BR_PROB_BASE * 60 / 100);
22134 else
22135 predict_jump (REG_BR_PROB_BASE * 20 / 100);
22136 }
22137 }
22138
22139 /* Emit code to decide on runtime whether library call or inline should be
22140 used. */
22141 if (dynamic_check != -1)
22142 {
22143 if (CONST_INT_P (count_exp))
22144 {
22145 if (UINTVAL (count_exp) >= (unsigned HOST_WIDE_INT)dynamic_check)
22146 {
22147 emit_block_move_via_libcall (dst, src, count_exp, false);
22148 count_exp = const0_rtx;
22149 goto epilogue;
22150 }
22151 }
22152 else
22153 {
22154 rtx hot_label = gen_label_rtx ();
22155 jump_around_label = gen_label_rtx ();
22156 emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
22157 LEU, 0, GET_MODE (count_exp), 1, hot_label);
22158 predict_jump (REG_BR_PROB_BASE * 90 / 100);
22159 emit_block_move_via_libcall (dst, src, count_exp, false);
22160 emit_jump (jump_around_label);
22161 emit_label (hot_label);
22162 }
22163 }
22164
22165 /* Step 2: Alignment prologue. */
22166
22167 if (desired_align > align)
22168 {
22169 if (align_bytes == 0)
22170 {
22171 /* Except for the first move in epilogue, we no longer know
22172 constant offset in aliasing info. It don't seems to worth
22173 the pain to maintain it for the first move, so throw away
22174 the info early. */
22175 src = change_address (src, BLKmode, srcreg);
22176 dst = change_address (dst, BLKmode, destreg);
22177 expand_movmem_prologue (dst, src, destreg, srcreg, count_exp, align,
22178 desired_align);
22179 }
22180 else
22181 {
22182 /* If we know how many bytes need to be stored before dst is
22183 sufficiently aligned, maintain aliasing info accurately. */
22184 dst = expand_constant_movmem_prologue (dst, &src, destreg, srcreg,
22185 desired_align, align_bytes);
22186 count_exp = plus_constant (count_exp, -align_bytes);
22187 count -= align_bytes;
22188 }
22189 if (need_zero_guard
22190 && (count < (unsigned HOST_WIDE_INT) size_needed
22191 || (align_bytes == 0
22192 && count < ((unsigned HOST_WIDE_INT) size_needed
22193 + desired_align - align))))
22194 {
22195 /* It is possible that we copied enough so the main loop will not
22196 execute. */
22197 gcc_assert (size_needed > 1);
22198 if (label == NULL_RTX)
22199 label = gen_label_rtx ();
22200 emit_cmp_and_jump_insns (count_exp,
22201 GEN_INT (size_needed),
22202 LTU, 0, counter_mode (count_exp), 1, label);
22203 if (expected_size == -1
22204 || expected_size < (desired_align - align) / 2 + size_needed)
22205 predict_jump (REG_BR_PROB_BASE * 20 / 100);
22206 else
22207 predict_jump (REG_BR_PROB_BASE * 60 / 100);
22208 }
22209 }
22210 if (label && size_needed == 1)
22211 {
22212 emit_label (label);
22213 LABEL_NUSES (label) = 1;
22214 label = NULL;
22215 epilogue_size_needed = 1;
22216 }
22217 else if (label == NULL_RTX)
22218 epilogue_size_needed = size_needed;
22219
22220 /* Step 3: Main loop. */
22221
22222 switch (alg)
22223 {
22224 case libcall:
22225 case no_stringop:
22226 gcc_unreachable ();
22227 case loop_1_byte:
22228 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, NULL,
22229 count_exp, QImode, 1, expected_size);
22230 break;
22231 case loop:
22232 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, NULL,
22233 count_exp, Pmode, 1, expected_size);
22234 break;
22235 case unrolled_loop:
22236 /* Unroll only by factor of 2 in 32bit mode, since we don't have enough
22237 registers for 4 temporaries anyway. */
22238 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, NULL,
22239 count_exp, Pmode, TARGET_64BIT ? 4 : 2,
22240 expected_size);
22241 break;
22242 case rep_prefix_8_byte:
22243 expand_movmem_via_rep_mov (dst, src, destreg, srcreg, count_exp,
22244 DImode);
22245 break;
22246 case rep_prefix_4_byte:
22247 expand_movmem_via_rep_mov (dst, src, destreg, srcreg, count_exp,
22248 SImode);
22249 break;
22250 case rep_prefix_1_byte:
22251 expand_movmem_via_rep_mov (dst, src, destreg, srcreg, count_exp,
22252 QImode);
22253 break;
22254 }
22255 /* Adjust properly the offset of src and dest memory for aliasing. */
22256 if (CONST_INT_P (count_exp))
22257 {
22258 src = adjust_automodify_address_nv (src, BLKmode, srcreg,
22259 (count / size_needed) * size_needed);
22260 dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
22261 (count / size_needed) * size_needed);
22262 }
22263 else
22264 {
22265 src = change_address (src, BLKmode, srcreg);
22266 dst = change_address (dst, BLKmode, destreg);
22267 }
22268
22269 /* Step 4: Epilogue to copy the remaining bytes. */
22270 epilogue:
22271 if (label)
22272 {
22273 /* When the main loop is done, COUNT_EXP might hold original count,
22274 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
22275 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
22276 bytes. Compensate if needed. */
22277
22278 if (size_needed < epilogue_size_needed)
22279 {
22280 tmp =
22281 expand_simple_binop (counter_mode (count_exp), AND, count_exp,
22282 GEN_INT (size_needed - 1), count_exp, 1,
22283 OPTAB_DIRECT);
22284 if (tmp != count_exp)
22285 emit_move_insn (count_exp, tmp);
22286 }
22287 emit_label (label);
22288 LABEL_NUSES (label) = 1;
22289 }
22290
22291 if (count_exp != const0_rtx && epilogue_size_needed > 1)
22292 expand_movmem_epilogue (dst, src, destreg, srcreg, count_exp,
22293 epilogue_size_needed);
22294 if (jump_around_label)
22295 emit_label (jump_around_label);
22296 return true;
22297 }
22298
22299 /* Helper function for memcpy. For QImode value 0xXY produce
22300 0xXYXYXYXY of wide specified by MODE. This is essentially
22301 a * 0x10101010, but we can do slightly better than
22302 synth_mult by unwinding the sequence by hand on CPUs with
22303 slow multiply. */
22304 static rtx
22305 promote_duplicated_reg (enum machine_mode mode, rtx val)
22306 {
22307 enum machine_mode valmode = GET_MODE (val);
22308 rtx tmp;
22309 int nops = mode == DImode ? 3 : 2;
22310
22311 gcc_assert (mode == SImode || mode == DImode);
22312 if (val == const0_rtx)
22313 return copy_to_mode_reg (mode, const0_rtx);
22314 if (CONST_INT_P (val))
22315 {
22316 HOST_WIDE_INT v = INTVAL (val) & 255;
22317
22318 v |= v << 8;
22319 v |= v << 16;
22320 if (mode == DImode)
22321 v |= (v << 16) << 16;
22322 return copy_to_mode_reg (mode, gen_int_mode (v, mode));
22323 }
22324
22325 if (valmode == VOIDmode)
22326 valmode = QImode;
22327 if (valmode != QImode)
22328 val = gen_lowpart (QImode, val);
22329 if (mode == QImode)
22330 return val;
22331 if (!TARGET_PARTIAL_REG_STALL)
22332 nops--;
22333 if (ix86_cost->mult_init[mode == DImode ? 3 : 2]
22334 + ix86_cost->mult_bit * (mode == DImode ? 8 : 4)
22335 <= (ix86_cost->shift_const + ix86_cost->add) * nops
22336 + (COSTS_N_INSNS (TARGET_PARTIAL_REG_STALL == 0)))
22337 {
22338 rtx reg = convert_modes (mode, QImode, val, true);
22339 tmp = promote_duplicated_reg (mode, const1_rtx);
22340 return expand_simple_binop (mode, MULT, reg, tmp, NULL, 1,
22341 OPTAB_DIRECT);
22342 }
22343 else
22344 {
22345 rtx reg = convert_modes (mode, QImode, val, true);
22346
22347 if (!TARGET_PARTIAL_REG_STALL)
22348 if (mode == SImode)
22349 emit_insn (gen_movsi_insv_1 (reg, reg));
22350 else
22351 emit_insn (gen_movdi_insv_1 (reg, reg));
22352 else
22353 {
22354 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (8),
22355 NULL, 1, OPTAB_DIRECT);
22356 reg =
22357 expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
22358 }
22359 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (16),
22360 NULL, 1, OPTAB_DIRECT);
22361 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
22362 if (mode == SImode)
22363 return reg;
22364 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (32),
22365 NULL, 1, OPTAB_DIRECT);
22366 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
22367 return reg;
22368 }
22369 }
22370
22371 /* Duplicate value VAL using promote_duplicated_reg into maximal size that will
22372 be needed by main loop copying SIZE_NEEDED chunks and prologue getting
22373 alignment from ALIGN to DESIRED_ALIGN. */
22374 static rtx
22375 promote_duplicated_reg_to_size (rtx val, int size_needed, int desired_align, int align)
22376 {
22377 rtx promoted_val;
22378
22379 if (TARGET_64BIT
22380 && (size_needed > 4 || (desired_align > align && desired_align > 4)))
22381 promoted_val = promote_duplicated_reg (DImode, val);
22382 else if (size_needed > 2 || (desired_align > align && desired_align > 2))
22383 promoted_val = promote_duplicated_reg (SImode, val);
22384 else if (size_needed > 1 || (desired_align > align && desired_align > 1))
22385 promoted_val = promote_duplicated_reg (HImode, val);
22386 else
22387 promoted_val = val;
22388
22389 return promoted_val;
22390 }
22391
22392 /* Expand string clear operation (bzero). Use i386 string operations when
22393 profitable. See expand_movmem comment for explanation of individual
22394 steps performed. */
22395 bool
22396 ix86_expand_setmem (rtx dst, rtx count_exp, rtx val_exp, rtx align_exp,
22397 rtx expected_align_exp, rtx expected_size_exp)
22398 {
22399 rtx destreg;
22400 rtx label = NULL;
22401 rtx tmp;
22402 rtx jump_around_label = NULL;
22403 HOST_WIDE_INT align = 1;
22404 unsigned HOST_WIDE_INT count = 0;
22405 HOST_WIDE_INT expected_size = -1;
22406 int size_needed = 0, epilogue_size_needed;
22407 int desired_align = 0, align_bytes = 0;
22408 enum stringop_alg alg;
22409 rtx promoted_val = NULL;
22410 bool force_loopy_epilogue = false;
22411 int dynamic_check;
22412 bool need_zero_guard = false;
22413
22414 if (CONST_INT_P (align_exp))
22415 align = INTVAL (align_exp);
22416 /* i386 can do misaligned access on reasonably increased cost. */
22417 if (CONST_INT_P (expected_align_exp)
22418 && INTVAL (expected_align_exp) > align)
22419 align = INTVAL (expected_align_exp);
22420 if (CONST_INT_P (count_exp))
22421 count = expected_size = INTVAL (count_exp);
22422 if (CONST_INT_P (expected_size_exp) && count == 0)
22423 expected_size = INTVAL (expected_size_exp);
22424
22425 /* Make sure we don't need to care about overflow later on. */
22426 if (count > ((unsigned HOST_WIDE_INT) 1 << 30))
22427 return false;
22428
22429 /* Step 0: Decide on preferred algorithm, desired alignment and
22430 size of chunks to be copied by main loop. */
22431
22432 alg = decide_alg (count, expected_size, true, &dynamic_check);
22433 desired_align = decide_alignment (align, alg, expected_size);
22434
22435 if (!TARGET_ALIGN_STRINGOPS)
22436 align = desired_align;
22437
22438 if (alg == libcall)
22439 return false;
22440 gcc_assert (alg != no_stringop);
22441 if (!count)
22442 count_exp = copy_to_mode_reg (counter_mode (count_exp), count_exp);
22443 destreg = copy_to_mode_reg (Pmode, XEXP (dst, 0));
22444 switch (alg)
22445 {
22446 case libcall:
22447 case no_stringop:
22448 gcc_unreachable ();
22449 case loop:
22450 need_zero_guard = true;
22451 size_needed = GET_MODE_SIZE (Pmode);
22452 break;
22453 case unrolled_loop:
22454 need_zero_guard = true;
22455 size_needed = GET_MODE_SIZE (Pmode) * 4;
22456 break;
22457 case rep_prefix_8_byte:
22458 size_needed = 8;
22459 break;
22460 case rep_prefix_4_byte:
22461 size_needed = 4;
22462 break;
22463 case rep_prefix_1_byte:
22464 size_needed = 1;
22465 break;
22466 case loop_1_byte:
22467 need_zero_guard = true;
22468 size_needed = 1;
22469 break;
22470 }
22471 epilogue_size_needed = size_needed;
22472
22473 /* Step 1: Prologue guard. */
22474
22475 /* Alignment code needs count to be in register. */
22476 if (CONST_INT_P (count_exp) && desired_align > align)
22477 {
22478 if (INTVAL (count_exp) > desired_align
22479 && INTVAL (count_exp) > size_needed)
22480 {
22481 align_bytes
22482 = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
22483 if (align_bytes <= 0)
22484 align_bytes = 0;
22485 else
22486 align_bytes = desired_align - align_bytes;
22487 }
22488 if (align_bytes == 0)
22489 {
22490 enum machine_mode mode = SImode;
22491 if (TARGET_64BIT && (count & ~0xffffffff))
22492 mode = DImode;
22493 count_exp = force_reg (mode, count_exp);
22494 }
22495 }
22496 /* Do the cheap promotion to allow better CSE across the
22497 main loop and epilogue (ie one load of the big constant in the
22498 front of all code. */
22499 if (CONST_INT_P (val_exp))
22500 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
22501 desired_align, align);
22502 /* Ensure that alignment prologue won't copy past end of block. */
22503 if (size_needed > 1 || (desired_align > 1 && desired_align > align))
22504 {
22505 epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
22506 /* Epilogue always copies COUNT_EXP & (EPILOGUE_SIZE_NEEDED - 1) bytes.
22507 Make sure it is power of 2. */
22508 epilogue_size_needed = smallest_pow2_greater_than (epilogue_size_needed);
22509
22510 /* To improve performance of small blocks, we jump around the VAL
22511 promoting mode. This mean that if the promoted VAL is not constant,
22512 we might not use it in the epilogue and have to use byte
22513 loop variant. */
22514 if (epilogue_size_needed > 2 && !promoted_val)
22515 force_loopy_epilogue = true;
22516 if (count)
22517 {
22518 if (count < (unsigned HOST_WIDE_INT)epilogue_size_needed)
22519 {
22520 /* If main algorithm works on QImode, no epilogue is needed.
22521 For small sizes just don't align anything. */
22522 if (size_needed == 1)
22523 desired_align = align;
22524 else
22525 goto epilogue;
22526 }
22527 }
22528 else
22529 {
22530 label = gen_label_rtx ();
22531 emit_cmp_and_jump_insns (count_exp,
22532 GEN_INT (epilogue_size_needed),
22533 LTU, 0, counter_mode (count_exp), 1, label);
22534 if (expected_size == -1 || expected_size <= epilogue_size_needed)
22535 predict_jump (REG_BR_PROB_BASE * 60 / 100);
22536 else
22537 predict_jump (REG_BR_PROB_BASE * 20 / 100);
22538 }
22539 }
22540 if (dynamic_check != -1)
22541 {
22542 rtx hot_label = gen_label_rtx ();
22543 jump_around_label = gen_label_rtx ();
22544 emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
22545 LEU, 0, counter_mode (count_exp), 1, hot_label);
22546 predict_jump (REG_BR_PROB_BASE * 90 / 100);
22547 set_storage_via_libcall (dst, count_exp, val_exp, false);
22548 emit_jump (jump_around_label);
22549 emit_label (hot_label);
22550 }
22551
22552 /* Step 2: Alignment prologue. */
22553
22554 /* Do the expensive promotion once we branched off the small blocks. */
22555 if (!promoted_val)
22556 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
22557 desired_align, align);
22558 gcc_assert (desired_align >= 1 && align >= 1);
22559
22560 if (desired_align > align)
22561 {
22562 if (align_bytes == 0)
22563 {
22564 /* Except for the first move in epilogue, we no longer know
22565 constant offset in aliasing info. It don't seems to worth
22566 the pain to maintain it for the first move, so throw away
22567 the info early. */
22568 dst = change_address (dst, BLKmode, destreg);
22569 expand_setmem_prologue (dst, destreg, promoted_val, count_exp, align,
22570 desired_align);
22571 }
22572 else
22573 {
22574 /* If we know how many bytes need to be stored before dst is
22575 sufficiently aligned, maintain aliasing info accurately. */
22576 dst = expand_constant_setmem_prologue (dst, destreg, promoted_val,
22577 desired_align, align_bytes);
22578 count_exp = plus_constant (count_exp, -align_bytes);
22579 count -= align_bytes;
22580 }
22581 if (need_zero_guard
22582 && (count < (unsigned HOST_WIDE_INT) size_needed
22583 || (align_bytes == 0
22584 && count < ((unsigned HOST_WIDE_INT) size_needed
22585 + desired_align - align))))
22586 {
22587 /* It is possible that we copied enough so the main loop will not
22588 execute. */
22589 gcc_assert (size_needed > 1);
22590 if (label == NULL_RTX)
22591 label = gen_label_rtx ();
22592 emit_cmp_and_jump_insns (count_exp,
22593 GEN_INT (size_needed),
22594 LTU, 0, counter_mode (count_exp), 1, label);
22595 if (expected_size == -1
22596 || expected_size < (desired_align - align) / 2 + size_needed)
22597 predict_jump (REG_BR_PROB_BASE * 20 / 100);
22598 else
22599 predict_jump (REG_BR_PROB_BASE * 60 / 100);
22600 }
22601 }
22602 if (label && size_needed == 1)
22603 {
22604 emit_label (label);
22605 LABEL_NUSES (label) = 1;
22606 label = NULL;
22607 promoted_val = val_exp;
22608 epilogue_size_needed = 1;
22609 }
22610 else if (label == NULL_RTX)
22611 epilogue_size_needed = size_needed;
22612
22613 /* Step 3: Main loop. */
22614
22615 switch (alg)
22616 {
22617 case libcall:
22618 case no_stringop:
22619 gcc_unreachable ();
22620 case loop_1_byte:
22621 expand_set_or_movmem_via_loop (dst, NULL, destreg, NULL, promoted_val,
22622 count_exp, QImode, 1, expected_size);
22623 break;
22624 case loop:
22625 expand_set_or_movmem_via_loop (dst, NULL, destreg, NULL, promoted_val,
22626 count_exp, Pmode, 1, expected_size);
22627 break;
22628 case unrolled_loop:
22629 expand_set_or_movmem_via_loop (dst, NULL, destreg, NULL, promoted_val,
22630 count_exp, Pmode, 4, expected_size);
22631 break;
22632 case rep_prefix_8_byte:
22633 expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp,
22634 DImode, val_exp);
22635 break;
22636 case rep_prefix_4_byte:
22637 expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp,
22638 SImode, val_exp);
22639 break;
22640 case rep_prefix_1_byte:
22641 expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp,
22642 QImode, val_exp);
22643 break;
22644 }
22645 /* Adjust properly the offset of src and dest memory for aliasing. */
22646 if (CONST_INT_P (count_exp))
22647 dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
22648 (count / size_needed) * size_needed);
22649 else
22650 dst = change_address (dst, BLKmode, destreg);
22651
22652 /* Step 4: Epilogue to copy the remaining bytes. */
22653
22654 if (label)
22655 {
22656 /* When the main loop is done, COUNT_EXP might hold original count,
22657 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
22658 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
22659 bytes. Compensate if needed. */
22660
22661 if (size_needed < epilogue_size_needed)
22662 {
22663 tmp =
22664 expand_simple_binop (counter_mode (count_exp), AND, count_exp,
22665 GEN_INT (size_needed - 1), count_exp, 1,
22666 OPTAB_DIRECT);
22667 if (tmp != count_exp)
22668 emit_move_insn (count_exp, tmp);
22669 }
22670 emit_label (label);
22671 LABEL_NUSES (label) = 1;
22672 }
22673 epilogue:
22674 if (count_exp != const0_rtx && epilogue_size_needed > 1)
22675 {
22676 if (force_loopy_epilogue)
22677 expand_setmem_epilogue_via_loop (dst, destreg, val_exp, count_exp,
22678 epilogue_size_needed);
22679 else
22680 expand_setmem_epilogue (dst, destreg, promoted_val, count_exp,
22681 epilogue_size_needed);
22682 }
22683 if (jump_around_label)
22684 emit_label (jump_around_label);
22685 return true;
22686 }
22687
22688 /* Expand the appropriate insns for doing strlen if not just doing
22689 repnz; scasb
22690
22691 out = result, initialized with the start address
22692 align_rtx = alignment of the address.
22693 scratch = scratch register, initialized with the startaddress when
22694 not aligned, otherwise undefined
22695
22696 This is just the body. It needs the initializations mentioned above and
22697 some address computing at the end. These things are done in i386.md. */
22698
22699 static void
22700 ix86_expand_strlensi_unroll_1 (rtx out, rtx src, rtx align_rtx)
22701 {
22702 int align;
22703 rtx tmp;
22704 rtx align_2_label = NULL_RTX;
22705 rtx align_3_label = NULL_RTX;
22706 rtx align_4_label = gen_label_rtx ();
22707 rtx end_0_label = gen_label_rtx ();
22708 rtx mem;
22709 rtx tmpreg = gen_reg_rtx (SImode);
22710 rtx scratch = gen_reg_rtx (SImode);
22711 rtx cmp;
22712
22713 align = 0;
22714 if (CONST_INT_P (align_rtx))
22715 align = INTVAL (align_rtx);
22716
22717 /* Loop to check 1..3 bytes for null to get an aligned pointer. */
22718
22719 /* Is there a known alignment and is it less than 4? */
22720 if (align < 4)
22721 {
22722 rtx scratch1 = gen_reg_rtx (Pmode);
22723 emit_move_insn (scratch1, out);
22724 /* Is there a known alignment and is it not 2? */
22725 if (align != 2)
22726 {
22727 align_3_label = gen_label_rtx (); /* Label when aligned to 3-byte */
22728 align_2_label = gen_label_rtx (); /* Label when aligned to 2-byte */
22729
22730 /* Leave just the 3 lower bits. */
22731 align_rtx = expand_binop (Pmode, and_optab, scratch1, GEN_INT (3),
22732 NULL_RTX, 0, OPTAB_WIDEN);
22733
22734 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
22735 Pmode, 1, align_4_label);
22736 emit_cmp_and_jump_insns (align_rtx, const2_rtx, EQ, NULL,
22737 Pmode, 1, align_2_label);
22738 emit_cmp_and_jump_insns (align_rtx, const2_rtx, GTU, NULL,
22739 Pmode, 1, align_3_label);
22740 }
22741 else
22742 {
22743 /* Since the alignment is 2, we have to check 2 or 0 bytes;
22744 check if is aligned to 4 - byte. */
22745
22746 align_rtx = expand_binop (Pmode, and_optab, scratch1, const2_rtx,
22747 NULL_RTX, 0, OPTAB_WIDEN);
22748
22749 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
22750 Pmode, 1, align_4_label);
22751 }
22752
22753 mem = change_address (src, QImode, out);
22754
22755 /* Now compare the bytes. */
22756
22757 /* Compare the first n unaligned byte on a byte per byte basis. */
22758 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL,
22759 QImode, 1, end_0_label);
22760
22761 /* Increment the address. */
22762 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
22763
22764 /* Not needed with an alignment of 2 */
22765 if (align != 2)
22766 {
22767 emit_label (align_2_label);
22768
22769 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
22770 end_0_label);
22771
22772 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
22773
22774 emit_label (align_3_label);
22775 }
22776
22777 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
22778 end_0_label);
22779
22780 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
22781 }
22782
22783 /* Generate loop to check 4 bytes at a time. It is not a good idea to
22784 align this loop. It gives only huge programs, but does not help to
22785 speed up. */
22786 emit_label (align_4_label);
22787
22788 mem = change_address (src, SImode, out);
22789 emit_move_insn (scratch, mem);
22790 emit_insn (ix86_gen_add3 (out, out, GEN_INT (4)));
22791
22792 /* This formula yields a nonzero result iff one of the bytes is zero.
22793 This saves three branches inside loop and many cycles. */
22794
22795 emit_insn (gen_addsi3 (tmpreg, scratch, GEN_INT (-0x01010101)));
22796 emit_insn (gen_one_cmplsi2 (scratch, scratch));
22797 emit_insn (gen_andsi3 (tmpreg, tmpreg, scratch));
22798 emit_insn (gen_andsi3 (tmpreg, tmpreg,
22799 gen_int_mode (0x80808080, SImode)));
22800 emit_cmp_and_jump_insns (tmpreg, const0_rtx, EQ, 0, SImode, 1,
22801 align_4_label);
22802
22803 if (TARGET_CMOVE)
22804 {
22805 rtx reg = gen_reg_rtx (SImode);
22806 rtx reg2 = gen_reg_rtx (Pmode);
22807 emit_move_insn (reg, tmpreg);
22808 emit_insn (gen_lshrsi3 (reg, reg, GEN_INT (16)));
22809
22810 /* If zero is not in the first two bytes, move two bytes forward. */
22811 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
22812 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
22813 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
22814 emit_insn (gen_rtx_SET (VOIDmode, tmpreg,
22815 gen_rtx_IF_THEN_ELSE (SImode, tmp,
22816 reg,
22817 tmpreg)));
22818 /* Emit lea manually to avoid clobbering of flags. */
22819 emit_insn (gen_rtx_SET (SImode, reg2,
22820 gen_rtx_PLUS (Pmode, out, const2_rtx)));
22821
22822 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
22823 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
22824 emit_insn (gen_rtx_SET (VOIDmode, out,
22825 gen_rtx_IF_THEN_ELSE (Pmode, tmp,
22826 reg2,
22827 out)));
22828 }
22829 else
22830 {
22831 rtx end_2_label = gen_label_rtx ();
22832 /* Is zero in the first two bytes? */
22833
22834 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
22835 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
22836 tmp = gen_rtx_NE (VOIDmode, tmp, const0_rtx);
22837 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
22838 gen_rtx_LABEL_REF (VOIDmode, end_2_label),
22839 pc_rtx);
22840 tmp = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
22841 JUMP_LABEL (tmp) = end_2_label;
22842
22843 /* Not in the first two. Move two bytes forward. */
22844 emit_insn (gen_lshrsi3 (tmpreg, tmpreg, GEN_INT (16)));
22845 emit_insn (ix86_gen_add3 (out, out, const2_rtx));
22846
22847 emit_label (end_2_label);
22848
22849 }
22850
22851 /* Avoid branch in fixing the byte. */
22852 tmpreg = gen_lowpart (QImode, tmpreg);
22853 emit_insn (gen_addqi3_cc (tmpreg, tmpreg, tmpreg));
22854 tmp = gen_rtx_REG (CCmode, FLAGS_REG);
22855 cmp = gen_rtx_LTU (VOIDmode, tmp, const0_rtx);
22856 emit_insn (ix86_gen_sub3_carry (out, out, GEN_INT (3), tmp, cmp));
22857
22858 emit_label (end_0_label);
22859 }
22860
22861 /* Expand strlen. */
22862
22863 bool
22864 ix86_expand_strlen (rtx out, rtx src, rtx eoschar, rtx align)
22865 {
22866 rtx addr, scratch1, scratch2, scratch3, scratch4;
22867
22868 /* The generic case of strlen expander is long. Avoid it's
22869 expanding unless TARGET_INLINE_ALL_STRINGOPS. */
22870
22871 if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
22872 && !TARGET_INLINE_ALL_STRINGOPS
22873 && !optimize_insn_for_size_p ()
22874 && (!CONST_INT_P (align) || INTVAL (align) < 4))
22875 return false;
22876
22877 addr = force_reg (Pmode, XEXP (src, 0));
22878 scratch1 = gen_reg_rtx (Pmode);
22879
22880 if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
22881 && !optimize_insn_for_size_p ())
22882 {
22883 /* Well it seems that some optimizer does not combine a call like
22884 foo(strlen(bar), strlen(bar));
22885 when the move and the subtraction is done here. It does calculate
22886 the length just once when these instructions are done inside of
22887 output_strlen_unroll(). But I think since &bar[strlen(bar)] is
22888 often used and I use one fewer register for the lifetime of
22889 output_strlen_unroll() this is better. */
22890
22891 emit_move_insn (out, addr);
22892
22893 ix86_expand_strlensi_unroll_1 (out, src, align);
22894
22895 /* strlensi_unroll_1 returns the address of the zero at the end of
22896 the string, like memchr(), so compute the length by subtracting
22897 the start address. */
22898 emit_insn (ix86_gen_sub3 (out, out, addr));
22899 }
22900 else
22901 {
22902 rtx unspec;
22903
22904 /* Can't use this if the user has appropriated eax, ecx, or edi. */
22905 if (fixed_regs[AX_REG] || fixed_regs[CX_REG] || fixed_regs[DI_REG])
22906 return false;
22907
22908 scratch2 = gen_reg_rtx (Pmode);
22909 scratch3 = gen_reg_rtx (Pmode);
22910 scratch4 = force_reg (Pmode, constm1_rtx);
22911
22912 emit_move_insn (scratch3, addr);
22913 eoschar = force_reg (QImode, eoschar);
22914
22915 src = replace_equiv_address_nv (src, scratch3);
22916
22917 /* If .md starts supporting :P, this can be done in .md. */
22918 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (4, src, eoschar, align,
22919 scratch4), UNSPEC_SCAS);
22920 emit_insn (gen_strlenqi_1 (scratch1, scratch3, unspec));
22921 emit_insn (ix86_gen_one_cmpl2 (scratch2, scratch1));
22922 emit_insn (ix86_gen_add3 (out, scratch2, constm1_rtx));
22923 }
22924 return true;
22925 }
22926
22927 /* For given symbol (function) construct code to compute address of it's PLT
22928 entry in large x86-64 PIC model. */
22929 rtx
22930 construct_plt_address (rtx symbol)
22931 {
22932 rtx tmp = gen_reg_rtx (Pmode);
22933 rtx unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, symbol), UNSPEC_PLTOFF);
22934
22935 gcc_assert (GET_CODE (symbol) == SYMBOL_REF);
22936 gcc_assert (ix86_cmodel == CM_LARGE_PIC);
22937
22938 emit_move_insn (tmp, gen_rtx_CONST (Pmode, unspec));
22939 emit_insn (gen_adddi3 (tmp, tmp, pic_offset_table_rtx));
22940 return tmp;
22941 }
22942
22943 rtx
22944 ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1,
22945 rtx callarg2,
22946 rtx pop, bool sibcall)
22947 {
22948 /* We need to represent that SI and DI registers are clobbered
22949 by SYSV calls. */
22950 static int clobbered_registers[] = {
22951 XMM6_REG, XMM7_REG, XMM8_REG,
22952 XMM9_REG, XMM10_REG, XMM11_REG,
22953 XMM12_REG, XMM13_REG, XMM14_REG,
22954 XMM15_REG, SI_REG, DI_REG
22955 };
22956 rtx vec[ARRAY_SIZE (clobbered_registers) + 3];
22957 rtx use = NULL, call;
22958 unsigned int vec_len;
22959
22960 if (pop == const0_rtx)
22961 pop = NULL;
22962 gcc_assert (!TARGET_64BIT || !pop);
22963
22964 if (TARGET_MACHO && !TARGET_64BIT)
22965 {
22966 #if TARGET_MACHO
22967 if (flag_pic && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
22968 fnaddr = machopic_indirect_call_target (fnaddr);
22969 #endif
22970 }
22971 else
22972 {
22973 /* Static functions and indirect calls don't need the pic register. */
22974 if (flag_pic && (!TARGET_64BIT || ix86_cmodel == CM_LARGE_PIC)
22975 && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
22976 && ! SYMBOL_REF_LOCAL_P (XEXP (fnaddr, 0)))
22977 use_reg (&use, pic_offset_table_rtx);
22978 }
22979
22980 if (TARGET_64BIT && INTVAL (callarg2) >= 0)
22981 {
22982 rtx al = gen_rtx_REG (QImode, AX_REG);
22983 emit_move_insn (al, callarg2);
22984 use_reg (&use, al);
22985 }
22986
22987 if (ix86_cmodel == CM_LARGE_PIC
22988 && MEM_P (fnaddr)
22989 && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
22990 && !local_symbolic_operand (XEXP (fnaddr, 0), VOIDmode))
22991 fnaddr = gen_rtx_MEM (QImode, construct_plt_address (XEXP (fnaddr, 0)));
22992 else if (sibcall
22993 ? !sibcall_insn_operand (XEXP (fnaddr, 0), word_mode)
22994 : !call_insn_operand (XEXP (fnaddr, 0), word_mode))
22995 {
22996 fnaddr = XEXP (fnaddr, 0);
22997 if (GET_MODE (fnaddr) != word_mode)
22998 fnaddr = convert_to_mode (word_mode, fnaddr, 1);
22999 fnaddr = gen_rtx_MEM (QImode, copy_to_mode_reg (word_mode, fnaddr));
23000 }
23001
23002 vec_len = 0;
23003 call = gen_rtx_CALL (VOIDmode, fnaddr, callarg1);
23004 if (retval)
23005 call = gen_rtx_SET (VOIDmode, retval, call);
23006 vec[vec_len++] = call;
23007
23008 if (pop)
23009 {
23010 pop = gen_rtx_PLUS (Pmode, stack_pointer_rtx, pop);
23011 pop = gen_rtx_SET (VOIDmode, stack_pointer_rtx, pop);
23012 vec[vec_len++] = pop;
23013 }
23014
23015 if (TARGET_64BIT_MS_ABI
23016 && (!callarg2 || INTVAL (callarg2) != -2))
23017 {
23018 unsigned i;
23019
23020 vec[vec_len++] = gen_rtx_UNSPEC (VOIDmode, gen_rtvec (1, const0_rtx),
23021 UNSPEC_MS_TO_SYSV_CALL);
23022
23023 for (i = 0; i < ARRAY_SIZE (clobbered_registers); i++)
23024 vec[vec_len++]
23025 = gen_rtx_CLOBBER (SSE_REGNO_P (clobbered_registers[i])
23026 ? TImode : DImode,
23027 gen_rtx_REG (SSE_REGNO_P (clobbered_registers[i])
23028 ? TImode : DImode,
23029 clobbered_registers[i]));
23030 }
23031
23032 /* Add UNSPEC_CALL_NEEDS_VZEROUPPER decoration. */
23033 if (TARGET_VZEROUPPER)
23034 {
23035 int avx256;
23036 if (cfun->machine->callee_pass_avx256_p)
23037 {
23038 if (cfun->machine->callee_return_avx256_p)
23039 avx256 = callee_return_pass_avx256;
23040 else
23041 avx256 = callee_pass_avx256;
23042 }
23043 else if (cfun->machine->callee_return_avx256_p)
23044 avx256 = callee_return_avx256;
23045 else
23046 avx256 = call_no_avx256;
23047
23048 if (reload_completed)
23049 emit_insn (gen_avx_vzeroupper (GEN_INT (avx256)));
23050 else
23051 vec[vec_len++] = gen_rtx_UNSPEC (VOIDmode,
23052 gen_rtvec (1, GEN_INT (avx256)),
23053 UNSPEC_CALL_NEEDS_VZEROUPPER);
23054 }
23055
23056 if (vec_len > 1)
23057 call = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (vec_len, vec));
23058 call = emit_call_insn (call);
23059 if (use)
23060 CALL_INSN_FUNCTION_USAGE (call) = use;
23061
23062 return call;
23063 }
23064
23065 void
23066 ix86_split_call_vzeroupper (rtx insn, rtx vzeroupper)
23067 {
23068 rtx pat = PATTERN (insn);
23069 rtvec vec = XVEC (pat, 0);
23070 int len = GET_NUM_ELEM (vec) - 1;
23071
23072 /* Strip off the last entry of the parallel. */
23073 gcc_assert (GET_CODE (RTVEC_ELT (vec, len)) == UNSPEC);
23074 gcc_assert (XINT (RTVEC_ELT (vec, len), 1) == UNSPEC_CALL_NEEDS_VZEROUPPER);
23075 if (len == 1)
23076 pat = RTVEC_ELT (vec, 0);
23077 else
23078 pat = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (len, &RTVEC_ELT (vec, 0)));
23079
23080 emit_insn (gen_avx_vzeroupper (vzeroupper));
23081 emit_call_insn (pat);
23082 }
23083
23084 /* Output the assembly for a call instruction. */
23085
23086 const char *
23087 ix86_output_call_insn (rtx insn, rtx call_op)
23088 {
23089 bool direct_p = constant_call_address_operand (call_op, Pmode);
23090 bool seh_nop_p = false;
23091 const char *xasm;
23092
23093 if (SIBLING_CALL_P (insn))
23094 {
23095 if (direct_p)
23096 xasm = "jmp\t%P0";
23097 /* SEH epilogue detection requires the indirect branch case
23098 to include REX.W. */
23099 else if (TARGET_SEH)
23100 xasm = "rex.W jmp %A0";
23101 else
23102 xasm = "jmp\t%A0";
23103
23104 output_asm_insn (xasm, &call_op);
23105 return "";
23106 }
23107
23108 /* SEH unwinding can require an extra nop to be emitted in several
23109 circumstances. Determine if we have one of those. */
23110 if (TARGET_SEH)
23111 {
23112 rtx i;
23113
23114 for (i = NEXT_INSN (insn); i ; i = NEXT_INSN (i))
23115 {
23116 /* If we get to another real insn, we don't need the nop. */
23117 if (INSN_P (i))
23118 break;
23119
23120 /* If we get to the epilogue note, prevent a catch region from
23121 being adjacent to the standard epilogue sequence. If non-
23122 call-exceptions, we'll have done this during epilogue emission. */
23123 if (NOTE_P (i) && NOTE_KIND (i) == NOTE_INSN_EPILOGUE_BEG
23124 && !flag_non_call_exceptions
23125 && !can_throw_internal (insn))
23126 {
23127 seh_nop_p = true;
23128 break;
23129 }
23130 }
23131
23132 /* If we didn't find a real insn following the call, prevent the
23133 unwinder from looking into the next function. */
23134 if (i == NULL)
23135 seh_nop_p = true;
23136 }
23137
23138 if (direct_p)
23139 xasm = "call\t%P0";
23140 else
23141 xasm = "call\t%A0";
23142
23143 output_asm_insn (xasm, &call_op);
23144
23145 if (seh_nop_p)
23146 return "nop";
23147
23148 return "";
23149 }
23150 \f
23151 /* Clear stack slot assignments remembered from previous functions.
23152 This is called from INIT_EXPANDERS once before RTL is emitted for each
23153 function. */
23154
23155 static struct machine_function *
23156 ix86_init_machine_status (void)
23157 {
23158 struct machine_function *f;
23159
23160 f = ggc_alloc_cleared_machine_function ();
23161 f->use_fast_prologue_epilogue_nregs = -1;
23162 f->tls_descriptor_call_expanded_p = 0;
23163 f->call_abi = ix86_abi;
23164
23165 return f;
23166 }
23167
23168 /* Return a MEM corresponding to a stack slot with mode MODE.
23169 Allocate a new slot if necessary.
23170
23171 The RTL for a function can have several slots available: N is
23172 which slot to use. */
23173
23174 rtx
23175 assign_386_stack_local (enum machine_mode mode, enum ix86_stack_slot n)
23176 {
23177 struct stack_local_entry *s;
23178
23179 gcc_assert (n < MAX_386_STACK_LOCALS);
23180
23181 /* Virtual slot is valid only before vregs are instantiated. */
23182 gcc_assert ((n == SLOT_VIRTUAL) == !virtuals_instantiated);
23183
23184 for (s = ix86_stack_locals; s; s = s->next)
23185 if (s->mode == mode && s->n == n)
23186 return validize_mem (copy_rtx (s->rtl));
23187
23188 s = ggc_alloc_stack_local_entry ();
23189 s->n = n;
23190 s->mode = mode;
23191 s->rtl = assign_stack_local (mode, GET_MODE_SIZE (mode), 0);
23192
23193 s->next = ix86_stack_locals;
23194 ix86_stack_locals = s;
23195 return validize_mem (s->rtl);
23196 }
23197 \f
23198 /* Calculate the length of the memory address in the instruction encoding.
23199 Includes addr32 prefix, does not include the one-byte modrm, opcode,
23200 or other prefixes. */
23201
23202 int
23203 memory_address_length (rtx addr)
23204 {
23205 struct ix86_address parts;
23206 rtx base, index, disp;
23207 int len;
23208 int ok;
23209
23210 if (GET_CODE (addr) == PRE_DEC
23211 || GET_CODE (addr) == POST_INC
23212 || GET_CODE (addr) == PRE_MODIFY
23213 || GET_CODE (addr) == POST_MODIFY)
23214 return 0;
23215
23216 ok = ix86_decompose_address (addr, &parts);
23217 gcc_assert (ok);
23218
23219 if (parts.base && GET_CODE (parts.base) == SUBREG)
23220 parts.base = SUBREG_REG (parts.base);
23221 if (parts.index && GET_CODE (parts.index) == SUBREG)
23222 parts.index = SUBREG_REG (parts.index);
23223
23224 base = parts.base;
23225 index = parts.index;
23226 disp = parts.disp;
23227
23228 /* Add length of addr32 prefix. */
23229 len = (GET_CODE (addr) == ZERO_EXTEND
23230 || GET_CODE (addr) == AND);
23231
23232 /* Rule of thumb:
23233 - esp as the base always wants an index,
23234 - ebp as the base always wants a displacement,
23235 - r12 as the base always wants an index,
23236 - r13 as the base always wants a displacement. */
23237
23238 /* Register Indirect. */
23239 if (base && !index && !disp)
23240 {
23241 /* esp (for its index) and ebp (for its displacement) need
23242 the two-byte modrm form. Similarly for r12 and r13 in 64-bit
23243 code. */
23244 if (REG_P (addr)
23245 && (addr == arg_pointer_rtx
23246 || addr == frame_pointer_rtx
23247 || REGNO (addr) == SP_REG
23248 || REGNO (addr) == BP_REG
23249 || REGNO (addr) == R12_REG
23250 || REGNO (addr) == R13_REG))
23251 len = 1;
23252 }
23253
23254 /* Direct Addressing. In 64-bit mode mod 00 r/m 5
23255 is not disp32, but disp32(%rip), so for disp32
23256 SIB byte is needed, unless print_operand_address
23257 optimizes it into disp32(%rip) or (%rip) is implied
23258 by UNSPEC. */
23259 else if (disp && !base && !index)
23260 {
23261 len = 4;
23262 if (TARGET_64BIT)
23263 {
23264 rtx symbol = disp;
23265
23266 if (GET_CODE (disp) == CONST)
23267 symbol = XEXP (disp, 0);
23268 if (GET_CODE (symbol) == PLUS
23269 && CONST_INT_P (XEXP (symbol, 1)))
23270 symbol = XEXP (symbol, 0);
23271
23272 if (GET_CODE (symbol) != LABEL_REF
23273 && (GET_CODE (symbol) != SYMBOL_REF
23274 || SYMBOL_REF_TLS_MODEL (symbol) != 0)
23275 && (GET_CODE (symbol) != UNSPEC
23276 || (XINT (symbol, 1) != UNSPEC_GOTPCREL
23277 && XINT (symbol, 1) != UNSPEC_PCREL
23278 && XINT (symbol, 1) != UNSPEC_GOTNTPOFF)))
23279 len += 1;
23280 }
23281 }
23282
23283 else
23284 {
23285 /* Find the length of the displacement constant. */
23286 if (disp)
23287 {
23288 if (base && satisfies_constraint_K (disp))
23289 len = 1;
23290 else
23291 len = 4;
23292 }
23293 /* ebp always wants a displacement. Similarly r13. */
23294 else if (base && REG_P (base)
23295 && (REGNO (base) == BP_REG || REGNO (base) == R13_REG))
23296 len = 1;
23297
23298 /* An index requires the two-byte modrm form.... */
23299 if (index
23300 /* ...like esp (or r12), which always wants an index. */
23301 || base == arg_pointer_rtx
23302 || base == frame_pointer_rtx
23303 || (base && REG_P (base)
23304 && (REGNO (base) == SP_REG || REGNO (base) == R12_REG)))
23305 len += 1;
23306 }
23307
23308 switch (parts.seg)
23309 {
23310 case SEG_FS:
23311 case SEG_GS:
23312 len += 1;
23313 break;
23314 default:
23315 break;
23316 }
23317
23318 return len;
23319 }
23320
23321 /* Compute default value for "length_immediate" attribute. When SHORTFORM
23322 is set, expect that insn have 8bit immediate alternative. */
23323 int
23324 ix86_attr_length_immediate_default (rtx insn, bool shortform)
23325 {
23326 int len = 0;
23327 int i;
23328 extract_insn_cached (insn);
23329 for (i = recog_data.n_operands - 1; i >= 0; --i)
23330 if (CONSTANT_P (recog_data.operand[i]))
23331 {
23332 enum attr_mode mode = get_attr_mode (insn);
23333
23334 gcc_assert (!len);
23335 if (shortform && CONST_INT_P (recog_data.operand[i]))
23336 {
23337 HOST_WIDE_INT ival = INTVAL (recog_data.operand[i]);
23338 switch (mode)
23339 {
23340 case MODE_QI:
23341 len = 1;
23342 continue;
23343 case MODE_HI:
23344 ival = trunc_int_for_mode (ival, HImode);
23345 break;
23346 case MODE_SI:
23347 ival = trunc_int_for_mode (ival, SImode);
23348 break;
23349 default:
23350 break;
23351 }
23352 if (IN_RANGE (ival, -128, 127))
23353 {
23354 len = 1;
23355 continue;
23356 }
23357 }
23358 switch (mode)
23359 {
23360 case MODE_QI:
23361 len = 1;
23362 break;
23363 case MODE_HI:
23364 len = 2;
23365 break;
23366 case MODE_SI:
23367 len = 4;
23368 break;
23369 /* Immediates for DImode instructions are encoded as 32bit sign extended values. */
23370 case MODE_DI:
23371 len = 4;
23372 break;
23373 default:
23374 fatal_insn ("unknown insn mode", insn);
23375 }
23376 }
23377 return len;
23378 }
23379 /* Compute default value for "length_address" attribute. */
23380 int
23381 ix86_attr_length_address_default (rtx insn)
23382 {
23383 int i;
23384
23385 if (get_attr_type (insn) == TYPE_LEA)
23386 {
23387 rtx set = PATTERN (insn), addr;
23388
23389 if (GET_CODE (set) == PARALLEL)
23390 set = XVECEXP (set, 0, 0);
23391
23392 gcc_assert (GET_CODE (set) == SET);
23393
23394 addr = SET_SRC (set);
23395 if (TARGET_64BIT && get_attr_mode (insn) == MODE_SI)
23396 {
23397 if (GET_CODE (addr) == ZERO_EXTEND)
23398 addr = XEXP (addr, 0);
23399 if (GET_CODE (addr) == SUBREG)
23400 addr = SUBREG_REG (addr);
23401 }
23402
23403 return memory_address_length (addr);
23404 }
23405
23406 extract_insn_cached (insn);
23407 for (i = recog_data.n_operands - 1; i >= 0; --i)
23408 if (MEM_P (recog_data.operand[i]))
23409 {
23410 constrain_operands_cached (reload_completed);
23411 if (which_alternative != -1)
23412 {
23413 const char *constraints = recog_data.constraints[i];
23414 int alt = which_alternative;
23415
23416 while (*constraints == '=' || *constraints == '+')
23417 constraints++;
23418 while (alt-- > 0)
23419 while (*constraints++ != ',')
23420 ;
23421 /* Skip ignored operands. */
23422 if (*constraints == 'X')
23423 continue;
23424 }
23425 return memory_address_length (XEXP (recog_data.operand[i], 0));
23426 }
23427 return 0;
23428 }
23429
23430 /* Compute default value for "length_vex" attribute. It includes
23431 2 or 3 byte VEX prefix and 1 opcode byte. */
23432
23433 int
23434 ix86_attr_length_vex_default (rtx insn, bool has_0f_opcode, bool has_vex_w)
23435 {
23436 int i;
23437
23438 /* Only 0f opcode can use 2 byte VEX prefix and VEX W bit uses 3
23439 byte VEX prefix. */
23440 if (!has_0f_opcode || has_vex_w)
23441 return 3 + 1;
23442
23443 /* We can always use 2 byte VEX prefix in 32bit. */
23444 if (!TARGET_64BIT)
23445 return 2 + 1;
23446
23447 extract_insn_cached (insn);
23448
23449 for (i = recog_data.n_operands - 1; i >= 0; --i)
23450 if (REG_P (recog_data.operand[i]))
23451 {
23452 /* REX.W bit uses 3 byte VEX prefix. */
23453 if (GET_MODE (recog_data.operand[i]) == DImode
23454 && GENERAL_REG_P (recog_data.operand[i]))
23455 return 3 + 1;
23456 }
23457 else
23458 {
23459 /* REX.X or REX.B bits use 3 byte VEX prefix. */
23460 if (MEM_P (recog_data.operand[i])
23461 && x86_extended_reg_mentioned_p (recog_data.operand[i]))
23462 return 3 + 1;
23463 }
23464
23465 return 2 + 1;
23466 }
23467 \f
23468 /* Return the maximum number of instructions a cpu can issue. */
23469
23470 static int
23471 ix86_issue_rate (void)
23472 {
23473 switch (ix86_tune)
23474 {
23475 case PROCESSOR_PENTIUM:
23476 case PROCESSOR_ATOM:
23477 case PROCESSOR_K6:
23478 return 2;
23479
23480 case PROCESSOR_PENTIUMPRO:
23481 case PROCESSOR_PENTIUM4:
23482 case PROCESSOR_CORE2_32:
23483 case PROCESSOR_CORE2_64:
23484 case PROCESSOR_COREI7_32:
23485 case PROCESSOR_COREI7_64:
23486 case PROCESSOR_ATHLON:
23487 case PROCESSOR_K8:
23488 case PROCESSOR_AMDFAM10:
23489 case PROCESSOR_NOCONA:
23490 case PROCESSOR_GENERIC32:
23491 case PROCESSOR_GENERIC64:
23492 case PROCESSOR_BDVER1:
23493 case PROCESSOR_BDVER2:
23494 case PROCESSOR_BTVER1:
23495 return 3;
23496
23497 default:
23498 return 1;
23499 }
23500 }
23501
23502 /* A subroutine of ix86_adjust_cost -- return TRUE iff INSN reads flags set
23503 by DEP_INSN and nothing set by DEP_INSN. */
23504
23505 static bool
23506 ix86_flags_dependent (rtx insn, rtx dep_insn, enum attr_type insn_type)
23507 {
23508 rtx set, set2;
23509
23510 /* Simplify the test for uninteresting insns. */
23511 if (insn_type != TYPE_SETCC
23512 && insn_type != TYPE_ICMOV
23513 && insn_type != TYPE_FCMOV
23514 && insn_type != TYPE_IBR)
23515 return false;
23516
23517 if ((set = single_set (dep_insn)) != 0)
23518 {
23519 set = SET_DEST (set);
23520 set2 = NULL_RTX;
23521 }
23522 else if (GET_CODE (PATTERN (dep_insn)) == PARALLEL
23523 && XVECLEN (PATTERN (dep_insn), 0) == 2
23524 && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 0)) == SET
23525 && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 1)) == SET)
23526 {
23527 set = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
23528 set2 = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
23529 }
23530 else
23531 return false;
23532
23533 if (!REG_P (set) || REGNO (set) != FLAGS_REG)
23534 return false;
23535
23536 /* This test is true if the dependent insn reads the flags but
23537 not any other potentially set register. */
23538 if (!reg_overlap_mentioned_p (set, PATTERN (insn)))
23539 return false;
23540
23541 if (set2 && reg_overlap_mentioned_p (set2, PATTERN (insn)))
23542 return false;
23543
23544 return true;
23545 }
23546
23547 /* Return true iff USE_INSN has a memory address with operands set by
23548 SET_INSN. */
23549
23550 bool
23551 ix86_agi_dependent (rtx set_insn, rtx use_insn)
23552 {
23553 int i;
23554 extract_insn_cached (use_insn);
23555 for (i = recog_data.n_operands - 1; i >= 0; --i)
23556 if (MEM_P (recog_data.operand[i]))
23557 {
23558 rtx addr = XEXP (recog_data.operand[i], 0);
23559 return modified_in_p (addr, set_insn) != 0;
23560 }
23561 return false;
23562 }
23563
23564 static int
23565 ix86_adjust_cost (rtx insn, rtx link, rtx dep_insn, int cost)
23566 {
23567 enum attr_type insn_type, dep_insn_type;
23568 enum attr_memory memory;
23569 rtx set, set2;
23570 int dep_insn_code_number;
23571
23572 /* Anti and output dependencies have zero cost on all CPUs. */
23573 if (REG_NOTE_KIND (link) != 0)
23574 return 0;
23575
23576 dep_insn_code_number = recog_memoized (dep_insn);
23577
23578 /* If we can't recognize the insns, we can't really do anything. */
23579 if (dep_insn_code_number < 0 || recog_memoized (insn) < 0)
23580 return cost;
23581
23582 insn_type = get_attr_type (insn);
23583 dep_insn_type = get_attr_type (dep_insn);
23584
23585 switch (ix86_tune)
23586 {
23587 case PROCESSOR_PENTIUM:
23588 /* Address Generation Interlock adds a cycle of latency. */
23589 if (insn_type == TYPE_LEA)
23590 {
23591 rtx addr = PATTERN (insn);
23592
23593 if (GET_CODE (addr) == PARALLEL)
23594 addr = XVECEXP (addr, 0, 0);
23595
23596 gcc_assert (GET_CODE (addr) == SET);
23597
23598 addr = SET_SRC (addr);
23599 if (modified_in_p (addr, dep_insn))
23600 cost += 1;
23601 }
23602 else if (ix86_agi_dependent (dep_insn, insn))
23603 cost += 1;
23604
23605 /* ??? Compares pair with jump/setcc. */
23606 if (ix86_flags_dependent (insn, dep_insn, insn_type))
23607 cost = 0;
23608
23609 /* Floating point stores require value to be ready one cycle earlier. */
23610 if (insn_type == TYPE_FMOV
23611 && get_attr_memory (insn) == MEMORY_STORE
23612 && !ix86_agi_dependent (dep_insn, insn))
23613 cost += 1;
23614 break;
23615
23616 case PROCESSOR_PENTIUMPRO:
23617 memory = get_attr_memory (insn);
23618
23619 /* INT->FP conversion is expensive. */
23620 if (get_attr_fp_int_src (dep_insn))
23621 cost += 5;
23622
23623 /* There is one cycle extra latency between an FP op and a store. */
23624 if (insn_type == TYPE_FMOV
23625 && (set = single_set (dep_insn)) != NULL_RTX
23626 && (set2 = single_set (insn)) != NULL_RTX
23627 && rtx_equal_p (SET_DEST (set), SET_SRC (set2))
23628 && MEM_P (SET_DEST (set2)))
23629 cost += 1;
23630
23631 /* Show ability of reorder buffer to hide latency of load by executing
23632 in parallel with previous instruction in case
23633 previous instruction is not needed to compute the address. */
23634 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
23635 && !ix86_agi_dependent (dep_insn, insn))
23636 {
23637 /* Claim moves to take one cycle, as core can issue one load
23638 at time and the next load can start cycle later. */
23639 if (dep_insn_type == TYPE_IMOV
23640 || dep_insn_type == TYPE_FMOV)
23641 cost = 1;
23642 else if (cost > 1)
23643 cost--;
23644 }
23645 break;
23646
23647 case PROCESSOR_K6:
23648 memory = get_attr_memory (insn);
23649
23650 /* The esp dependency is resolved before the instruction is really
23651 finished. */
23652 if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
23653 && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
23654 return 1;
23655
23656 /* INT->FP conversion is expensive. */
23657 if (get_attr_fp_int_src (dep_insn))
23658 cost += 5;
23659
23660 /* Show ability of reorder buffer to hide latency of load by executing
23661 in parallel with previous instruction in case
23662 previous instruction is not needed to compute the address. */
23663 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
23664 && !ix86_agi_dependent (dep_insn, insn))
23665 {
23666 /* Claim moves to take one cycle, as core can issue one load
23667 at time and the next load can start cycle later. */
23668 if (dep_insn_type == TYPE_IMOV
23669 || dep_insn_type == TYPE_FMOV)
23670 cost = 1;
23671 else if (cost > 2)
23672 cost -= 2;
23673 else
23674 cost = 1;
23675 }
23676 break;
23677
23678 case PROCESSOR_ATHLON:
23679 case PROCESSOR_K8:
23680 case PROCESSOR_AMDFAM10:
23681 case PROCESSOR_BDVER1:
23682 case PROCESSOR_BDVER2:
23683 case PROCESSOR_BTVER1:
23684 case PROCESSOR_ATOM:
23685 case PROCESSOR_GENERIC32:
23686 case PROCESSOR_GENERIC64:
23687 memory = get_attr_memory (insn);
23688
23689 /* Show ability of reorder buffer to hide latency of load by executing
23690 in parallel with previous instruction in case
23691 previous instruction is not needed to compute the address. */
23692 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
23693 && !ix86_agi_dependent (dep_insn, insn))
23694 {
23695 enum attr_unit unit = get_attr_unit (insn);
23696 int loadcost = 3;
23697
23698 /* Because of the difference between the length of integer and
23699 floating unit pipeline preparation stages, the memory operands
23700 for floating point are cheaper.
23701
23702 ??? For Athlon it the difference is most probably 2. */
23703 if (unit == UNIT_INTEGER || unit == UNIT_UNKNOWN)
23704 loadcost = 3;
23705 else
23706 loadcost = TARGET_ATHLON ? 2 : 0;
23707
23708 if (cost >= loadcost)
23709 cost -= loadcost;
23710 else
23711 cost = 0;
23712 }
23713
23714 default:
23715 break;
23716 }
23717
23718 return cost;
23719 }
23720
23721 /* How many alternative schedules to try. This should be as wide as the
23722 scheduling freedom in the DFA, but no wider. Making this value too
23723 large results extra work for the scheduler. */
23724
23725 static int
23726 ia32_multipass_dfa_lookahead (void)
23727 {
23728 switch (ix86_tune)
23729 {
23730 case PROCESSOR_PENTIUM:
23731 return 2;
23732
23733 case PROCESSOR_PENTIUMPRO:
23734 case PROCESSOR_K6:
23735 return 1;
23736
23737 case PROCESSOR_CORE2_32:
23738 case PROCESSOR_CORE2_64:
23739 case PROCESSOR_COREI7_32:
23740 case PROCESSOR_COREI7_64:
23741 /* Generally, we want haifa-sched:max_issue() to look ahead as far
23742 as many instructions can be executed on a cycle, i.e.,
23743 issue_rate. I wonder why tuning for many CPUs does not do this. */
23744 return ix86_issue_rate ();
23745
23746 default:
23747 return 0;
23748 }
23749 }
23750
23751 \f
23752
23753 /* Model decoder of Core 2/i7.
23754 Below hooks for multipass scheduling (see haifa-sched.c:max_issue)
23755 track the instruction fetch block boundaries and make sure that long
23756 (9+ bytes) instructions are assigned to D0. */
23757
23758 /* Maximum length of an insn that can be handled by
23759 a secondary decoder unit. '8' for Core 2/i7. */
23760 static int core2i7_secondary_decoder_max_insn_size;
23761
23762 /* Ifetch block size, i.e., number of bytes decoder reads per cycle.
23763 '16' for Core 2/i7. */
23764 static int core2i7_ifetch_block_size;
23765
23766 /* Maximum number of instructions decoder can handle per cycle.
23767 '6' for Core 2/i7. */
23768 static int core2i7_ifetch_block_max_insns;
23769
23770 typedef struct ix86_first_cycle_multipass_data_ *
23771 ix86_first_cycle_multipass_data_t;
23772 typedef const struct ix86_first_cycle_multipass_data_ *
23773 const_ix86_first_cycle_multipass_data_t;
23774
23775 /* A variable to store target state across calls to max_issue within
23776 one cycle. */
23777 static struct ix86_first_cycle_multipass_data_ _ix86_first_cycle_multipass_data,
23778 *ix86_first_cycle_multipass_data = &_ix86_first_cycle_multipass_data;
23779
23780 /* Initialize DATA. */
23781 static void
23782 core2i7_first_cycle_multipass_init (void *_data)
23783 {
23784 ix86_first_cycle_multipass_data_t data
23785 = (ix86_first_cycle_multipass_data_t) _data;
23786
23787 data->ifetch_block_len = 0;
23788 data->ifetch_block_n_insns = 0;
23789 data->ready_try_change = NULL;
23790 data->ready_try_change_size = 0;
23791 }
23792
23793 /* Advancing the cycle; reset ifetch block counts. */
23794 static void
23795 core2i7_dfa_post_advance_cycle (void)
23796 {
23797 ix86_first_cycle_multipass_data_t data = ix86_first_cycle_multipass_data;
23798
23799 gcc_assert (data->ifetch_block_n_insns <= core2i7_ifetch_block_max_insns);
23800
23801 data->ifetch_block_len = 0;
23802 data->ifetch_block_n_insns = 0;
23803 }
23804
23805 static int min_insn_size (rtx);
23806
23807 /* Filter out insns from ready_try that the core will not be able to issue
23808 on current cycle due to decoder. */
23809 static void
23810 core2i7_first_cycle_multipass_filter_ready_try
23811 (const_ix86_first_cycle_multipass_data_t data,
23812 char *ready_try, int n_ready, bool first_cycle_insn_p)
23813 {
23814 while (n_ready--)
23815 {
23816 rtx insn;
23817 int insn_size;
23818
23819 if (ready_try[n_ready])
23820 continue;
23821
23822 insn = get_ready_element (n_ready);
23823 insn_size = min_insn_size (insn);
23824
23825 if (/* If this is a too long an insn for a secondary decoder ... */
23826 (!first_cycle_insn_p
23827 && insn_size > core2i7_secondary_decoder_max_insn_size)
23828 /* ... or it would not fit into the ifetch block ... */
23829 || data->ifetch_block_len + insn_size > core2i7_ifetch_block_size
23830 /* ... or the decoder is full already ... */
23831 || data->ifetch_block_n_insns + 1 > core2i7_ifetch_block_max_insns)
23832 /* ... mask the insn out. */
23833 {
23834 ready_try[n_ready] = 1;
23835
23836 if (data->ready_try_change)
23837 SET_BIT (data->ready_try_change, n_ready);
23838 }
23839 }
23840 }
23841
23842 /* Prepare for a new round of multipass lookahead scheduling. */
23843 static void
23844 core2i7_first_cycle_multipass_begin (void *_data, char *ready_try, int n_ready,
23845 bool first_cycle_insn_p)
23846 {
23847 ix86_first_cycle_multipass_data_t data
23848 = (ix86_first_cycle_multipass_data_t) _data;
23849 const_ix86_first_cycle_multipass_data_t prev_data
23850 = ix86_first_cycle_multipass_data;
23851
23852 /* Restore the state from the end of the previous round. */
23853 data->ifetch_block_len = prev_data->ifetch_block_len;
23854 data->ifetch_block_n_insns = prev_data->ifetch_block_n_insns;
23855
23856 /* Filter instructions that cannot be issued on current cycle due to
23857 decoder restrictions. */
23858 core2i7_first_cycle_multipass_filter_ready_try (data, ready_try, n_ready,
23859 first_cycle_insn_p);
23860 }
23861
23862 /* INSN is being issued in current solution. Account for its impact on
23863 the decoder model. */
23864 static void
23865 core2i7_first_cycle_multipass_issue (void *_data, char *ready_try, int n_ready,
23866 rtx insn, const void *_prev_data)
23867 {
23868 ix86_first_cycle_multipass_data_t data
23869 = (ix86_first_cycle_multipass_data_t) _data;
23870 const_ix86_first_cycle_multipass_data_t prev_data
23871 = (const_ix86_first_cycle_multipass_data_t) _prev_data;
23872
23873 int insn_size = min_insn_size (insn);
23874
23875 data->ifetch_block_len = prev_data->ifetch_block_len + insn_size;
23876 data->ifetch_block_n_insns = prev_data->ifetch_block_n_insns + 1;
23877 gcc_assert (data->ifetch_block_len <= core2i7_ifetch_block_size
23878 && data->ifetch_block_n_insns <= core2i7_ifetch_block_max_insns);
23879
23880 /* Allocate or resize the bitmap for storing INSN's effect on ready_try. */
23881 if (!data->ready_try_change)
23882 {
23883 data->ready_try_change = sbitmap_alloc (n_ready);
23884 data->ready_try_change_size = n_ready;
23885 }
23886 else if (data->ready_try_change_size < n_ready)
23887 {
23888 data->ready_try_change = sbitmap_resize (data->ready_try_change,
23889 n_ready, 0);
23890 data->ready_try_change_size = n_ready;
23891 }
23892 sbitmap_zero (data->ready_try_change);
23893
23894 /* Filter out insns from ready_try that the core will not be able to issue
23895 on current cycle due to decoder. */
23896 core2i7_first_cycle_multipass_filter_ready_try (data, ready_try, n_ready,
23897 false);
23898 }
23899
23900 /* Revert the effect on ready_try. */
23901 static void
23902 core2i7_first_cycle_multipass_backtrack (const void *_data,
23903 char *ready_try,
23904 int n_ready ATTRIBUTE_UNUSED)
23905 {
23906 const_ix86_first_cycle_multipass_data_t data
23907 = (const_ix86_first_cycle_multipass_data_t) _data;
23908 unsigned int i = 0;
23909 sbitmap_iterator sbi;
23910
23911 gcc_assert (sbitmap_last_set_bit (data->ready_try_change) < n_ready);
23912 EXECUTE_IF_SET_IN_SBITMAP (data->ready_try_change, 0, i, sbi)
23913 {
23914 ready_try[i] = 0;
23915 }
23916 }
23917
23918 /* Save the result of multipass lookahead scheduling for the next round. */
23919 static void
23920 core2i7_first_cycle_multipass_end (const void *_data)
23921 {
23922 const_ix86_first_cycle_multipass_data_t data
23923 = (const_ix86_first_cycle_multipass_data_t) _data;
23924 ix86_first_cycle_multipass_data_t next_data
23925 = ix86_first_cycle_multipass_data;
23926
23927 if (data != NULL)
23928 {
23929 next_data->ifetch_block_len = data->ifetch_block_len;
23930 next_data->ifetch_block_n_insns = data->ifetch_block_n_insns;
23931 }
23932 }
23933
23934 /* Deallocate target data. */
23935 static void
23936 core2i7_first_cycle_multipass_fini (void *_data)
23937 {
23938 ix86_first_cycle_multipass_data_t data
23939 = (ix86_first_cycle_multipass_data_t) _data;
23940
23941 if (data->ready_try_change)
23942 {
23943 sbitmap_free (data->ready_try_change);
23944 data->ready_try_change = NULL;
23945 data->ready_try_change_size = 0;
23946 }
23947 }
23948
23949 /* Prepare for scheduling pass. */
23950 static void
23951 ix86_sched_init_global (FILE *dump ATTRIBUTE_UNUSED,
23952 int verbose ATTRIBUTE_UNUSED,
23953 int max_uid ATTRIBUTE_UNUSED)
23954 {
23955 /* Install scheduling hooks for current CPU. Some of these hooks are used
23956 in time-critical parts of the scheduler, so we only set them up when
23957 they are actually used. */
23958 switch (ix86_tune)
23959 {
23960 case PROCESSOR_CORE2_32:
23961 case PROCESSOR_CORE2_64:
23962 case PROCESSOR_COREI7_32:
23963 case PROCESSOR_COREI7_64:
23964 targetm.sched.dfa_post_advance_cycle
23965 = core2i7_dfa_post_advance_cycle;
23966 targetm.sched.first_cycle_multipass_init
23967 = core2i7_first_cycle_multipass_init;
23968 targetm.sched.first_cycle_multipass_begin
23969 = core2i7_first_cycle_multipass_begin;
23970 targetm.sched.first_cycle_multipass_issue
23971 = core2i7_first_cycle_multipass_issue;
23972 targetm.sched.first_cycle_multipass_backtrack
23973 = core2i7_first_cycle_multipass_backtrack;
23974 targetm.sched.first_cycle_multipass_end
23975 = core2i7_first_cycle_multipass_end;
23976 targetm.sched.first_cycle_multipass_fini
23977 = core2i7_first_cycle_multipass_fini;
23978
23979 /* Set decoder parameters. */
23980 core2i7_secondary_decoder_max_insn_size = 8;
23981 core2i7_ifetch_block_size = 16;
23982 core2i7_ifetch_block_max_insns = 6;
23983 break;
23984
23985 default:
23986 targetm.sched.dfa_post_advance_cycle = NULL;
23987 targetm.sched.first_cycle_multipass_init = NULL;
23988 targetm.sched.first_cycle_multipass_begin = NULL;
23989 targetm.sched.first_cycle_multipass_issue = NULL;
23990 targetm.sched.first_cycle_multipass_backtrack = NULL;
23991 targetm.sched.first_cycle_multipass_end = NULL;
23992 targetm.sched.first_cycle_multipass_fini = NULL;
23993 break;
23994 }
23995 }
23996
23997 \f
23998 /* Compute the alignment given to a constant that is being placed in memory.
23999 EXP is the constant and ALIGN is the alignment that the object would
24000 ordinarily have.
24001 The value of this function is used instead of that alignment to align
24002 the object. */
24003
24004 int
24005 ix86_constant_alignment (tree exp, int align)
24006 {
24007 if (TREE_CODE (exp) == REAL_CST || TREE_CODE (exp) == VECTOR_CST
24008 || TREE_CODE (exp) == INTEGER_CST)
24009 {
24010 if (TYPE_MODE (TREE_TYPE (exp)) == DFmode && align < 64)
24011 return 64;
24012 else if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (exp))) && align < 128)
24013 return 128;
24014 }
24015 else if (!optimize_size && TREE_CODE (exp) == STRING_CST
24016 && TREE_STRING_LENGTH (exp) >= 31 && align < BITS_PER_WORD)
24017 return BITS_PER_WORD;
24018
24019 return align;
24020 }
24021
24022 /* Compute the alignment for a static variable.
24023 TYPE is the data type, and ALIGN is the alignment that
24024 the object would ordinarily have. The value of this function is used
24025 instead of that alignment to align the object. */
24026
24027 int
24028 ix86_data_alignment (tree type, int align)
24029 {
24030 int max_align = optimize_size ? BITS_PER_WORD : MIN (256, MAX_OFILE_ALIGNMENT);
24031
24032 if (AGGREGATE_TYPE_P (type)
24033 && TYPE_SIZE (type)
24034 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
24035 && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= (unsigned) max_align
24036 || TREE_INT_CST_HIGH (TYPE_SIZE (type)))
24037 && align < max_align)
24038 align = max_align;
24039
24040 /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
24041 to 16byte boundary. */
24042 if (TARGET_64BIT)
24043 {
24044 if (AGGREGATE_TYPE_P (type)
24045 && TYPE_SIZE (type)
24046 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
24047 && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= 128
24048 || TREE_INT_CST_HIGH (TYPE_SIZE (type))) && align < 128)
24049 return 128;
24050 }
24051
24052 if (TREE_CODE (type) == ARRAY_TYPE)
24053 {
24054 if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
24055 return 64;
24056 if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
24057 return 128;
24058 }
24059 else if (TREE_CODE (type) == COMPLEX_TYPE)
24060 {
24061
24062 if (TYPE_MODE (type) == DCmode && align < 64)
24063 return 64;
24064 if ((TYPE_MODE (type) == XCmode
24065 || TYPE_MODE (type) == TCmode) && align < 128)
24066 return 128;
24067 }
24068 else if ((TREE_CODE (type) == RECORD_TYPE
24069 || TREE_CODE (type) == UNION_TYPE
24070 || TREE_CODE (type) == QUAL_UNION_TYPE)
24071 && TYPE_FIELDS (type))
24072 {
24073 if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
24074 return 64;
24075 if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
24076 return 128;
24077 }
24078 else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
24079 || TREE_CODE (type) == INTEGER_TYPE)
24080 {
24081 if (TYPE_MODE (type) == DFmode && align < 64)
24082 return 64;
24083 if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
24084 return 128;
24085 }
24086
24087 return align;
24088 }
24089
24090 /* Compute the alignment for a local variable or a stack slot. EXP is
24091 the data type or decl itself, MODE is the widest mode available and
24092 ALIGN is the alignment that the object would ordinarily have. The
24093 value of this macro is used instead of that alignment to align the
24094 object. */
24095
24096 unsigned int
24097 ix86_local_alignment (tree exp, enum machine_mode mode,
24098 unsigned int align)
24099 {
24100 tree type, decl;
24101
24102 if (exp && DECL_P (exp))
24103 {
24104 type = TREE_TYPE (exp);
24105 decl = exp;
24106 }
24107 else
24108 {
24109 type = exp;
24110 decl = NULL;
24111 }
24112
24113 /* Don't do dynamic stack realignment for long long objects with
24114 -mpreferred-stack-boundary=2. */
24115 if (!TARGET_64BIT
24116 && align == 64
24117 && ix86_preferred_stack_boundary < 64
24118 && (mode == DImode || (type && TYPE_MODE (type) == DImode))
24119 && (!type || !TYPE_USER_ALIGN (type))
24120 && (!decl || !DECL_USER_ALIGN (decl)))
24121 align = 32;
24122
24123 /* If TYPE is NULL, we are allocating a stack slot for caller-save
24124 register in MODE. We will return the largest alignment of XF
24125 and DF. */
24126 if (!type)
24127 {
24128 if (mode == XFmode && align < GET_MODE_ALIGNMENT (DFmode))
24129 align = GET_MODE_ALIGNMENT (DFmode);
24130 return align;
24131 }
24132
24133 /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
24134 to 16byte boundary. Exact wording is:
24135
24136 An array uses the same alignment as its elements, except that a local or
24137 global array variable of length at least 16 bytes or
24138 a C99 variable-length array variable always has alignment of at least 16 bytes.
24139
24140 This was added to allow use of aligned SSE instructions at arrays. This
24141 rule is meant for static storage (where compiler can not do the analysis
24142 by itself). We follow it for automatic variables only when convenient.
24143 We fully control everything in the function compiled and functions from
24144 other unit can not rely on the alignment.
24145
24146 Exclude va_list type. It is the common case of local array where
24147 we can not benefit from the alignment. */
24148 if (TARGET_64BIT && optimize_function_for_speed_p (cfun)
24149 && TARGET_SSE)
24150 {
24151 if (AGGREGATE_TYPE_P (type)
24152 && (va_list_type_node == NULL_TREE
24153 || (TYPE_MAIN_VARIANT (type)
24154 != TYPE_MAIN_VARIANT (va_list_type_node)))
24155 && TYPE_SIZE (type)
24156 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
24157 && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= 16
24158 || TREE_INT_CST_HIGH (TYPE_SIZE (type))) && align < 128)
24159 return 128;
24160 }
24161 if (TREE_CODE (type) == ARRAY_TYPE)
24162 {
24163 if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
24164 return 64;
24165 if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
24166 return 128;
24167 }
24168 else if (TREE_CODE (type) == COMPLEX_TYPE)
24169 {
24170 if (TYPE_MODE (type) == DCmode && align < 64)
24171 return 64;
24172 if ((TYPE_MODE (type) == XCmode
24173 || TYPE_MODE (type) == TCmode) && align < 128)
24174 return 128;
24175 }
24176 else if ((TREE_CODE (type) == RECORD_TYPE
24177 || TREE_CODE (type) == UNION_TYPE
24178 || TREE_CODE (type) == QUAL_UNION_TYPE)
24179 && TYPE_FIELDS (type))
24180 {
24181 if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
24182 return 64;
24183 if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
24184 return 128;
24185 }
24186 else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
24187 || TREE_CODE (type) == INTEGER_TYPE)
24188 {
24189
24190 if (TYPE_MODE (type) == DFmode && align < 64)
24191 return 64;
24192 if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
24193 return 128;
24194 }
24195 return align;
24196 }
24197
24198 /* Compute the minimum required alignment for dynamic stack realignment
24199 purposes for a local variable, parameter or a stack slot. EXP is
24200 the data type or decl itself, MODE is its mode and ALIGN is the
24201 alignment that the object would ordinarily have. */
24202
24203 unsigned int
24204 ix86_minimum_alignment (tree exp, enum machine_mode mode,
24205 unsigned int align)
24206 {
24207 tree type, decl;
24208
24209 if (exp && DECL_P (exp))
24210 {
24211 type = TREE_TYPE (exp);
24212 decl = exp;
24213 }
24214 else
24215 {
24216 type = exp;
24217 decl = NULL;
24218 }
24219
24220 if (TARGET_64BIT || align != 64 || ix86_preferred_stack_boundary >= 64)
24221 return align;
24222
24223 /* Don't do dynamic stack realignment for long long objects with
24224 -mpreferred-stack-boundary=2. */
24225 if ((mode == DImode || (type && TYPE_MODE (type) == DImode))
24226 && (!type || !TYPE_USER_ALIGN (type))
24227 && (!decl || !DECL_USER_ALIGN (decl)))
24228 return 32;
24229
24230 return align;
24231 }
24232 \f
24233 /* Find a location for the static chain incoming to a nested function.
24234 This is a register, unless all free registers are used by arguments. */
24235
24236 static rtx
24237 ix86_static_chain (const_tree fndecl, bool incoming_p)
24238 {
24239 unsigned regno;
24240
24241 if (!DECL_STATIC_CHAIN (fndecl))
24242 return NULL;
24243
24244 if (TARGET_64BIT)
24245 {
24246 /* We always use R10 in 64-bit mode. */
24247 regno = R10_REG;
24248 }
24249 else
24250 {
24251 tree fntype;
24252 unsigned int ccvt;
24253
24254 /* By default in 32-bit mode we use ECX to pass the static chain. */
24255 regno = CX_REG;
24256
24257 fntype = TREE_TYPE (fndecl);
24258 ccvt = ix86_get_callcvt (fntype);
24259 if ((ccvt & (IX86_CALLCVT_FASTCALL | IX86_CALLCVT_THISCALL)) != 0)
24260 {
24261 /* Fastcall functions use ecx/edx for arguments, which leaves
24262 us with EAX for the static chain.
24263 Thiscall functions use ecx for arguments, which also
24264 leaves us with EAX for the static chain. */
24265 regno = AX_REG;
24266 }
24267 else if (ix86_function_regparm (fntype, fndecl) == 3)
24268 {
24269 /* For regparm 3, we have no free call-clobbered registers in
24270 which to store the static chain. In order to implement this,
24271 we have the trampoline push the static chain to the stack.
24272 However, we can't push a value below the return address when
24273 we call the nested function directly, so we have to use an
24274 alternate entry point. For this we use ESI, and have the
24275 alternate entry point push ESI, so that things appear the
24276 same once we're executing the nested function. */
24277 if (incoming_p)
24278 {
24279 if (fndecl == current_function_decl)
24280 ix86_static_chain_on_stack = true;
24281 return gen_frame_mem (SImode,
24282 plus_constant (arg_pointer_rtx, -8));
24283 }
24284 regno = SI_REG;
24285 }
24286 }
24287
24288 return gen_rtx_REG (Pmode, regno);
24289 }
24290
24291 /* Emit RTL insns to initialize the variable parts of a trampoline.
24292 FNDECL is the decl of the target address; M_TRAMP is a MEM for
24293 the trampoline, and CHAIN_VALUE is an RTX for the static chain
24294 to be passed to the target function. */
24295
24296 static void
24297 ix86_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
24298 {
24299 rtx mem, fnaddr;
24300 int opcode;
24301 int offset = 0;
24302
24303 fnaddr = XEXP (DECL_RTL (fndecl), 0);
24304
24305 if (TARGET_64BIT)
24306 {
24307 int size;
24308
24309 /* Load the function address to r11. Try to load address using
24310 the shorter movl instead of movabs. We may want to support
24311 movq for kernel mode, but kernel does not use trampolines at
24312 the moment. FNADDR is a 32bit address and may not be in
24313 DImode when ptr_mode == SImode. Always use movl in this
24314 case. */
24315 if (ptr_mode == SImode
24316 || x86_64_zext_immediate_operand (fnaddr, VOIDmode))
24317 {
24318 fnaddr = copy_to_mode_reg (Pmode, fnaddr);
24319
24320 mem = adjust_address (m_tramp, HImode, offset);
24321 emit_move_insn (mem, gen_int_mode (0xbb41, HImode));
24322
24323 mem = adjust_address (m_tramp, SImode, offset + 2);
24324 emit_move_insn (mem, gen_lowpart (SImode, fnaddr));
24325 offset += 6;
24326 }
24327 else
24328 {
24329 mem = adjust_address (m_tramp, HImode, offset);
24330 emit_move_insn (mem, gen_int_mode (0xbb49, HImode));
24331
24332 mem = adjust_address (m_tramp, DImode, offset + 2);
24333 emit_move_insn (mem, fnaddr);
24334 offset += 10;
24335 }
24336
24337 /* Load static chain using movabs to r10. Use the shorter movl
24338 instead of movabs when ptr_mode == SImode. */
24339 if (ptr_mode == SImode)
24340 {
24341 opcode = 0xba41;
24342 size = 6;
24343 }
24344 else
24345 {
24346 opcode = 0xba49;
24347 size = 10;
24348 }
24349
24350 mem = adjust_address (m_tramp, HImode, offset);
24351 emit_move_insn (mem, gen_int_mode (opcode, HImode));
24352
24353 mem = adjust_address (m_tramp, ptr_mode, offset + 2);
24354 emit_move_insn (mem, chain_value);
24355 offset += size;
24356
24357 /* Jump to r11; the last (unused) byte is a nop, only there to
24358 pad the write out to a single 32-bit store. */
24359 mem = adjust_address (m_tramp, SImode, offset);
24360 emit_move_insn (mem, gen_int_mode (0x90e3ff49, SImode));
24361 offset += 4;
24362 }
24363 else
24364 {
24365 rtx disp, chain;
24366
24367 /* Depending on the static chain location, either load a register
24368 with a constant, or push the constant to the stack. All of the
24369 instructions are the same size. */
24370 chain = ix86_static_chain (fndecl, true);
24371 if (REG_P (chain))
24372 {
24373 switch (REGNO (chain))
24374 {
24375 case AX_REG:
24376 opcode = 0xb8; break;
24377 case CX_REG:
24378 opcode = 0xb9; break;
24379 default:
24380 gcc_unreachable ();
24381 }
24382 }
24383 else
24384 opcode = 0x68;
24385
24386 mem = adjust_address (m_tramp, QImode, offset);
24387 emit_move_insn (mem, gen_int_mode (opcode, QImode));
24388
24389 mem = adjust_address (m_tramp, SImode, offset + 1);
24390 emit_move_insn (mem, chain_value);
24391 offset += 5;
24392
24393 mem = adjust_address (m_tramp, QImode, offset);
24394 emit_move_insn (mem, gen_int_mode (0xe9, QImode));
24395
24396 mem = adjust_address (m_tramp, SImode, offset + 1);
24397
24398 /* Compute offset from the end of the jmp to the target function.
24399 In the case in which the trampoline stores the static chain on
24400 the stack, we need to skip the first insn which pushes the
24401 (call-saved) register static chain; this push is 1 byte. */
24402 offset += 5;
24403 disp = expand_binop (SImode, sub_optab, fnaddr,
24404 plus_constant (XEXP (m_tramp, 0),
24405 offset - (MEM_P (chain) ? 1 : 0)),
24406 NULL_RTX, 1, OPTAB_DIRECT);
24407 emit_move_insn (mem, disp);
24408 }
24409
24410 gcc_assert (offset <= TRAMPOLINE_SIZE);
24411
24412 #ifdef HAVE_ENABLE_EXECUTE_STACK
24413 #ifdef CHECK_EXECUTE_STACK_ENABLED
24414 if (CHECK_EXECUTE_STACK_ENABLED)
24415 #endif
24416 emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__enable_execute_stack"),
24417 LCT_NORMAL, VOIDmode, 1, XEXP (m_tramp, 0), Pmode);
24418 #endif
24419 }
24420 \f
24421 /* The following file contains several enumerations and data structures
24422 built from the definitions in i386-builtin-types.def. */
24423
24424 #include "i386-builtin-types.inc"
24425
24426 /* Table for the ix86 builtin non-function types. */
24427 static GTY(()) tree ix86_builtin_type_tab[(int) IX86_BT_LAST_CPTR + 1];
24428
24429 /* Retrieve an element from the above table, building some of
24430 the types lazily. */
24431
24432 static tree
24433 ix86_get_builtin_type (enum ix86_builtin_type tcode)
24434 {
24435 unsigned int index;
24436 tree type, itype;
24437
24438 gcc_assert ((unsigned)tcode < ARRAY_SIZE(ix86_builtin_type_tab));
24439
24440 type = ix86_builtin_type_tab[(int) tcode];
24441 if (type != NULL)
24442 return type;
24443
24444 gcc_assert (tcode > IX86_BT_LAST_PRIM);
24445 if (tcode <= IX86_BT_LAST_VECT)
24446 {
24447 enum machine_mode mode;
24448
24449 index = tcode - IX86_BT_LAST_PRIM - 1;
24450 itype = ix86_get_builtin_type (ix86_builtin_type_vect_base[index]);
24451 mode = ix86_builtin_type_vect_mode[index];
24452
24453 type = build_vector_type_for_mode (itype, mode);
24454 }
24455 else
24456 {
24457 int quals;
24458
24459 index = tcode - IX86_BT_LAST_VECT - 1;
24460 if (tcode <= IX86_BT_LAST_PTR)
24461 quals = TYPE_UNQUALIFIED;
24462 else
24463 quals = TYPE_QUAL_CONST;
24464
24465 itype = ix86_get_builtin_type (ix86_builtin_type_ptr_base[index]);
24466 if (quals != TYPE_UNQUALIFIED)
24467 itype = build_qualified_type (itype, quals);
24468
24469 type = build_pointer_type (itype);
24470 }
24471
24472 ix86_builtin_type_tab[(int) tcode] = type;
24473 return type;
24474 }
24475
24476 /* Table for the ix86 builtin function types. */
24477 static GTY(()) tree ix86_builtin_func_type_tab[(int) IX86_BT_LAST_ALIAS + 1];
24478
24479 /* Retrieve an element from the above table, building some of
24480 the types lazily. */
24481
24482 static tree
24483 ix86_get_builtin_func_type (enum ix86_builtin_func_type tcode)
24484 {
24485 tree type;
24486
24487 gcc_assert ((unsigned)tcode < ARRAY_SIZE (ix86_builtin_func_type_tab));
24488
24489 type = ix86_builtin_func_type_tab[(int) tcode];
24490 if (type != NULL)
24491 return type;
24492
24493 if (tcode <= IX86_BT_LAST_FUNC)
24494 {
24495 unsigned start = ix86_builtin_func_start[(int) tcode];
24496 unsigned after = ix86_builtin_func_start[(int) tcode + 1];
24497 tree rtype, atype, args = void_list_node;
24498 unsigned i;
24499
24500 rtype = ix86_get_builtin_type (ix86_builtin_func_args[start]);
24501 for (i = after - 1; i > start; --i)
24502 {
24503 atype = ix86_get_builtin_type (ix86_builtin_func_args[i]);
24504 args = tree_cons (NULL, atype, args);
24505 }
24506
24507 type = build_function_type (rtype, args);
24508 }
24509 else
24510 {
24511 unsigned index = tcode - IX86_BT_LAST_FUNC - 1;
24512 enum ix86_builtin_func_type icode;
24513
24514 icode = ix86_builtin_func_alias_base[index];
24515 type = ix86_get_builtin_func_type (icode);
24516 }
24517
24518 ix86_builtin_func_type_tab[(int) tcode] = type;
24519 return type;
24520 }
24521
24522
24523 /* Codes for all the SSE/MMX builtins. */
24524 enum ix86_builtins
24525 {
24526 IX86_BUILTIN_ADDPS,
24527 IX86_BUILTIN_ADDSS,
24528 IX86_BUILTIN_DIVPS,
24529 IX86_BUILTIN_DIVSS,
24530 IX86_BUILTIN_MULPS,
24531 IX86_BUILTIN_MULSS,
24532 IX86_BUILTIN_SUBPS,
24533 IX86_BUILTIN_SUBSS,
24534
24535 IX86_BUILTIN_CMPEQPS,
24536 IX86_BUILTIN_CMPLTPS,
24537 IX86_BUILTIN_CMPLEPS,
24538 IX86_BUILTIN_CMPGTPS,
24539 IX86_BUILTIN_CMPGEPS,
24540 IX86_BUILTIN_CMPNEQPS,
24541 IX86_BUILTIN_CMPNLTPS,
24542 IX86_BUILTIN_CMPNLEPS,
24543 IX86_BUILTIN_CMPNGTPS,
24544 IX86_BUILTIN_CMPNGEPS,
24545 IX86_BUILTIN_CMPORDPS,
24546 IX86_BUILTIN_CMPUNORDPS,
24547 IX86_BUILTIN_CMPEQSS,
24548 IX86_BUILTIN_CMPLTSS,
24549 IX86_BUILTIN_CMPLESS,
24550 IX86_BUILTIN_CMPNEQSS,
24551 IX86_BUILTIN_CMPNLTSS,
24552 IX86_BUILTIN_CMPNLESS,
24553 IX86_BUILTIN_CMPNGTSS,
24554 IX86_BUILTIN_CMPNGESS,
24555 IX86_BUILTIN_CMPORDSS,
24556 IX86_BUILTIN_CMPUNORDSS,
24557
24558 IX86_BUILTIN_COMIEQSS,
24559 IX86_BUILTIN_COMILTSS,
24560 IX86_BUILTIN_COMILESS,
24561 IX86_BUILTIN_COMIGTSS,
24562 IX86_BUILTIN_COMIGESS,
24563 IX86_BUILTIN_COMINEQSS,
24564 IX86_BUILTIN_UCOMIEQSS,
24565 IX86_BUILTIN_UCOMILTSS,
24566 IX86_BUILTIN_UCOMILESS,
24567 IX86_BUILTIN_UCOMIGTSS,
24568 IX86_BUILTIN_UCOMIGESS,
24569 IX86_BUILTIN_UCOMINEQSS,
24570
24571 IX86_BUILTIN_CVTPI2PS,
24572 IX86_BUILTIN_CVTPS2PI,
24573 IX86_BUILTIN_CVTSI2SS,
24574 IX86_BUILTIN_CVTSI642SS,
24575 IX86_BUILTIN_CVTSS2SI,
24576 IX86_BUILTIN_CVTSS2SI64,
24577 IX86_BUILTIN_CVTTPS2PI,
24578 IX86_BUILTIN_CVTTSS2SI,
24579 IX86_BUILTIN_CVTTSS2SI64,
24580
24581 IX86_BUILTIN_MAXPS,
24582 IX86_BUILTIN_MAXSS,
24583 IX86_BUILTIN_MINPS,
24584 IX86_BUILTIN_MINSS,
24585
24586 IX86_BUILTIN_LOADUPS,
24587 IX86_BUILTIN_STOREUPS,
24588 IX86_BUILTIN_MOVSS,
24589
24590 IX86_BUILTIN_MOVHLPS,
24591 IX86_BUILTIN_MOVLHPS,
24592 IX86_BUILTIN_LOADHPS,
24593 IX86_BUILTIN_LOADLPS,
24594 IX86_BUILTIN_STOREHPS,
24595 IX86_BUILTIN_STORELPS,
24596
24597 IX86_BUILTIN_MASKMOVQ,
24598 IX86_BUILTIN_MOVMSKPS,
24599 IX86_BUILTIN_PMOVMSKB,
24600
24601 IX86_BUILTIN_MOVNTPS,
24602 IX86_BUILTIN_MOVNTQ,
24603
24604 IX86_BUILTIN_LOADDQU,
24605 IX86_BUILTIN_STOREDQU,
24606
24607 IX86_BUILTIN_PACKSSWB,
24608 IX86_BUILTIN_PACKSSDW,
24609 IX86_BUILTIN_PACKUSWB,
24610
24611 IX86_BUILTIN_PADDB,
24612 IX86_BUILTIN_PADDW,
24613 IX86_BUILTIN_PADDD,
24614 IX86_BUILTIN_PADDQ,
24615 IX86_BUILTIN_PADDSB,
24616 IX86_BUILTIN_PADDSW,
24617 IX86_BUILTIN_PADDUSB,
24618 IX86_BUILTIN_PADDUSW,
24619 IX86_BUILTIN_PSUBB,
24620 IX86_BUILTIN_PSUBW,
24621 IX86_BUILTIN_PSUBD,
24622 IX86_BUILTIN_PSUBQ,
24623 IX86_BUILTIN_PSUBSB,
24624 IX86_BUILTIN_PSUBSW,
24625 IX86_BUILTIN_PSUBUSB,
24626 IX86_BUILTIN_PSUBUSW,
24627
24628 IX86_BUILTIN_PAND,
24629 IX86_BUILTIN_PANDN,
24630 IX86_BUILTIN_POR,
24631 IX86_BUILTIN_PXOR,
24632
24633 IX86_BUILTIN_PAVGB,
24634 IX86_BUILTIN_PAVGW,
24635
24636 IX86_BUILTIN_PCMPEQB,
24637 IX86_BUILTIN_PCMPEQW,
24638 IX86_BUILTIN_PCMPEQD,
24639 IX86_BUILTIN_PCMPGTB,
24640 IX86_BUILTIN_PCMPGTW,
24641 IX86_BUILTIN_PCMPGTD,
24642
24643 IX86_BUILTIN_PMADDWD,
24644
24645 IX86_BUILTIN_PMAXSW,
24646 IX86_BUILTIN_PMAXUB,
24647 IX86_BUILTIN_PMINSW,
24648 IX86_BUILTIN_PMINUB,
24649
24650 IX86_BUILTIN_PMULHUW,
24651 IX86_BUILTIN_PMULHW,
24652 IX86_BUILTIN_PMULLW,
24653
24654 IX86_BUILTIN_PSADBW,
24655 IX86_BUILTIN_PSHUFW,
24656
24657 IX86_BUILTIN_PSLLW,
24658 IX86_BUILTIN_PSLLD,
24659 IX86_BUILTIN_PSLLQ,
24660 IX86_BUILTIN_PSRAW,
24661 IX86_BUILTIN_PSRAD,
24662 IX86_BUILTIN_PSRLW,
24663 IX86_BUILTIN_PSRLD,
24664 IX86_BUILTIN_PSRLQ,
24665 IX86_BUILTIN_PSLLWI,
24666 IX86_BUILTIN_PSLLDI,
24667 IX86_BUILTIN_PSLLQI,
24668 IX86_BUILTIN_PSRAWI,
24669 IX86_BUILTIN_PSRADI,
24670 IX86_BUILTIN_PSRLWI,
24671 IX86_BUILTIN_PSRLDI,
24672 IX86_BUILTIN_PSRLQI,
24673
24674 IX86_BUILTIN_PUNPCKHBW,
24675 IX86_BUILTIN_PUNPCKHWD,
24676 IX86_BUILTIN_PUNPCKHDQ,
24677 IX86_BUILTIN_PUNPCKLBW,
24678 IX86_BUILTIN_PUNPCKLWD,
24679 IX86_BUILTIN_PUNPCKLDQ,
24680
24681 IX86_BUILTIN_SHUFPS,
24682
24683 IX86_BUILTIN_RCPPS,
24684 IX86_BUILTIN_RCPSS,
24685 IX86_BUILTIN_RSQRTPS,
24686 IX86_BUILTIN_RSQRTPS_NR,
24687 IX86_BUILTIN_RSQRTSS,
24688 IX86_BUILTIN_RSQRTF,
24689 IX86_BUILTIN_SQRTPS,
24690 IX86_BUILTIN_SQRTPS_NR,
24691 IX86_BUILTIN_SQRTSS,
24692
24693 IX86_BUILTIN_UNPCKHPS,
24694 IX86_BUILTIN_UNPCKLPS,
24695
24696 IX86_BUILTIN_ANDPS,
24697 IX86_BUILTIN_ANDNPS,
24698 IX86_BUILTIN_ORPS,
24699 IX86_BUILTIN_XORPS,
24700
24701 IX86_BUILTIN_EMMS,
24702 IX86_BUILTIN_LDMXCSR,
24703 IX86_BUILTIN_STMXCSR,
24704 IX86_BUILTIN_SFENCE,
24705
24706 /* 3DNow! Original */
24707 IX86_BUILTIN_FEMMS,
24708 IX86_BUILTIN_PAVGUSB,
24709 IX86_BUILTIN_PF2ID,
24710 IX86_BUILTIN_PFACC,
24711 IX86_BUILTIN_PFADD,
24712 IX86_BUILTIN_PFCMPEQ,
24713 IX86_BUILTIN_PFCMPGE,
24714 IX86_BUILTIN_PFCMPGT,
24715 IX86_BUILTIN_PFMAX,
24716 IX86_BUILTIN_PFMIN,
24717 IX86_BUILTIN_PFMUL,
24718 IX86_BUILTIN_PFRCP,
24719 IX86_BUILTIN_PFRCPIT1,
24720 IX86_BUILTIN_PFRCPIT2,
24721 IX86_BUILTIN_PFRSQIT1,
24722 IX86_BUILTIN_PFRSQRT,
24723 IX86_BUILTIN_PFSUB,
24724 IX86_BUILTIN_PFSUBR,
24725 IX86_BUILTIN_PI2FD,
24726 IX86_BUILTIN_PMULHRW,
24727
24728 /* 3DNow! Athlon Extensions */
24729 IX86_BUILTIN_PF2IW,
24730 IX86_BUILTIN_PFNACC,
24731 IX86_BUILTIN_PFPNACC,
24732 IX86_BUILTIN_PI2FW,
24733 IX86_BUILTIN_PSWAPDSI,
24734 IX86_BUILTIN_PSWAPDSF,
24735
24736 /* SSE2 */
24737 IX86_BUILTIN_ADDPD,
24738 IX86_BUILTIN_ADDSD,
24739 IX86_BUILTIN_DIVPD,
24740 IX86_BUILTIN_DIVSD,
24741 IX86_BUILTIN_MULPD,
24742 IX86_BUILTIN_MULSD,
24743 IX86_BUILTIN_SUBPD,
24744 IX86_BUILTIN_SUBSD,
24745
24746 IX86_BUILTIN_CMPEQPD,
24747 IX86_BUILTIN_CMPLTPD,
24748 IX86_BUILTIN_CMPLEPD,
24749 IX86_BUILTIN_CMPGTPD,
24750 IX86_BUILTIN_CMPGEPD,
24751 IX86_BUILTIN_CMPNEQPD,
24752 IX86_BUILTIN_CMPNLTPD,
24753 IX86_BUILTIN_CMPNLEPD,
24754 IX86_BUILTIN_CMPNGTPD,
24755 IX86_BUILTIN_CMPNGEPD,
24756 IX86_BUILTIN_CMPORDPD,
24757 IX86_BUILTIN_CMPUNORDPD,
24758 IX86_BUILTIN_CMPEQSD,
24759 IX86_BUILTIN_CMPLTSD,
24760 IX86_BUILTIN_CMPLESD,
24761 IX86_BUILTIN_CMPNEQSD,
24762 IX86_BUILTIN_CMPNLTSD,
24763 IX86_BUILTIN_CMPNLESD,
24764 IX86_BUILTIN_CMPORDSD,
24765 IX86_BUILTIN_CMPUNORDSD,
24766
24767 IX86_BUILTIN_COMIEQSD,
24768 IX86_BUILTIN_COMILTSD,
24769 IX86_BUILTIN_COMILESD,
24770 IX86_BUILTIN_COMIGTSD,
24771 IX86_BUILTIN_COMIGESD,
24772 IX86_BUILTIN_COMINEQSD,
24773 IX86_BUILTIN_UCOMIEQSD,
24774 IX86_BUILTIN_UCOMILTSD,
24775 IX86_BUILTIN_UCOMILESD,
24776 IX86_BUILTIN_UCOMIGTSD,
24777 IX86_BUILTIN_UCOMIGESD,
24778 IX86_BUILTIN_UCOMINEQSD,
24779
24780 IX86_BUILTIN_MAXPD,
24781 IX86_BUILTIN_MAXSD,
24782 IX86_BUILTIN_MINPD,
24783 IX86_BUILTIN_MINSD,
24784
24785 IX86_BUILTIN_ANDPD,
24786 IX86_BUILTIN_ANDNPD,
24787 IX86_BUILTIN_ORPD,
24788 IX86_BUILTIN_XORPD,
24789
24790 IX86_BUILTIN_SQRTPD,
24791 IX86_BUILTIN_SQRTSD,
24792
24793 IX86_BUILTIN_UNPCKHPD,
24794 IX86_BUILTIN_UNPCKLPD,
24795
24796 IX86_BUILTIN_SHUFPD,
24797
24798 IX86_BUILTIN_LOADUPD,
24799 IX86_BUILTIN_STOREUPD,
24800 IX86_BUILTIN_MOVSD,
24801
24802 IX86_BUILTIN_LOADHPD,
24803 IX86_BUILTIN_LOADLPD,
24804
24805 IX86_BUILTIN_CVTDQ2PD,
24806 IX86_BUILTIN_CVTDQ2PS,
24807
24808 IX86_BUILTIN_CVTPD2DQ,
24809 IX86_BUILTIN_CVTPD2PI,
24810 IX86_BUILTIN_CVTPD2PS,
24811 IX86_BUILTIN_CVTTPD2DQ,
24812 IX86_BUILTIN_CVTTPD2PI,
24813
24814 IX86_BUILTIN_CVTPI2PD,
24815 IX86_BUILTIN_CVTSI2SD,
24816 IX86_BUILTIN_CVTSI642SD,
24817
24818 IX86_BUILTIN_CVTSD2SI,
24819 IX86_BUILTIN_CVTSD2SI64,
24820 IX86_BUILTIN_CVTSD2SS,
24821 IX86_BUILTIN_CVTSS2SD,
24822 IX86_BUILTIN_CVTTSD2SI,
24823 IX86_BUILTIN_CVTTSD2SI64,
24824
24825 IX86_BUILTIN_CVTPS2DQ,
24826 IX86_BUILTIN_CVTPS2PD,
24827 IX86_BUILTIN_CVTTPS2DQ,
24828
24829 IX86_BUILTIN_MOVNTI,
24830 IX86_BUILTIN_MOVNTI64,
24831 IX86_BUILTIN_MOVNTPD,
24832 IX86_BUILTIN_MOVNTDQ,
24833
24834 IX86_BUILTIN_MOVQ128,
24835
24836 /* SSE2 MMX */
24837 IX86_BUILTIN_MASKMOVDQU,
24838 IX86_BUILTIN_MOVMSKPD,
24839 IX86_BUILTIN_PMOVMSKB128,
24840
24841 IX86_BUILTIN_PACKSSWB128,
24842 IX86_BUILTIN_PACKSSDW128,
24843 IX86_BUILTIN_PACKUSWB128,
24844
24845 IX86_BUILTIN_PADDB128,
24846 IX86_BUILTIN_PADDW128,
24847 IX86_BUILTIN_PADDD128,
24848 IX86_BUILTIN_PADDQ128,
24849 IX86_BUILTIN_PADDSB128,
24850 IX86_BUILTIN_PADDSW128,
24851 IX86_BUILTIN_PADDUSB128,
24852 IX86_BUILTIN_PADDUSW128,
24853 IX86_BUILTIN_PSUBB128,
24854 IX86_BUILTIN_PSUBW128,
24855 IX86_BUILTIN_PSUBD128,
24856 IX86_BUILTIN_PSUBQ128,
24857 IX86_BUILTIN_PSUBSB128,
24858 IX86_BUILTIN_PSUBSW128,
24859 IX86_BUILTIN_PSUBUSB128,
24860 IX86_BUILTIN_PSUBUSW128,
24861
24862 IX86_BUILTIN_PAND128,
24863 IX86_BUILTIN_PANDN128,
24864 IX86_BUILTIN_POR128,
24865 IX86_BUILTIN_PXOR128,
24866
24867 IX86_BUILTIN_PAVGB128,
24868 IX86_BUILTIN_PAVGW128,
24869
24870 IX86_BUILTIN_PCMPEQB128,
24871 IX86_BUILTIN_PCMPEQW128,
24872 IX86_BUILTIN_PCMPEQD128,
24873 IX86_BUILTIN_PCMPGTB128,
24874 IX86_BUILTIN_PCMPGTW128,
24875 IX86_BUILTIN_PCMPGTD128,
24876
24877 IX86_BUILTIN_PMADDWD128,
24878
24879 IX86_BUILTIN_PMAXSW128,
24880 IX86_BUILTIN_PMAXUB128,
24881 IX86_BUILTIN_PMINSW128,
24882 IX86_BUILTIN_PMINUB128,
24883
24884 IX86_BUILTIN_PMULUDQ,
24885 IX86_BUILTIN_PMULUDQ128,
24886 IX86_BUILTIN_PMULHUW128,
24887 IX86_BUILTIN_PMULHW128,
24888 IX86_BUILTIN_PMULLW128,
24889
24890 IX86_BUILTIN_PSADBW128,
24891 IX86_BUILTIN_PSHUFHW,
24892 IX86_BUILTIN_PSHUFLW,
24893 IX86_BUILTIN_PSHUFD,
24894
24895 IX86_BUILTIN_PSLLDQI128,
24896 IX86_BUILTIN_PSLLWI128,
24897 IX86_BUILTIN_PSLLDI128,
24898 IX86_BUILTIN_PSLLQI128,
24899 IX86_BUILTIN_PSRAWI128,
24900 IX86_BUILTIN_PSRADI128,
24901 IX86_BUILTIN_PSRLDQI128,
24902 IX86_BUILTIN_PSRLWI128,
24903 IX86_BUILTIN_PSRLDI128,
24904 IX86_BUILTIN_PSRLQI128,
24905
24906 IX86_BUILTIN_PSLLDQ128,
24907 IX86_BUILTIN_PSLLW128,
24908 IX86_BUILTIN_PSLLD128,
24909 IX86_BUILTIN_PSLLQ128,
24910 IX86_BUILTIN_PSRAW128,
24911 IX86_BUILTIN_PSRAD128,
24912 IX86_BUILTIN_PSRLW128,
24913 IX86_BUILTIN_PSRLD128,
24914 IX86_BUILTIN_PSRLQ128,
24915
24916 IX86_BUILTIN_PUNPCKHBW128,
24917 IX86_BUILTIN_PUNPCKHWD128,
24918 IX86_BUILTIN_PUNPCKHDQ128,
24919 IX86_BUILTIN_PUNPCKHQDQ128,
24920 IX86_BUILTIN_PUNPCKLBW128,
24921 IX86_BUILTIN_PUNPCKLWD128,
24922 IX86_BUILTIN_PUNPCKLDQ128,
24923 IX86_BUILTIN_PUNPCKLQDQ128,
24924
24925 IX86_BUILTIN_CLFLUSH,
24926 IX86_BUILTIN_MFENCE,
24927 IX86_BUILTIN_LFENCE,
24928 IX86_BUILTIN_PAUSE,
24929
24930 IX86_BUILTIN_BSRSI,
24931 IX86_BUILTIN_BSRDI,
24932 IX86_BUILTIN_RDPMC,
24933 IX86_BUILTIN_RDTSC,
24934 IX86_BUILTIN_RDTSCP,
24935 IX86_BUILTIN_ROLQI,
24936 IX86_BUILTIN_ROLHI,
24937 IX86_BUILTIN_RORQI,
24938 IX86_BUILTIN_RORHI,
24939
24940 /* SSE3. */
24941 IX86_BUILTIN_ADDSUBPS,
24942 IX86_BUILTIN_HADDPS,
24943 IX86_BUILTIN_HSUBPS,
24944 IX86_BUILTIN_MOVSHDUP,
24945 IX86_BUILTIN_MOVSLDUP,
24946 IX86_BUILTIN_ADDSUBPD,
24947 IX86_BUILTIN_HADDPD,
24948 IX86_BUILTIN_HSUBPD,
24949 IX86_BUILTIN_LDDQU,
24950
24951 IX86_BUILTIN_MONITOR,
24952 IX86_BUILTIN_MWAIT,
24953
24954 /* SSSE3. */
24955 IX86_BUILTIN_PHADDW,
24956 IX86_BUILTIN_PHADDD,
24957 IX86_BUILTIN_PHADDSW,
24958 IX86_BUILTIN_PHSUBW,
24959 IX86_BUILTIN_PHSUBD,
24960 IX86_BUILTIN_PHSUBSW,
24961 IX86_BUILTIN_PMADDUBSW,
24962 IX86_BUILTIN_PMULHRSW,
24963 IX86_BUILTIN_PSHUFB,
24964 IX86_BUILTIN_PSIGNB,
24965 IX86_BUILTIN_PSIGNW,
24966 IX86_BUILTIN_PSIGND,
24967 IX86_BUILTIN_PALIGNR,
24968 IX86_BUILTIN_PABSB,
24969 IX86_BUILTIN_PABSW,
24970 IX86_BUILTIN_PABSD,
24971
24972 IX86_BUILTIN_PHADDW128,
24973 IX86_BUILTIN_PHADDD128,
24974 IX86_BUILTIN_PHADDSW128,
24975 IX86_BUILTIN_PHSUBW128,
24976 IX86_BUILTIN_PHSUBD128,
24977 IX86_BUILTIN_PHSUBSW128,
24978 IX86_BUILTIN_PMADDUBSW128,
24979 IX86_BUILTIN_PMULHRSW128,
24980 IX86_BUILTIN_PSHUFB128,
24981 IX86_BUILTIN_PSIGNB128,
24982 IX86_BUILTIN_PSIGNW128,
24983 IX86_BUILTIN_PSIGND128,
24984 IX86_BUILTIN_PALIGNR128,
24985 IX86_BUILTIN_PABSB128,
24986 IX86_BUILTIN_PABSW128,
24987 IX86_BUILTIN_PABSD128,
24988
24989 /* AMDFAM10 - SSE4A New Instructions. */
24990 IX86_BUILTIN_MOVNTSD,
24991 IX86_BUILTIN_MOVNTSS,
24992 IX86_BUILTIN_EXTRQI,
24993 IX86_BUILTIN_EXTRQ,
24994 IX86_BUILTIN_INSERTQI,
24995 IX86_BUILTIN_INSERTQ,
24996
24997 /* SSE4.1. */
24998 IX86_BUILTIN_BLENDPD,
24999 IX86_BUILTIN_BLENDPS,
25000 IX86_BUILTIN_BLENDVPD,
25001 IX86_BUILTIN_BLENDVPS,
25002 IX86_BUILTIN_PBLENDVB128,
25003 IX86_BUILTIN_PBLENDW128,
25004
25005 IX86_BUILTIN_DPPD,
25006 IX86_BUILTIN_DPPS,
25007
25008 IX86_BUILTIN_INSERTPS128,
25009
25010 IX86_BUILTIN_MOVNTDQA,
25011 IX86_BUILTIN_MPSADBW128,
25012 IX86_BUILTIN_PACKUSDW128,
25013 IX86_BUILTIN_PCMPEQQ,
25014 IX86_BUILTIN_PHMINPOSUW128,
25015
25016 IX86_BUILTIN_PMAXSB128,
25017 IX86_BUILTIN_PMAXSD128,
25018 IX86_BUILTIN_PMAXUD128,
25019 IX86_BUILTIN_PMAXUW128,
25020
25021 IX86_BUILTIN_PMINSB128,
25022 IX86_BUILTIN_PMINSD128,
25023 IX86_BUILTIN_PMINUD128,
25024 IX86_BUILTIN_PMINUW128,
25025
25026 IX86_BUILTIN_PMOVSXBW128,
25027 IX86_BUILTIN_PMOVSXBD128,
25028 IX86_BUILTIN_PMOVSXBQ128,
25029 IX86_BUILTIN_PMOVSXWD128,
25030 IX86_BUILTIN_PMOVSXWQ128,
25031 IX86_BUILTIN_PMOVSXDQ128,
25032
25033 IX86_BUILTIN_PMOVZXBW128,
25034 IX86_BUILTIN_PMOVZXBD128,
25035 IX86_BUILTIN_PMOVZXBQ128,
25036 IX86_BUILTIN_PMOVZXWD128,
25037 IX86_BUILTIN_PMOVZXWQ128,
25038 IX86_BUILTIN_PMOVZXDQ128,
25039
25040 IX86_BUILTIN_PMULDQ128,
25041 IX86_BUILTIN_PMULLD128,
25042
25043 IX86_BUILTIN_ROUNDSD,
25044 IX86_BUILTIN_ROUNDSS,
25045
25046 IX86_BUILTIN_ROUNDPD,
25047 IX86_BUILTIN_ROUNDPS,
25048
25049 IX86_BUILTIN_FLOORPD,
25050 IX86_BUILTIN_CEILPD,
25051 IX86_BUILTIN_TRUNCPD,
25052 IX86_BUILTIN_RINTPD,
25053 IX86_BUILTIN_ROUNDPD_AZ,
25054
25055 IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX,
25056 IX86_BUILTIN_CEILPD_VEC_PACK_SFIX,
25057 IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX,
25058
25059 IX86_BUILTIN_FLOORPS,
25060 IX86_BUILTIN_CEILPS,
25061 IX86_BUILTIN_TRUNCPS,
25062 IX86_BUILTIN_RINTPS,
25063 IX86_BUILTIN_ROUNDPS_AZ,
25064
25065 IX86_BUILTIN_FLOORPS_SFIX,
25066 IX86_BUILTIN_CEILPS_SFIX,
25067 IX86_BUILTIN_ROUNDPS_AZ_SFIX,
25068
25069 IX86_BUILTIN_PTESTZ,
25070 IX86_BUILTIN_PTESTC,
25071 IX86_BUILTIN_PTESTNZC,
25072
25073 IX86_BUILTIN_VEC_INIT_V2SI,
25074 IX86_BUILTIN_VEC_INIT_V4HI,
25075 IX86_BUILTIN_VEC_INIT_V8QI,
25076 IX86_BUILTIN_VEC_EXT_V2DF,
25077 IX86_BUILTIN_VEC_EXT_V2DI,
25078 IX86_BUILTIN_VEC_EXT_V4SF,
25079 IX86_BUILTIN_VEC_EXT_V4SI,
25080 IX86_BUILTIN_VEC_EXT_V8HI,
25081 IX86_BUILTIN_VEC_EXT_V2SI,
25082 IX86_BUILTIN_VEC_EXT_V4HI,
25083 IX86_BUILTIN_VEC_EXT_V16QI,
25084 IX86_BUILTIN_VEC_SET_V2DI,
25085 IX86_BUILTIN_VEC_SET_V4SF,
25086 IX86_BUILTIN_VEC_SET_V4SI,
25087 IX86_BUILTIN_VEC_SET_V8HI,
25088 IX86_BUILTIN_VEC_SET_V4HI,
25089 IX86_BUILTIN_VEC_SET_V16QI,
25090
25091 IX86_BUILTIN_VEC_PACK_SFIX,
25092 IX86_BUILTIN_VEC_PACK_SFIX256,
25093
25094 /* SSE4.2. */
25095 IX86_BUILTIN_CRC32QI,
25096 IX86_BUILTIN_CRC32HI,
25097 IX86_BUILTIN_CRC32SI,
25098 IX86_BUILTIN_CRC32DI,
25099
25100 IX86_BUILTIN_PCMPESTRI128,
25101 IX86_BUILTIN_PCMPESTRM128,
25102 IX86_BUILTIN_PCMPESTRA128,
25103 IX86_BUILTIN_PCMPESTRC128,
25104 IX86_BUILTIN_PCMPESTRO128,
25105 IX86_BUILTIN_PCMPESTRS128,
25106 IX86_BUILTIN_PCMPESTRZ128,
25107 IX86_BUILTIN_PCMPISTRI128,
25108 IX86_BUILTIN_PCMPISTRM128,
25109 IX86_BUILTIN_PCMPISTRA128,
25110 IX86_BUILTIN_PCMPISTRC128,
25111 IX86_BUILTIN_PCMPISTRO128,
25112 IX86_BUILTIN_PCMPISTRS128,
25113 IX86_BUILTIN_PCMPISTRZ128,
25114
25115 IX86_BUILTIN_PCMPGTQ,
25116
25117 /* AES instructions */
25118 IX86_BUILTIN_AESENC128,
25119 IX86_BUILTIN_AESENCLAST128,
25120 IX86_BUILTIN_AESDEC128,
25121 IX86_BUILTIN_AESDECLAST128,
25122 IX86_BUILTIN_AESIMC128,
25123 IX86_BUILTIN_AESKEYGENASSIST128,
25124
25125 /* PCLMUL instruction */
25126 IX86_BUILTIN_PCLMULQDQ128,
25127
25128 /* AVX */
25129 IX86_BUILTIN_ADDPD256,
25130 IX86_BUILTIN_ADDPS256,
25131 IX86_BUILTIN_ADDSUBPD256,
25132 IX86_BUILTIN_ADDSUBPS256,
25133 IX86_BUILTIN_ANDPD256,
25134 IX86_BUILTIN_ANDPS256,
25135 IX86_BUILTIN_ANDNPD256,
25136 IX86_BUILTIN_ANDNPS256,
25137 IX86_BUILTIN_BLENDPD256,
25138 IX86_BUILTIN_BLENDPS256,
25139 IX86_BUILTIN_BLENDVPD256,
25140 IX86_BUILTIN_BLENDVPS256,
25141 IX86_BUILTIN_DIVPD256,
25142 IX86_BUILTIN_DIVPS256,
25143 IX86_BUILTIN_DPPS256,
25144 IX86_BUILTIN_HADDPD256,
25145 IX86_BUILTIN_HADDPS256,
25146 IX86_BUILTIN_HSUBPD256,
25147 IX86_BUILTIN_HSUBPS256,
25148 IX86_BUILTIN_MAXPD256,
25149 IX86_BUILTIN_MAXPS256,
25150 IX86_BUILTIN_MINPD256,
25151 IX86_BUILTIN_MINPS256,
25152 IX86_BUILTIN_MULPD256,
25153 IX86_BUILTIN_MULPS256,
25154 IX86_BUILTIN_ORPD256,
25155 IX86_BUILTIN_ORPS256,
25156 IX86_BUILTIN_SHUFPD256,
25157 IX86_BUILTIN_SHUFPS256,
25158 IX86_BUILTIN_SUBPD256,
25159 IX86_BUILTIN_SUBPS256,
25160 IX86_BUILTIN_XORPD256,
25161 IX86_BUILTIN_XORPS256,
25162 IX86_BUILTIN_CMPSD,
25163 IX86_BUILTIN_CMPSS,
25164 IX86_BUILTIN_CMPPD,
25165 IX86_BUILTIN_CMPPS,
25166 IX86_BUILTIN_CMPPD256,
25167 IX86_BUILTIN_CMPPS256,
25168 IX86_BUILTIN_CVTDQ2PD256,
25169 IX86_BUILTIN_CVTDQ2PS256,
25170 IX86_BUILTIN_CVTPD2PS256,
25171 IX86_BUILTIN_CVTPS2DQ256,
25172 IX86_BUILTIN_CVTPS2PD256,
25173 IX86_BUILTIN_CVTTPD2DQ256,
25174 IX86_BUILTIN_CVTPD2DQ256,
25175 IX86_BUILTIN_CVTTPS2DQ256,
25176 IX86_BUILTIN_EXTRACTF128PD256,
25177 IX86_BUILTIN_EXTRACTF128PS256,
25178 IX86_BUILTIN_EXTRACTF128SI256,
25179 IX86_BUILTIN_VZEROALL,
25180 IX86_BUILTIN_VZEROUPPER,
25181 IX86_BUILTIN_VPERMILVARPD,
25182 IX86_BUILTIN_VPERMILVARPS,
25183 IX86_BUILTIN_VPERMILVARPD256,
25184 IX86_BUILTIN_VPERMILVARPS256,
25185 IX86_BUILTIN_VPERMILPD,
25186 IX86_BUILTIN_VPERMILPS,
25187 IX86_BUILTIN_VPERMILPD256,
25188 IX86_BUILTIN_VPERMILPS256,
25189 IX86_BUILTIN_VPERMIL2PD,
25190 IX86_BUILTIN_VPERMIL2PS,
25191 IX86_BUILTIN_VPERMIL2PD256,
25192 IX86_BUILTIN_VPERMIL2PS256,
25193 IX86_BUILTIN_VPERM2F128PD256,
25194 IX86_BUILTIN_VPERM2F128PS256,
25195 IX86_BUILTIN_VPERM2F128SI256,
25196 IX86_BUILTIN_VBROADCASTSS,
25197 IX86_BUILTIN_VBROADCASTSD256,
25198 IX86_BUILTIN_VBROADCASTSS256,
25199 IX86_BUILTIN_VBROADCASTPD256,
25200 IX86_BUILTIN_VBROADCASTPS256,
25201 IX86_BUILTIN_VINSERTF128PD256,
25202 IX86_BUILTIN_VINSERTF128PS256,
25203 IX86_BUILTIN_VINSERTF128SI256,
25204 IX86_BUILTIN_LOADUPD256,
25205 IX86_BUILTIN_LOADUPS256,
25206 IX86_BUILTIN_STOREUPD256,
25207 IX86_BUILTIN_STOREUPS256,
25208 IX86_BUILTIN_LDDQU256,
25209 IX86_BUILTIN_MOVNTDQ256,
25210 IX86_BUILTIN_MOVNTPD256,
25211 IX86_BUILTIN_MOVNTPS256,
25212 IX86_BUILTIN_LOADDQU256,
25213 IX86_BUILTIN_STOREDQU256,
25214 IX86_BUILTIN_MASKLOADPD,
25215 IX86_BUILTIN_MASKLOADPS,
25216 IX86_BUILTIN_MASKSTOREPD,
25217 IX86_BUILTIN_MASKSTOREPS,
25218 IX86_BUILTIN_MASKLOADPD256,
25219 IX86_BUILTIN_MASKLOADPS256,
25220 IX86_BUILTIN_MASKSTOREPD256,
25221 IX86_BUILTIN_MASKSTOREPS256,
25222 IX86_BUILTIN_MOVSHDUP256,
25223 IX86_BUILTIN_MOVSLDUP256,
25224 IX86_BUILTIN_MOVDDUP256,
25225
25226 IX86_BUILTIN_SQRTPD256,
25227 IX86_BUILTIN_SQRTPS256,
25228 IX86_BUILTIN_SQRTPS_NR256,
25229 IX86_BUILTIN_RSQRTPS256,
25230 IX86_BUILTIN_RSQRTPS_NR256,
25231
25232 IX86_BUILTIN_RCPPS256,
25233
25234 IX86_BUILTIN_ROUNDPD256,
25235 IX86_BUILTIN_ROUNDPS256,
25236
25237 IX86_BUILTIN_FLOORPD256,
25238 IX86_BUILTIN_CEILPD256,
25239 IX86_BUILTIN_TRUNCPD256,
25240 IX86_BUILTIN_RINTPD256,
25241 IX86_BUILTIN_ROUNDPD_AZ256,
25242
25243 IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256,
25244 IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256,
25245 IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256,
25246
25247 IX86_BUILTIN_FLOORPS256,
25248 IX86_BUILTIN_CEILPS256,
25249 IX86_BUILTIN_TRUNCPS256,
25250 IX86_BUILTIN_RINTPS256,
25251 IX86_BUILTIN_ROUNDPS_AZ256,
25252
25253 IX86_BUILTIN_FLOORPS_SFIX256,
25254 IX86_BUILTIN_CEILPS_SFIX256,
25255 IX86_BUILTIN_ROUNDPS_AZ_SFIX256,
25256
25257 IX86_BUILTIN_UNPCKHPD256,
25258 IX86_BUILTIN_UNPCKLPD256,
25259 IX86_BUILTIN_UNPCKHPS256,
25260 IX86_BUILTIN_UNPCKLPS256,
25261
25262 IX86_BUILTIN_SI256_SI,
25263 IX86_BUILTIN_PS256_PS,
25264 IX86_BUILTIN_PD256_PD,
25265 IX86_BUILTIN_SI_SI256,
25266 IX86_BUILTIN_PS_PS256,
25267 IX86_BUILTIN_PD_PD256,
25268
25269 IX86_BUILTIN_VTESTZPD,
25270 IX86_BUILTIN_VTESTCPD,
25271 IX86_BUILTIN_VTESTNZCPD,
25272 IX86_BUILTIN_VTESTZPS,
25273 IX86_BUILTIN_VTESTCPS,
25274 IX86_BUILTIN_VTESTNZCPS,
25275 IX86_BUILTIN_VTESTZPD256,
25276 IX86_BUILTIN_VTESTCPD256,
25277 IX86_BUILTIN_VTESTNZCPD256,
25278 IX86_BUILTIN_VTESTZPS256,
25279 IX86_BUILTIN_VTESTCPS256,
25280 IX86_BUILTIN_VTESTNZCPS256,
25281 IX86_BUILTIN_PTESTZ256,
25282 IX86_BUILTIN_PTESTC256,
25283 IX86_BUILTIN_PTESTNZC256,
25284
25285 IX86_BUILTIN_MOVMSKPD256,
25286 IX86_BUILTIN_MOVMSKPS256,
25287
25288 /* AVX2 */
25289 IX86_BUILTIN_MPSADBW256,
25290 IX86_BUILTIN_PABSB256,
25291 IX86_BUILTIN_PABSW256,
25292 IX86_BUILTIN_PABSD256,
25293 IX86_BUILTIN_PACKSSDW256,
25294 IX86_BUILTIN_PACKSSWB256,
25295 IX86_BUILTIN_PACKUSDW256,
25296 IX86_BUILTIN_PACKUSWB256,
25297 IX86_BUILTIN_PADDB256,
25298 IX86_BUILTIN_PADDW256,
25299 IX86_BUILTIN_PADDD256,
25300 IX86_BUILTIN_PADDQ256,
25301 IX86_BUILTIN_PADDSB256,
25302 IX86_BUILTIN_PADDSW256,
25303 IX86_BUILTIN_PADDUSB256,
25304 IX86_BUILTIN_PADDUSW256,
25305 IX86_BUILTIN_PALIGNR256,
25306 IX86_BUILTIN_AND256I,
25307 IX86_BUILTIN_ANDNOT256I,
25308 IX86_BUILTIN_PAVGB256,
25309 IX86_BUILTIN_PAVGW256,
25310 IX86_BUILTIN_PBLENDVB256,
25311 IX86_BUILTIN_PBLENDVW256,
25312 IX86_BUILTIN_PCMPEQB256,
25313 IX86_BUILTIN_PCMPEQW256,
25314 IX86_BUILTIN_PCMPEQD256,
25315 IX86_BUILTIN_PCMPEQQ256,
25316 IX86_BUILTIN_PCMPGTB256,
25317 IX86_BUILTIN_PCMPGTW256,
25318 IX86_BUILTIN_PCMPGTD256,
25319 IX86_BUILTIN_PCMPGTQ256,
25320 IX86_BUILTIN_PHADDW256,
25321 IX86_BUILTIN_PHADDD256,
25322 IX86_BUILTIN_PHADDSW256,
25323 IX86_BUILTIN_PHSUBW256,
25324 IX86_BUILTIN_PHSUBD256,
25325 IX86_BUILTIN_PHSUBSW256,
25326 IX86_BUILTIN_PMADDUBSW256,
25327 IX86_BUILTIN_PMADDWD256,
25328 IX86_BUILTIN_PMAXSB256,
25329 IX86_BUILTIN_PMAXSW256,
25330 IX86_BUILTIN_PMAXSD256,
25331 IX86_BUILTIN_PMAXUB256,
25332 IX86_BUILTIN_PMAXUW256,
25333 IX86_BUILTIN_PMAXUD256,
25334 IX86_BUILTIN_PMINSB256,
25335 IX86_BUILTIN_PMINSW256,
25336 IX86_BUILTIN_PMINSD256,
25337 IX86_BUILTIN_PMINUB256,
25338 IX86_BUILTIN_PMINUW256,
25339 IX86_BUILTIN_PMINUD256,
25340 IX86_BUILTIN_PMOVMSKB256,
25341 IX86_BUILTIN_PMOVSXBW256,
25342 IX86_BUILTIN_PMOVSXBD256,
25343 IX86_BUILTIN_PMOVSXBQ256,
25344 IX86_BUILTIN_PMOVSXWD256,
25345 IX86_BUILTIN_PMOVSXWQ256,
25346 IX86_BUILTIN_PMOVSXDQ256,
25347 IX86_BUILTIN_PMOVZXBW256,
25348 IX86_BUILTIN_PMOVZXBD256,
25349 IX86_BUILTIN_PMOVZXBQ256,
25350 IX86_BUILTIN_PMOVZXWD256,
25351 IX86_BUILTIN_PMOVZXWQ256,
25352 IX86_BUILTIN_PMOVZXDQ256,
25353 IX86_BUILTIN_PMULDQ256,
25354 IX86_BUILTIN_PMULHRSW256,
25355 IX86_BUILTIN_PMULHUW256,
25356 IX86_BUILTIN_PMULHW256,
25357 IX86_BUILTIN_PMULLW256,
25358 IX86_BUILTIN_PMULLD256,
25359 IX86_BUILTIN_PMULUDQ256,
25360 IX86_BUILTIN_POR256,
25361 IX86_BUILTIN_PSADBW256,
25362 IX86_BUILTIN_PSHUFB256,
25363 IX86_BUILTIN_PSHUFD256,
25364 IX86_BUILTIN_PSHUFHW256,
25365 IX86_BUILTIN_PSHUFLW256,
25366 IX86_BUILTIN_PSIGNB256,
25367 IX86_BUILTIN_PSIGNW256,
25368 IX86_BUILTIN_PSIGND256,
25369 IX86_BUILTIN_PSLLDQI256,
25370 IX86_BUILTIN_PSLLWI256,
25371 IX86_BUILTIN_PSLLW256,
25372 IX86_BUILTIN_PSLLDI256,
25373 IX86_BUILTIN_PSLLD256,
25374 IX86_BUILTIN_PSLLQI256,
25375 IX86_BUILTIN_PSLLQ256,
25376 IX86_BUILTIN_PSRAWI256,
25377 IX86_BUILTIN_PSRAW256,
25378 IX86_BUILTIN_PSRADI256,
25379 IX86_BUILTIN_PSRAD256,
25380 IX86_BUILTIN_PSRLDQI256,
25381 IX86_BUILTIN_PSRLWI256,
25382 IX86_BUILTIN_PSRLW256,
25383 IX86_BUILTIN_PSRLDI256,
25384 IX86_BUILTIN_PSRLD256,
25385 IX86_BUILTIN_PSRLQI256,
25386 IX86_BUILTIN_PSRLQ256,
25387 IX86_BUILTIN_PSUBB256,
25388 IX86_BUILTIN_PSUBW256,
25389 IX86_BUILTIN_PSUBD256,
25390 IX86_BUILTIN_PSUBQ256,
25391 IX86_BUILTIN_PSUBSB256,
25392 IX86_BUILTIN_PSUBSW256,
25393 IX86_BUILTIN_PSUBUSB256,
25394 IX86_BUILTIN_PSUBUSW256,
25395 IX86_BUILTIN_PUNPCKHBW256,
25396 IX86_BUILTIN_PUNPCKHWD256,
25397 IX86_BUILTIN_PUNPCKHDQ256,
25398 IX86_BUILTIN_PUNPCKHQDQ256,
25399 IX86_BUILTIN_PUNPCKLBW256,
25400 IX86_BUILTIN_PUNPCKLWD256,
25401 IX86_BUILTIN_PUNPCKLDQ256,
25402 IX86_BUILTIN_PUNPCKLQDQ256,
25403 IX86_BUILTIN_PXOR256,
25404 IX86_BUILTIN_MOVNTDQA256,
25405 IX86_BUILTIN_VBROADCASTSS_PS,
25406 IX86_BUILTIN_VBROADCASTSS_PS256,
25407 IX86_BUILTIN_VBROADCASTSD_PD256,
25408 IX86_BUILTIN_VBROADCASTSI256,
25409 IX86_BUILTIN_PBLENDD256,
25410 IX86_BUILTIN_PBLENDD128,
25411 IX86_BUILTIN_PBROADCASTB256,
25412 IX86_BUILTIN_PBROADCASTW256,
25413 IX86_BUILTIN_PBROADCASTD256,
25414 IX86_BUILTIN_PBROADCASTQ256,
25415 IX86_BUILTIN_PBROADCASTB128,
25416 IX86_BUILTIN_PBROADCASTW128,
25417 IX86_BUILTIN_PBROADCASTD128,
25418 IX86_BUILTIN_PBROADCASTQ128,
25419 IX86_BUILTIN_VPERMVARSI256,
25420 IX86_BUILTIN_VPERMDF256,
25421 IX86_BUILTIN_VPERMVARSF256,
25422 IX86_BUILTIN_VPERMDI256,
25423 IX86_BUILTIN_VPERMTI256,
25424 IX86_BUILTIN_VEXTRACT128I256,
25425 IX86_BUILTIN_VINSERT128I256,
25426 IX86_BUILTIN_MASKLOADD,
25427 IX86_BUILTIN_MASKLOADQ,
25428 IX86_BUILTIN_MASKLOADD256,
25429 IX86_BUILTIN_MASKLOADQ256,
25430 IX86_BUILTIN_MASKSTORED,
25431 IX86_BUILTIN_MASKSTOREQ,
25432 IX86_BUILTIN_MASKSTORED256,
25433 IX86_BUILTIN_MASKSTOREQ256,
25434 IX86_BUILTIN_PSLLVV4DI,
25435 IX86_BUILTIN_PSLLVV2DI,
25436 IX86_BUILTIN_PSLLVV8SI,
25437 IX86_BUILTIN_PSLLVV4SI,
25438 IX86_BUILTIN_PSRAVV8SI,
25439 IX86_BUILTIN_PSRAVV4SI,
25440 IX86_BUILTIN_PSRLVV4DI,
25441 IX86_BUILTIN_PSRLVV2DI,
25442 IX86_BUILTIN_PSRLVV8SI,
25443 IX86_BUILTIN_PSRLVV4SI,
25444
25445 IX86_BUILTIN_GATHERSIV2DF,
25446 IX86_BUILTIN_GATHERSIV4DF,
25447 IX86_BUILTIN_GATHERDIV2DF,
25448 IX86_BUILTIN_GATHERDIV4DF,
25449 IX86_BUILTIN_GATHERSIV4SF,
25450 IX86_BUILTIN_GATHERSIV8SF,
25451 IX86_BUILTIN_GATHERDIV4SF,
25452 IX86_BUILTIN_GATHERDIV8SF,
25453 IX86_BUILTIN_GATHERSIV2DI,
25454 IX86_BUILTIN_GATHERSIV4DI,
25455 IX86_BUILTIN_GATHERDIV2DI,
25456 IX86_BUILTIN_GATHERDIV4DI,
25457 IX86_BUILTIN_GATHERSIV4SI,
25458 IX86_BUILTIN_GATHERSIV8SI,
25459 IX86_BUILTIN_GATHERDIV4SI,
25460 IX86_BUILTIN_GATHERDIV8SI,
25461
25462 /* Alternate 4 element gather for the vectorizer where
25463 all operands are 32-byte wide. */
25464 IX86_BUILTIN_GATHERALTSIV4DF,
25465 IX86_BUILTIN_GATHERALTDIV8SF,
25466 IX86_BUILTIN_GATHERALTSIV4DI,
25467 IX86_BUILTIN_GATHERALTDIV8SI,
25468
25469 /* TFmode support builtins. */
25470 IX86_BUILTIN_INFQ,
25471 IX86_BUILTIN_HUGE_VALQ,
25472 IX86_BUILTIN_FABSQ,
25473 IX86_BUILTIN_COPYSIGNQ,
25474
25475 /* Vectorizer support builtins. */
25476 IX86_BUILTIN_CPYSGNPS,
25477 IX86_BUILTIN_CPYSGNPD,
25478 IX86_BUILTIN_CPYSGNPS256,
25479 IX86_BUILTIN_CPYSGNPD256,
25480
25481 /* FMA4 instructions. */
25482 IX86_BUILTIN_VFMADDSS,
25483 IX86_BUILTIN_VFMADDSD,
25484 IX86_BUILTIN_VFMADDPS,
25485 IX86_BUILTIN_VFMADDPD,
25486 IX86_BUILTIN_VFMADDPS256,
25487 IX86_BUILTIN_VFMADDPD256,
25488 IX86_BUILTIN_VFMADDSUBPS,
25489 IX86_BUILTIN_VFMADDSUBPD,
25490 IX86_BUILTIN_VFMADDSUBPS256,
25491 IX86_BUILTIN_VFMADDSUBPD256,
25492
25493 /* FMA3 instructions. */
25494 IX86_BUILTIN_VFMADDSS3,
25495 IX86_BUILTIN_VFMADDSD3,
25496
25497 /* XOP instructions. */
25498 IX86_BUILTIN_VPCMOV,
25499 IX86_BUILTIN_VPCMOV_V2DI,
25500 IX86_BUILTIN_VPCMOV_V4SI,
25501 IX86_BUILTIN_VPCMOV_V8HI,
25502 IX86_BUILTIN_VPCMOV_V16QI,
25503 IX86_BUILTIN_VPCMOV_V4SF,
25504 IX86_BUILTIN_VPCMOV_V2DF,
25505 IX86_BUILTIN_VPCMOV256,
25506 IX86_BUILTIN_VPCMOV_V4DI256,
25507 IX86_BUILTIN_VPCMOV_V8SI256,
25508 IX86_BUILTIN_VPCMOV_V16HI256,
25509 IX86_BUILTIN_VPCMOV_V32QI256,
25510 IX86_BUILTIN_VPCMOV_V8SF256,
25511 IX86_BUILTIN_VPCMOV_V4DF256,
25512
25513 IX86_BUILTIN_VPPERM,
25514
25515 IX86_BUILTIN_VPMACSSWW,
25516 IX86_BUILTIN_VPMACSWW,
25517 IX86_BUILTIN_VPMACSSWD,
25518 IX86_BUILTIN_VPMACSWD,
25519 IX86_BUILTIN_VPMACSSDD,
25520 IX86_BUILTIN_VPMACSDD,
25521 IX86_BUILTIN_VPMACSSDQL,
25522 IX86_BUILTIN_VPMACSSDQH,
25523 IX86_BUILTIN_VPMACSDQL,
25524 IX86_BUILTIN_VPMACSDQH,
25525 IX86_BUILTIN_VPMADCSSWD,
25526 IX86_BUILTIN_VPMADCSWD,
25527
25528 IX86_BUILTIN_VPHADDBW,
25529 IX86_BUILTIN_VPHADDBD,
25530 IX86_BUILTIN_VPHADDBQ,
25531 IX86_BUILTIN_VPHADDWD,
25532 IX86_BUILTIN_VPHADDWQ,
25533 IX86_BUILTIN_VPHADDDQ,
25534 IX86_BUILTIN_VPHADDUBW,
25535 IX86_BUILTIN_VPHADDUBD,
25536 IX86_BUILTIN_VPHADDUBQ,
25537 IX86_BUILTIN_VPHADDUWD,
25538 IX86_BUILTIN_VPHADDUWQ,
25539 IX86_BUILTIN_VPHADDUDQ,
25540 IX86_BUILTIN_VPHSUBBW,
25541 IX86_BUILTIN_VPHSUBWD,
25542 IX86_BUILTIN_VPHSUBDQ,
25543
25544 IX86_BUILTIN_VPROTB,
25545 IX86_BUILTIN_VPROTW,
25546 IX86_BUILTIN_VPROTD,
25547 IX86_BUILTIN_VPROTQ,
25548 IX86_BUILTIN_VPROTB_IMM,
25549 IX86_BUILTIN_VPROTW_IMM,
25550 IX86_BUILTIN_VPROTD_IMM,
25551 IX86_BUILTIN_VPROTQ_IMM,
25552
25553 IX86_BUILTIN_VPSHLB,
25554 IX86_BUILTIN_VPSHLW,
25555 IX86_BUILTIN_VPSHLD,
25556 IX86_BUILTIN_VPSHLQ,
25557 IX86_BUILTIN_VPSHAB,
25558 IX86_BUILTIN_VPSHAW,
25559 IX86_BUILTIN_VPSHAD,
25560 IX86_BUILTIN_VPSHAQ,
25561
25562 IX86_BUILTIN_VFRCZSS,
25563 IX86_BUILTIN_VFRCZSD,
25564 IX86_BUILTIN_VFRCZPS,
25565 IX86_BUILTIN_VFRCZPD,
25566 IX86_BUILTIN_VFRCZPS256,
25567 IX86_BUILTIN_VFRCZPD256,
25568
25569 IX86_BUILTIN_VPCOMEQUB,
25570 IX86_BUILTIN_VPCOMNEUB,
25571 IX86_BUILTIN_VPCOMLTUB,
25572 IX86_BUILTIN_VPCOMLEUB,
25573 IX86_BUILTIN_VPCOMGTUB,
25574 IX86_BUILTIN_VPCOMGEUB,
25575 IX86_BUILTIN_VPCOMFALSEUB,
25576 IX86_BUILTIN_VPCOMTRUEUB,
25577
25578 IX86_BUILTIN_VPCOMEQUW,
25579 IX86_BUILTIN_VPCOMNEUW,
25580 IX86_BUILTIN_VPCOMLTUW,
25581 IX86_BUILTIN_VPCOMLEUW,
25582 IX86_BUILTIN_VPCOMGTUW,
25583 IX86_BUILTIN_VPCOMGEUW,
25584 IX86_BUILTIN_VPCOMFALSEUW,
25585 IX86_BUILTIN_VPCOMTRUEUW,
25586
25587 IX86_BUILTIN_VPCOMEQUD,
25588 IX86_BUILTIN_VPCOMNEUD,
25589 IX86_BUILTIN_VPCOMLTUD,
25590 IX86_BUILTIN_VPCOMLEUD,
25591 IX86_BUILTIN_VPCOMGTUD,
25592 IX86_BUILTIN_VPCOMGEUD,
25593 IX86_BUILTIN_VPCOMFALSEUD,
25594 IX86_BUILTIN_VPCOMTRUEUD,
25595
25596 IX86_BUILTIN_VPCOMEQUQ,
25597 IX86_BUILTIN_VPCOMNEUQ,
25598 IX86_BUILTIN_VPCOMLTUQ,
25599 IX86_BUILTIN_VPCOMLEUQ,
25600 IX86_BUILTIN_VPCOMGTUQ,
25601 IX86_BUILTIN_VPCOMGEUQ,
25602 IX86_BUILTIN_VPCOMFALSEUQ,
25603 IX86_BUILTIN_VPCOMTRUEUQ,
25604
25605 IX86_BUILTIN_VPCOMEQB,
25606 IX86_BUILTIN_VPCOMNEB,
25607 IX86_BUILTIN_VPCOMLTB,
25608 IX86_BUILTIN_VPCOMLEB,
25609 IX86_BUILTIN_VPCOMGTB,
25610 IX86_BUILTIN_VPCOMGEB,
25611 IX86_BUILTIN_VPCOMFALSEB,
25612 IX86_BUILTIN_VPCOMTRUEB,
25613
25614 IX86_BUILTIN_VPCOMEQW,
25615 IX86_BUILTIN_VPCOMNEW,
25616 IX86_BUILTIN_VPCOMLTW,
25617 IX86_BUILTIN_VPCOMLEW,
25618 IX86_BUILTIN_VPCOMGTW,
25619 IX86_BUILTIN_VPCOMGEW,
25620 IX86_BUILTIN_VPCOMFALSEW,
25621 IX86_BUILTIN_VPCOMTRUEW,
25622
25623 IX86_BUILTIN_VPCOMEQD,
25624 IX86_BUILTIN_VPCOMNED,
25625 IX86_BUILTIN_VPCOMLTD,
25626 IX86_BUILTIN_VPCOMLED,
25627 IX86_BUILTIN_VPCOMGTD,
25628 IX86_BUILTIN_VPCOMGED,
25629 IX86_BUILTIN_VPCOMFALSED,
25630 IX86_BUILTIN_VPCOMTRUED,
25631
25632 IX86_BUILTIN_VPCOMEQQ,
25633 IX86_BUILTIN_VPCOMNEQ,
25634 IX86_BUILTIN_VPCOMLTQ,
25635 IX86_BUILTIN_VPCOMLEQ,
25636 IX86_BUILTIN_VPCOMGTQ,
25637 IX86_BUILTIN_VPCOMGEQ,
25638 IX86_BUILTIN_VPCOMFALSEQ,
25639 IX86_BUILTIN_VPCOMTRUEQ,
25640
25641 /* LWP instructions. */
25642 IX86_BUILTIN_LLWPCB,
25643 IX86_BUILTIN_SLWPCB,
25644 IX86_BUILTIN_LWPVAL32,
25645 IX86_BUILTIN_LWPVAL64,
25646 IX86_BUILTIN_LWPINS32,
25647 IX86_BUILTIN_LWPINS64,
25648
25649 IX86_BUILTIN_CLZS,
25650
25651 /* BMI instructions. */
25652 IX86_BUILTIN_BEXTR32,
25653 IX86_BUILTIN_BEXTR64,
25654 IX86_BUILTIN_CTZS,
25655
25656 /* TBM instructions. */
25657 IX86_BUILTIN_BEXTRI32,
25658 IX86_BUILTIN_BEXTRI64,
25659
25660 /* BMI2 instructions. */
25661 IX86_BUILTIN_BZHI32,
25662 IX86_BUILTIN_BZHI64,
25663 IX86_BUILTIN_PDEP32,
25664 IX86_BUILTIN_PDEP64,
25665 IX86_BUILTIN_PEXT32,
25666 IX86_BUILTIN_PEXT64,
25667
25668 /* FSGSBASE instructions. */
25669 IX86_BUILTIN_RDFSBASE32,
25670 IX86_BUILTIN_RDFSBASE64,
25671 IX86_BUILTIN_RDGSBASE32,
25672 IX86_BUILTIN_RDGSBASE64,
25673 IX86_BUILTIN_WRFSBASE32,
25674 IX86_BUILTIN_WRFSBASE64,
25675 IX86_BUILTIN_WRGSBASE32,
25676 IX86_BUILTIN_WRGSBASE64,
25677
25678 /* RDRND instructions. */
25679 IX86_BUILTIN_RDRAND16_STEP,
25680 IX86_BUILTIN_RDRAND32_STEP,
25681 IX86_BUILTIN_RDRAND64_STEP,
25682
25683 /* F16C instructions. */
25684 IX86_BUILTIN_CVTPH2PS,
25685 IX86_BUILTIN_CVTPH2PS256,
25686 IX86_BUILTIN_CVTPS2PH,
25687 IX86_BUILTIN_CVTPS2PH256,
25688
25689 /* CFString built-in for darwin */
25690 IX86_BUILTIN_CFSTRING,
25691
25692 IX86_BUILTIN_MAX
25693 };
25694
25695 /* Table for the ix86 builtin decls. */
25696 static GTY(()) tree ix86_builtins[(int) IX86_BUILTIN_MAX];
25697
25698 /* Table of all of the builtin functions that are possible with different ISA's
25699 but are waiting to be built until a function is declared to use that
25700 ISA. */
25701 struct builtin_isa {
25702 const char *name; /* function name */
25703 enum ix86_builtin_func_type tcode; /* type to use in the declaration */
25704 HOST_WIDE_INT isa; /* isa_flags this builtin is defined for */
25705 bool const_p; /* true if the declaration is constant */
25706 bool set_and_not_built_p;
25707 };
25708
25709 static struct builtin_isa ix86_builtins_isa[(int) IX86_BUILTIN_MAX];
25710
25711
25712 /* Add an ix86 target builtin function with CODE, NAME and TYPE. Save the MASK
25713 of which isa_flags to use in the ix86_builtins_isa array. Stores the
25714 function decl in the ix86_builtins array. Returns the function decl or
25715 NULL_TREE, if the builtin was not added.
25716
25717 If the front end has a special hook for builtin functions, delay adding
25718 builtin functions that aren't in the current ISA until the ISA is changed
25719 with function specific optimization. Doing so, can save about 300K for the
25720 default compiler. When the builtin is expanded, check at that time whether
25721 it is valid.
25722
25723 If the front end doesn't have a special hook, record all builtins, even if
25724 it isn't an instruction set in the current ISA in case the user uses
25725 function specific options for a different ISA, so that we don't get scope
25726 errors if a builtin is added in the middle of a function scope. */
25727
25728 static inline tree
25729 def_builtin (HOST_WIDE_INT mask, const char *name,
25730 enum ix86_builtin_func_type tcode,
25731 enum ix86_builtins code)
25732 {
25733 tree decl = NULL_TREE;
25734
25735 if (!(mask & OPTION_MASK_ISA_64BIT) || TARGET_64BIT)
25736 {
25737 ix86_builtins_isa[(int) code].isa = mask;
25738
25739 mask &= ~OPTION_MASK_ISA_64BIT;
25740 if (mask == 0
25741 || (mask & ix86_isa_flags) != 0
25742 || (lang_hooks.builtin_function
25743 == lang_hooks.builtin_function_ext_scope))
25744
25745 {
25746 tree type = ix86_get_builtin_func_type (tcode);
25747 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
25748 NULL, NULL_TREE);
25749 ix86_builtins[(int) code] = decl;
25750 ix86_builtins_isa[(int) code].set_and_not_built_p = false;
25751 }
25752 else
25753 {
25754 ix86_builtins[(int) code] = NULL_TREE;
25755 ix86_builtins_isa[(int) code].tcode = tcode;
25756 ix86_builtins_isa[(int) code].name = name;
25757 ix86_builtins_isa[(int) code].const_p = false;
25758 ix86_builtins_isa[(int) code].set_and_not_built_p = true;
25759 }
25760 }
25761
25762 return decl;
25763 }
25764
25765 /* Like def_builtin, but also marks the function decl "const". */
25766
25767 static inline tree
25768 def_builtin_const (HOST_WIDE_INT mask, const char *name,
25769 enum ix86_builtin_func_type tcode, enum ix86_builtins code)
25770 {
25771 tree decl = def_builtin (mask, name, tcode, code);
25772 if (decl)
25773 TREE_READONLY (decl) = 1;
25774 else
25775 ix86_builtins_isa[(int) code].const_p = true;
25776
25777 return decl;
25778 }
25779
25780 /* Add any new builtin functions for a given ISA that may not have been
25781 declared. This saves a bit of space compared to adding all of the
25782 declarations to the tree, even if we didn't use them. */
25783
25784 static void
25785 ix86_add_new_builtins (HOST_WIDE_INT isa)
25786 {
25787 int i;
25788
25789 for (i = 0; i < (int)IX86_BUILTIN_MAX; i++)
25790 {
25791 if ((ix86_builtins_isa[i].isa & isa) != 0
25792 && ix86_builtins_isa[i].set_and_not_built_p)
25793 {
25794 tree decl, type;
25795
25796 /* Don't define the builtin again. */
25797 ix86_builtins_isa[i].set_and_not_built_p = false;
25798
25799 type = ix86_get_builtin_func_type (ix86_builtins_isa[i].tcode);
25800 decl = add_builtin_function_ext_scope (ix86_builtins_isa[i].name,
25801 type, i, BUILT_IN_MD, NULL,
25802 NULL_TREE);
25803
25804 ix86_builtins[i] = decl;
25805 if (ix86_builtins_isa[i].const_p)
25806 TREE_READONLY (decl) = 1;
25807 }
25808 }
25809 }
25810
25811 /* Bits for builtin_description.flag. */
25812
25813 /* Set when we don't support the comparison natively, and should
25814 swap_comparison in order to support it. */
25815 #define BUILTIN_DESC_SWAP_OPERANDS 1
25816
25817 struct builtin_description
25818 {
25819 const HOST_WIDE_INT mask;
25820 const enum insn_code icode;
25821 const char *const name;
25822 const enum ix86_builtins code;
25823 const enum rtx_code comparison;
25824 const int flag;
25825 };
25826
25827 static const struct builtin_description bdesc_comi[] =
25828 {
25829 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comieq", IX86_BUILTIN_COMIEQSS, UNEQ, 0 },
25830 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comilt", IX86_BUILTIN_COMILTSS, UNLT, 0 },
25831 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comile", IX86_BUILTIN_COMILESS, UNLE, 0 },
25832 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comigt", IX86_BUILTIN_COMIGTSS, GT, 0 },
25833 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comige", IX86_BUILTIN_COMIGESS, GE, 0 },
25834 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comineq", IX86_BUILTIN_COMINEQSS, LTGT, 0 },
25835 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomieq", IX86_BUILTIN_UCOMIEQSS, UNEQ, 0 },
25836 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomilt", IX86_BUILTIN_UCOMILTSS, UNLT, 0 },
25837 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomile", IX86_BUILTIN_UCOMILESS, UNLE, 0 },
25838 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomigt", IX86_BUILTIN_UCOMIGTSS, GT, 0 },
25839 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomige", IX86_BUILTIN_UCOMIGESS, GE, 0 },
25840 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomineq", IX86_BUILTIN_UCOMINEQSS, LTGT, 0 },
25841 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdeq", IX86_BUILTIN_COMIEQSD, UNEQ, 0 },
25842 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdlt", IX86_BUILTIN_COMILTSD, UNLT, 0 },
25843 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdle", IX86_BUILTIN_COMILESD, UNLE, 0 },
25844 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdgt", IX86_BUILTIN_COMIGTSD, GT, 0 },
25845 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdge", IX86_BUILTIN_COMIGESD, GE, 0 },
25846 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdneq", IX86_BUILTIN_COMINEQSD, LTGT, 0 },
25847 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdeq", IX86_BUILTIN_UCOMIEQSD, UNEQ, 0 },
25848 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdlt", IX86_BUILTIN_UCOMILTSD, UNLT, 0 },
25849 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdle", IX86_BUILTIN_UCOMILESD, UNLE, 0 },
25850 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdgt", IX86_BUILTIN_UCOMIGTSD, GT, 0 },
25851 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdge", IX86_BUILTIN_UCOMIGESD, GE, 0 },
25852 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdneq", IX86_BUILTIN_UCOMINEQSD, LTGT, 0 },
25853 };
25854
25855 static const struct builtin_description bdesc_pcmpestr[] =
25856 {
25857 /* SSE4.2 */
25858 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestri128", IX86_BUILTIN_PCMPESTRI128, UNKNOWN, 0 },
25859 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestrm128", IX86_BUILTIN_PCMPESTRM128, UNKNOWN, 0 },
25860 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestria128", IX86_BUILTIN_PCMPESTRA128, UNKNOWN, (int) CCAmode },
25861 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestric128", IX86_BUILTIN_PCMPESTRC128, UNKNOWN, (int) CCCmode },
25862 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestrio128", IX86_BUILTIN_PCMPESTRO128, UNKNOWN, (int) CCOmode },
25863 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestris128", IX86_BUILTIN_PCMPESTRS128, UNKNOWN, (int) CCSmode },
25864 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestriz128", IX86_BUILTIN_PCMPESTRZ128, UNKNOWN, (int) CCZmode },
25865 };
25866
25867 static const struct builtin_description bdesc_pcmpistr[] =
25868 {
25869 /* SSE4.2 */
25870 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistri128", IX86_BUILTIN_PCMPISTRI128, UNKNOWN, 0 },
25871 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistrm128", IX86_BUILTIN_PCMPISTRM128, UNKNOWN, 0 },
25872 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistria128", IX86_BUILTIN_PCMPISTRA128, UNKNOWN, (int) CCAmode },
25873 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistric128", IX86_BUILTIN_PCMPISTRC128, UNKNOWN, (int) CCCmode },
25874 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistrio128", IX86_BUILTIN_PCMPISTRO128, UNKNOWN, (int) CCOmode },
25875 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistris128", IX86_BUILTIN_PCMPISTRS128, UNKNOWN, (int) CCSmode },
25876 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistriz128", IX86_BUILTIN_PCMPISTRZ128, UNKNOWN, (int) CCZmode },
25877 };
25878
25879 /* Special builtins with variable number of arguments. */
25880 static const struct builtin_description bdesc_special_args[] =
25881 {
25882 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rdtsc, "__builtin_ia32_rdtsc", IX86_BUILTIN_RDTSC, UNKNOWN, (int) UINT64_FTYPE_VOID },
25883 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rdtscp, "__builtin_ia32_rdtscp", IX86_BUILTIN_RDTSCP, UNKNOWN, (int) UINT64_FTYPE_PUNSIGNED },
25884 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_pause, "__builtin_ia32_pause", IX86_BUILTIN_PAUSE, UNKNOWN, (int) VOID_FTYPE_VOID },
25885
25886 /* MMX */
25887 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_emms, "__builtin_ia32_emms", IX86_BUILTIN_EMMS, UNKNOWN, (int) VOID_FTYPE_VOID },
25888
25889 /* 3DNow! */
25890 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_femms, "__builtin_ia32_femms", IX86_BUILTIN_FEMMS, UNKNOWN, (int) VOID_FTYPE_VOID },
25891
25892 /* SSE */
25893 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movups, "__builtin_ia32_storeups", IX86_BUILTIN_STOREUPS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF },
25894 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movntv4sf, "__builtin_ia32_movntps", IX86_BUILTIN_MOVNTPS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF },
25895 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movups, "__builtin_ia32_loadups", IX86_BUILTIN_LOADUPS, UNKNOWN, (int) V4SF_FTYPE_PCFLOAT },
25896
25897 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_loadhps_exp, "__builtin_ia32_loadhps", IX86_BUILTIN_LOADHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_PCV2SF },
25898 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_loadlps_exp, "__builtin_ia32_loadlps", IX86_BUILTIN_LOADLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_PCV2SF },
25899 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_storehps, "__builtin_ia32_storehps", IX86_BUILTIN_STOREHPS, UNKNOWN, (int) VOID_FTYPE_PV2SF_V4SF },
25900 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_storelps, "__builtin_ia32_storelps", IX86_BUILTIN_STORELPS, UNKNOWN, (int) VOID_FTYPE_PV2SF_V4SF },
25901
25902 /* SSE or 3DNow!A */
25903 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_sse_sfence, "__builtin_ia32_sfence", IX86_BUILTIN_SFENCE, UNKNOWN, (int) VOID_FTYPE_VOID },
25904 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_sse_movntq, "__builtin_ia32_movntq", IX86_BUILTIN_MOVNTQ, UNKNOWN, (int) VOID_FTYPE_PULONGLONG_ULONGLONG },
25905
25906 /* SSE2 */
25907 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_lfence, "__builtin_ia32_lfence", IX86_BUILTIN_LFENCE, UNKNOWN, (int) VOID_FTYPE_VOID },
25908 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_mfence, 0, IX86_BUILTIN_MFENCE, UNKNOWN, (int) VOID_FTYPE_VOID },
25909 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movupd, "__builtin_ia32_storeupd", IX86_BUILTIN_STOREUPD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF },
25910 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movdqu, "__builtin_ia32_storedqu", IX86_BUILTIN_STOREDQU, UNKNOWN, (int) VOID_FTYPE_PCHAR_V16QI },
25911 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntv2df, "__builtin_ia32_movntpd", IX86_BUILTIN_MOVNTPD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF },
25912 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntv2di, "__builtin_ia32_movntdq", IX86_BUILTIN_MOVNTDQ, UNKNOWN, (int) VOID_FTYPE_PV2DI_V2DI },
25913 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntisi, "__builtin_ia32_movnti", IX86_BUILTIN_MOVNTI, UNKNOWN, (int) VOID_FTYPE_PINT_INT },
25914 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_movntidi, "__builtin_ia32_movnti64", IX86_BUILTIN_MOVNTI64, UNKNOWN, (int) VOID_FTYPE_PLONGLONG_LONGLONG },
25915 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movupd, "__builtin_ia32_loadupd", IX86_BUILTIN_LOADUPD, UNKNOWN, (int) V2DF_FTYPE_PCDOUBLE },
25916 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movdqu, "__builtin_ia32_loaddqu", IX86_BUILTIN_LOADDQU, UNKNOWN, (int) V16QI_FTYPE_PCCHAR },
25917
25918 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loadhpd_exp, "__builtin_ia32_loadhpd", IX86_BUILTIN_LOADHPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_PCDOUBLE },
25919 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loadlpd_exp, "__builtin_ia32_loadlpd", IX86_BUILTIN_LOADLPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_PCDOUBLE },
25920
25921 /* SSE3 */
25922 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_lddqu, "__builtin_ia32_lddqu", IX86_BUILTIN_LDDQU, UNKNOWN, (int) V16QI_FTYPE_PCCHAR },
25923
25924 /* SSE4.1 */
25925 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_movntdqa, "__builtin_ia32_movntdqa", IX86_BUILTIN_MOVNTDQA, UNKNOWN, (int) V2DI_FTYPE_PV2DI },
25926
25927 /* SSE4A */
25928 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_vmmovntv2df, "__builtin_ia32_movntsd", IX86_BUILTIN_MOVNTSD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF },
25929 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_vmmovntv4sf, "__builtin_ia32_movntss", IX86_BUILTIN_MOVNTSS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF },
25930
25931 /* AVX */
25932 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vzeroall, "__builtin_ia32_vzeroall", IX86_BUILTIN_VZEROALL, UNKNOWN, (int) VOID_FTYPE_VOID },
25933 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vzeroupper, "__builtin_ia32_vzeroupper", IX86_BUILTIN_VZEROUPPER, UNKNOWN, (int) VOID_FTYPE_VOID },
25934
25935 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv4sf, "__builtin_ia32_vbroadcastss", IX86_BUILTIN_VBROADCASTSS, UNKNOWN, (int) V4SF_FTYPE_PCFLOAT },
25936 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv4df, "__builtin_ia32_vbroadcastsd256", IX86_BUILTIN_VBROADCASTSD256, UNKNOWN, (int) V4DF_FTYPE_PCDOUBLE },
25937 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv8sf, "__builtin_ia32_vbroadcastss256", IX86_BUILTIN_VBROADCASTSS256, UNKNOWN, (int) V8SF_FTYPE_PCFLOAT },
25938 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastf128_v4df, "__builtin_ia32_vbroadcastf128_pd256", IX86_BUILTIN_VBROADCASTPD256, UNKNOWN, (int) V4DF_FTYPE_PCV2DF },
25939 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastf128_v8sf, "__builtin_ia32_vbroadcastf128_ps256", IX86_BUILTIN_VBROADCASTPS256, UNKNOWN, (int) V8SF_FTYPE_PCV4SF },
25940
25941 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movupd256, "__builtin_ia32_loadupd256", IX86_BUILTIN_LOADUPD256, UNKNOWN, (int) V4DF_FTYPE_PCDOUBLE },
25942 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movups256, "__builtin_ia32_loadups256", IX86_BUILTIN_LOADUPS256, UNKNOWN, (int) V8SF_FTYPE_PCFLOAT },
25943 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movupd256, "__builtin_ia32_storeupd256", IX86_BUILTIN_STOREUPD256, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V4DF },
25944 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movups256, "__builtin_ia32_storeups256", IX86_BUILTIN_STOREUPS256, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V8SF },
25945 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movdqu256, "__builtin_ia32_loaddqu256", IX86_BUILTIN_LOADDQU256, UNKNOWN, (int) V32QI_FTYPE_PCCHAR },
25946 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movdqu256, "__builtin_ia32_storedqu256", IX86_BUILTIN_STOREDQU256, UNKNOWN, (int) VOID_FTYPE_PCHAR_V32QI },
25947 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_lddqu256, "__builtin_ia32_lddqu256", IX86_BUILTIN_LDDQU256, UNKNOWN, (int) V32QI_FTYPE_PCCHAR },
25948
25949 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv4di, "__builtin_ia32_movntdq256", IX86_BUILTIN_MOVNTDQ256, UNKNOWN, (int) VOID_FTYPE_PV4DI_V4DI },
25950 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv4df, "__builtin_ia32_movntpd256", IX86_BUILTIN_MOVNTPD256, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V4DF },
25951 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv8sf, "__builtin_ia32_movntps256", IX86_BUILTIN_MOVNTPS256, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V8SF },
25952
25953 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadpd, "__builtin_ia32_maskloadpd", IX86_BUILTIN_MASKLOADPD, UNKNOWN, (int) V2DF_FTYPE_PCV2DF_V2DI },
25954 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadps, "__builtin_ia32_maskloadps", IX86_BUILTIN_MASKLOADPS, UNKNOWN, (int) V4SF_FTYPE_PCV4SF_V4SI },
25955 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadpd256, "__builtin_ia32_maskloadpd256", IX86_BUILTIN_MASKLOADPD256, UNKNOWN, (int) V4DF_FTYPE_PCV4DF_V4DI },
25956 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadps256, "__builtin_ia32_maskloadps256", IX86_BUILTIN_MASKLOADPS256, UNKNOWN, (int) V8SF_FTYPE_PCV8SF_V8SI },
25957 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstorepd, "__builtin_ia32_maskstorepd", IX86_BUILTIN_MASKSTOREPD, UNKNOWN, (int) VOID_FTYPE_PV2DF_V2DI_V2DF },
25958 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstoreps, "__builtin_ia32_maskstoreps", IX86_BUILTIN_MASKSTOREPS, UNKNOWN, (int) VOID_FTYPE_PV4SF_V4SI_V4SF },
25959 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstorepd256, "__builtin_ia32_maskstorepd256", IX86_BUILTIN_MASKSTOREPD256, UNKNOWN, (int) VOID_FTYPE_PV4DF_V4DI_V4DF },
25960 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstoreps256, "__builtin_ia32_maskstoreps256", IX86_BUILTIN_MASKSTOREPS256, UNKNOWN, (int) VOID_FTYPE_PV8SF_V8SI_V8SF },
25961
25962 /* AVX2 */
25963 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_movntdqa, "__builtin_ia32_movntdqa256", IX86_BUILTIN_MOVNTDQA256, UNKNOWN, (int) V4DI_FTYPE_PV4DI },
25964 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadd, "__builtin_ia32_maskloadd", IX86_BUILTIN_MASKLOADD, UNKNOWN, (int) V4SI_FTYPE_PCV4SI_V4SI },
25965 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadq, "__builtin_ia32_maskloadq", IX86_BUILTIN_MASKLOADQ, UNKNOWN, (int) V2DI_FTYPE_PCV2DI_V2DI },
25966 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadd256, "__builtin_ia32_maskloadd256", IX86_BUILTIN_MASKLOADD256, UNKNOWN, (int) V8SI_FTYPE_PCV8SI_V8SI },
25967 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadq256, "__builtin_ia32_maskloadq256", IX86_BUILTIN_MASKLOADQ256, UNKNOWN, (int) V4DI_FTYPE_PCV4DI_V4DI },
25968 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstored, "__builtin_ia32_maskstored", IX86_BUILTIN_MASKSTORED, UNKNOWN, (int) VOID_FTYPE_PV4SI_V4SI_V4SI },
25969 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstoreq, "__builtin_ia32_maskstoreq", IX86_BUILTIN_MASKSTOREQ, UNKNOWN, (int) VOID_FTYPE_PV2DI_V2DI_V2DI },
25970 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstored256, "__builtin_ia32_maskstored256", IX86_BUILTIN_MASKSTORED256, UNKNOWN, (int) VOID_FTYPE_PV8SI_V8SI_V8SI },
25971 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstoreq256, "__builtin_ia32_maskstoreq256", IX86_BUILTIN_MASKSTOREQ256, UNKNOWN, (int) VOID_FTYPE_PV4DI_V4DI_V4DI },
25972
25973 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_llwpcb, "__builtin_ia32_llwpcb", IX86_BUILTIN_LLWPCB, UNKNOWN, (int) VOID_FTYPE_PVOID },
25974 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_slwpcb, "__builtin_ia32_slwpcb", IX86_BUILTIN_SLWPCB, UNKNOWN, (int) PVOID_FTYPE_VOID },
25975 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpvalsi3, "__builtin_ia32_lwpval32", IX86_BUILTIN_LWPVAL32, UNKNOWN, (int) VOID_FTYPE_UINT_UINT_UINT },
25976 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpvaldi3, "__builtin_ia32_lwpval64", IX86_BUILTIN_LWPVAL64, UNKNOWN, (int) VOID_FTYPE_UINT64_UINT_UINT },
25977 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpinssi3, "__builtin_ia32_lwpins32", IX86_BUILTIN_LWPINS32, UNKNOWN, (int) UCHAR_FTYPE_UINT_UINT_UINT },
25978 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpinsdi3, "__builtin_ia32_lwpins64", IX86_BUILTIN_LWPINS64, UNKNOWN, (int) UCHAR_FTYPE_UINT64_UINT_UINT },
25979
25980 /* FSGSBASE */
25981 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdfsbasesi, "__builtin_ia32_rdfsbase32", IX86_BUILTIN_RDFSBASE32, UNKNOWN, (int) UNSIGNED_FTYPE_VOID },
25982 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdfsbasedi, "__builtin_ia32_rdfsbase64", IX86_BUILTIN_RDFSBASE64, UNKNOWN, (int) UINT64_FTYPE_VOID },
25983 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdgsbasesi, "__builtin_ia32_rdgsbase32", IX86_BUILTIN_RDGSBASE32, UNKNOWN, (int) UNSIGNED_FTYPE_VOID },
25984 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdgsbasedi, "__builtin_ia32_rdgsbase64", IX86_BUILTIN_RDGSBASE64, UNKNOWN, (int) UINT64_FTYPE_VOID },
25985 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrfsbasesi, "__builtin_ia32_wrfsbase32", IX86_BUILTIN_WRFSBASE32, UNKNOWN, (int) VOID_FTYPE_UNSIGNED },
25986 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrfsbasedi, "__builtin_ia32_wrfsbase64", IX86_BUILTIN_WRFSBASE64, UNKNOWN, (int) VOID_FTYPE_UINT64 },
25987 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrgsbasesi, "__builtin_ia32_wrgsbase32", IX86_BUILTIN_WRGSBASE32, UNKNOWN, (int) VOID_FTYPE_UNSIGNED },
25988 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrgsbasedi, "__builtin_ia32_wrgsbase64", IX86_BUILTIN_WRGSBASE64, UNKNOWN, (int) VOID_FTYPE_UINT64 },
25989 };
25990
25991 /* Builtins with variable number of arguments. */
25992 static const struct builtin_description bdesc_args[] =
25993 {
25994 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_bsr, "__builtin_ia32_bsrsi", IX86_BUILTIN_BSRSI, UNKNOWN, (int) INT_FTYPE_INT },
25995 { OPTION_MASK_ISA_64BIT, CODE_FOR_bsr_rex64, "__builtin_ia32_bsrdi", IX86_BUILTIN_BSRDI, UNKNOWN, (int) INT64_FTYPE_INT64 },
25996 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rdpmc, "__builtin_ia32_rdpmc", IX86_BUILTIN_RDPMC, UNKNOWN, (int) UINT64_FTYPE_INT },
25997 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotlqi3, "__builtin_ia32_rolqi", IX86_BUILTIN_ROLQI, UNKNOWN, (int) UINT8_FTYPE_UINT8_INT },
25998 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotlhi3, "__builtin_ia32_rolhi", IX86_BUILTIN_ROLHI, UNKNOWN, (int) UINT16_FTYPE_UINT16_INT },
25999 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotrqi3, "__builtin_ia32_rorqi", IX86_BUILTIN_RORQI, UNKNOWN, (int) UINT8_FTYPE_UINT8_INT },
26000 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotrhi3, "__builtin_ia32_rorhi", IX86_BUILTIN_RORHI, UNKNOWN, (int) UINT16_FTYPE_UINT16_INT },
26001
26002 /* MMX */
26003 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv8qi3, "__builtin_ia32_paddb", IX86_BUILTIN_PADDB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26004 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv4hi3, "__builtin_ia32_paddw", IX86_BUILTIN_PADDW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26005 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv2si3, "__builtin_ia32_paddd", IX86_BUILTIN_PADDD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26006 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv8qi3, "__builtin_ia32_psubb", IX86_BUILTIN_PSUBB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26007 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv4hi3, "__builtin_ia32_psubw", IX86_BUILTIN_PSUBW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26008 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv2si3, "__builtin_ia32_psubd", IX86_BUILTIN_PSUBD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26009
26010 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ssaddv8qi3, "__builtin_ia32_paddsb", IX86_BUILTIN_PADDSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26011 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ssaddv4hi3, "__builtin_ia32_paddsw", IX86_BUILTIN_PADDSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26012 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_sssubv8qi3, "__builtin_ia32_psubsb", IX86_BUILTIN_PSUBSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26013 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_sssubv4hi3, "__builtin_ia32_psubsw", IX86_BUILTIN_PSUBSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26014 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_usaddv8qi3, "__builtin_ia32_paddusb", IX86_BUILTIN_PADDUSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26015 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_usaddv4hi3, "__builtin_ia32_paddusw", IX86_BUILTIN_PADDUSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26016 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ussubv8qi3, "__builtin_ia32_psubusb", IX86_BUILTIN_PSUBUSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26017 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ussubv4hi3, "__builtin_ia32_psubusw", IX86_BUILTIN_PSUBUSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26018
26019 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_mulv4hi3, "__builtin_ia32_pmullw", IX86_BUILTIN_PMULLW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26020 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_smulv4hi3_highpart, "__builtin_ia32_pmulhw", IX86_BUILTIN_PMULHW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26021
26022 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_andv2si3, "__builtin_ia32_pand", IX86_BUILTIN_PAND, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26023 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_andnotv2si3, "__builtin_ia32_pandn", IX86_BUILTIN_PANDN, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26024 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_iorv2si3, "__builtin_ia32_por", IX86_BUILTIN_POR, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26025 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_xorv2si3, "__builtin_ia32_pxor", IX86_BUILTIN_PXOR, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26026
26027 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_eqv8qi3, "__builtin_ia32_pcmpeqb", IX86_BUILTIN_PCMPEQB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26028 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_eqv4hi3, "__builtin_ia32_pcmpeqw", IX86_BUILTIN_PCMPEQW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26029 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_eqv2si3, "__builtin_ia32_pcmpeqd", IX86_BUILTIN_PCMPEQD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26030 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_gtv8qi3, "__builtin_ia32_pcmpgtb", IX86_BUILTIN_PCMPGTB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26031 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_gtv4hi3, "__builtin_ia32_pcmpgtw", IX86_BUILTIN_PCMPGTW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26032 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_gtv2si3, "__builtin_ia32_pcmpgtd", IX86_BUILTIN_PCMPGTD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26033
26034 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckhbw, "__builtin_ia32_punpckhbw", IX86_BUILTIN_PUNPCKHBW, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26035 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckhwd, "__builtin_ia32_punpckhwd", IX86_BUILTIN_PUNPCKHWD, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26036 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckhdq, "__builtin_ia32_punpckhdq", IX86_BUILTIN_PUNPCKHDQ, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26037 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpcklbw, "__builtin_ia32_punpcklbw", IX86_BUILTIN_PUNPCKLBW, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26038 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpcklwd, "__builtin_ia32_punpcklwd", IX86_BUILTIN_PUNPCKLWD, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI},
26039 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckldq, "__builtin_ia32_punpckldq", IX86_BUILTIN_PUNPCKLDQ, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI},
26040
26041 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_packsswb, "__builtin_ia32_packsswb", IX86_BUILTIN_PACKSSWB, UNKNOWN, (int) V8QI_FTYPE_V4HI_V4HI },
26042 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_packssdw, "__builtin_ia32_packssdw", IX86_BUILTIN_PACKSSDW, UNKNOWN, (int) V4HI_FTYPE_V2SI_V2SI },
26043 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_packuswb, "__builtin_ia32_packuswb", IX86_BUILTIN_PACKUSWB, UNKNOWN, (int) V8QI_FTYPE_V4HI_V4HI },
26044
26045 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_pmaddwd, "__builtin_ia32_pmaddwd", IX86_BUILTIN_PMADDWD, UNKNOWN, (int) V2SI_FTYPE_V4HI_V4HI },
26046
26047 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv4hi3, "__builtin_ia32_psllwi", IX86_BUILTIN_PSLLWI, UNKNOWN, (int) V4HI_FTYPE_V4HI_SI_COUNT },
26048 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv2si3, "__builtin_ia32_pslldi", IX86_BUILTIN_PSLLDI, UNKNOWN, (int) V2SI_FTYPE_V2SI_SI_COUNT },
26049 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv1di3, "__builtin_ia32_psllqi", IX86_BUILTIN_PSLLQI, UNKNOWN, (int) V1DI_FTYPE_V1DI_SI_COUNT },
26050 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv4hi3, "__builtin_ia32_psllw", IX86_BUILTIN_PSLLW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI_COUNT },
26051 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv2si3, "__builtin_ia32_pslld", IX86_BUILTIN_PSLLD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI_COUNT },
26052 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv1di3, "__builtin_ia32_psllq", IX86_BUILTIN_PSLLQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI_COUNT },
26053
26054 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv4hi3, "__builtin_ia32_psrlwi", IX86_BUILTIN_PSRLWI, UNKNOWN, (int) V4HI_FTYPE_V4HI_SI_COUNT },
26055 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv2si3, "__builtin_ia32_psrldi", IX86_BUILTIN_PSRLDI, UNKNOWN, (int) V2SI_FTYPE_V2SI_SI_COUNT },
26056 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv1di3, "__builtin_ia32_psrlqi", IX86_BUILTIN_PSRLQI, UNKNOWN, (int) V1DI_FTYPE_V1DI_SI_COUNT },
26057 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv4hi3, "__builtin_ia32_psrlw", IX86_BUILTIN_PSRLW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI_COUNT },
26058 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv2si3, "__builtin_ia32_psrld", IX86_BUILTIN_PSRLD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI_COUNT },
26059 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv1di3, "__builtin_ia32_psrlq", IX86_BUILTIN_PSRLQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI_COUNT },
26060
26061 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv4hi3, "__builtin_ia32_psrawi", IX86_BUILTIN_PSRAWI, UNKNOWN, (int) V4HI_FTYPE_V4HI_SI_COUNT },
26062 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv2si3, "__builtin_ia32_psradi", IX86_BUILTIN_PSRADI, UNKNOWN, (int) V2SI_FTYPE_V2SI_SI_COUNT },
26063 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv4hi3, "__builtin_ia32_psraw", IX86_BUILTIN_PSRAW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI_COUNT },
26064 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv2si3, "__builtin_ia32_psrad", IX86_BUILTIN_PSRAD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI_COUNT },
26065
26066 /* 3DNow! */
26067 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_pf2id, "__builtin_ia32_pf2id", IX86_BUILTIN_PF2ID, UNKNOWN, (int) V2SI_FTYPE_V2SF },
26068 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_floatv2si2, "__builtin_ia32_pi2fd", IX86_BUILTIN_PI2FD, UNKNOWN, (int) V2SF_FTYPE_V2SI },
26069 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rcpv2sf2, "__builtin_ia32_pfrcp", IX86_BUILTIN_PFRCP, UNKNOWN, (int) V2SF_FTYPE_V2SF },
26070 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rsqrtv2sf2, "__builtin_ia32_pfrsqrt", IX86_BUILTIN_PFRSQRT, UNKNOWN, (int) V2SF_FTYPE_V2SF },
26071
26072 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_uavgv8qi3, "__builtin_ia32_pavgusb", IX86_BUILTIN_PAVGUSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26073 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_haddv2sf3, "__builtin_ia32_pfacc", IX86_BUILTIN_PFACC, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26074 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_addv2sf3, "__builtin_ia32_pfadd", IX86_BUILTIN_PFADD, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26075 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_eqv2sf3, "__builtin_ia32_pfcmpeq", IX86_BUILTIN_PFCMPEQ, UNKNOWN, (int) V2SI_FTYPE_V2SF_V2SF },
26076 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_gev2sf3, "__builtin_ia32_pfcmpge", IX86_BUILTIN_PFCMPGE, UNKNOWN, (int) V2SI_FTYPE_V2SF_V2SF },
26077 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_gtv2sf3, "__builtin_ia32_pfcmpgt", IX86_BUILTIN_PFCMPGT, UNKNOWN, (int) V2SI_FTYPE_V2SF_V2SF },
26078 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_smaxv2sf3, "__builtin_ia32_pfmax", IX86_BUILTIN_PFMAX, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26079 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_sminv2sf3, "__builtin_ia32_pfmin", IX86_BUILTIN_PFMIN, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26080 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_mulv2sf3, "__builtin_ia32_pfmul", IX86_BUILTIN_PFMUL, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26081 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rcpit1v2sf3, "__builtin_ia32_pfrcpit1", IX86_BUILTIN_PFRCPIT1, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26082 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rcpit2v2sf3, "__builtin_ia32_pfrcpit2", IX86_BUILTIN_PFRCPIT2, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26083 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rsqit1v2sf3, "__builtin_ia32_pfrsqit1", IX86_BUILTIN_PFRSQIT1, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26084 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_subv2sf3, "__builtin_ia32_pfsub", IX86_BUILTIN_PFSUB, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26085 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_subrv2sf3, "__builtin_ia32_pfsubr", IX86_BUILTIN_PFSUBR, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26086 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_pmulhrwv4hi3, "__builtin_ia32_pmulhrw", IX86_BUILTIN_PMULHRW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26087
26088 /* 3DNow!A */
26089 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pf2iw, "__builtin_ia32_pf2iw", IX86_BUILTIN_PF2IW, UNKNOWN, (int) V2SI_FTYPE_V2SF },
26090 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pi2fw, "__builtin_ia32_pi2fw", IX86_BUILTIN_PI2FW, UNKNOWN, (int) V2SF_FTYPE_V2SI },
26091 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pswapdv2si2, "__builtin_ia32_pswapdsi", IX86_BUILTIN_PSWAPDSI, UNKNOWN, (int) V2SI_FTYPE_V2SI },
26092 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pswapdv2sf2, "__builtin_ia32_pswapdsf", IX86_BUILTIN_PSWAPDSF, UNKNOWN, (int) V2SF_FTYPE_V2SF },
26093 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_hsubv2sf3, "__builtin_ia32_pfnacc", IX86_BUILTIN_PFNACC, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26094 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_addsubv2sf3, "__builtin_ia32_pfpnacc", IX86_BUILTIN_PFPNACC, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26095
26096 /* SSE */
26097 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movmskps, "__builtin_ia32_movmskps", IX86_BUILTIN_MOVMSKPS, UNKNOWN, (int) INT_FTYPE_V4SF },
26098 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_sqrtv4sf2, "__builtin_ia32_sqrtps", IX86_BUILTIN_SQRTPS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
26099 { OPTION_MASK_ISA_SSE, CODE_FOR_sqrtv4sf2, "__builtin_ia32_sqrtps_nr", IX86_BUILTIN_SQRTPS_NR, UNKNOWN, (int) V4SF_FTYPE_V4SF },
26100 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_rsqrtv4sf2, "__builtin_ia32_rsqrtps", IX86_BUILTIN_RSQRTPS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
26101 { OPTION_MASK_ISA_SSE, CODE_FOR_rsqrtv4sf2, "__builtin_ia32_rsqrtps_nr", IX86_BUILTIN_RSQRTPS_NR, UNKNOWN, (int) V4SF_FTYPE_V4SF },
26102 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_rcpv4sf2, "__builtin_ia32_rcpps", IX86_BUILTIN_RCPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
26103 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtps2pi, "__builtin_ia32_cvtps2pi", IX86_BUILTIN_CVTPS2PI, UNKNOWN, (int) V2SI_FTYPE_V4SF },
26104 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtss2si, "__builtin_ia32_cvtss2si", IX86_BUILTIN_CVTSS2SI, UNKNOWN, (int) INT_FTYPE_V4SF },
26105 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvtss2siq, "__builtin_ia32_cvtss2si64", IX86_BUILTIN_CVTSS2SI64, UNKNOWN, (int) INT64_FTYPE_V4SF },
26106 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvttps2pi, "__builtin_ia32_cvttps2pi", IX86_BUILTIN_CVTTPS2PI, UNKNOWN, (int) V2SI_FTYPE_V4SF },
26107 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvttss2si, "__builtin_ia32_cvttss2si", IX86_BUILTIN_CVTTSS2SI, UNKNOWN, (int) INT_FTYPE_V4SF },
26108 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvttss2siq, "__builtin_ia32_cvttss2si64", IX86_BUILTIN_CVTTSS2SI64, UNKNOWN, (int) INT64_FTYPE_V4SF },
26109
26110 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_shufps, "__builtin_ia32_shufps", IX86_BUILTIN_SHUFPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
26111
26112 { OPTION_MASK_ISA_SSE, CODE_FOR_addv4sf3, "__builtin_ia32_addps", IX86_BUILTIN_ADDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26113 { OPTION_MASK_ISA_SSE, CODE_FOR_subv4sf3, "__builtin_ia32_subps", IX86_BUILTIN_SUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26114 { OPTION_MASK_ISA_SSE, CODE_FOR_mulv4sf3, "__builtin_ia32_mulps", IX86_BUILTIN_MULPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26115 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_divv4sf3, "__builtin_ia32_divps", IX86_BUILTIN_DIVPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26116 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmaddv4sf3, "__builtin_ia32_addss", IX86_BUILTIN_ADDSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26117 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsubv4sf3, "__builtin_ia32_subss", IX86_BUILTIN_SUBSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26118 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmulv4sf3, "__builtin_ia32_mulss", IX86_BUILTIN_MULSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26119 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmdivv4sf3, "__builtin_ia32_divss", IX86_BUILTIN_DIVSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26120
26121 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpeqps", IX86_BUILTIN_CMPEQPS, EQ, (int) V4SF_FTYPE_V4SF_V4SF },
26122 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpltps", IX86_BUILTIN_CMPLTPS, LT, (int) V4SF_FTYPE_V4SF_V4SF },
26123 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpleps", IX86_BUILTIN_CMPLEPS, LE, (int) V4SF_FTYPE_V4SF_V4SF },
26124 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpgtps", IX86_BUILTIN_CMPGTPS, LT, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
26125 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpgeps", IX86_BUILTIN_CMPGEPS, LE, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
26126 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpunordps", IX86_BUILTIN_CMPUNORDPS, UNORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
26127 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpneqps", IX86_BUILTIN_CMPNEQPS, NE, (int) V4SF_FTYPE_V4SF_V4SF },
26128 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpnltps", IX86_BUILTIN_CMPNLTPS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF },
26129 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpnleps", IX86_BUILTIN_CMPNLEPS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF },
26130 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpngtps", IX86_BUILTIN_CMPNGTPS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
26131 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpngeps", IX86_BUILTIN_CMPNGEPS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF_SWAP},
26132 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpordps", IX86_BUILTIN_CMPORDPS, ORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
26133 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpeqss", IX86_BUILTIN_CMPEQSS, EQ, (int) V4SF_FTYPE_V4SF_V4SF },
26134 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpltss", IX86_BUILTIN_CMPLTSS, LT, (int) V4SF_FTYPE_V4SF_V4SF },
26135 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpless", IX86_BUILTIN_CMPLESS, LE, (int) V4SF_FTYPE_V4SF_V4SF },
26136 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpunordss", IX86_BUILTIN_CMPUNORDSS, UNORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
26137 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpneqss", IX86_BUILTIN_CMPNEQSS, NE, (int) V4SF_FTYPE_V4SF_V4SF },
26138 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpnltss", IX86_BUILTIN_CMPNLTSS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF },
26139 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpnless", IX86_BUILTIN_CMPNLESS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF },
26140 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpngtss", IX86_BUILTIN_CMPNGTSS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
26141 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpngess", IX86_BUILTIN_CMPNGESS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
26142 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpordss", IX86_BUILTIN_CMPORDSS, ORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
26143
26144 { OPTION_MASK_ISA_SSE, CODE_FOR_sminv4sf3, "__builtin_ia32_minps", IX86_BUILTIN_MINPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26145 { OPTION_MASK_ISA_SSE, CODE_FOR_smaxv4sf3, "__builtin_ia32_maxps", IX86_BUILTIN_MAXPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26146 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsminv4sf3, "__builtin_ia32_minss", IX86_BUILTIN_MINSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26147 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsmaxv4sf3, "__builtin_ia32_maxss", IX86_BUILTIN_MAXSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26148
26149 { OPTION_MASK_ISA_SSE, CODE_FOR_andv4sf3, "__builtin_ia32_andps", IX86_BUILTIN_ANDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26150 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_andnotv4sf3, "__builtin_ia32_andnps", IX86_BUILTIN_ANDNPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26151 { OPTION_MASK_ISA_SSE, CODE_FOR_iorv4sf3, "__builtin_ia32_orps", IX86_BUILTIN_ORPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26152 { OPTION_MASK_ISA_SSE, CODE_FOR_xorv4sf3, "__builtin_ia32_xorps", IX86_BUILTIN_XORPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26153
26154 { OPTION_MASK_ISA_SSE, CODE_FOR_copysignv4sf3, "__builtin_ia32_copysignps", IX86_BUILTIN_CPYSGNPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26155
26156 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movss, "__builtin_ia32_movss", IX86_BUILTIN_MOVSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26157 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movhlps_exp, "__builtin_ia32_movhlps", IX86_BUILTIN_MOVHLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26158 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movlhps_exp, "__builtin_ia32_movlhps", IX86_BUILTIN_MOVLHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26159 { OPTION_MASK_ISA_SSE, CODE_FOR_vec_interleave_highv4sf, "__builtin_ia32_unpckhps", IX86_BUILTIN_UNPCKHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26160 { OPTION_MASK_ISA_SSE, CODE_FOR_vec_interleave_lowv4sf, "__builtin_ia32_unpcklps", IX86_BUILTIN_UNPCKLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26161
26162 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtpi2ps, "__builtin_ia32_cvtpi2ps", IX86_BUILTIN_CVTPI2PS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V2SI },
26163 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtsi2ss, "__builtin_ia32_cvtsi2ss", IX86_BUILTIN_CVTSI2SS, UNKNOWN, (int) V4SF_FTYPE_V4SF_SI },
26164 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvtsi2ssq, "__builtin_ia32_cvtsi642ss", IX86_BUILTIN_CVTSI642SS, UNKNOWN, V4SF_FTYPE_V4SF_DI },
26165
26166 { OPTION_MASK_ISA_SSE, CODE_FOR_rsqrtsf2, "__builtin_ia32_rsqrtf", IX86_BUILTIN_RSQRTF, UNKNOWN, (int) FLOAT_FTYPE_FLOAT },
26167
26168 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsqrtv4sf2, "__builtin_ia32_sqrtss", IX86_BUILTIN_SQRTSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_VEC_MERGE },
26169 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmrsqrtv4sf2, "__builtin_ia32_rsqrtss", IX86_BUILTIN_RSQRTSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_VEC_MERGE },
26170 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmrcpv4sf2, "__builtin_ia32_rcpss", IX86_BUILTIN_RCPSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_VEC_MERGE },
26171
26172 /* SSE MMX or 3Dnow!A */
26173 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_uavgv8qi3, "__builtin_ia32_pavgb", IX86_BUILTIN_PAVGB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26174 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_uavgv4hi3, "__builtin_ia32_pavgw", IX86_BUILTIN_PAVGW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26175 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_umulv4hi3_highpart, "__builtin_ia32_pmulhuw", IX86_BUILTIN_PMULHUW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26176
26177 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_umaxv8qi3, "__builtin_ia32_pmaxub", IX86_BUILTIN_PMAXUB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26178 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_smaxv4hi3, "__builtin_ia32_pmaxsw", IX86_BUILTIN_PMAXSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26179 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_uminv8qi3, "__builtin_ia32_pminub", IX86_BUILTIN_PMINUB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26180 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_sminv4hi3, "__builtin_ia32_pminsw", IX86_BUILTIN_PMINSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26181
26182 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_psadbw, "__builtin_ia32_psadbw", IX86_BUILTIN_PSADBW, UNKNOWN, (int) V1DI_FTYPE_V8QI_V8QI },
26183 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pmovmskb, "__builtin_ia32_pmovmskb", IX86_BUILTIN_PMOVMSKB, UNKNOWN, (int) INT_FTYPE_V8QI },
26184
26185 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pshufw, "__builtin_ia32_pshufw", IX86_BUILTIN_PSHUFW, UNKNOWN, (int) V4HI_FTYPE_V4HI_INT },
26186
26187 /* SSE2 */
26188 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_shufpd, "__builtin_ia32_shufpd", IX86_BUILTIN_SHUFPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
26189
26190 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movmskpd, "__builtin_ia32_movmskpd", IX86_BUILTIN_MOVMSKPD, UNKNOWN, (int) INT_FTYPE_V2DF },
26191 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pmovmskb, "__builtin_ia32_pmovmskb128", IX86_BUILTIN_PMOVMSKB128, UNKNOWN, (int) INT_FTYPE_V16QI },
26192 { OPTION_MASK_ISA_SSE2, CODE_FOR_sqrtv2df2, "__builtin_ia32_sqrtpd", IX86_BUILTIN_SQRTPD, UNKNOWN, (int) V2DF_FTYPE_V2DF },
26193 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtdq2pd, "__builtin_ia32_cvtdq2pd", IX86_BUILTIN_CVTDQ2PD, UNKNOWN, (int) V2DF_FTYPE_V4SI },
26194 { OPTION_MASK_ISA_SSE2, CODE_FOR_floatv4siv4sf2, "__builtin_ia32_cvtdq2ps", IX86_BUILTIN_CVTDQ2PS, UNKNOWN, (int) V4SF_FTYPE_V4SI },
26195
26196 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpd2dq, "__builtin_ia32_cvtpd2dq", IX86_BUILTIN_CVTPD2DQ, UNKNOWN, (int) V4SI_FTYPE_V2DF },
26197 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpd2pi, "__builtin_ia32_cvtpd2pi", IX86_BUILTIN_CVTPD2PI, UNKNOWN, (int) V2SI_FTYPE_V2DF },
26198 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpd2ps, "__builtin_ia32_cvtpd2ps", IX86_BUILTIN_CVTPD2PS, UNKNOWN, (int) V4SF_FTYPE_V2DF },
26199 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvttpd2dq, "__builtin_ia32_cvttpd2dq", IX86_BUILTIN_CVTTPD2DQ, UNKNOWN, (int) V4SI_FTYPE_V2DF },
26200 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvttpd2pi, "__builtin_ia32_cvttpd2pi", IX86_BUILTIN_CVTTPD2PI, UNKNOWN, (int) V2SI_FTYPE_V2DF },
26201
26202 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpi2pd, "__builtin_ia32_cvtpi2pd", IX86_BUILTIN_CVTPI2PD, UNKNOWN, (int) V2DF_FTYPE_V2SI },
26203
26204 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtsd2si, "__builtin_ia32_cvtsd2si", IX86_BUILTIN_CVTSD2SI, UNKNOWN, (int) INT_FTYPE_V2DF },
26205 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvttsd2si, "__builtin_ia32_cvttsd2si", IX86_BUILTIN_CVTTSD2SI, UNKNOWN, (int) INT_FTYPE_V2DF },
26206 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvtsd2siq, "__builtin_ia32_cvtsd2si64", IX86_BUILTIN_CVTSD2SI64, UNKNOWN, (int) INT64_FTYPE_V2DF },
26207 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvttsd2siq, "__builtin_ia32_cvttsd2si64", IX86_BUILTIN_CVTTSD2SI64, UNKNOWN, (int) INT64_FTYPE_V2DF },
26208
26209 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtps2dq, "__builtin_ia32_cvtps2dq", IX86_BUILTIN_CVTPS2DQ, UNKNOWN, (int) V4SI_FTYPE_V4SF },
26210 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtps2pd, "__builtin_ia32_cvtps2pd", IX86_BUILTIN_CVTPS2PD, UNKNOWN, (int) V2DF_FTYPE_V4SF },
26211 { OPTION_MASK_ISA_SSE2, CODE_FOR_fix_truncv4sfv4si2, "__builtin_ia32_cvttps2dq", IX86_BUILTIN_CVTTPS2DQ, UNKNOWN, (int) V4SI_FTYPE_V4SF },
26212
26213 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv2df3, "__builtin_ia32_addpd", IX86_BUILTIN_ADDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26214 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv2df3, "__builtin_ia32_subpd", IX86_BUILTIN_SUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26215 { OPTION_MASK_ISA_SSE2, CODE_FOR_mulv2df3, "__builtin_ia32_mulpd", IX86_BUILTIN_MULPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26216 { OPTION_MASK_ISA_SSE2, CODE_FOR_divv2df3, "__builtin_ia32_divpd", IX86_BUILTIN_DIVPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26217 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmaddv2df3, "__builtin_ia32_addsd", IX86_BUILTIN_ADDSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26218 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsubv2df3, "__builtin_ia32_subsd", IX86_BUILTIN_SUBSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26219 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmulv2df3, "__builtin_ia32_mulsd", IX86_BUILTIN_MULSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26220 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmdivv2df3, "__builtin_ia32_divsd", IX86_BUILTIN_DIVSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26221
26222 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpeqpd", IX86_BUILTIN_CMPEQPD, EQ, (int) V2DF_FTYPE_V2DF_V2DF },
26223 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpltpd", IX86_BUILTIN_CMPLTPD, LT, (int) V2DF_FTYPE_V2DF_V2DF },
26224 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmplepd", IX86_BUILTIN_CMPLEPD, LE, (int) V2DF_FTYPE_V2DF_V2DF },
26225 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpgtpd", IX86_BUILTIN_CMPGTPD, LT, (int) V2DF_FTYPE_V2DF_V2DF_SWAP },
26226 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpgepd", IX86_BUILTIN_CMPGEPD, LE, (int) V2DF_FTYPE_V2DF_V2DF_SWAP},
26227 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpunordpd", IX86_BUILTIN_CMPUNORDPD, UNORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
26228 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpneqpd", IX86_BUILTIN_CMPNEQPD, NE, (int) V2DF_FTYPE_V2DF_V2DF },
26229 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpnltpd", IX86_BUILTIN_CMPNLTPD, UNGE, (int) V2DF_FTYPE_V2DF_V2DF },
26230 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpnlepd", IX86_BUILTIN_CMPNLEPD, UNGT, (int) V2DF_FTYPE_V2DF_V2DF },
26231 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpngtpd", IX86_BUILTIN_CMPNGTPD, UNGE, (int) V2DF_FTYPE_V2DF_V2DF_SWAP },
26232 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpngepd", IX86_BUILTIN_CMPNGEPD, UNGT, (int) V2DF_FTYPE_V2DF_V2DF_SWAP },
26233 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpordpd", IX86_BUILTIN_CMPORDPD, ORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
26234 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpeqsd", IX86_BUILTIN_CMPEQSD, EQ, (int) V2DF_FTYPE_V2DF_V2DF },
26235 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpltsd", IX86_BUILTIN_CMPLTSD, LT, (int) V2DF_FTYPE_V2DF_V2DF },
26236 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmplesd", IX86_BUILTIN_CMPLESD, LE, (int) V2DF_FTYPE_V2DF_V2DF },
26237 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpunordsd", IX86_BUILTIN_CMPUNORDSD, UNORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
26238 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpneqsd", IX86_BUILTIN_CMPNEQSD, NE, (int) V2DF_FTYPE_V2DF_V2DF },
26239 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpnltsd", IX86_BUILTIN_CMPNLTSD, UNGE, (int) V2DF_FTYPE_V2DF_V2DF },
26240 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpnlesd", IX86_BUILTIN_CMPNLESD, UNGT, (int) V2DF_FTYPE_V2DF_V2DF },
26241 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpordsd", IX86_BUILTIN_CMPORDSD, ORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
26242
26243 { OPTION_MASK_ISA_SSE2, CODE_FOR_sminv2df3, "__builtin_ia32_minpd", IX86_BUILTIN_MINPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26244 { OPTION_MASK_ISA_SSE2, CODE_FOR_smaxv2df3, "__builtin_ia32_maxpd", IX86_BUILTIN_MAXPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26245 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsminv2df3, "__builtin_ia32_minsd", IX86_BUILTIN_MINSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26246 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsmaxv2df3, "__builtin_ia32_maxsd", IX86_BUILTIN_MAXSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26247
26248 { OPTION_MASK_ISA_SSE2, CODE_FOR_andv2df3, "__builtin_ia32_andpd", IX86_BUILTIN_ANDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26249 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_andnotv2df3, "__builtin_ia32_andnpd", IX86_BUILTIN_ANDNPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26250 { OPTION_MASK_ISA_SSE2, CODE_FOR_iorv2df3, "__builtin_ia32_orpd", IX86_BUILTIN_ORPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26251 { OPTION_MASK_ISA_SSE2, CODE_FOR_xorv2df3, "__builtin_ia32_xorpd", IX86_BUILTIN_XORPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26252
26253 { OPTION_MASK_ISA_SSE2, CODE_FOR_copysignv2df3, "__builtin_ia32_copysignpd", IX86_BUILTIN_CPYSGNPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26254
26255 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movsd, "__builtin_ia32_movsd", IX86_BUILTIN_MOVSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26256 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv2df, "__builtin_ia32_unpckhpd", IX86_BUILTIN_UNPCKHPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26257 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv2df, "__builtin_ia32_unpcklpd", IX86_BUILTIN_UNPCKLPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26258
26259 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_pack_sfix_v2df, "__builtin_ia32_vec_pack_sfix", IX86_BUILTIN_VEC_PACK_SFIX, UNKNOWN, (int) V4SI_FTYPE_V2DF_V2DF },
26260
26261 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv16qi3, "__builtin_ia32_paddb128", IX86_BUILTIN_PADDB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26262 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv8hi3, "__builtin_ia32_paddw128", IX86_BUILTIN_PADDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26263 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv4si3, "__builtin_ia32_paddd128", IX86_BUILTIN_PADDD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26264 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv2di3, "__builtin_ia32_paddq128", IX86_BUILTIN_PADDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26265 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv16qi3, "__builtin_ia32_psubb128", IX86_BUILTIN_PSUBB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26266 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv8hi3, "__builtin_ia32_psubw128", IX86_BUILTIN_PSUBW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26267 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv4si3, "__builtin_ia32_psubd128", IX86_BUILTIN_PSUBD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26268 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv2di3, "__builtin_ia32_psubq128", IX86_BUILTIN_PSUBQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26269
26270 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ssaddv16qi3, "__builtin_ia32_paddsb128", IX86_BUILTIN_PADDSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26271 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ssaddv8hi3, "__builtin_ia32_paddsw128", IX86_BUILTIN_PADDSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26272 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_sssubv16qi3, "__builtin_ia32_psubsb128", IX86_BUILTIN_PSUBSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26273 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_sssubv8hi3, "__builtin_ia32_psubsw128", IX86_BUILTIN_PSUBSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26274 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_usaddv16qi3, "__builtin_ia32_paddusb128", IX86_BUILTIN_PADDUSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26275 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_usaddv8hi3, "__builtin_ia32_paddusw128", IX86_BUILTIN_PADDUSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26276 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ussubv16qi3, "__builtin_ia32_psubusb128", IX86_BUILTIN_PSUBUSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26277 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ussubv8hi3, "__builtin_ia32_psubusw128", IX86_BUILTIN_PSUBUSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26278
26279 { OPTION_MASK_ISA_SSE2, CODE_FOR_mulv8hi3, "__builtin_ia32_pmullw128", IX86_BUILTIN_PMULLW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26280 { OPTION_MASK_ISA_SSE2, CODE_FOR_smulv8hi3_highpart, "__builtin_ia32_pmulhw128", IX86_BUILTIN_PMULHW128, UNKNOWN,(int) V8HI_FTYPE_V8HI_V8HI },
26281
26282 { OPTION_MASK_ISA_SSE2, CODE_FOR_andv2di3, "__builtin_ia32_pand128", IX86_BUILTIN_PAND128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26283 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_andnotv2di3, "__builtin_ia32_pandn128", IX86_BUILTIN_PANDN128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26284 { OPTION_MASK_ISA_SSE2, CODE_FOR_iorv2di3, "__builtin_ia32_por128", IX86_BUILTIN_POR128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26285 { OPTION_MASK_ISA_SSE2, CODE_FOR_xorv2di3, "__builtin_ia32_pxor128", IX86_BUILTIN_PXOR128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26286
26287 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_uavgv16qi3, "__builtin_ia32_pavgb128", IX86_BUILTIN_PAVGB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26288 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_uavgv8hi3, "__builtin_ia32_pavgw128", IX86_BUILTIN_PAVGW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26289
26290 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_eqv16qi3, "__builtin_ia32_pcmpeqb128", IX86_BUILTIN_PCMPEQB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26291 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_eqv8hi3, "__builtin_ia32_pcmpeqw128", IX86_BUILTIN_PCMPEQW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26292 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_eqv4si3, "__builtin_ia32_pcmpeqd128", IX86_BUILTIN_PCMPEQD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26293 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_gtv16qi3, "__builtin_ia32_pcmpgtb128", IX86_BUILTIN_PCMPGTB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26294 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_gtv8hi3, "__builtin_ia32_pcmpgtw128", IX86_BUILTIN_PCMPGTW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26295 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_gtv4si3, "__builtin_ia32_pcmpgtd128", IX86_BUILTIN_PCMPGTD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26296
26297 { OPTION_MASK_ISA_SSE2, CODE_FOR_umaxv16qi3, "__builtin_ia32_pmaxub128", IX86_BUILTIN_PMAXUB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26298 { OPTION_MASK_ISA_SSE2, CODE_FOR_smaxv8hi3, "__builtin_ia32_pmaxsw128", IX86_BUILTIN_PMAXSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26299 { OPTION_MASK_ISA_SSE2, CODE_FOR_uminv16qi3, "__builtin_ia32_pminub128", IX86_BUILTIN_PMINUB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26300 { OPTION_MASK_ISA_SSE2, CODE_FOR_sminv8hi3, "__builtin_ia32_pminsw128", IX86_BUILTIN_PMINSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26301
26302 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv16qi, "__builtin_ia32_punpckhbw128", IX86_BUILTIN_PUNPCKHBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26303 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv8hi, "__builtin_ia32_punpckhwd128", IX86_BUILTIN_PUNPCKHWD128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26304 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv4si, "__builtin_ia32_punpckhdq128", IX86_BUILTIN_PUNPCKHDQ128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26305 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv2di, "__builtin_ia32_punpckhqdq128", IX86_BUILTIN_PUNPCKHQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26306 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv16qi, "__builtin_ia32_punpcklbw128", IX86_BUILTIN_PUNPCKLBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26307 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv8hi, "__builtin_ia32_punpcklwd128", IX86_BUILTIN_PUNPCKLWD128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26308 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv4si, "__builtin_ia32_punpckldq128", IX86_BUILTIN_PUNPCKLDQ128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26309 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv2di, "__builtin_ia32_punpcklqdq128", IX86_BUILTIN_PUNPCKLQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26310
26311 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_packsswb, "__builtin_ia32_packsswb128", IX86_BUILTIN_PACKSSWB128, UNKNOWN, (int) V16QI_FTYPE_V8HI_V8HI },
26312 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_packssdw, "__builtin_ia32_packssdw128", IX86_BUILTIN_PACKSSDW128, UNKNOWN, (int) V8HI_FTYPE_V4SI_V4SI },
26313 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_packuswb, "__builtin_ia32_packuswb128", IX86_BUILTIN_PACKUSWB128, UNKNOWN, (int) V16QI_FTYPE_V8HI_V8HI },
26314
26315 { OPTION_MASK_ISA_SSE2, CODE_FOR_umulv8hi3_highpart, "__builtin_ia32_pmulhuw128", IX86_BUILTIN_PMULHUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26316 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_psadbw, "__builtin_ia32_psadbw128", IX86_BUILTIN_PSADBW128, UNKNOWN, (int) V2DI_FTYPE_V16QI_V16QI },
26317
26318 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_umulv1siv1di3, "__builtin_ia32_pmuludq", IX86_BUILTIN_PMULUDQ, UNKNOWN, (int) V1DI_FTYPE_V2SI_V2SI },
26319 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_umulv2siv2di3, "__builtin_ia32_pmuludq128", IX86_BUILTIN_PMULUDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI_V4SI },
26320
26321 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pmaddwd, "__builtin_ia32_pmaddwd128", IX86_BUILTIN_PMADDWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI_V8HI },
26322
26323 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtsi2sd, "__builtin_ia32_cvtsi2sd", IX86_BUILTIN_CVTSI2SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_SI },
26324 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvtsi2sdq, "__builtin_ia32_cvtsi642sd", IX86_BUILTIN_CVTSI642SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_DI },
26325 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtsd2ss, "__builtin_ia32_cvtsd2ss", IX86_BUILTIN_CVTSD2SS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V2DF },
26326 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtss2sd, "__builtin_ia32_cvtss2sd", IX86_BUILTIN_CVTSS2SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V4SF },
26327
26328 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ashlv1ti3, "__builtin_ia32_pslldqi128", IX86_BUILTIN_PSLLDQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT_CONVERT },
26329 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv8hi3, "__builtin_ia32_psllwi128", IX86_BUILTIN_PSLLWI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_SI_COUNT },
26330 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv4si3, "__builtin_ia32_pslldi128", IX86_BUILTIN_PSLLDI128, UNKNOWN, (int) V4SI_FTYPE_V4SI_SI_COUNT },
26331 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv2di3, "__builtin_ia32_psllqi128", IX86_BUILTIN_PSLLQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_SI_COUNT },
26332 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv8hi3, "__builtin_ia32_psllw128", IX86_BUILTIN_PSLLW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_COUNT },
26333 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv4si3, "__builtin_ia32_pslld128", IX86_BUILTIN_PSLLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_COUNT },
26334 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv2di3, "__builtin_ia32_psllq128", IX86_BUILTIN_PSLLQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_COUNT },
26335
26336 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_lshrv1ti3, "__builtin_ia32_psrldqi128", IX86_BUILTIN_PSRLDQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT_CONVERT },
26337 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv8hi3, "__builtin_ia32_psrlwi128", IX86_BUILTIN_PSRLWI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_SI_COUNT },
26338 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv4si3, "__builtin_ia32_psrldi128", IX86_BUILTIN_PSRLDI128, UNKNOWN, (int) V4SI_FTYPE_V4SI_SI_COUNT },
26339 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv2di3, "__builtin_ia32_psrlqi128", IX86_BUILTIN_PSRLQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_SI_COUNT },
26340 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv8hi3, "__builtin_ia32_psrlw128", IX86_BUILTIN_PSRLW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_COUNT },
26341 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv4si3, "__builtin_ia32_psrld128", IX86_BUILTIN_PSRLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_COUNT },
26342 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv2di3, "__builtin_ia32_psrlq128", IX86_BUILTIN_PSRLQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_COUNT },
26343
26344 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv8hi3, "__builtin_ia32_psrawi128", IX86_BUILTIN_PSRAWI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_SI_COUNT },
26345 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv4si3, "__builtin_ia32_psradi128", IX86_BUILTIN_PSRADI128, UNKNOWN, (int) V4SI_FTYPE_V4SI_SI_COUNT },
26346 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv8hi3, "__builtin_ia32_psraw128", IX86_BUILTIN_PSRAW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_COUNT },
26347 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv4si3, "__builtin_ia32_psrad128", IX86_BUILTIN_PSRAD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_COUNT },
26348
26349 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pshufd, "__builtin_ia32_pshufd", IX86_BUILTIN_PSHUFD, UNKNOWN, (int) V4SI_FTYPE_V4SI_INT },
26350 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pshuflw, "__builtin_ia32_pshuflw", IX86_BUILTIN_PSHUFLW, UNKNOWN, (int) V8HI_FTYPE_V8HI_INT },
26351 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pshufhw, "__builtin_ia32_pshufhw", IX86_BUILTIN_PSHUFHW, UNKNOWN, (int) V8HI_FTYPE_V8HI_INT },
26352
26353 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsqrtv2df2, "__builtin_ia32_sqrtsd", IX86_BUILTIN_SQRTSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_VEC_MERGE },
26354
26355 { OPTION_MASK_ISA_SSE2, CODE_FOR_abstf2, 0, IX86_BUILTIN_FABSQ, UNKNOWN, (int) FLOAT128_FTYPE_FLOAT128 },
26356 { OPTION_MASK_ISA_SSE2, CODE_FOR_copysigntf3, 0, IX86_BUILTIN_COPYSIGNQ, UNKNOWN, (int) FLOAT128_FTYPE_FLOAT128_FLOAT128 },
26357
26358 { OPTION_MASK_ISA_SSE, CODE_FOR_sse2_movq128, "__builtin_ia32_movq128", IX86_BUILTIN_MOVQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI },
26359
26360 /* SSE2 MMX */
26361 { OPTION_MASK_ISA_SSE2, CODE_FOR_mmx_addv1di3, "__builtin_ia32_paddq", IX86_BUILTIN_PADDQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI },
26362 { OPTION_MASK_ISA_SSE2, CODE_FOR_mmx_subv1di3, "__builtin_ia32_psubq", IX86_BUILTIN_PSUBQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI },
26363
26364 /* SSE3 */
26365 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_movshdup, "__builtin_ia32_movshdup", IX86_BUILTIN_MOVSHDUP, UNKNOWN, (int) V4SF_FTYPE_V4SF},
26366 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_movsldup, "__builtin_ia32_movsldup", IX86_BUILTIN_MOVSLDUP, UNKNOWN, (int) V4SF_FTYPE_V4SF },
26367
26368 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_addsubv4sf3, "__builtin_ia32_addsubps", IX86_BUILTIN_ADDSUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26369 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_addsubv2df3, "__builtin_ia32_addsubpd", IX86_BUILTIN_ADDSUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26370 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_haddv4sf3, "__builtin_ia32_haddps", IX86_BUILTIN_HADDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26371 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_haddv2df3, "__builtin_ia32_haddpd", IX86_BUILTIN_HADDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26372 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_hsubv4sf3, "__builtin_ia32_hsubps", IX86_BUILTIN_HSUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26373 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_hsubv2df3, "__builtin_ia32_hsubpd", IX86_BUILTIN_HSUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26374
26375 /* SSSE3 */
26376 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv16qi2, "__builtin_ia32_pabsb128", IX86_BUILTIN_PABSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI },
26377 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv8qi2, "__builtin_ia32_pabsb", IX86_BUILTIN_PABSB, UNKNOWN, (int) V8QI_FTYPE_V8QI },
26378 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv8hi2, "__builtin_ia32_pabsw128", IX86_BUILTIN_PABSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI },
26379 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv4hi2, "__builtin_ia32_pabsw", IX86_BUILTIN_PABSW, UNKNOWN, (int) V4HI_FTYPE_V4HI },
26380 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv4si2, "__builtin_ia32_pabsd128", IX86_BUILTIN_PABSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI },
26381 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv2si2, "__builtin_ia32_pabsd", IX86_BUILTIN_PABSD, UNKNOWN, (int) V2SI_FTYPE_V2SI },
26382
26383 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddwv8hi3, "__builtin_ia32_phaddw128", IX86_BUILTIN_PHADDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26384 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddwv4hi3, "__builtin_ia32_phaddw", IX86_BUILTIN_PHADDW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26385 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phadddv4si3, "__builtin_ia32_phaddd128", IX86_BUILTIN_PHADDD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26386 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phadddv2si3, "__builtin_ia32_phaddd", IX86_BUILTIN_PHADDD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26387 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddswv8hi3, "__builtin_ia32_phaddsw128", IX86_BUILTIN_PHADDSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26388 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddswv4hi3, "__builtin_ia32_phaddsw", IX86_BUILTIN_PHADDSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26389 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubwv8hi3, "__builtin_ia32_phsubw128", IX86_BUILTIN_PHSUBW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26390 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubwv4hi3, "__builtin_ia32_phsubw", IX86_BUILTIN_PHSUBW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26391 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubdv4si3, "__builtin_ia32_phsubd128", IX86_BUILTIN_PHSUBD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26392 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubdv2si3, "__builtin_ia32_phsubd", IX86_BUILTIN_PHSUBD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26393 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubswv8hi3, "__builtin_ia32_phsubsw128", IX86_BUILTIN_PHSUBSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26394 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubswv4hi3, "__builtin_ia32_phsubsw", IX86_BUILTIN_PHSUBSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26395 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmaddubsw128, "__builtin_ia32_pmaddubsw128", IX86_BUILTIN_PMADDUBSW128, UNKNOWN, (int) V8HI_FTYPE_V16QI_V16QI },
26396 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmaddubsw, "__builtin_ia32_pmaddubsw", IX86_BUILTIN_PMADDUBSW, UNKNOWN, (int) V4HI_FTYPE_V8QI_V8QI },
26397 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmulhrswv8hi3, "__builtin_ia32_pmulhrsw128", IX86_BUILTIN_PMULHRSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26398 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmulhrswv4hi3, "__builtin_ia32_pmulhrsw", IX86_BUILTIN_PMULHRSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26399 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pshufbv16qi3, "__builtin_ia32_pshufb128", IX86_BUILTIN_PSHUFB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26400 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pshufbv8qi3, "__builtin_ia32_pshufb", IX86_BUILTIN_PSHUFB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26401 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv16qi3, "__builtin_ia32_psignb128", IX86_BUILTIN_PSIGNB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26402 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv8qi3, "__builtin_ia32_psignb", IX86_BUILTIN_PSIGNB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26403 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv8hi3, "__builtin_ia32_psignw128", IX86_BUILTIN_PSIGNW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26404 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv4hi3, "__builtin_ia32_psignw", IX86_BUILTIN_PSIGNW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26405 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv4si3, "__builtin_ia32_psignd128", IX86_BUILTIN_PSIGND128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26406 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv2si3, "__builtin_ia32_psignd", IX86_BUILTIN_PSIGND, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26407
26408 /* SSSE3. */
26409 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_palignrti, "__builtin_ia32_palignr128", IX86_BUILTIN_PALIGNR128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_INT_CONVERT },
26410 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_palignrdi, "__builtin_ia32_palignr", IX86_BUILTIN_PALIGNR, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI_INT_CONVERT },
26411
26412 /* SSE4.1 */
26413 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendpd, "__builtin_ia32_blendpd", IX86_BUILTIN_BLENDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
26414 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendps, "__builtin_ia32_blendps", IX86_BUILTIN_BLENDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
26415 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendvpd, "__builtin_ia32_blendvpd", IX86_BUILTIN_BLENDVPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_V2DF },
26416 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendvps, "__builtin_ia32_blendvps", IX86_BUILTIN_BLENDVPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_V4SF },
26417 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_dppd, "__builtin_ia32_dppd", IX86_BUILTIN_DPPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
26418 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_dpps, "__builtin_ia32_dpps", IX86_BUILTIN_DPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
26419 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_insertps, "__builtin_ia32_insertps128", IX86_BUILTIN_INSERTPS128, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
26420 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_mpsadbw, "__builtin_ia32_mpsadbw128", IX86_BUILTIN_MPSADBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI_INT },
26421 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_pblendvb, "__builtin_ia32_pblendvb128", IX86_BUILTIN_PBLENDVB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI_V16QI },
26422 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_pblendw, "__builtin_ia32_pblendw128", IX86_BUILTIN_PBLENDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_INT },
26423
26424 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv8qiv8hi2, "__builtin_ia32_pmovsxbw128", IX86_BUILTIN_PMOVSXBW128, UNKNOWN, (int) V8HI_FTYPE_V16QI },
26425 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv4qiv4si2, "__builtin_ia32_pmovsxbd128", IX86_BUILTIN_PMOVSXBD128, UNKNOWN, (int) V4SI_FTYPE_V16QI },
26426 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv2qiv2di2, "__builtin_ia32_pmovsxbq128", IX86_BUILTIN_PMOVSXBQ128, UNKNOWN, (int) V2DI_FTYPE_V16QI },
26427 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv4hiv4si2, "__builtin_ia32_pmovsxwd128", IX86_BUILTIN_PMOVSXWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI },
26428 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv2hiv2di2, "__builtin_ia32_pmovsxwq128", IX86_BUILTIN_PMOVSXWQ128, UNKNOWN, (int) V2DI_FTYPE_V8HI },
26429 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv2siv2di2, "__builtin_ia32_pmovsxdq128", IX86_BUILTIN_PMOVSXDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI },
26430 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv8qiv8hi2, "__builtin_ia32_pmovzxbw128", IX86_BUILTIN_PMOVZXBW128, UNKNOWN, (int) V8HI_FTYPE_V16QI },
26431 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv4qiv4si2, "__builtin_ia32_pmovzxbd128", IX86_BUILTIN_PMOVZXBD128, UNKNOWN, (int) V4SI_FTYPE_V16QI },
26432 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv2qiv2di2, "__builtin_ia32_pmovzxbq128", IX86_BUILTIN_PMOVZXBQ128, UNKNOWN, (int) V2DI_FTYPE_V16QI },
26433 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv4hiv4si2, "__builtin_ia32_pmovzxwd128", IX86_BUILTIN_PMOVZXWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI },
26434 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv2hiv2di2, "__builtin_ia32_pmovzxwq128", IX86_BUILTIN_PMOVZXWQ128, UNKNOWN, (int) V2DI_FTYPE_V8HI },
26435 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv2siv2di2, "__builtin_ia32_pmovzxdq128", IX86_BUILTIN_PMOVZXDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI },
26436 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_phminposuw, "__builtin_ia32_phminposuw128", IX86_BUILTIN_PHMINPOSUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI },
26437
26438 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_packusdw, "__builtin_ia32_packusdw128", IX86_BUILTIN_PACKUSDW128, UNKNOWN, (int) V8HI_FTYPE_V4SI_V4SI },
26439 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_eqv2di3, "__builtin_ia32_pcmpeqq", IX86_BUILTIN_PCMPEQQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26440 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_smaxv16qi3, "__builtin_ia32_pmaxsb128", IX86_BUILTIN_PMAXSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26441 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_smaxv4si3, "__builtin_ia32_pmaxsd128", IX86_BUILTIN_PMAXSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26442 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_umaxv4si3, "__builtin_ia32_pmaxud128", IX86_BUILTIN_PMAXUD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26443 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_umaxv8hi3, "__builtin_ia32_pmaxuw128", IX86_BUILTIN_PMAXUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26444 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sminv16qi3, "__builtin_ia32_pminsb128", IX86_BUILTIN_PMINSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26445 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sminv4si3, "__builtin_ia32_pminsd128", IX86_BUILTIN_PMINSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26446 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_uminv4si3, "__builtin_ia32_pminud128", IX86_BUILTIN_PMINUD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26447 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_uminv8hi3, "__builtin_ia32_pminuw128", IX86_BUILTIN_PMINUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26448 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_mulv2siv2di3, "__builtin_ia32_pmuldq128", IX86_BUILTIN_PMULDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI_V4SI },
26449 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_mulv4si3, "__builtin_ia32_pmulld128", IX86_BUILTIN_PMULLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26450
26451 /* SSE4.1 */
26452 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_roundpd", IX86_BUILTIN_ROUNDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_INT },
26453 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_roundps", IX86_BUILTIN_ROUNDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_INT },
26454 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundsd, "__builtin_ia32_roundsd", IX86_BUILTIN_ROUNDSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
26455 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundss, "__builtin_ia32_roundss", IX86_BUILTIN_ROUNDSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
26456
26457 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_floorpd", IX86_BUILTIN_FLOORPD, (enum rtx_code) ROUND_FLOOR, (int) V2DF_FTYPE_V2DF_ROUND },
26458 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_ceilpd", IX86_BUILTIN_CEILPD, (enum rtx_code) ROUND_CEIL, (int) V2DF_FTYPE_V2DF_ROUND },
26459 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_truncpd", IX86_BUILTIN_TRUNCPD, (enum rtx_code) ROUND_TRUNC, (int) V2DF_FTYPE_V2DF_ROUND },
26460 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_rintpd", IX86_BUILTIN_RINTPD, (enum rtx_code) ROUND_MXCSR, (int) V2DF_FTYPE_V2DF_ROUND },
26461
26462 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd_vec_pack_sfix, "__builtin_ia32_floorpd_vec_pack_sfix", IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX, (enum rtx_code) ROUND_FLOOR, (int) V4SI_FTYPE_V2DF_V2DF_ROUND },
26463 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd_vec_pack_sfix, "__builtin_ia32_ceilpd_vec_pack_sfix", IX86_BUILTIN_CEILPD_VEC_PACK_SFIX, (enum rtx_code) ROUND_CEIL, (int) V4SI_FTYPE_V2DF_V2DF_ROUND },
26464
26465 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv2df2, "__builtin_ia32_roundpd_az", IX86_BUILTIN_ROUNDPD_AZ, UNKNOWN, (int) V2DF_FTYPE_V2DF },
26466 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv2df2_vec_pack_sfix, "__builtin_ia32_roundpd_az_vec_pack_sfix", IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX, UNKNOWN, (int) V4SI_FTYPE_V2DF_V2DF },
26467
26468 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_floorps", IX86_BUILTIN_FLOORPS, (enum rtx_code) ROUND_FLOOR, (int) V4SF_FTYPE_V4SF_ROUND },
26469 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_ceilps", IX86_BUILTIN_CEILPS, (enum rtx_code) ROUND_CEIL, (int) V4SF_FTYPE_V4SF_ROUND },
26470 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_truncps", IX86_BUILTIN_TRUNCPS, (enum rtx_code) ROUND_TRUNC, (int) V4SF_FTYPE_V4SF_ROUND },
26471 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_rintps", IX86_BUILTIN_RINTPS, (enum rtx_code) ROUND_MXCSR, (int) V4SF_FTYPE_V4SF_ROUND },
26472
26473 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps_sfix, "__builtin_ia32_floorps_sfix", IX86_BUILTIN_FLOORPS_SFIX, (enum rtx_code) ROUND_FLOOR, (int) V4SI_FTYPE_V4SF_ROUND },
26474 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps_sfix, "__builtin_ia32_ceilps_sfix", IX86_BUILTIN_CEILPS_SFIX, (enum rtx_code) ROUND_CEIL, (int) V4SI_FTYPE_V4SF_ROUND },
26475
26476 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv4sf2, "__builtin_ia32_roundps_az", IX86_BUILTIN_ROUNDPS_AZ, UNKNOWN, (int) V4SF_FTYPE_V4SF },
26477 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv4sf2_sfix, "__builtin_ia32_roundps_az_sfix", IX86_BUILTIN_ROUNDPS_AZ_SFIX, UNKNOWN, (int) V4SI_FTYPE_V4SF },
26478
26479 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestz128", IX86_BUILTIN_PTESTZ, EQ, (int) INT_FTYPE_V2DI_V2DI_PTEST },
26480 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestc128", IX86_BUILTIN_PTESTC, LTU, (int) INT_FTYPE_V2DI_V2DI_PTEST },
26481 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestnzc128", IX86_BUILTIN_PTESTNZC, GTU, (int) INT_FTYPE_V2DI_V2DI_PTEST },
26482
26483 /* SSE4.2 */
26484 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_gtv2di3, "__builtin_ia32_pcmpgtq", IX86_BUILTIN_PCMPGTQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26485 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32, CODE_FOR_sse4_2_crc32qi, "__builtin_ia32_crc32qi", IX86_BUILTIN_CRC32QI, UNKNOWN, (int) UINT_FTYPE_UINT_UCHAR },
26486 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32, CODE_FOR_sse4_2_crc32hi, "__builtin_ia32_crc32hi", IX86_BUILTIN_CRC32HI, UNKNOWN, (int) UINT_FTYPE_UINT_USHORT },
26487 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32, CODE_FOR_sse4_2_crc32si, "__builtin_ia32_crc32si", IX86_BUILTIN_CRC32SI, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
26488 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse4_2_crc32di, "__builtin_ia32_crc32di", IX86_BUILTIN_CRC32DI, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
26489
26490 /* SSE4A */
26491 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_extrqi, "__builtin_ia32_extrqi", IX86_BUILTIN_EXTRQI, UNKNOWN, (int) V2DI_FTYPE_V2DI_UINT_UINT },
26492 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_extrq, "__builtin_ia32_extrq", IX86_BUILTIN_EXTRQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V16QI },
26493 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_insertqi, "__builtin_ia32_insertqi", IX86_BUILTIN_INSERTQI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_UINT_UINT },
26494 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_insertq, "__builtin_ia32_insertq", IX86_BUILTIN_INSERTQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26495
26496 /* AES */
26497 { OPTION_MASK_ISA_SSE2, CODE_FOR_aeskeygenassist, 0, IX86_BUILTIN_AESKEYGENASSIST128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT },
26498 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesimc, 0, IX86_BUILTIN_AESIMC128, UNKNOWN, (int) V2DI_FTYPE_V2DI },
26499
26500 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesenc, 0, IX86_BUILTIN_AESENC128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26501 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesenclast, 0, IX86_BUILTIN_AESENCLAST128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26502 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesdec, 0, IX86_BUILTIN_AESDEC128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26503 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesdeclast, 0, IX86_BUILTIN_AESDECLAST128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26504
26505 /* PCLMUL */
26506 { OPTION_MASK_ISA_SSE2, CODE_FOR_pclmulqdq, 0, IX86_BUILTIN_PCLMULQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_INT },
26507
26508 /* AVX */
26509 { OPTION_MASK_ISA_AVX, CODE_FOR_addv4df3, "__builtin_ia32_addpd256", IX86_BUILTIN_ADDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26510 { OPTION_MASK_ISA_AVX, CODE_FOR_addv8sf3, "__builtin_ia32_addps256", IX86_BUILTIN_ADDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26511 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_addsubv4df3, "__builtin_ia32_addsubpd256", IX86_BUILTIN_ADDSUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26512 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_addsubv8sf3, "__builtin_ia32_addsubps256", IX86_BUILTIN_ADDSUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26513 { OPTION_MASK_ISA_AVX, CODE_FOR_andv4df3, "__builtin_ia32_andpd256", IX86_BUILTIN_ANDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26514 { OPTION_MASK_ISA_AVX, CODE_FOR_andv8sf3, "__builtin_ia32_andps256", IX86_BUILTIN_ANDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26515 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_andnotv4df3, "__builtin_ia32_andnpd256", IX86_BUILTIN_ANDNPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26516 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_andnotv8sf3, "__builtin_ia32_andnps256", IX86_BUILTIN_ANDNPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26517 { OPTION_MASK_ISA_AVX, CODE_FOR_divv4df3, "__builtin_ia32_divpd256", IX86_BUILTIN_DIVPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26518 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_divv8sf3, "__builtin_ia32_divps256", IX86_BUILTIN_DIVPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26519 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_haddv4df3, "__builtin_ia32_haddpd256", IX86_BUILTIN_HADDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26520 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_hsubv8sf3, "__builtin_ia32_hsubps256", IX86_BUILTIN_HSUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26521 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_hsubv4df3, "__builtin_ia32_hsubpd256", IX86_BUILTIN_HSUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26522 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_haddv8sf3, "__builtin_ia32_haddps256", IX86_BUILTIN_HADDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26523 { OPTION_MASK_ISA_AVX, CODE_FOR_smaxv4df3, "__builtin_ia32_maxpd256", IX86_BUILTIN_MAXPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26524 { OPTION_MASK_ISA_AVX, CODE_FOR_smaxv8sf3, "__builtin_ia32_maxps256", IX86_BUILTIN_MAXPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26525 { OPTION_MASK_ISA_AVX, CODE_FOR_sminv4df3, "__builtin_ia32_minpd256", IX86_BUILTIN_MINPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26526 { OPTION_MASK_ISA_AVX, CODE_FOR_sminv8sf3, "__builtin_ia32_minps256", IX86_BUILTIN_MINPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26527 { OPTION_MASK_ISA_AVX, CODE_FOR_mulv4df3, "__builtin_ia32_mulpd256", IX86_BUILTIN_MULPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26528 { OPTION_MASK_ISA_AVX, CODE_FOR_mulv8sf3, "__builtin_ia32_mulps256", IX86_BUILTIN_MULPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26529 { OPTION_MASK_ISA_AVX, CODE_FOR_iorv4df3, "__builtin_ia32_orpd256", IX86_BUILTIN_ORPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26530 { OPTION_MASK_ISA_AVX, CODE_FOR_iorv8sf3, "__builtin_ia32_orps256", IX86_BUILTIN_ORPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26531 { OPTION_MASK_ISA_AVX, CODE_FOR_subv4df3, "__builtin_ia32_subpd256", IX86_BUILTIN_SUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26532 { OPTION_MASK_ISA_AVX, CODE_FOR_subv8sf3, "__builtin_ia32_subps256", IX86_BUILTIN_SUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26533 { OPTION_MASK_ISA_AVX, CODE_FOR_xorv4df3, "__builtin_ia32_xorpd256", IX86_BUILTIN_XORPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26534 { OPTION_MASK_ISA_AVX, CODE_FOR_xorv8sf3, "__builtin_ia32_xorps256", IX86_BUILTIN_XORPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26535
26536 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv2df3, "__builtin_ia32_vpermilvarpd", IX86_BUILTIN_VPERMILVARPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DI },
26537 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv4sf3, "__builtin_ia32_vpermilvarps", IX86_BUILTIN_VPERMILVARPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SI },
26538 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv4df3, "__builtin_ia32_vpermilvarpd256", IX86_BUILTIN_VPERMILVARPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DI },
26539 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv8sf3, "__builtin_ia32_vpermilvarps256", IX86_BUILTIN_VPERMILVARPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SI },
26540
26541 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendpd256, "__builtin_ia32_blendpd256", IX86_BUILTIN_BLENDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
26542 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendps256, "__builtin_ia32_blendps256", IX86_BUILTIN_BLENDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
26543 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendvpd256, "__builtin_ia32_blendvpd256", IX86_BUILTIN_BLENDVPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_V4DF },
26544 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendvps256, "__builtin_ia32_blendvps256", IX86_BUILTIN_BLENDVPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_V8SF },
26545 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_dpps256, "__builtin_ia32_dpps256", IX86_BUILTIN_DPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
26546 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_shufpd256, "__builtin_ia32_shufpd256", IX86_BUILTIN_SHUFPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
26547 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_shufps256, "__builtin_ia32_shufps256", IX86_BUILTIN_SHUFPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
26548 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vmcmpv2df3, "__builtin_ia32_cmpsd", IX86_BUILTIN_CMPSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
26549 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vmcmpv4sf3, "__builtin_ia32_cmpss", IX86_BUILTIN_CMPSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
26550 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv2df3, "__builtin_ia32_cmppd", IX86_BUILTIN_CMPPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
26551 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv4sf3, "__builtin_ia32_cmpps", IX86_BUILTIN_CMPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
26552 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv4df3, "__builtin_ia32_cmppd256", IX86_BUILTIN_CMPPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
26553 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv8sf3, "__builtin_ia32_cmpps256", IX86_BUILTIN_CMPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
26554 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vextractf128v4df, "__builtin_ia32_vextractf128_pd256", IX86_BUILTIN_EXTRACTF128PD256, UNKNOWN, (int) V2DF_FTYPE_V4DF_INT },
26555 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vextractf128v8sf, "__builtin_ia32_vextractf128_ps256", IX86_BUILTIN_EXTRACTF128PS256, UNKNOWN, (int) V4SF_FTYPE_V8SF_INT },
26556 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vextractf128v8si, "__builtin_ia32_vextractf128_si256", IX86_BUILTIN_EXTRACTF128SI256, UNKNOWN, (int) V4SI_FTYPE_V8SI_INT },
26557 { OPTION_MASK_ISA_AVX, CODE_FOR_floatv4siv4df2, "__builtin_ia32_cvtdq2pd256", IX86_BUILTIN_CVTDQ2PD256, UNKNOWN, (int) V4DF_FTYPE_V4SI },
26558 { OPTION_MASK_ISA_AVX, CODE_FOR_floatv8siv8sf2, "__builtin_ia32_cvtdq2ps256", IX86_BUILTIN_CVTDQ2PS256, UNKNOWN, (int) V8SF_FTYPE_V8SI },
26559 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtpd2ps256, "__builtin_ia32_cvtpd2ps256", IX86_BUILTIN_CVTPD2PS256, UNKNOWN, (int) V4SF_FTYPE_V4DF },
26560 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtps2dq256, "__builtin_ia32_cvtps2dq256", IX86_BUILTIN_CVTPS2DQ256, UNKNOWN, (int) V8SI_FTYPE_V8SF },
26561 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtps2pd256, "__builtin_ia32_cvtps2pd256", IX86_BUILTIN_CVTPS2PD256, UNKNOWN, (int) V4DF_FTYPE_V4SF },
26562 { OPTION_MASK_ISA_AVX, CODE_FOR_fix_truncv4dfv4si2, "__builtin_ia32_cvttpd2dq256", IX86_BUILTIN_CVTTPD2DQ256, UNKNOWN, (int) V4SI_FTYPE_V4DF },
26563 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtpd2dq256, "__builtin_ia32_cvtpd2dq256", IX86_BUILTIN_CVTPD2DQ256, UNKNOWN, (int) V4SI_FTYPE_V4DF },
26564 { OPTION_MASK_ISA_AVX, CODE_FOR_fix_truncv8sfv8si2, "__builtin_ia32_cvttps2dq256", IX86_BUILTIN_CVTTPS2DQ256, UNKNOWN, (int) V8SI_FTYPE_V8SF },
26565 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v4df3, "__builtin_ia32_vperm2f128_pd256", IX86_BUILTIN_VPERM2F128PD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
26566 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v8sf3, "__builtin_ia32_vperm2f128_ps256", IX86_BUILTIN_VPERM2F128PS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
26567 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v8si3, "__builtin_ia32_vperm2f128_si256", IX86_BUILTIN_VPERM2F128SI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_INT },
26568 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv2df, "__builtin_ia32_vpermilpd", IX86_BUILTIN_VPERMILPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_INT },
26569 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv4sf, "__builtin_ia32_vpermilps", IX86_BUILTIN_VPERMILPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_INT },
26570 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv4df, "__builtin_ia32_vpermilpd256", IX86_BUILTIN_VPERMILPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT },
26571 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv8sf, "__builtin_ia32_vpermilps256", IX86_BUILTIN_VPERMILPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_INT },
26572 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vinsertf128v4df, "__builtin_ia32_vinsertf128_pd256", IX86_BUILTIN_VINSERTF128PD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V2DF_INT },
26573 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vinsertf128v8sf, "__builtin_ia32_vinsertf128_ps256", IX86_BUILTIN_VINSERTF128PS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V4SF_INT },
26574 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vinsertf128v8si, "__builtin_ia32_vinsertf128_si256", IX86_BUILTIN_VINSERTF128SI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_INT },
26575
26576 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movshdup256, "__builtin_ia32_movshdup256", IX86_BUILTIN_MOVSHDUP256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
26577 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movsldup256, "__builtin_ia32_movsldup256", IX86_BUILTIN_MOVSLDUP256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
26578 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movddup256, "__builtin_ia32_movddup256", IX86_BUILTIN_MOVDDUP256, UNKNOWN, (int) V4DF_FTYPE_V4DF },
26579
26580 { OPTION_MASK_ISA_AVX, CODE_FOR_sqrtv4df2, "__builtin_ia32_sqrtpd256", IX86_BUILTIN_SQRTPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF },
26581 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_sqrtv8sf2, "__builtin_ia32_sqrtps256", IX86_BUILTIN_SQRTPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
26582 { OPTION_MASK_ISA_AVX, CODE_FOR_sqrtv8sf2, "__builtin_ia32_sqrtps_nr256", IX86_BUILTIN_SQRTPS_NR256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
26583 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_rsqrtv8sf2, "__builtin_ia32_rsqrtps256", IX86_BUILTIN_RSQRTPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
26584 { OPTION_MASK_ISA_AVX, CODE_FOR_rsqrtv8sf2, "__builtin_ia32_rsqrtps_nr256", IX86_BUILTIN_RSQRTPS_NR256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
26585
26586 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_rcpv8sf2, "__builtin_ia32_rcpps256", IX86_BUILTIN_RCPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
26587
26588 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_roundpd256", IX86_BUILTIN_ROUNDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT },
26589 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_roundps256", IX86_BUILTIN_ROUNDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_INT },
26590
26591 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_floorpd256", IX86_BUILTIN_FLOORPD256, (enum rtx_code) ROUND_FLOOR, (int) V4DF_FTYPE_V4DF_ROUND },
26592 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_ceilpd256", IX86_BUILTIN_CEILPD256, (enum rtx_code) ROUND_CEIL, (int) V4DF_FTYPE_V4DF_ROUND },
26593 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_truncpd256", IX86_BUILTIN_TRUNCPD256, (enum rtx_code) ROUND_TRUNC, (int) V4DF_FTYPE_V4DF_ROUND },
26594 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_rintpd256", IX86_BUILTIN_RINTPD256, (enum rtx_code) ROUND_MXCSR, (int) V4DF_FTYPE_V4DF_ROUND },
26595
26596 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv4df2, "__builtin_ia32_roundpd_az256", IX86_BUILTIN_ROUNDPD_AZ256, UNKNOWN, (int) V4DF_FTYPE_V4DF },
26597 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv4df2_vec_pack_sfix, "__builtin_ia32_roundpd_az_vec_pack_sfix256", IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256, UNKNOWN, (int) V8SI_FTYPE_V4DF_V4DF },
26598
26599 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd_vec_pack_sfix256, "__builtin_ia32_floorpd_vec_pack_sfix256", IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256, (enum rtx_code) ROUND_FLOOR, (int) V8SI_FTYPE_V4DF_V4DF_ROUND },
26600 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd_vec_pack_sfix256, "__builtin_ia32_ceilpd_vec_pack_sfix256", IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256, (enum rtx_code) ROUND_CEIL, (int) V8SI_FTYPE_V4DF_V4DF_ROUND },
26601
26602 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_floorps256", IX86_BUILTIN_FLOORPS256, (enum rtx_code) ROUND_FLOOR, (int) V8SF_FTYPE_V8SF_ROUND },
26603 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_ceilps256", IX86_BUILTIN_CEILPS256, (enum rtx_code) ROUND_CEIL, (int) V8SF_FTYPE_V8SF_ROUND },
26604 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_truncps256", IX86_BUILTIN_TRUNCPS256, (enum rtx_code) ROUND_TRUNC, (int) V8SF_FTYPE_V8SF_ROUND },
26605 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_rintps256", IX86_BUILTIN_RINTPS256, (enum rtx_code) ROUND_MXCSR, (int) V8SF_FTYPE_V8SF_ROUND },
26606
26607 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps_sfix256, "__builtin_ia32_floorps_sfix256", IX86_BUILTIN_FLOORPS_SFIX256, (enum rtx_code) ROUND_FLOOR, (int) V8SI_FTYPE_V8SF_ROUND },
26608 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps_sfix256, "__builtin_ia32_ceilps_sfix256", IX86_BUILTIN_CEILPS_SFIX256, (enum rtx_code) ROUND_CEIL, (int) V8SI_FTYPE_V8SF_ROUND },
26609
26610 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv8sf2, "__builtin_ia32_roundps_az256", IX86_BUILTIN_ROUNDPS_AZ256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
26611 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv8sf2_sfix, "__builtin_ia32_roundps_az_sfix256", IX86_BUILTIN_ROUNDPS_AZ_SFIX256, UNKNOWN, (int) V8SI_FTYPE_V8SF },
26612
26613 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpckhpd256, "__builtin_ia32_unpckhpd256", IX86_BUILTIN_UNPCKHPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26614 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpcklpd256, "__builtin_ia32_unpcklpd256", IX86_BUILTIN_UNPCKLPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26615 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpckhps256, "__builtin_ia32_unpckhps256", IX86_BUILTIN_UNPCKHPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26616 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpcklps256, "__builtin_ia32_unpcklps256", IX86_BUILTIN_UNPCKLPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26617
26618 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_si256_si, "__builtin_ia32_si256_si", IX86_BUILTIN_SI256_SI, UNKNOWN, (int) V8SI_FTYPE_V4SI },
26619 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ps256_ps, "__builtin_ia32_ps256_ps", IX86_BUILTIN_PS256_PS, UNKNOWN, (int) V8SF_FTYPE_V4SF },
26620 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_pd256_pd, "__builtin_ia32_pd256_pd", IX86_BUILTIN_PD256_PD, UNKNOWN, (int) V4DF_FTYPE_V2DF },
26621 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_extract_lo_v8si, "__builtin_ia32_si_si256", IX86_BUILTIN_SI_SI256, UNKNOWN, (int) V4SI_FTYPE_V8SI },
26622 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_extract_lo_v8sf, "__builtin_ia32_ps_ps256", IX86_BUILTIN_PS_PS256, UNKNOWN, (int) V4SF_FTYPE_V8SF },
26623 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_extract_lo_v4df, "__builtin_ia32_pd_pd256", IX86_BUILTIN_PD_PD256, UNKNOWN, (int) V2DF_FTYPE_V4DF },
26624
26625 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd, "__builtin_ia32_vtestzpd", IX86_BUILTIN_VTESTZPD, EQ, (int) INT_FTYPE_V2DF_V2DF_PTEST },
26626 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd, "__builtin_ia32_vtestcpd", IX86_BUILTIN_VTESTCPD, LTU, (int) INT_FTYPE_V2DF_V2DF_PTEST },
26627 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd, "__builtin_ia32_vtestnzcpd", IX86_BUILTIN_VTESTNZCPD, GTU, (int) INT_FTYPE_V2DF_V2DF_PTEST },
26628 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps, "__builtin_ia32_vtestzps", IX86_BUILTIN_VTESTZPS, EQ, (int) INT_FTYPE_V4SF_V4SF_PTEST },
26629 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps, "__builtin_ia32_vtestcps", IX86_BUILTIN_VTESTCPS, LTU, (int) INT_FTYPE_V4SF_V4SF_PTEST },
26630 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps, "__builtin_ia32_vtestnzcps", IX86_BUILTIN_VTESTNZCPS, GTU, (int) INT_FTYPE_V4SF_V4SF_PTEST },
26631 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd256, "__builtin_ia32_vtestzpd256", IX86_BUILTIN_VTESTZPD256, EQ, (int) INT_FTYPE_V4DF_V4DF_PTEST },
26632 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd256, "__builtin_ia32_vtestcpd256", IX86_BUILTIN_VTESTCPD256, LTU, (int) INT_FTYPE_V4DF_V4DF_PTEST },
26633 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd256, "__builtin_ia32_vtestnzcpd256", IX86_BUILTIN_VTESTNZCPD256, GTU, (int) INT_FTYPE_V4DF_V4DF_PTEST },
26634 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestzps256", IX86_BUILTIN_VTESTZPS256, EQ, (int) INT_FTYPE_V8SF_V8SF_PTEST },
26635 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestcps256", IX86_BUILTIN_VTESTCPS256, LTU, (int) INT_FTYPE_V8SF_V8SF_PTEST },
26636 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestnzcps256", IX86_BUILTIN_VTESTNZCPS256, GTU, (int) INT_FTYPE_V8SF_V8SF_PTEST },
26637 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, "__builtin_ia32_ptestz256", IX86_BUILTIN_PTESTZ256, EQ, (int) INT_FTYPE_V4DI_V4DI_PTEST },
26638 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, "__builtin_ia32_ptestc256", IX86_BUILTIN_PTESTC256, LTU, (int) INT_FTYPE_V4DI_V4DI_PTEST },
26639 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, "__builtin_ia32_ptestnzc256", IX86_BUILTIN_PTESTNZC256, GTU, (int) INT_FTYPE_V4DI_V4DI_PTEST },
26640
26641 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movmskpd256, "__builtin_ia32_movmskpd256", IX86_BUILTIN_MOVMSKPD256, UNKNOWN, (int) INT_FTYPE_V4DF },
26642 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movmskps256, "__builtin_ia32_movmskps256", IX86_BUILTIN_MOVMSKPS256, UNKNOWN, (int) INT_FTYPE_V8SF },
26643
26644 { OPTION_MASK_ISA_AVX, CODE_FOR_copysignv8sf3, "__builtin_ia32_copysignps256", IX86_BUILTIN_CPYSGNPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26645 { OPTION_MASK_ISA_AVX, CODE_FOR_copysignv4df3, "__builtin_ia32_copysignpd256", IX86_BUILTIN_CPYSGNPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26646
26647 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_pack_sfix_v4df, "__builtin_ia32_vec_pack_sfix256 ", IX86_BUILTIN_VEC_PACK_SFIX256, UNKNOWN, (int) V8SI_FTYPE_V4DF_V4DF },
26648
26649 /* AVX2 */
26650 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_mpsadbw, "__builtin_ia32_mpsadbw256", IX86_BUILTIN_MPSADBW256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI_INT },
26651 { OPTION_MASK_ISA_AVX2, CODE_FOR_absv32qi2, "__builtin_ia32_pabsb256", IX86_BUILTIN_PABSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI },
26652 { OPTION_MASK_ISA_AVX2, CODE_FOR_absv16hi2, "__builtin_ia32_pabsw256", IX86_BUILTIN_PABSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI },
26653 { OPTION_MASK_ISA_AVX2, CODE_FOR_absv8si2, "__builtin_ia32_pabsd256", IX86_BUILTIN_PABSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI },
26654 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packssdw, "__builtin_ia32_packssdw256", IX86_BUILTIN_PACKSSDW256, UNKNOWN, (int) V16HI_FTYPE_V8SI_V8SI },
26655 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packsswb, "__builtin_ia32_packsswb256", IX86_BUILTIN_PACKSSWB256, UNKNOWN, (int) V32QI_FTYPE_V16HI_V16HI },
26656 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packusdw, "__builtin_ia32_packusdw256", IX86_BUILTIN_PACKUSDW256, UNKNOWN, (int) V16HI_FTYPE_V8SI_V8SI },
26657 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packuswb, "__builtin_ia32_packuswb256", IX86_BUILTIN_PACKUSWB256, UNKNOWN, (int) V32QI_FTYPE_V16HI_V16HI },
26658 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv32qi3, "__builtin_ia32_paddb256", IX86_BUILTIN_PADDB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26659 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv16hi3, "__builtin_ia32_paddw256", IX86_BUILTIN_PADDW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26660 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv8si3, "__builtin_ia32_paddd256", IX86_BUILTIN_PADDD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26661 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv4di3, "__builtin_ia32_paddq256", IX86_BUILTIN_PADDQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26662 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ssaddv32qi3, "__builtin_ia32_paddsb256", IX86_BUILTIN_PADDSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26663 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ssaddv16hi3, "__builtin_ia32_paddsw256", IX86_BUILTIN_PADDSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26664 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_usaddv32qi3, "__builtin_ia32_paddusb256", IX86_BUILTIN_PADDUSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26665 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_usaddv16hi3, "__builtin_ia32_paddusw256", IX86_BUILTIN_PADDUSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26666 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_palignrv2ti, "__builtin_ia32_palignr256", IX86_BUILTIN_PALIGNR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI_INT_CONVERT },
26667 { OPTION_MASK_ISA_AVX2, CODE_FOR_andv4di3, "__builtin_ia32_andsi256", IX86_BUILTIN_AND256I, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26668 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_andnotv4di3, "__builtin_ia32_andnotsi256", IX86_BUILTIN_ANDNOT256I, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26669 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_uavgv32qi3, "__builtin_ia32_pavgb256", IX86_BUILTIN_PAVGB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26670 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_uavgv16hi3, "__builtin_ia32_pavgw256", IX86_BUILTIN_PAVGW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26671 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblendvb, "__builtin_ia32_pblendvb256", IX86_BUILTIN_PBLENDVB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI_V32QI },
26672 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblendw, "__builtin_ia32_pblendw256", IX86_BUILTIN_PBLENDVW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI_INT },
26673 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv32qi3, "__builtin_ia32_pcmpeqb256", IX86_BUILTIN_PCMPEQB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26674 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv16hi3, "__builtin_ia32_pcmpeqw256", IX86_BUILTIN_PCMPEQW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26675 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv8si3, "__builtin_ia32_pcmpeqd256", IX86_BUILTIN_PCMPEQD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26676 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv4di3, "__builtin_ia32_pcmpeqq256", IX86_BUILTIN_PCMPEQQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26677 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv32qi3, "__builtin_ia32_pcmpgtb256", IX86_BUILTIN_PCMPGTB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26678 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv16hi3, "__builtin_ia32_pcmpgtw256", IX86_BUILTIN_PCMPGTW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26679 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv8si3, "__builtin_ia32_pcmpgtd256", IX86_BUILTIN_PCMPGTD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26680 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv4di3, "__builtin_ia32_pcmpgtq256", IX86_BUILTIN_PCMPGTQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26681 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phaddwv16hi3, "__builtin_ia32_phaddw256", IX86_BUILTIN_PHADDW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26682 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phadddv8si3, "__builtin_ia32_phaddd256", IX86_BUILTIN_PHADDD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26683 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phaddswv16hi3, "__builtin_ia32_phaddsw256", IX86_BUILTIN_PHADDSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26684 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phsubwv16hi3, "__builtin_ia32_phsubw256", IX86_BUILTIN_PHSUBW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26685 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phsubdv8si3, "__builtin_ia32_phsubd256", IX86_BUILTIN_PHSUBD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26686 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phsubswv16hi3, "__builtin_ia32_phsubsw256", IX86_BUILTIN_PHSUBSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26687 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmaddubsw256, "__builtin_ia32_pmaddubsw256", IX86_BUILTIN_PMADDUBSW256, UNKNOWN, (int) V16HI_FTYPE_V32QI_V32QI },
26688 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmaddwd, "__builtin_ia32_pmaddwd256", IX86_BUILTIN_PMADDWD256, UNKNOWN, (int) V8SI_FTYPE_V16HI_V16HI },
26689 { OPTION_MASK_ISA_AVX2, CODE_FOR_smaxv32qi3, "__builtin_ia32_pmaxsb256", IX86_BUILTIN_PMAXSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26690 { OPTION_MASK_ISA_AVX2, CODE_FOR_smaxv16hi3, "__builtin_ia32_pmaxsw256", IX86_BUILTIN_PMAXSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26691 { OPTION_MASK_ISA_AVX2, CODE_FOR_smaxv8si3 , "__builtin_ia32_pmaxsd256", IX86_BUILTIN_PMAXSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26692 { OPTION_MASK_ISA_AVX2, CODE_FOR_umaxv32qi3, "__builtin_ia32_pmaxub256", IX86_BUILTIN_PMAXUB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26693 { OPTION_MASK_ISA_AVX2, CODE_FOR_umaxv16hi3, "__builtin_ia32_pmaxuw256", IX86_BUILTIN_PMAXUW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26694 { OPTION_MASK_ISA_AVX2, CODE_FOR_umaxv8si3 , "__builtin_ia32_pmaxud256", IX86_BUILTIN_PMAXUD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26695 { OPTION_MASK_ISA_AVX2, CODE_FOR_sminv32qi3, "__builtin_ia32_pminsb256", IX86_BUILTIN_PMINSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26696 { OPTION_MASK_ISA_AVX2, CODE_FOR_sminv16hi3, "__builtin_ia32_pminsw256", IX86_BUILTIN_PMINSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26697 { OPTION_MASK_ISA_AVX2, CODE_FOR_sminv8si3 , "__builtin_ia32_pminsd256", IX86_BUILTIN_PMINSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26698 { OPTION_MASK_ISA_AVX2, CODE_FOR_uminv32qi3, "__builtin_ia32_pminub256", IX86_BUILTIN_PMINUB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26699 { OPTION_MASK_ISA_AVX2, CODE_FOR_uminv16hi3, "__builtin_ia32_pminuw256", IX86_BUILTIN_PMINUW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26700 { OPTION_MASK_ISA_AVX2, CODE_FOR_uminv8si3 , "__builtin_ia32_pminud256", IX86_BUILTIN_PMINUD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26701 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmovmskb, "__builtin_ia32_pmovmskb256", IX86_BUILTIN_PMOVMSKB256, UNKNOWN, (int) INT_FTYPE_V32QI },
26702 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv16qiv16hi2, "__builtin_ia32_pmovsxbw256", IX86_BUILTIN_PMOVSXBW256, UNKNOWN, (int) V16HI_FTYPE_V16QI },
26703 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv8qiv8si2 , "__builtin_ia32_pmovsxbd256", IX86_BUILTIN_PMOVSXBD256, UNKNOWN, (int) V8SI_FTYPE_V16QI },
26704 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv4qiv4di2 , "__builtin_ia32_pmovsxbq256", IX86_BUILTIN_PMOVSXBQ256, UNKNOWN, (int) V4DI_FTYPE_V16QI },
26705 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv8hiv8si2 , "__builtin_ia32_pmovsxwd256", IX86_BUILTIN_PMOVSXWD256, UNKNOWN, (int) V8SI_FTYPE_V8HI },
26706 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv4hiv4di2 , "__builtin_ia32_pmovsxwq256", IX86_BUILTIN_PMOVSXWQ256, UNKNOWN, (int) V4DI_FTYPE_V8HI },
26707 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv4siv4di2 , "__builtin_ia32_pmovsxdq256", IX86_BUILTIN_PMOVSXDQ256, UNKNOWN, (int) V4DI_FTYPE_V4SI },
26708 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv16qiv16hi2, "__builtin_ia32_pmovzxbw256", IX86_BUILTIN_PMOVZXBW256, UNKNOWN, (int) V16HI_FTYPE_V16QI },
26709 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv8qiv8si2 , "__builtin_ia32_pmovzxbd256", IX86_BUILTIN_PMOVZXBD256, UNKNOWN, (int) V8SI_FTYPE_V16QI },
26710 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4qiv4di2 , "__builtin_ia32_pmovzxbq256", IX86_BUILTIN_PMOVZXBQ256, UNKNOWN, (int) V4DI_FTYPE_V16QI },
26711 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv8hiv8si2 , "__builtin_ia32_pmovzxwd256", IX86_BUILTIN_PMOVZXWD256, UNKNOWN, (int) V8SI_FTYPE_V8HI },
26712 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4hiv4di2 , "__builtin_ia32_pmovzxwq256", IX86_BUILTIN_PMOVZXWQ256, UNKNOWN, (int) V4DI_FTYPE_V8HI },
26713 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4siv4di2 , "__builtin_ia32_pmovzxdq256", IX86_BUILTIN_PMOVZXDQ256, UNKNOWN, (int) V4DI_FTYPE_V4SI },
26714 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_mulv4siv4di3 , "__builtin_ia32_pmuldq256" , IX86_BUILTIN_PMULDQ256 , UNKNOWN, (int) V4DI_FTYPE_V8SI_V8SI },
26715 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_umulhrswv16hi3 , "__builtin_ia32_pmulhrsw256", IX86_BUILTIN_PMULHRSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26716 { OPTION_MASK_ISA_AVX2, CODE_FOR_umulv16hi3_highpart, "__builtin_ia32_pmulhuw256" , IX86_BUILTIN_PMULHUW256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26717 { OPTION_MASK_ISA_AVX2, CODE_FOR_smulv16hi3_highpart, "__builtin_ia32_pmulhw256" , IX86_BUILTIN_PMULHW256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26718 { OPTION_MASK_ISA_AVX2, CODE_FOR_mulv16hi3, "__builtin_ia32_pmullw256" , IX86_BUILTIN_PMULLW256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26719 { OPTION_MASK_ISA_AVX2, CODE_FOR_mulv8si3, "__builtin_ia32_pmulld256" , IX86_BUILTIN_PMULLD256 , UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26720 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_umulv4siv4di3 , "__builtin_ia32_pmuludq256" , IX86_BUILTIN_PMULUDQ256 , UNKNOWN, (int) V4DI_FTYPE_V8SI_V8SI },
26721 { OPTION_MASK_ISA_AVX2, CODE_FOR_iorv4di3, "__builtin_ia32_por256", IX86_BUILTIN_POR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26722 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psadbw, "__builtin_ia32_psadbw256", IX86_BUILTIN_PSADBW256, UNKNOWN, (int) V16HI_FTYPE_V32QI_V32QI },
26723 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshufbv32qi3, "__builtin_ia32_pshufb256", IX86_BUILTIN_PSHUFB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26724 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshufdv3, "__builtin_ia32_pshufd256", IX86_BUILTIN_PSHUFD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_INT },
26725 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshufhwv3, "__builtin_ia32_pshufhw256", IX86_BUILTIN_PSHUFHW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_INT },
26726 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshuflwv3, "__builtin_ia32_pshuflw256", IX86_BUILTIN_PSHUFLW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_INT },
26727 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psignv32qi3, "__builtin_ia32_psignb256", IX86_BUILTIN_PSIGNB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26728 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psignv16hi3, "__builtin_ia32_psignw256", IX86_BUILTIN_PSIGNW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26729 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psignv8si3 , "__builtin_ia32_psignd256", IX86_BUILTIN_PSIGND256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26730 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlv2ti3, "__builtin_ia32_pslldqi256", IX86_BUILTIN_PSLLDQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_CONVERT },
26731 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv16hi3, "__builtin_ia32_psllwi256", IX86_BUILTIN_PSLLWI256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_SI_COUNT },
26732 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv16hi3, "__builtin_ia32_psllw256", IX86_BUILTIN_PSLLW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V8HI_COUNT },
26733 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv8si3, "__builtin_ia32_pslldi256", IX86_BUILTIN_PSLLDI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_SI_COUNT },
26734 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv8si3, "__builtin_ia32_pslld256", IX86_BUILTIN_PSLLD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_COUNT },
26735 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv4di3, "__builtin_ia32_psllqi256", IX86_BUILTIN_PSLLQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_COUNT },
26736 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv4di3, "__builtin_ia32_psllq256", IX86_BUILTIN_PSLLQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V2DI_COUNT },
26737 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv16hi3, "__builtin_ia32_psrawi256", IX86_BUILTIN_PSRAWI256, UNKNOWN, (int) V16HI_FTYPE_V16HI_SI_COUNT },
26738 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv16hi3, "__builtin_ia32_psraw256", IX86_BUILTIN_PSRAW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V8HI_COUNT },
26739 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv8si3, "__builtin_ia32_psradi256", IX86_BUILTIN_PSRADI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_SI_COUNT },
26740 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv8si3, "__builtin_ia32_psrad256", IX86_BUILTIN_PSRAD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_COUNT },
26741 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrv2ti3, "__builtin_ia32_psrldqi256", IX86_BUILTIN_PSRLDQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_CONVERT },
26742 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv16hi3, "__builtin_ia32_psrlwi256", IX86_BUILTIN_PSRLWI256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_SI_COUNT },
26743 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv16hi3, "__builtin_ia32_psrlw256", IX86_BUILTIN_PSRLW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V8HI_COUNT },
26744 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv8si3, "__builtin_ia32_psrldi256", IX86_BUILTIN_PSRLDI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_SI_COUNT },
26745 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv8si3, "__builtin_ia32_psrld256", IX86_BUILTIN_PSRLD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_COUNT },
26746 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv4di3, "__builtin_ia32_psrlqi256", IX86_BUILTIN_PSRLQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_COUNT },
26747 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv4di3, "__builtin_ia32_psrlq256", IX86_BUILTIN_PSRLQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V2DI_COUNT },
26748 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv32qi3, "__builtin_ia32_psubb256", IX86_BUILTIN_PSUBB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26749 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv16hi3, "__builtin_ia32_psubw256", IX86_BUILTIN_PSUBW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26750 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv8si3, "__builtin_ia32_psubd256", IX86_BUILTIN_PSUBD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26751 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv4di3, "__builtin_ia32_psubq256", IX86_BUILTIN_PSUBQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26752 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sssubv32qi3, "__builtin_ia32_psubsb256", IX86_BUILTIN_PSUBSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26753 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sssubv16hi3, "__builtin_ia32_psubsw256", IX86_BUILTIN_PSUBSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26754 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ussubv32qi3, "__builtin_ia32_psubusb256", IX86_BUILTIN_PSUBUSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26755 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ussubv16hi3, "__builtin_ia32_psubusw256", IX86_BUILTIN_PSUBUSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26756 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv32qi, "__builtin_ia32_punpckhbw256", IX86_BUILTIN_PUNPCKHBW256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26757 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv16hi, "__builtin_ia32_punpckhwd256", IX86_BUILTIN_PUNPCKHWD256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26758 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv8si, "__builtin_ia32_punpckhdq256", IX86_BUILTIN_PUNPCKHDQ256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26759 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv4di, "__builtin_ia32_punpckhqdq256", IX86_BUILTIN_PUNPCKHQDQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26760 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv32qi, "__builtin_ia32_punpcklbw256", IX86_BUILTIN_PUNPCKLBW256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26761 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv16hi, "__builtin_ia32_punpcklwd256", IX86_BUILTIN_PUNPCKLWD256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26762 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv8si, "__builtin_ia32_punpckldq256", IX86_BUILTIN_PUNPCKLDQ256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26763 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv4di, "__builtin_ia32_punpcklqdq256", IX86_BUILTIN_PUNPCKLQDQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26764 { OPTION_MASK_ISA_AVX2, CODE_FOR_xorv4di3, "__builtin_ia32_pxor256", IX86_BUILTIN_PXOR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26765 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv4sf, "__builtin_ia32_vbroadcastss_ps", IX86_BUILTIN_VBROADCASTSS_PS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
26766 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv8sf, "__builtin_ia32_vbroadcastss_ps256", IX86_BUILTIN_VBROADCASTSS_PS256, UNKNOWN, (int) V8SF_FTYPE_V4SF },
26767 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv4df, "__builtin_ia32_vbroadcastsd_pd256", IX86_BUILTIN_VBROADCASTSD_PD256, UNKNOWN, (int) V4DF_FTYPE_V2DF },
26768 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vbroadcasti128_v4di, "__builtin_ia32_vbroadcastsi256", IX86_BUILTIN_VBROADCASTSI256, UNKNOWN, (int) V4DI_FTYPE_V2DI },
26769 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblenddv4si, "__builtin_ia32_pblendd128", IX86_BUILTIN_PBLENDD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_INT },
26770 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblenddv8si, "__builtin_ia32_pblendd256", IX86_BUILTIN_PBLENDD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_INT },
26771 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv32qi, "__builtin_ia32_pbroadcastb256", IX86_BUILTIN_PBROADCASTB256, UNKNOWN, (int) V32QI_FTYPE_V16QI },
26772 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv16hi, "__builtin_ia32_pbroadcastw256", IX86_BUILTIN_PBROADCASTW256, UNKNOWN, (int) V16HI_FTYPE_V8HI },
26773 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv8si, "__builtin_ia32_pbroadcastd256", IX86_BUILTIN_PBROADCASTD256, UNKNOWN, (int) V8SI_FTYPE_V4SI },
26774 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv4di, "__builtin_ia32_pbroadcastq256", IX86_BUILTIN_PBROADCASTQ256, UNKNOWN, (int) V4DI_FTYPE_V2DI },
26775 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv16qi, "__builtin_ia32_pbroadcastb128", IX86_BUILTIN_PBROADCASTB128, UNKNOWN, (int) V16QI_FTYPE_V16QI },
26776 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv8hi, "__builtin_ia32_pbroadcastw128", IX86_BUILTIN_PBROADCASTW128, UNKNOWN, (int) V8HI_FTYPE_V8HI },
26777 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv4si, "__builtin_ia32_pbroadcastd128", IX86_BUILTIN_PBROADCASTD128, UNKNOWN, (int) V4SI_FTYPE_V4SI },
26778 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv2di, "__builtin_ia32_pbroadcastq128", IX86_BUILTIN_PBROADCASTQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI },
26779 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permvarv8si, "__builtin_ia32_permvarsi256", IX86_BUILTIN_VPERMVARSI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26780 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv4df, "__builtin_ia32_permdf256", IX86_BUILTIN_VPERMDF256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT },
26781 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permvarv8sf, "__builtin_ia32_permvarsf256", IX86_BUILTIN_VPERMVARSF256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26782 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv4di, "__builtin_ia32_permdi256", IX86_BUILTIN_VPERMDI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT },
26783 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv2ti, "__builtin_ia32_permti256", IX86_BUILTIN_VPERMTI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI_INT },
26784 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_extracti128, "__builtin_ia32_extract128i256", IX86_BUILTIN_VEXTRACT128I256, UNKNOWN, (int) V2DI_FTYPE_V4DI_INT },
26785 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_inserti128, "__builtin_ia32_insert128i256", IX86_BUILTIN_VINSERT128I256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V2DI_INT },
26786 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv4di, "__builtin_ia32_psllv4di", IX86_BUILTIN_PSLLVV4DI, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26787 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv2di, "__builtin_ia32_psllv2di", IX86_BUILTIN_PSLLVV2DI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26788 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv8si, "__builtin_ia32_psllv8si", IX86_BUILTIN_PSLLVV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26789 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv4si, "__builtin_ia32_psllv4si", IX86_BUILTIN_PSLLVV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26790 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashrvv8si, "__builtin_ia32_psrav8si", IX86_BUILTIN_PSRAVV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26791 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashrvv4si, "__builtin_ia32_psrav4si", IX86_BUILTIN_PSRAVV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26792 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv4di, "__builtin_ia32_psrlv4di", IX86_BUILTIN_PSRLVV4DI, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26793 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv2di, "__builtin_ia32_psrlv2di", IX86_BUILTIN_PSRLVV2DI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26794 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv8si, "__builtin_ia32_psrlv8si", IX86_BUILTIN_PSRLVV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26795 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv4si, "__builtin_ia32_psrlv4si", IX86_BUILTIN_PSRLVV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26796
26797 { OPTION_MASK_ISA_LZCNT, CODE_FOR_clzhi2_lzcnt, "__builtin_clzs", IX86_BUILTIN_CLZS, UNKNOWN, (int) UINT16_FTYPE_UINT16 },
26798
26799 /* BMI */
26800 { OPTION_MASK_ISA_BMI, CODE_FOR_bmi_bextr_si, "__builtin_ia32_bextr_u32", IX86_BUILTIN_BEXTR32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
26801 { OPTION_MASK_ISA_BMI, CODE_FOR_bmi_bextr_di, "__builtin_ia32_bextr_u64", IX86_BUILTIN_BEXTR64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
26802 { OPTION_MASK_ISA_BMI, CODE_FOR_ctzhi2, "__builtin_ctzs", IX86_BUILTIN_CTZS, UNKNOWN, (int) UINT16_FTYPE_UINT16 },
26803
26804 /* TBM */
26805 { OPTION_MASK_ISA_TBM, CODE_FOR_tbm_bextri_si, "__builtin_ia32_bextri_u32", IX86_BUILTIN_BEXTRI32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
26806 { OPTION_MASK_ISA_TBM, CODE_FOR_tbm_bextri_di, "__builtin_ia32_bextri_u64", IX86_BUILTIN_BEXTRI64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
26807
26808 /* F16C */
26809 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtph2ps, "__builtin_ia32_vcvtph2ps", IX86_BUILTIN_CVTPH2PS, UNKNOWN, (int) V4SF_FTYPE_V8HI },
26810 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtph2ps256, "__builtin_ia32_vcvtph2ps256", IX86_BUILTIN_CVTPH2PS256, UNKNOWN, (int) V8SF_FTYPE_V8HI },
26811 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtps2ph, "__builtin_ia32_vcvtps2ph", IX86_BUILTIN_CVTPS2PH, UNKNOWN, (int) V8HI_FTYPE_V4SF_INT },
26812 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtps2ph256, "__builtin_ia32_vcvtps2ph256", IX86_BUILTIN_CVTPS2PH256, UNKNOWN, (int) V8HI_FTYPE_V8SF_INT },
26813
26814 /* BMI2 */
26815 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_bzhi_si3, "__builtin_ia32_bzhi_si", IX86_BUILTIN_BZHI32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
26816 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_bzhi_di3, "__builtin_ia32_bzhi_di", IX86_BUILTIN_BZHI64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
26817 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pdep_si3, "__builtin_ia32_pdep_si", IX86_BUILTIN_PDEP32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
26818 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pdep_di3, "__builtin_ia32_pdep_di", IX86_BUILTIN_PDEP64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
26819 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pext_si3, "__builtin_ia32_pext_si", IX86_BUILTIN_PEXT32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
26820 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pext_di3, "__builtin_ia32_pext_di", IX86_BUILTIN_PEXT64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
26821 };
26822
26823 /* FMA4 and XOP. */
26824 #define MULTI_ARG_4_DF2_DI_I V2DF_FTYPE_V2DF_V2DF_V2DI_INT
26825 #define MULTI_ARG_4_DF2_DI_I1 V4DF_FTYPE_V4DF_V4DF_V4DI_INT
26826 #define MULTI_ARG_4_SF2_SI_I V4SF_FTYPE_V4SF_V4SF_V4SI_INT
26827 #define MULTI_ARG_4_SF2_SI_I1 V8SF_FTYPE_V8SF_V8SF_V8SI_INT
26828 #define MULTI_ARG_3_SF V4SF_FTYPE_V4SF_V4SF_V4SF
26829 #define MULTI_ARG_3_DF V2DF_FTYPE_V2DF_V2DF_V2DF
26830 #define MULTI_ARG_3_SF2 V8SF_FTYPE_V8SF_V8SF_V8SF
26831 #define MULTI_ARG_3_DF2 V4DF_FTYPE_V4DF_V4DF_V4DF
26832 #define MULTI_ARG_3_DI V2DI_FTYPE_V2DI_V2DI_V2DI
26833 #define MULTI_ARG_3_SI V4SI_FTYPE_V4SI_V4SI_V4SI
26834 #define MULTI_ARG_3_SI_DI V4SI_FTYPE_V4SI_V4SI_V2DI
26835 #define MULTI_ARG_3_HI V8HI_FTYPE_V8HI_V8HI_V8HI
26836 #define MULTI_ARG_3_HI_SI V8HI_FTYPE_V8HI_V8HI_V4SI
26837 #define MULTI_ARG_3_QI V16QI_FTYPE_V16QI_V16QI_V16QI
26838 #define MULTI_ARG_3_DI2 V4DI_FTYPE_V4DI_V4DI_V4DI
26839 #define MULTI_ARG_3_SI2 V8SI_FTYPE_V8SI_V8SI_V8SI
26840 #define MULTI_ARG_3_HI2 V16HI_FTYPE_V16HI_V16HI_V16HI
26841 #define MULTI_ARG_3_QI2 V32QI_FTYPE_V32QI_V32QI_V32QI
26842 #define MULTI_ARG_2_SF V4SF_FTYPE_V4SF_V4SF
26843 #define MULTI_ARG_2_DF V2DF_FTYPE_V2DF_V2DF
26844 #define MULTI_ARG_2_DI V2DI_FTYPE_V2DI_V2DI
26845 #define MULTI_ARG_2_SI V4SI_FTYPE_V4SI_V4SI
26846 #define MULTI_ARG_2_HI V8HI_FTYPE_V8HI_V8HI
26847 #define MULTI_ARG_2_QI V16QI_FTYPE_V16QI_V16QI
26848 #define MULTI_ARG_2_DI_IMM V2DI_FTYPE_V2DI_SI
26849 #define MULTI_ARG_2_SI_IMM V4SI_FTYPE_V4SI_SI
26850 #define MULTI_ARG_2_HI_IMM V8HI_FTYPE_V8HI_SI
26851 #define MULTI_ARG_2_QI_IMM V16QI_FTYPE_V16QI_SI
26852 #define MULTI_ARG_2_DI_CMP V2DI_FTYPE_V2DI_V2DI_CMP
26853 #define MULTI_ARG_2_SI_CMP V4SI_FTYPE_V4SI_V4SI_CMP
26854 #define MULTI_ARG_2_HI_CMP V8HI_FTYPE_V8HI_V8HI_CMP
26855 #define MULTI_ARG_2_QI_CMP V16QI_FTYPE_V16QI_V16QI_CMP
26856 #define MULTI_ARG_2_SF_TF V4SF_FTYPE_V4SF_V4SF_TF
26857 #define MULTI_ARG_2_DF_TF V2DF_FTYPE_V2DF_V2DF_TF
26858 #define MULTI_ARG_2_DI_TF V2DI_FTYPE_V2DI_V2DI_TF
26859 #define MULTI_ARG_2_SI_TF V4SI_FTYPE_V4SI_V4SI_TF
26860 #define MULTI_ARG_2_HI_TF V8HI_FTYPE_V8HI_V8HI_TF
26861 #define MULTI_ARG_2_QI_TF V16QI_FTYPE_V16QI_V16QI_TF
26862 #define MULTI_ARG_1_SF V4SF_FTYPE_V4SF
26863 #define MULTI_ARG_1_DF V2DF_FTYPE_V2DF
26864 #define MULTI_ARG_1_SF2 V8SF_FTYPE_V8SF
26865 #define MULTI_ARG_1_DF2 V4DF_FTYPE_V4DF
26866 #define MULTI_ARG_1_DI V2DI_FTYPE_V2DI
26867 #define MULTI_ARG_1_SI V4SI_FTYPE_V4SI
26868 #define MULTI_ARG_1_HI V8HI_FTYPE_V8HI
26869 #define MULTI_ARG_1_QI V16QI_FTYPE_V16QI
26870 #define MULTI_ARG_1_SI_DI V2DI_FTYPE_V4SI
26871 #define MULTI_ARG_1_HI_DI V2DI_FTYPE_V8HI
26872 #define MULTI_ARG_1_HI_SI V4SI_FTYPE_V8HI
26873 #define MULTI_ARG_1_QI_DI V2DI_FTYPE_V16QI
26874 #define MULTI_ARG_1_QI_SI V4SI_FTYPE_V16QI
26875 #define MULTI_ARG_1_QI_HI V8HI_FTYPE_V16QI
26876
26877 static const struct builtin_description bdesc_multi_arg[] =
26878 {
26879 { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_vmfmadd_v4sf,
26880 "__builtin_ia32_vfmaddss", IX86_BUILTIN_VFMADDSS,
26881 UNKNOWN, (int)MULTI_ARG_3_SF },
26882 { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_vmfmadd_v2df,
26883 "__builtin_ia32_vfmaddsd", IX86_BUILTIN_VFMADDSD,
26884 UNKNOWN, (int)MULTI_ARG_3_DF },
26885
26886 { OPTION_MASK_ISA_FMA, CODE_FOR_fmai_vmfmadd_v4sf,
26887 "__builtin_ia32_vfmaddss3", IX86_BUILTIN_VFMADDSS3,
26888 UNKNOWN, (int)MULTI_ARG_3_SF },
26889 { OPTION_MASK_ISA_FMA, CODE_FOR_fmai_vmfmadd_v2df,
26890 "__builtin_ia32_vfmaddsd3", IX86_BUILTIN_VFMADDSD3,
26891 UNKNOWN, (int)MULTI_ARG_3_DF },
26892
26893 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v4sf,
26894 "__builtin_ia32_vfmaddps", IX86_BUILTIN_VFMADDPS,
26895 UNKNOWN, (int)MULTI_ARG_3_SF },
26896 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v2df,
26897 "__builtin_ia32_vfmaddpd", IX86_BUILTIN_VFMADDPD,
26898 UNKNOWN, (int)MULTI_ARG_3_DF },
26899 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v8sf,
26900 "__builtin_ia32_vfmaddps256", IX86_BUILTIN_VFMADDPS256,
26901 UNKNOWN, (int)MULTI_ARG_3_SF2 },
26902 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v4df,
26903 "__builtin_ia32_vfmaddpd256", IX86_BUILTIN_VFMADDPD256,
26904 UNKNOWN, (int)MULTI_ARG_3_DF2 },
26905
26906 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v4sf,
26907 "__builtin_ia32_vfmaddsubps", IX86_BUILTIN_VFMADDSUBPS,
26908 UNKNOWN, (int)MULTI_ARG_3_SF },
26909 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v2df,
26910 "__builtin_ia32_vfmaddsubpd", IX86_BUILTIN_VFMADDSUBPD,
26911 UNKNOWN, (int)MULTI_ARG_3_DF },
26912 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v8sf,
26913 "__builtin_ia32_vfmaddsubps256", IX86_BUILTIN_VFMADDSUBPS256,
26914 UNKNOWN, (int)MULTI_ARG_3_SF2 },
26915 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v4df,
26916 "__builtin_ia32_vfmaddsubpd256", IX86_BUILTIN_VFMADDSUBPD256,
26917 UNKNOWN, (int)MULTI_ARG_3_DF2 },
26918
26919 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v2di, "__builtin_ia32_vpcmov", IX86_BUILTIN_VPCMOV, UNKNOWN, (int)MULTI_ARG_3_DI },
26920 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v2di, "__builtin_ia32_vpcmov_v2di", IX86_BUILTIN_VPCMOV_V2DI, UNKNOWN, (int)MULTI_ARG_3_DI },
26921 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4si, "__builtin_ia32_vpcmov_v4si", IX86_BUILTIN_VPCMOV_V4SI, UNKNOWN, (int)MULTI_ARG_3_SI },
26922 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v8hi, "__builtin_ia32_vpcmov_v8hi", IX86_BUILTIN_VPCMOV_V8HI, UNKNOWN, (int)MULTI_ARG_3_HI },
26923 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v16qi, "__builtin_ia32_vpcmov_v16qi",IX86_BUILTIN_VPCMOV_V16QI,UNKNOWN, (int)MULTI_ARG_3_QI },
26924 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v2df, "__builtin_ia32_vpcmov_v2df", IX86_BUILTIN_VPCMOV_V2DF, UNKNOWN, (int)MULTI_ARG_3_DF },
26925 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4sf, "__builtin_ia32_vpcmov_v4sf", IX86_BUILTIN_VPCMOV_V4SF, UNKNOWN, (int)MULTI_ARG_3_SF },
26926
26927 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4di256, "__builtin_ia32_vpcmov256", IX86_BUILTIN_VPCMOV256, UNKNOWN, (int)MULTI_ARG_3_DI2 },
26928 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4di256, "__builtin_ia32_vpcmov_v4di256", IX86_BUILTIN_VPCMOV_V4DI256, UNKNOWN, (int)MULTI_ARG_3_DI2 },
26929 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v8si256, "__builtin_ia32_vpcmov_v8si256", IX86_BUILTIN_VPCMOV_V8SI256, UNKNOWN, (int)MULTI_ARG_3_SI2 },
26930 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v16hi256, "__builtin_ia32_vpcmov_v16hi256", IX86_BUILTIN_VPCMOV_V16HI256, UNKNOWN, (int)MULTI_ARG_3_HI2 },
26931 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v32qi256, "__builtin_ia32_vpcmov_v32qi256", IX86_BUILTIN_VPCMOV_V32QI256, UNKNOWN, (int)MULTI_ARG_3_QI2 },
26932 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4df256, "__builtin_ia32_vpcmov_v4df256", IX86_BUILTIN_VPCMOV_V4DF256, UNKNOWN, (int)MULTI_ARG_3_DF2 },
26933 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v8sf256, "__builtin_ia32_vpcmov_v8sf256", IX86_BUILTIN_VPCMOV_V8SF256, UNKNOWN, (int)MULTI_ARG_3_SF2 },
26934
26935 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pperm, "__builtin_ia32_vpperm", IX86_BUILTIN_VPPERM, UNKNOWN, (int)MULTI_ARG_3_QI },
26936
26937 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssww, "__builtin_ia32_vpmacssww", IX86_BUILTIN_VPMACSSWW, UNKNOWN, (int)MULTI_ARG_3_HI },
26938 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsww, "__builtin_ia32_vpmacsww", IX86_BUILTIN_VPMACSWW, UNKNOWN, (int)MULTI_ARG_3_HI },
26939 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsswd, "__builtin_ia32_vpmacsswd", IX86_BUILTIN_VPMACSSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
26940 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacswd, "__builtin_ia32_vpmacswd", IX86_BUILTIN_VPMACSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
26941 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssdd, "__builtin_ia32_vpmacssdd", IX86_BUILTIN_VPMACSSDD, UNKNOWN, (int)MULTI_ARG_3_SI },
26942 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsdd, "__builtin_ia32_vpmacsdd", IX86_BUILTIN_VPMACSDD, UNKNOWN, (int)MULTI_ARG_3_SI },
26943 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssdql, "__builtin_ia32_vpmacssdql", IX86_BUILTIN_VPMACSSDQL, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
26944 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssdqh, "__builtin_ia32_vpmacssdqh", IX86_BUILTIN_VPMACSSDQH, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
26945 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsdql, "__builtin_ia32_vpmacsdql", IX86_BUILTIN_VPMACSDQL, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
26946 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsdqh, "__builtin_ia32_vpmacsdqh", IX86_BUILTIN_VPMACSDQH, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
26947 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmadcsswd, "__builtin_ia32_vpmadcsswd", IX86_BUILTIN_VPMADCSSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
26948 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmadcswd, "__builtin_ia32_vpmadcswd", IX86_BUILTIN_VPMADCSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
26949
26950 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv2di3, "__builtin_ia32_vprotq", IX86_BUILTIN_VPROTQ, UNKNOWN, (int)MULTI_ARG_2_DI },
26951 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv4si3, "__builtin_ia32_vprotd", IX86_BUILTIN_VPROTD, UNKNOWN, (int)MULTI_ARG_2_SI },
26952 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv8hi3, "__builtin_ia32_vprotw", IX86_BUILTIN_VPROTW, UNKNOWN, (int)MULTI_ARG_2_HI },
26953 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv16qi3, "__builtin_ia32_vprotb", IX86_BUILTIN_VPROTB, UNKNOWN, (int)MULTI_ARG_2_QI },
26954 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv2di3, "__builtin_ia32_vprotqi", IX86_BUILTIN_VPROTQ_IMM, UNKNOWN, (int)MULTI_ARG_2_DI_IMM },
26955 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv4si3, "__builtin_ia32_vprotdi", IX86_BUILTIN_VPROTD_IMM, UNKNOWN, (int)MULTI_ARG_2_SI_IMM },
26956 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv8hi3, "__builtin_ia32_vprotwi", IX86_BUILTIN_VPROTW_IMM, UNKNOWN, (int)MULTI_ARG_2_HI_IMM },
26957 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv16qi3, "__builtin_ia32_vprotbi", IX86_BUILTIN_VPROTB_IMM, UNKNOWN, (int)MULTI_ARG_2_QI_IMM },
26958 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav2di3, "__builtin_ia32_vpshaq", IX86_BUILTIN_VPSHAQ, UNKNOWN, (int)MULTI_ARG_2_DI },
26959 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav4si3, "__builtin_ia32_vpshad", IX86_BUILTIN_VPSHAD, UNKNOWN, (int)MULTI_ARG_2_SI },
26960 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav8hi3, "__builtin_ia32_vpshaw", IX86_BUILTIN_VPSHAW, UNKNOWN, (int)MULTI_ARG_2_HI },
26961 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav16qi3, "__builtin_ia32_vpshab", IX86_BUILTIN_VPSHAB, UNKNOWN, (int)MULTI_ARG_2_QI },
26962 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv2di3, "__builtin_ia32_vpshlq", IX86_BUILTIN_VPSHLQ, UNKNOWN, (int)MULTI_ARG_2_DI },
26963 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv4si3, "__builtin_ia32_vpshld", IX86_BUILTIN_VPSHLD, UNKNOWN, (int)MULTI_ARG_2_SI },
26964 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv8hi3, "__builtin_ia32_vpshlw", IX86_BUILTIN_VPSHLW, UNKNOWN, (int)MULTI_ARG_2_HI },
26965 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv16qi3, "__builtin_ia32_vpshlb", IX86_BUILTIN_VPSHLB, UNKNOWN, (int)MULTI_ARG_2_QI },
26966
26967 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vmfrczv4sf2, "__builtin_ia32_vfrczss", IX86_BUILTIN_VFRCZSS, UNKNOWN, (int)MULTI_ARG_2_SF },
26968 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vmfrczv2df2, "__builtin_ia32_vfrczsd", IX86_BUILTIN_VFRCZSD, UNKNOWN, (int)MULTI_ARG_2_DF },
26969 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv4sf2, "__builtin_ia32_vfrczps", IX86_BUILTIN_VFRCZPS, UNKNOWN, (int)MULTI_ARG_1_SF },
26970 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv2df2, "__builtin_ia32_vfrczpd", IX86_BUILTIN_VFRCZPD, UNKNOWN, (int)MULTI_ARG_1_DF },
26971 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv8sf2, "__builtin_ia32_vfrczps256", IX86_BUILTIN_VFRCZPS256, UNKNOWN, (int)MULTI_ARG_1_SF2 },
26972 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv4df2, "__builtin_ia32_vfrczpd256", IX86_BUILTIN_VFRCZPD256, UNKNOWN, (int)MULTI_ARG_1_DF2 },
26973
26974 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddbw, "__builtin_ia32_vphaddbw", IX86_BUILTIN_VPHADDBW, UNKNOWN, (int)MULTI_ARG_1_QI_HI },
26975 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddbd, "__builtin_ia32_vphaddbd", IX86_BUILTIN_VPHADDBD, UNKNOWN, (int)MULTI_ARG_1_QI_SI },
26976 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddbq, "__builtin_ia32_vphaddbq", IX86_BUILTIN_VPHADDBQ, UNKNOWN, (int)MULTI_ARG_1_QI_DI },
26977 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddwd, "__builtin_ia32_vphaddwd", IX86_BUILTIN_VPHADDWD, UNKNOWN, (int)MULTI_ARG_1_HI_SI },
26978 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddwq, "__builtin_ia32_vphaddwq", IX86_BUILTIN_VPHADDWQ, UNKNOWN, (int)MULTI_ARG_1_HI_DI },
26979 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phadddq, "__builtin_ia32_vphadddq", IX86_BUILTIN_VPHADDDQ, UNKNOWN, (int)MULTI_ARG_1_SI_DI },
26980 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddubw, "__builtin_ia32_vphaddubw", IX86_BUILTIN_VPHADDUBW, UNKNOWN, (int)MULTI_ARG_1_QI_HI },
26981 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddubd, "__builtin_ia32_vphaddubd", IX86_BUILTIN_VPHADDUBD, UNKNOWN, (int)MULTI_ARG_1_QI_SI },
26982 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddubq, "__builtin_ia32_vphaddubq", IX86_BUILTIN_VPHADDUBQ, UNKNOWN, (int)MULTI_ARG_1_QI_DI },
26983 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phadduwd, "__builtin_ia32_vphadduwd", IX86_BUILTIN_VPHADDUWD, UNKNOWN, (int)MULTI_ARG_1_HI_SI },
26984 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phadduwq, "__builtin_ia32_vphadduwq", IX86_BUILTIN_VPHADDUWQ, UNKNOWN, (int)MULTI_ARG_1_HI_DI },
26985 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddudq, "__builtin_ia32_vphaddudq", IX86_BUILTIN_VPHADDUDQ, UNKNOWN, (int)MULTI_ARG_1_SI_DI },
26986 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phsubbw, "__builtin_ia32_vphsubbw", IX86_BUILTIN_VPHSUBBW, UNKNOWN, (int)MULTI_ARG_1_QI_HI },
26987 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phsubwd, "__builtin_ia32_vphsubwd", IX86_BUILTIN_VPHSUBWD, UNKNOWN, (int)MULTI_ARG_1_HI_SI },
26988 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phsubdq, "__builtin_ia32_vphsubdq", IX86_BUILTIN_VPHSUBDQ, UNKNOWN, (int)MULTI_ARG_1_SI_DI },
26989
26990 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomeqb", IX86_BUILTIN_VPCOMEQB, EQ, (int)MULTI_ARG_2_QI_CMP },
26991 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomneb", IX86_BUILTIN_VPCOMNEB, NE, (int)MULTI_ARG_2_QI_CMP },
26992 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomneqb", IX86_BUILTIN_VPCOMNEB, NE, (int)MULTI_ARG_2_QI_CMP },
26993 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomltb", IX86_BUILTIN_VPCOMLTB, LT, (int)MULTI_ARG_2_QI_CMP },
26994 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomleb", IX86_BUILTIN_VPCOMLEB, LE, (int)MULTI_ARG_2_QI_CMP },
26995 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomgtb", IX86_BUILTIN_VPCOMGTB, GT, (int)MULTI_ARG_2_QI_CMP },
26996 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomgeb", IX86_BUILTIN_VPCOMGEB, GE, (int)MULTI_ARG_2_QI_CMP },
26997
26998 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomeqw", IX86_BUILTIN_VPCOMEQW, EQ, (int)MULTI_ARG_2_HI_CMP },
26999 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomnew", IX86_BUILTIN_VPCOMNEW, NE, (int)MULTI_ARG_2_HI_CMP },
27000 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomneqw", IX86_BUILTIN_VPCOMNEW, NE, (int)MULTI_ARG_2_HI_CMP },
27001 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomltw", IX86_BUILTIN_VPCOMLTW, LT, (int)MULTI_ARG_2_HI_CMP },
27002 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomlew", IX86_BUILTIN_VPCOMLEW, LE, (int)MULTI_ARG_2_HI_CMP },
27003 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomgtw", IX86_BUILTIN_VPCOMGTW, GT, (int)MULTI_ARG_2_HI_CMP },
27004 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomgew", IX86_BUILTIN_VPCOMGEW, GE, (int)MULTI_ARG_2_HI_CMP },
27005
27006 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomeqd", IX86_BUILTIN_VPCOMEQD, EQ, (int)MULTI_ARG_2_SI_CMP },
27007 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomned", IX86_BUILTIN_VPCOMNED, NE, (int)MULTI_ARG_2_SI_CMP },
27008 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomneqd", IX86_BUILTIN_VPCOMNED, NE, (int)MULTI_ARG_2_SI_CMP },
27009 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomltd", IX86_BUILTIN_VPCOMLTD, LT, (int)MULTI_ARG_2_SI_CMP },
27010 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomled", IX86_BUILTIN_VPCOMLED, LE, (int)MULTI_ARG_2_SI_CMP },
27011 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomgtd", IX86_BUILTIN_VPCOMGTD, GT, (int)MULTI_ARG_2_SI_CMP },
27012 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomged", IX86_BUILTIN_VPCOMGED, GE, (int)MULTI_ARG_2_SI_CMP },
27013
27014 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomeqq", IX86_BUILTIN_VPCOMEQQ, EQ, (int)MULTI_ARG_2_DI_CMP },
27015 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomneq", IX86_BUILTIN_VPCOMNEQ, NE, (int)MULTI_ARG_2_DI_CMP },
27016 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomneqq", IX86_BUILTIN_VPCOMNEQ, NE, (int)MULTI_ARG_2_DI_CMP },
27017 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomltq", IX86_BUILTIN_VPCOMLTQ, LT, (int)MULTI_ARG_2_DI_CMP },
27018 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomleq", IX86_BUILTIN_VPCOMLEQ, LE, (int)MULTI_ARG_2_DI_CMP },
27019 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomgtq", IX86_BUILTIN_VPCOMGTQ, GT, (int)MULTI_ARG_2_DI_CMP },
27020 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomgeq", IX86_BUILTIN_VPCOMGEQ, GE, (int)MULTI_ARG_2_DI_CMP },
27021
27022 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v16qi3,"__builtin_ia32_vpcomequb", IX86_BUILTIN_VPCOMEQUB, EQ, (int)MULTI_ARG_2_QI_CMP },
27023 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v16qi3,"__builtin_ia32_vpcomneub", IX86_BUILTIN_VPCOMNEUB, NE, (int)MULTI_ARG_2_QI_CMP },
27024 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v16qi3,"__builtin_ia32_vpcomnequb", IX86_BUILTIN_VPCOMNEUB, NE, (int)MULTI_ARG_2_QI_CMP },
27025 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomltub", IX86_BUILTIN_VPCOMLTUB, LTU, (int)MULTI_ARG_2_QI_CMP },
27026 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomleub", IX86_BUILTIN_VPCOMLEUB, LEU, (int)MULTI_ARG_2_QI_CMP },
27027 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomgtub", IX86_BUILTIN_VPCOMGTUB, GTU, (int)MULTI_ARG_2_QI_CMP },
27028 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomgeub", IX86_BUILTIN_VPCOMGEUB, GEU, (int)MULTI_ARG_2_QI_CMP },
27029
27030 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v8hi3, "__builtin_ia32_vpcomequw", IX86_BUILTIN_VPCOMEQUW, EQ, (int)MULTI_ARG_2_HI_CMP },
27031 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v8hi3, "__builtin_ia32_vpcomneuw", IX86_BUILTIN_VPCOMNEUW, NE, (int)MULTI_ARG_2_HI_CMP },
27032 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v8hi3, "__builtin_ia32_vpcomnequw", IX86_BUILTIN_VPCOMNEUW, NE, (int)MULTI_ARG_2_HI_CMP },
27033 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomltuw", IX86_BUILTIN_VPCOMLTUW, LTU, (int)MULTI_ARG_2_HI_CMP },
27034 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomleuw", IX86_BUILTIN_VPCOMLEUW, LEU, (int)MULTI_ARG_2_HI_CMP },
27035 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomgtuw", IX86_BUILTIN_VPCOMGTUW, GTU, (int)MULTI_ARG_2_HI_CMP },
27036 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomgeuw", IX86_BUILTIN_VPCOMGEUW, GEU, (int)MULTI_ARG_2_HI_CMP },
27037
27038 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v4si3, "__builtin_ia32_vpcomequd", IX86_BUILTIN_VPCOMEQUD, EQ, (int)MULTI_ARG_2_SI_CMP },
27039 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v4si3, "__builtin_ia32_vpcomneud", IX86_BUILTIN_VPCOMNEUD, NE, (int)MULTI_ARG_2_SI_CMP },
27040 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v4si3, "__builtin_ia32_vpcomnequd", IX86_BUILTIN_VPCOMNEUD, NE, (int)MULTI_ARG_2_SI_CMP },
27041 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomltud", IX86_BUILTIN_VPCOMLTUD, LTU, (int)MULTI_ARG_2_SI_CMP },
27042 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomleud", IX86_BUILTIN_VPCOMLEUD, LEU, (int)MULTI_ARG_2_SI_CMP },
27043 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomgtud", IX86_BUILTIN_VPCOMGTUD, GTU, (int)MULTI_ARG_2_SI_CMP },
27044 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomgeud", IX86_BUILTIN_VPCOMGEUD, GEU, (int)MULTI_ARG_2_SI_CMP },
27045
27046 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v2di3, "__builtin_ia32_vpcomequq", IX86_BUILTIN_VPCOMEQUQ, EQ, (int)MULTI_ARG_2_DI_CMP },
27047 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v2di3, "__builtin_ia32_vpcomneuq", IX86_BUILTIN_VPCOMNEUQ, NE, (int)MULTI_ARG_2_DI_CMP },
27048 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v2di3, "__builtin_ia32_vpcomnequq", IX86_BUILTIN_VPCOMNEUQ, NE, (int)MULTI_ARG_2_DI_CMP },
27049 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomltuq", IX86_BUILTIN_VPCOMLTUQ, LTU, (int)MULTI_ARG_2_DI_CMP },
27050 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomleuq", IX86_BUILTIN_VPCOMLEUQ, LEU, (int)MULTI_ARG_2_DI_CMP },
27051 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomgtuq", IX86_BUILTIN_VPCOMGTUQ, GTU, (int)MULTI_ARG_2_DI_CMP },
27052 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomgeuq", IX86_BUILTIN_VPCOMGEUQ, GEU, (int)MULTI_ARG_2_DI_CMP },
27053
27054 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomfalseb", IX86_BUILTIN_VPCOMFALSEB, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_QI_TF },
27055 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomfalsew", IX86_BUILTIN_VPCOMFALSEW, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_HI_TF },
27056 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomfalsed", IX86_BUILTIN_VPCOMFALSED, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_SI_TF },
27057 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomfalseq", IX86_BUILTIN_VPCOMFALSEQ, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_DI_TF },
27058 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomfalseub",IX86_BUILTIN_VPCOMFALSEUB,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_QI_TF },
27059 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomfalseuw",IX86_BUILTIN_VPCOMFALSEUW,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_HI_TF },
27060 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomfalseud",IX86_BUILTIN_VPCOMFALSEUD,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_SI_TF },
27061 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomfalseuq",IX86_BUILTIN_VPCOMFALSEUQ,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_DI_TF },
27062
27063 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomtrueb", IX86_BUILTIN_VPCOMTRUEB, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_QI_TF },
27064 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomtruew", IX86_BUILTIN_VPCOMTRUEW, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_HI_TF },
27065 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomtrued", IX86_BUILTIN_VPCOMTRUED, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_SI_TF },
27066 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomtrueq", IX86_BUILTIN_VPCOMTRUEQ, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_DI_TF },
27067 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomtrueub", IX86_BUILTIN_VPCOMTRUEUB, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_QI_TF },
27068 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomtrueuw", IX86_BUILTIN_VPCOMTRUEUW, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_HI_TF },
27069 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomtrueud", IX86_BUILTIN_VPCOMTRUEUD, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_SI_TF },
27070 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomtrueuq", IX86_BUILTIN_VPCOMTRUEUQ, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_DI_TF },
27071
27072 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v2df3, "__builtin_ia32_vpermil2pd", IX86_BUILTIN_VPERMIL2PD, UNKNOWN, (int)MULTI_ARG_4_DF2_DI_I },
27073 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v4sf3, "__builtin_ia32_vpermil2ps", IX86_BUILTIN_VPERMIL2PS, UNKNOWN, (int)MULTI_ARG_4_SF2_SI_I },
27074 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v4df3, "__builtin_ia32_vpermil2pd256", IX86_BUILTIN_VPERMIL2PD256, UNKNOWN, (int)MULTI_ARG_4_DF2_DI_I1 },
27075 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v8sf3, "__builtin_ia32_vpermil2ps256", IX86_BUILTIN_VPERMIL2PS256, UNKNOWN, (int)MULTI_ARG_4_SF2_SI_I1 },
27076
27077 };
27078 \f
27079 /* TM vector builtins. */
27080
27081 /* Reuse the existing x86-specific `struct builtin_description' cause
27082 we're lazy. Add casts to make them fit. */
27083 static const struct builtin_description bdesc_tm[] =
27084 {
27085 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WM64", (enum ix86_builtins) BUILT_IN_TM_STORE_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
27086 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WaRM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
27087 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WaWM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
27088 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
27089 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RaRM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
27090 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RaWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
27091 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RfWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
27092
27093 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WM128", (enum ix86_builtins) BUILT_IN_TM_STORE_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
27094 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WaRM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
27095 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WaWM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
27096 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
27097 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RaRM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
27098 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RaWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
27099 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RfWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
27100
27101 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WM256", (enum ix86_builtins) BUILT_IN_TM_STORE_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
27102 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WaRM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
27103 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WaWM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
27104 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
27105 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RaRM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
27106 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RaWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
27107 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RfWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
27108
27109 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_LM64", (enum ix86_builtins) BUILT_IN_TM_LOG_M64, UNKNOWN, VOID_FTYPE_PCVOID },
27110 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_LM128", (enum ix86_builtins) BUILT_IN_TM_LOG_M128, UNKNOWN, VOID_FTYPE_PCVOID },
27111 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_LM256", (enum ix86_builtins) BUILT_IN_TM_LOG_M256, UNKNOWN, VOID_FTYPE_PCVOID },
27112 };
27113
27114 /* TM callbacks. */
27115
27116 /* Return the builtin decl needed to load a vector of TYPE. */
27117
27118 static tree
27119 ix86_builtin_tm_load (tree type)
27120 {
27121 if (TREE_CODE (type) == VECTOR_TYPE)
27122 {
27123 switch (tree_low_cst (TYPE_SIZE (type), 1))
27124 {
27125 case 64:
27126 return builtin_decl_explicit (BUILT_IN_TM_LOAD_M64);
27127 case 128:
27128 return builtin_decl_explicit (BUILT_IN_TM_LOAD_M128);
27129 case 256:
27130 return builtin_decl_explicit (BUILT_IN_TM_LOAD_M256);
27131 }
27132 }
27133 return NULL_TREE;
27134 }
27135
27136 /* Return the builtin decl needed to store a vector of TYPE. */
27137
27138 static tree
27139 ix86_builtin_tm_store (tree type)
27140 {
27141 if (TREE_CODE (type) == VECTOR_TYPE)
27142 {
27143 switch (tree_low_cst (TYPE_SIZE (type), 1))
27144 {
27145 case 64:
27146 return builtin_decl_explicit (BUILT_IN_TM_STORE_M64);
27147 case 128:
27148 return builtin_decl_explicit (BUILT_IN_TM_STORE_M128);
27149 case 256:
27150 return builtin_decl_explicit (BUILT_IN_TM_STORE_M256);
27151 }
27152 }
27153 return NULL_TREE;
27154 }
27155 \f
27156 /* Initialize the transactional memory vector load/store builtins. */
27157
27158 static void
27159 ix86_init_tm_builtins (void)
27160 {
27161 enum ix86_builtin_func_type ftype;
27162 const struct builtin_description *d;
27163 size_t i;
27164 tree decl;
27165 tree attrs_load, attrs_type_load, attrs_store, attrs_type_store;
27166 tree attrs_log, attrs_type_log;
27167
27168 if (!flag_tm)
27169 return;
27170
27171 /* If there are no builtins defined, we must be compiling in a
27172 language without trans-mem support. */
27173 if (!builtin_decl_explicit_p (BUILT_IN_TM_LOAD_1))
27174 return;
27175
27176 /* Use whatever attributes a normal TM load has. */
27177 decl = builtin_decl_explicit (BUILT_IN_TM_LOAD_1);
27178 attrs_load = DECL_ATTRIBUTES (decl);
27179 attrs_type_load = TYPE_ATTRIBUTES (TREE_TYPE (decl));
27180 /* Use whatever attributes a normal TM store has. */
27181 decl = builtin_decl_explicit (BUILT_IN_TM_STORE_1);
27182 attrs_store = DECL_ATTRIBUTES (decl);
27183 attrs_type_store = TYPE_ATTRIBUTES (TREE_TYPE (decl));
27184 /* Use whatever attributes a normal TM log has. */
27185 decl = builtin_decl_explicit (BUILT_IN_TM_LOG);
27186 attrs_log = DECL_ATTRIBUTES (decl);
27187 attrs_type_log = TYPE_ATTRIBUTES (TREE_TYPE (decl));
27188
27189 for (i = 0, d = bdesc_tm;
27190 i < ARRAY_SIZE (bdesc_tm);
27191 i++, d++)
27192 {
27193 if ((d->mask & ix86_isa_flags) != 0
27194 || (lang_hooks.builtin_function
27195 == lang_hooks.builtin_function_ext_scope))
27196 {
27197 tree type, attrs, attrs_type;
27198 enum built_in_function code = (enum built_in_function) d->code;
27199
27200 ftype = (enum ix86_builtin_func_type) d->flag;
27201 type = ix86_get_builtin_func_type (ftype);
27202
27203 if (BUILTIN_TM_LOAD_P (code))
27204 {
27205 attrs = attrs_load;
27206 attrs_type = attrs_type_load;
27207 }
27208 else if (BUILTIN_TM_STORE_P (code))
27209 {
27210 attrs = attrs_store;
27211 attrs_type = attrs_type_store;
27212 }
27213 else
27214 {
27215 attrs = attrs_log;
27216 attrs_type = attrs_type_log;
27217 }
27218 decl = add_builtin_function (d->name, type, code, BUILT_IN_NORMAL,
27219 /* The builtin without the prefix for
27220 calling it directly. */
27221 d->name + strlen ("__builtin_"),
27222 attrs);
27223 /* add_builtin_function() will set the DECL_ATTRIBUTES, now
27224 set the TYPE_ATTRIBUTES. */
27225 decl_attributes (&TREE_TYPE (decl), attrs_type, ATTR_FLAG_BUILT_IN);
27226
27227 set_builtin_decl (code, decl, false);
27228 }
27229 }
27230 }
27231
27232 /* Set up all the MMX/SSE builtins, even builtins for instructions that are not
27233 in the current target ISA to allow the user to compile particular modules
27234 with different target specific options that differ from the command line
27235 options. */
27236 static void
27237 ix86_init_mmx_sse_builtins (void)
27238 {
27239 const struct builtin_description * d;
27240 enum ix86_builtin_func_type ftype;
27241 size_t i;
27242
27243 /* Add all special builtins with variable number of operands. */
27244 for (i = 0, d = bdesc_special_args;
27245 i < ARRAY_SIZE (bdesc_special_args);
27246 i++, d++)
27247 {
27248 if (d->name == 0)
27249 continue;
27250
27251 ftype = (enum ix86_builtin_func_type) d->flag;
27252 def_builtin (d->mask, d->name, ftype, d->code);
27253 }
27254
27255 /* Add all builtins with variable number of operands. */
27256 for (i = 0, d = bdesc_args;
27257 i < ARRAY_SIZE (bdesc_args);
27258 i++, d++)
27259 {
27260 if (d->name == 0)
27261 continue;
27262
27263 ftype = (enum ix86_builtin_func_type) d->flag;
27264 def_builtin_const (d->mask, d->name, ftype, d->code);
27265 }
27266
27267 /* pcmpestr[im] insns. */
27268 for (i = 0, d = bdesc_pcmpestr;
27269 i < ARRAY_SIZE (bdesc_pcmpestr);
27270 i++, d++)
27271 {
27272 if (d->code == IX86_BUILTIN_PCMPESTRM128)
27273 ftype = V16QI_FTYPE_V16QI_INT_V16QI_INT_INT;
27274 else
27275 ftype = INT_FTYPE_V16QI_INT_V16QI_INT_INT;
27276 def_builtin_const (d->mask, d->name, ftype, d->code);
27277 }
27278
27279 /* pcmpistr[im] insns. */
27280 for (i = 0, d = bdesc_pcmpistr;
27281 i < ARRAY_SIZE (bdesc_pcmpistr);
27282 i++, d++)
27283 {
27284 if (d->code == IX86_BUILTIN_PCMPISTRM128)
27285 ftype = V16QI_FTYPE_V16QI_V16QI_INT;
27286 else
27287 ftype = INT_FTYPE_V16QI_V16QI_INT;
27288 def_builtin_const (d->mask, d->name, ftype, d->code);
27289 }
27290
27291 /* comi/ucomi insns. */
27292 for (i = 0, d = bdesc_comi; i < ARRAY_SIZE (bdesc_comi); i++, d++)
27293 {
27294 if (d->mask == OPTION_MASK_ISA_SSE2)
27295 ftype = INT_FTYPE_V2DF_V2DF;
27296 else
27297 ftype = INT_FTYPE_V4SF_V4SF;
27298 def_builtin_const (d->mask, d->name, ftype, d->code);
27299 }
27300
27301 /* SSE */
27302 def_builtin (OPTION_MASK_ISA_SSE, "__builtin_ia32_ldmxcsr",
27303 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_LDMXCSR);
27304 def_builtin (OPTION_MASK_ISA_SSE, "__builtin_ia32_stmxcsr",
27305 UNSIGNED_FTYPE_VOID, IX86_BUILTIN_STMXCSR);
27306
27307 /* SSE or 3DNow!A */
27308 def_builtin (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
27309 "__builtin_ia32_maskmovq", VOID_FTYPE_V8QI_V8QI_PCHAR,
27310 IX86_BUILTIN_MASKMOVQ);
27311
27312 /* SSE2 */
27313 def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_maskmovdqu",
27314 VOID_FTYPE_V16QI_V16QI_PCHAR, IX86_BUILTIN_MASKMOVDQU);
27315
27316 def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_clflush",
27317 VOID_FTYPE_PCVOID, IX86_BUILTIN_CLFLUSH);
27318 x86_mfence = def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_mfence",
27319 VOID_FTYPE_VOID, IX86_BUILTIN_MFENCE);
27320
27321 /* SSE3. */
27322 def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_monitor",
27323 VOID_FTYPE_PCVOID_UNSIGNED_UNSIGNED, IX86_BUILTIN_MONITOR);
27324 def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_mwait",
27325 VOID_FTYPE_UNSIGNED_UNSIGNED, IX86_BUILTIN_MWAIT);
27326
27327 /* AES */
27328 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesenc128",
27329 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENC128);
27330 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesenclast128",
27331 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENCLAST128);
27332 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesdec128",
27333 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDEC128);
27334 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesdeclast128",
27335 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDECLAST128);
27336 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesimc128",
27337 V2DI_FTYPE_V2DI, IX86_BUILTIN_AESIMC128);
27338 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aeskeygenassist128",
27339 V2DI_FTYPE_V2DI_INT, IX86_BUILTIN_AESKEYGENASSIST128);
27340
27341 /* PCLMUL */
27342 def_builtin_const (OPTION_MASK_ISA_PCLMUL, "__builtin_ia32_pclmulqdq128",
27343 V2DI_FTYPE_V2DI_V2DI_INT, IX86_BUILTIN_PCLMULQDQ128);
27344
27345 /* RDRND */
27346 def_builtin (OPTION_MASK_ISA_RDRND, "__builtin_ia32_rdrand16_step",
27347 INT_FTYPE_PUSHORT, IX86_BUILTIN_RDRAND16_STEP);
27348 def_builtin (OPTION_MASK_ISA_RDRND, "__builtin_ia32_rdrand32_step",
27349 INT_FTYPE_PUNSIGNED, IX86_BUILTIN_RDRAND32_STEP);
27350 def_builtin (OPTION_MASK_ISA_RDRND | OPTION_MASK_ISA_64BIT,
27351 "__builtin_ia32_rdrand64_step", INT_FTYPE_PULONGLONG,
27352 IX86_BUILTIN_RDRAND64_STEP);
27353
27354 /* AVX2 */
27355 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv2df",
27356 V2DF_FTYPE_V2DF_PCDOUBLE_V4SI_V2DF_INT,
27357 IX86_BUILTIN_GATHERSIV2DF);
27358
27359 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4df",
27360 V4DF_FTYPE_V4DF_PCDOUBLE_V4SI_V4DF_INT,
27361 IX86_BUILTIN_GATHERSIV4DF);
27362
27363 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv2df",
27364 V2DF_FTYPE_V2DF_PCDOUBLE_V2DI_V2DF_INT,
27365 IX86_BUILTIN_GATHERDIV2DF);
27366
27367 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4df",
27368 V4DF_FTYPE_V4DF_PCDOUBLE_V4DI_V4DF_INT,
27369 IX86_BUILTIN_GATHERDIV4DF);
27370
27371 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4sf",
27372 V4SF_FTYPE_V4SF_PCFLOAT_V4SI_V4SF_INT,
27373 IX86_BUILTIN_GATHERSIV4SF);
27374
27375 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv8sf",
27376 V8SF_FTYPE_V8SF_PCFLOAT_V8SI_V8SF_INT,
27377 IX86_BUILTIN_GATHERSIV8SF);
27378
27379 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4sf",
27380 V4SF_FTYPE_V4SF_PCFLOAT_V2DI_V4SF_INT,
27381 IX86_BUILTIN_GATHERDIV4SF);
27382
27383 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4sf256",
27384 V4SF_FTYPE_V4SF_PCFLOAT_V4DI_V4SF_INT,
27385 IX86_BUILTIN_GATHERDIV8SF);
27386
27387 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv2di",
27388 V2DI_FTYPE_V2DI_PCINT64_V4SI_V2DI_INT,
27389 IX86_BUILTIN_GATHERSIV2DI);
27390
27391 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4di",
27392 V4DI_FTYPE_V4DI_PCINT64_V4SI_V4DI_INT,
27393 IX86_BUILTIN_GATHERSIV4DI);
27394
27395 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv2di",
27396 V2DI_FTYPE_V2DI_PCINT64_V2DI_V2DI_INT,
27397 IX86_BUILTIN_GATHERDIV2DI);
27398
27399 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4di",
27400 V4DI_FTYPE_V4DI_PCINT64_V4DI_V4DI_INT,
27401 IX86_BUILTIN_GATHERDIV4DI);
27402
27403 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4si",
27404 V4SI_FTYPE_V4SI_PCINT_V4SI_V4SI_INT,
27405 IX86_BUILTIN_GATHERSIV4SI);
27406
27407 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv8si",
27408 V8SI_FTYPE_V8SI_PCINT_V8SI_V8SI_INT,
27409 IX86_BUILTIN_GATHERSIV8SI);
27410
27411 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4si",
27412 V4SI_FTYPE_V4SI_PCINT_V2DI_V4SI_INT,
27413 IX86_BUILTIN_GATHERDIV4SI);
27414
27415 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4si256",
27416 V4SI_FTYPE_V4SI_PCINT_V4DI_V4SI_INT,
27417 IX86_BUILTIN_GATHERDIV8SI);
27418
27419 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4df ",
27420 V4DF_FTYPE_V4DF_PCDOUBLE_V8SI_V4DF_INT,
27421 IX86_BUILTIN_GATHERALTSIV4DF);
27422
27423 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4sf256 ",
27424 V8SF_FTYPE_V8SF_PCFLOAT_V4DI_V8SF_INT,
27425 IX86_BUILTIN_GATHERALTDIV8SF);
27426
27427 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4di ",
27428 V4DI_FTYPE_V4DI_PCINT64_V8SI_V4DI_INT,
27429 IX86_BUILTIN_GATHERALTSIV4DI);
27430
27431 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4si256 ",
27432 V8SI_FTYPE_V8SI_PCINT_V4DI_V8SI_INT,
27433 IX86_BUILTIN_GATHERALTDIV8SI);
27434
27435 /* MMX access to the vec_init patterns. */
27436 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v2si",
27437 V2SI_FTYPE_INT_INT, IX86_BUILTIN_VEC_INIT_V2SI);
27438
27439 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v4hi",
27440 V4HI_FTYPE_HI_HI_HI_HI,
27441 IX86_BUILTIN_VEC_INIT_V4HI);
27442
27443 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v8qi",
27444 V8QI_FTYPE_QI_QI_QI_QI_QI_QI_QI_QI,
27445 IX86_BUILTIN_VEC_INIT_V8QI);
27446
27447 /* Access to the vec_extract patterns. */
27448 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2df",
27449 DOUBLE_FTYPE_V2DF_INT, IX86_BUILTIN_VEC_EXT_V2DF);
27450 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2di",
27451 DI_FTYPE_V2DI_INT, IX86_BUILTIN_VEC_EXT_V2DI);
27452 def_builtin_const (OPTION_MASK_ISA_SSE, "__builtin_ia32_vec_ext_v4sf",
27453 FLOAT_FTYPE_V4SF_INT, IX86_BUILTIN_VEC_EXT_V4SF);
27454 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v4si",
27455 SI_FTYPE_V4SI_INT, IX86_BUILTIN_VEC_EXT_V4SI);
27456 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v8hi",
27457 HI_FTYPE_V8HI_INT, IX86_BUILTIN_VEC_EXT_V8HI);
27458
27459 def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
27460 "__builtin_ia32_vec_ext_v4hi",
27461 HI_FTYPE_V4HI_INT, IX86_BUILTIN_VEC_EXT_V4HI);
27462
27463 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_ext_v2si",
27464 SI_FTYPE_V2SI_INT, IX86_BUILTIN_VEC_EXT_V2SI);
27465
27466 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v16qi",
27467 QI_FTYPE_V16QI_INT, IX86_BUILTIN_VEC_EXT_V16QI);
27468
27469 /* Access to the vec_set patterns. */
27470 def_builtin_const (OPTION_MASK_ISA_SSE4_1 | OPTION_MASK_ISA_64BIT,
27471 "__builtin_ia32_vec_set_v2di",
27472 V2DI_FTYPE_V2DI_DI_INT, IX86_BUILTIN_VEC_SET_V2DI);
27473
27474 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4sf",
27475 V4SF_FTYPE_V4SF_FLOAT_INT, IX86_BUILTIN_VEC_SET_V4SF);
27476
27477 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4si",
27478 V4SI_FTYPE_V4SI_SI_INT, IX86_BUILTIN_VEC_SET_V4SI);
27479
27480 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_set_v8hi",
27481 V8HI_FTYPE_V8HI_HI_INT, IX86_BUILTIN_VEC_SET_V8HI);
27482
27483 def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
27484 "__builtin_ia32_vec_set_v4hi",
27485 V4HI_FTYPE_V4HI_HI_INT, IX86_BUILTIN_VEC_SET_V4HI);
27486
27487 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v16qi",
27488 V16QI_FTYPE_V16QI_QI_INT, IX86_BUILTIN_VEC_SET_V16QI);
27489
27490 /* Add FMA4 multi-arg argument instructions */
27491 for (i = 0, d = bdesc_multi_arg; i < ARRAY_SIZE (bdesc_multi_arg); i++, d++)
27492 {
27493 if (d->name == 0)
27494 continue;
27495
27496 ftype = (enum ix86_builtin_func_type) d->flag;
27497 def_builtin_const (d->mask, d->name, ftype, d->code);
27498 }
27499 }
27500
27501 /* Internal method for ix86_init_builtins. */
27502
27503 static void
27504 ix86_init_builtins_va_builtins_abi (void)
27505 {
27506 tree ms_va_ref, sysv_va_ref;
27507 tree fnvoid_va_end_ms, fnvoid_va_end_sysv;
27508 tree fnvoid_va_start_ms, fnvoid_va_start_sysv;
27509 tree fnvoid_va_copy_ms, fnvoid_va_copy_sysv;
27510 tree fnattr_ms = NULL_TREE, fnattr_sysv = NULL_TREE;
27511
27512 if (!TARGET_64BIT)
27513 return;
27514 fnattr_ms = build_tree_list (get_identifier ("ms_abi"), NULL_TREE);
27515 fnattr_sysv = build_tree_list (get_identifier ("sysv_abi"), NULL_TREE);
27516 ms_va_ref = build_reference_type (ms_va_list_type_node);
27517 sysv_va_ref =
27518 build_pointer_type (TREE_TYPE (sysv_va_list_type_node));
27519
27520 fnvoid_va_end_ms =
27521 build_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
27522 fnvoid_va_start_ms =
27523 build_varargs_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
27524 fnvoid_va_end_sysv =
27525 build_function_type_list (void_type_node, sysv_va_ref, NULL_TREE);
27526 fnvoid_va_start_sysv =
27527 build_varargs_function_type_list (void_type_node, sysv_va_ref,
27528 NULL_TREE);
27529 fnvoid_va_copy_ms =
27530 build_function_type_list (void_type_node, ms_va_ref, ms_va_list_type_node,
27531 NULL_TREE);
27532 fnvoid_va_copy_sysv =
27533 build_function_type_list (void_type_node, sysv_va_ref,
27534 sysv_va_ref, NULL_TREE);
27535
27536 add_builtin_function ("__builtin_ms_va_start", fnvoid_va_start_ms,
27537 BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_ms);
27538 add_builtin_function ("__builtin_ms_va_end", fnvoid_va_end_ms,
27539 BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_ms);
27540 add_builtin_function ("__builtin_ms_va_copy", fnvoid_va_copy_ms,
27541 BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_ms);
27542 add_builtin_function ("__builtin_sysv_va_start", fnvoid_va_start_sysv,
27543 BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_sysv);
27544 add_builtin_function ("__builtin_sysv_va_end", fnvoid_va_end_sysv,
27545 BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_sysv);
27546 add_builtin_function ("__builtin_sysv_va_copy", fnvoid_va_copy_sysv,
27547 BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_sysv);
27548 }
27549
27550 static void
27551 ix86_init_builtin_types (void)
27552 {
27553 tree float128_type_node, float80_type_node;
27554
27555 /* The __float80 type. */
27556 float80_type_node = long_double_type_node;
27557 if (TYPE_MODE (float80_type_node) != XFmode)
27558 {
27559 /* The __float80 type. */
27560 float80_type_node = make_node (REAL_TYPE);
27561
27562 TYPE_PRECISION (float80_type_node) = 80;
27563 layout_type (float80_type_node);
27564 }
27565 lang_hooks.types.register_builtin_type (float80_type_node, "__float80");
27566
27567 /* The __float128 type. */
27568 float128_type_node = make_node (REAL_TYPE);
27569 TYPE_PRECISION (float128_type_node) = 128;
27570 layout_type (float128_type_node);
27571 lang_hooks.types.register_builtin_type (float128_type_node, "__float128");
27572
27573 /* This macro is built by i386-builtin-types.awk. */
27574 DEFINE_BUILTIN_PRIMITIVE_TYPES;
27575 }
27576
27577 static void
27578 ix86_init_builtins (void)
27579 {
27580 tree t;
27581
27582 ix86_init_builtin_types ();
27583
27584 /* TFmode support builtins. */
27585 def_builtin_const (0, "__builtin_infq",
27586 FLOAT128_FTYPE_VOID, IX86_BUILTIN_INFQ);
27587 def_builtin_const (0, "__builtin_huge_valq",
27588 FLOAT128_FTYPE_VOID, IX86_BUILTIN_HUGE_VALQ);
27589
27590 /* We will expand them to normal call if SSE2 isn't available since
27591 they are used by libgcc. */
27592 t = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128);
27593 t = add_builtin_function ("__builtin_fabsq", t, IX86_BUILTIN_FABSQ,
27594 BUILT_IN_MD, "__fabstf2", NULL_TREE);
27595 TREE_READONLY (t) = 1;
27596 ix86_builtins[(int) IX86_BUILTIN_FABSQ] = t;
27597
27598 t = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128_FLOAT128);
27599 t = add_builtin_function ("__builtin_copysignq", t, IX86_BUILTIN_COPYSIGNQ,
27600 BUILT_IN_MD, "__copysigntf3", NULL_TREE);
27601 TREE_READONLY (t) = 1;
27602 ix86_builtins[(int) IX86_BUILTIN_COPYSIGNQ] = t;
27603
27604 ix86_init_tm_builtins ();
27605 ix86_init_mmx_sse_builtins ();
27606
27607 if (TARGET_LP64)
27608 ix86_init_builtins_va_builtins_abi ();
27609
27610 #ifdef SUBTARGET_INIT_BUILTINS
27611 SUBTARGET_INIT_BUILTINS;
27612 #endif
27613 }
27614
27615 /* Return the ix86 builtin for CODE. */
27616
27617 static tree
27618 ix86_builtin_decl (unsigned code, bool initialize_p ATTRIBUTE_UNUSED)
27619 {
27620 if (code >= IX86_BUILTIN_MAX)
27621 return error_mark_node;
27622
27623 return ix86_builtins[code];
27624 }
27625
27626 /* Errors in the source file can cause expand_expr to return const0_rtx
27627 where we expect a vector. To avoid crashing, use one of the vector
27628 clear instructions. */
27629 static rtx
27630 safe_vector_operand (rtx x, enum machine_mode mode)
27631 {
27632 if (x == const0_rtx)
27633 x = CONST0_RTX (mode);
27634 return x;
27635 }
27636
27637 /* Subroutine of ix86_expand_builtin to take care of binop insns. */
27638
27639 static rtx
27640 ix86_expand_binop_builtin (enum insn_code icode, tree exp, rtx target)
27641 {
27642 rtx pat;
27643 tree arg0 = CALL_EXPR_ARG (exp, 0);
27644 tree arg1 = CALL_EXPR_ARG (exp, 1);
27645 rtx op0 = expand_normal (arg0);
27646 rtx op1 = expand_normal (arg1);
27647 enum machine_mode tmode = insn_data[icode].operand[0].mode;
27648 enum machine_mode mode0 = insn_data[icode].operand[1].mode;
27649 enum machine_mode mode1 = insn_data[icode].operand[2].mode;
27650
27651 if (VECTOR_MODE_P (mode0))
27652 op0 = safe_vector_operand (op0, mode0);
27653 if (VECTOR_MODE_P (mode1))
27654 op1 = safe_vector_operand (op1, mode1);
27655
27656 if (optimize || !target
27657 || GET_MODE (target) != tmode
27658 || !insn_data[icode].operand[0].predicate (target, tmode))
27659 target = gen_reg_rtx (tmode);
27660
27661 if (GET_MODE (op1) == SImode && mode1 == TImode)
27662 {
27663 rtx x = gen_reg_rtx (V4SImode);
27664 emit_insn (gen_sse2_loadd (x, op1));
27665 op1 = gen_lowpart (TImode, x);
27666 }
27667
27668 if (!insn_data[icode].operand[1].predicate (op0, mode0))
27669 op0 = copy_to_mode_reg (mode0, op0);
27670 if (!insn_data[icode].operand[2].predicate (op1, mode1))
27671 op1 = copy_to_mode_reg (mode1, op1);
27672
27673 pat = GEN_FCN (icode) (target, op0, op1);
27674 if (! pat)
27675 return 0;
27676
27677 emit_insn (pat);
27678
27679 return target;
27680 }
27681
27682 /* Subroutine of ix86_expand_builtin to take care of 2-4 argument insns. */
27683
27684 static rtx
27685 ix86_expand_multi_arg_builtin (enum insn_code icode, tree exp, rtx target,
27686 enum ix86_builtin_func_type m_type,
27687 enum rtx_code sub_code)
27688 {
27689 rtx pat;
27690 int i;
27691 int nargs;
27692 bool comparison_p = false;
27693 bool tf_p = false;
27694 bool last_arg_constant = false;
27695 int num_memory = 0;
27696 struct {
27697 rtx op;
27698 enum machine_mode mode;
27699 } args[4];
27700
27701 enum machine_mode tmode = insn_data[icode].operand[0].mode;
27702
27703 switch (m_type)
27704 {
27705 case MULTI_ARG_4_DF2_DI_I:
27706 case MULTI_ARG_4_DF2_DI_I1:
27707 case MULTI_ARG_4_SF2_SI_I:
27708 case MULTI_ARG_4_SF2_SI_I1:
27709 nargs = 4;
27710 last_arg_constant = true;
27711 break;
27712
27713 case MULTI_ARG_3_SF:
27714 case MULTI_ARG_3_DF:
27715 case MULTI_ARG_3_SF2:
27716 case MULTI_ARG_3_DF2:
27717 case MULTI_ARG_3_DI:
27718 case MULTI_ARG_3_SI:
27719 case MULTI_ARG_3_SI_DI:
27720 case MULTI_ARG_3_HI:
27721 case MULTI_ARG_3_HI_SI:
27722 case MULTI_ARG_3_QI:
27723 case MULTI_ARG_3_DI2:
27724 case MULTI_ARG_3_SI2:
27725 case MULTI_ARG_3_HI2:
27726 case MULTI_ARG_3_QI2:
27727 nargs = 3;
27728 break;
27729
27730 case MULTI_ARG_2_SF:
27731 case MULTI_ARG_2_DF:
27732 case MULTI_ARG_2_DI:
27733 case MULTI_ARG_2_SI:
27734 case MULTI_ARG_2_HI:
27735 case MULTI_ARG_2_QI:
27736 nargs = 2;
27737 break;
27738
27739 case MULTI_ARG_2_DI_IMM:
27740 case MULTI_ARG_2_SI_IMM:
27741 case MULTI_ARG_2_HI_IMM:
27742 case MULTI_ARG_2_QI_IMM:
27743 nargs = 2;
27744 last_arg_constant = true;
27745 break;
27746
27747 case MULTI_ARG_1_SF:
27748 case MULTI_ARG_1_DF:
27749 case MULTI_ARG_1_SF2:
27750 case MULTI_ARG_1_DF2:
27751 case MULTI_ARG_1_DI:
27752 case MULTI_ARG_1_SI:
27753 case MULTI_ARG_1_HI:
27754 case MULTI_ARG_1_QI:
27755 case MULTI_ARG_1_SI_DI:
27756 case MULTI_ARG_1_HI_DI:
27757 case MULTI_ARG_1_HI_SI:
27758 case MULTI_ARG_1_QI_DI:
27759 case MULTI_ARG_1_QI_SI:
27760 case MULTI_ARG_1_QI_HI:
27761 nargs = 1;
27762 break;
27763
27764 case MULTI_ARG_2_DI_CMP:
27765 case MULTI_ARG_2_SI_CMP:
27766 case MULTI_ARG_2_HI_CMP:
27767 case MULTI_ARG_2_QI_CMP:
27768 nargs = 2;
27769 comparison_p = true;
27770 break;
27771
27772 case MULTI_ARG_2_SF_TF:
27773 case MULTI_ARG_2_DF_TF:
27774 case MULTI_ARG_2_DI_TF:
27775 case MULTI_ARG_2_SI_TF:
27776 case MULTI_ARG_2_HI_TF:
27777 case MULTI_ARG_2_QI_TF:
27778 nargs = 2;
27779 tf_p = true;
27780 break;
27781
27782 default:
27783 gcc_unreachable ();
27784 }
27785
27786 if (optimize || !target
27787 || GET_MODE (target) != tmode
27788 || !insn_data[icode].operand[0].predicate (target, tmode))
27789 target = gen_reg_rtx (tmode);
27790
27791 gcc_assert (nargs <= 4);
27792
27793 for (i = 0; i < nargs; i++)
27794 {
27795 tree arg = CALL_EXPR_ARG (exp, i);
27796 rtx op = expand_normal (arg);
27797 int adjust = (comparison_p) ? 1 : 0;
27798 enum machine_mode mode = insn_data[icode].operand[i+adjust+1].mode;
27799
27800 if (last_arg_constant && i == nargs - 1)
27801 {
27802 if (!insn_data[icode].operand[i + 1].predicate (op, mode))
27803 {
27804 enum insn_code new_icode = icode;
27805 switch (icode)
27806 {
27807 case CODE_FOR_xop_vpermil2v2df3:
27808 case CODE_FOR_xop_vpermil2v4sf3:
27809 case CODE_FOR_xop_vpermil2v4df3:
27810 case CODE_FOR_xop_vpermil2v8sf3:
27811 error ("the last argument must be a 2-bit immediate");
27812 return gen_reg_rtx (tmode);
27813 case CODE_FOR_xop_rotlv2di3:
27814 new_icode = CODE_FOR_rotlv2di3;
27815 goto xop_rotl;
27816 case CODE_FOR_xop_rotlv4si3:
27817 new_icode = CODE_FOR_rotlv4si3;
27818 goto xop_rotl;
27819 case CODE_FOR_xop_rotlv8hi3:
27820 new_icode = CODE_FOR_rotlv8hi3;
27821 goto xop_rotl;
27822 case CODE_FOR_xop_rotlv16qi3:
27823 new_icode = CODE_FOR_rotlv16qi3;
27824 xop_rotl:
27825 if (CONST_INT_P (op))
27826 {
27827 int mask = GET_MODE_BITSIZE (GET_MODE_INNER (tmode)) - 1;
27828 op = GEN_INT (INTVAL (op) & mask);
27829 gcc_checking_assert
27830 (insn_data[icode].operand[i + 1].predicate (op, mode));
27831 }
27832 else
27833 {
27834 gcc_checking_assert
27835 (nargs == 2
27836 && insn_data[new_icode].operand[0].mode == tmode
27837 && insn_data[new_icode].operand[1].mode == tmode
27838 && insn_data[new_icode].operand[2].mode == mode
27839 && insn_data[new_icode].operand[0].predicate
27840 == insn_data[icode].operand[0].predicate
27841 && insn_data[new_icode].operand[1].predicate
27842 == insn_data[icode].operand[1].predicate);
27843 icode = new_icode;
27844 goto non_constant;
27845 }
27846 break;
27847 default:
27848 gcc_unreachable ();
27849 }
27850 }
27851 }
27852 else
27853 {
27854 non_constant:
27855 if (VECTOR_MODE_P (mode))
27856 op = safe_vector_operand (op, mode);
27857
27858 /* If we aren't optimizing, only allow one memory operand to be
27859 generated. */
27860 if (memory_operand (op, mode))
27861 num_memory++;
27862
27863 gcc_assert (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode);
27864
27865 if (optimize
27866 || !insn_data[icode].operand[i+adjust+1].predicate (op, mode)
27867 || num_memory > 1)
27868 op = force_reg (mode, op);
27869 }
27870
27871 args[i].op = op;
27872 args[i].mode = mode;
27873 }
27874
27875 switch (nargs)
27876 {
27877 case 1:
27878 pat = GEN_FCN (icode) (target, args[0].op);
27879 break;
27880
27881 case 2:
27882 if (tf_p)
27883 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
27884 GEN_INT ((int)sub_code));
27885 else if (! comparison_p)
27886 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
27887 else
27888 {
27889 rtx cmp_op = gen_rtx_fmt_ee (sub_code, GET_MODE (target),
27890 args[0].op,
27891 args[1].op);
27892
27893 pat = GEN_FCN (icode) (target, cmp_op, args[0].op, args[1].op);
27894 }
27895 break;
27896
27897 case 3:
27898 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
27899 break;
27900
27901 case 4:
27902 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op, args[3].op);
27903 break;
27904
27905 default:
27906 gcc_unreachable ();
27907 }
27908
27909 if (! pat)
27910 return 0;
27911
27912 emit_insn (pat);
27913 return target;
27914 }
27915
27916 /* Subroutine of ix86_expand_args_builtin to take care of scalar unop
27917 insns with vec_merge. */
27918
27919 static rtx
27920 ix86_expand_unop_vec_merge_builtin (enum insn_code icode, tree exp,
27921 rtx target)
27922 {
27923 rtx pat;
27924 tree arg0 = CALL_EXPR_ARG (exp, 0);
27925 rtx op1, op0 = expand_normal (arg0);
27926 enum machine_mode tmode = insn_data[icode].operand[0].mode;
27927 enum machine_mode mode0 = insn_data[icode].operand[1].mode;
27928
27929 if (optimize || !target
27930 || GET_MODE (target) != tmode
27931 || !insn_data[icode].operand[0].predicate (target, tmode))
27932 target = gen_reg_rtx (tmode);
27933
27934 if (VECTOR_MODE_P (mode0))
27935 op0 = safe_vector_operand (op0, mode0);
27936
27937 if ((optimize && !register_operand (op0, mode0))
27938 || !insn_data[icode].operand[1].predicate (op0, mode0))
27939 op0 = copy_to_mode_reg (mode0, op0);
27940
27941 op1 = op0;
27942 if (!insn_data[icode].operand[2].predicate (op1, mode0))
27943 op1 = copy_to_mode_reg (mode0, op1);
27944
27945 pat = GEN_FCN (icode) (target, op0, op1);
27946 if (! pat)
27947 return 0;
27948 emit_insn (pat);
27949 return target;
27950 }
27951
27952 /* Subroutine of ix86_expand_builtin to take care of comparison insns. */
27953
27954 static rtx
27955 ix86_expand_sse_compare (const struct builtin_description *d,
27956 tree exp, rtx target, bool swap)
27957 {
27958 rtx pat;
27959 tree arg0 = CALL_EXPR_ARG (exp, 0);
27960 tree arg1 = CALL_EXPR_ARG (exp, 1);
27961 rtx op0 = expand_normal (arg0);
27962 rtx op1 = expand_normal (arg1);
27963 rtx op2;
27964 enum machine_mode tmode = insn_data[d->icode].operand[0].mode;
27965 enum machine_mode mode0 = insn_data[d->icode].operand[1].mode;
27966 enum machine_mode mode1 = insn_data[d->icode].operand[2].mode;
27967 enum rtx_code comparison = d->comparison;
27968
27969 if (VECTOR_MODE_P (mode0))
27970 op0 = safe_vector_operand (op0, mode0);
27971 if (VECTOR_MODE_P (mode1))
27972 op1 = safe_vector_operand (op1, mode1);
27973
27974 /* Swap operands if we have a comparison that isn't available in
27975 hardware. */
27976 if (swap)
27977 {
27978 rtx tmp = gen_reg_rtx (mode1);
27979 emit_move_insn (tmp, op1);
27980 op1 = op0;
27981 op0 = tmp;
27982 }
27983
27984 if (optimize || !target
27985 || GET_MODE (target) != tmode
27986 || !insn_data[d->icode].operand[0].predicate (target, tmode))
27987 target = gen_reg_rtx (tmode);
27988
27989 if ((optimize && !register_operand (op0, mode0))
27990 || !insn_data[d->icode].operand[1].predicate (op0, mode0))
27991 op0 = copy_to_mode_reg (mode0, op0);
27992 if ((optimize && !register_operand (op1, mode1))
27993 || !insn_data[d->icode].operand[2].predicate (op1, mode1))
27994 op1 = copy_to_mode_reg (mode1, op1);
27995
27996 op2 = gen_rtx_fmt_ee (comparison, mode0, op0, op1);
27997 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
27998 if (! pat)
27999 return 0;
28000 emit_insn (pat);
28001 return target;
28002 }
28003
28004 /* Subroutine of ix86_expand_builtin to take care of comi insns. */
28005
28006 static rtx
28007 ix86_expand_sse_comi (const struct builtin_description *d, tree exp,
28008 rtx target)
28009 {
28010 rtx pat;
28011 tree arg0 = CALL_EXPR_ARG (exp, 0);
28012 tree arg1 = CALL_EXPR_ARG (exp, 1);
28013 rtx op0 = expand_normal (arg0);
28014 rtx op1 = expand_normal (arg1);
28015 enum machine_mode mode0 = insn_data[d->icode].operand[0].mode;
28016 enum machine_mode mode1 = insn_data[d->icode].operand[1].mode;
28017 enum rtx_code comparison = d->comparison;
28018
28019 if (VECTOR_MODE_P (mode0))
28020 op0 = safe_vector_operand (op0, mode0);
28021 if (VECTOR_MODE_P (mode1))
28022 op1 = safe_vector_operand (op1, mode1);
28023
28024 /* Swap operands if we have a comparison that isn't available in
28025 hardware. */
28026 if (d->flag & BUILTIN_DESC_SWAP_OPERANDS)
28027 {
28028 rtx tmp = op1;
28029 op1 = op0;
28030 op0 = tmp;
28031 }
28032
28033 target = gen_reg_rtx (SImode);
28034 emit_move_insn (target, const0_rtx);
28035 target = gen_rtx_SUBREG (QImode, target, 0);
28036
28037 if ((optimize && !register_operand (op0, mode0))
28038 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
28039 op0 = copy_to_mode_reg (mode0, op0);
28040 if ((optimize && !register_operand (op1, mode1))
28041 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
28042 op1 = copy_to_mode_reg (mode1, op1);
28043
28044 pat = GEN_FCN (d->icode) (op0, op1);
28045 if (! pat)
28046 return 0;
28047 emit_insn (pat);
28048 emit_insn (gen_rtx_SET (VOIDmode,
28049 gen_rtx_STRICT_LOW_PART (VOIDmode, target),
28050 gen_rtx_fmt_ee (comparison, QImode,
28051 SET_DEST (pat),
28052 const0_rtx)));
28053
28054 return SUBREG_REG (target);
28055 }
28056
28057 /* Subroutines of ix86_expand_args_builtin to take care of round insns. */
28058
28059 static rtx
28060 ix86_expand_sse_round (const struct builtin_description *d, tree exp,
28061 rtx target)
28062 {
28063 rtx pat;
28064 tree arg0 = CALL_EXPR_ARG (exp, 0);
28065 rtx op1, op0 = expand_normal (arg0);
28066 enum machine_mode tmode = insn_data[d->icode].operand[0].mode;
28067 enum machine_mode mode0 = insn_data[d->icode].operand[1].mode;
28068
28069 if (optimize || target == 0
28070 || GET_MODE (target) != tmode
28071 || !insn_data[d->icode].operand[0].predicate (target, tmode))
28072 target = gen_reg_rtx (tmode);
28073
28074 if (VECTOR_MODE_P (mode0))
28075 op0 = safe_vector_operand (op0, mode0);
28076
28077 if ((optimize && !register_operand (op0, mode0))
28078 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
28079 op0 = copy_to_mode_reg (mode0, op0);
28080
28081 op1 = GEN_INT (d->comparison);
28082
28083 pat = GEN_FCN (d->icode) (target, op0, op1);
28084 if (! pat)
28085 return 0;
28086 emit_insn (pat);
28087 return target;
28088 }
28089
28090 static rtx
28091 ix86_expand_sse_round_vec_pack_sfix (const struct builtin_description *d,
28092 tree exp, rtx target)
28093 {
28094 rtx pat;
28095 tree arg0 = CALL_EXPR_ARG (exp, 0);
28096 tree arg1 = CALL_EXPR_ARG (exp, 1);
28097 rtx op0 = expand_normal (arg0);
28098 rtx op1 = expand_normal (arg1);
28099 rtx op2;
28100 enum machine_mode tmode = insn_data[d->icode].operand[0].mode;
28101 enum machine_mode mode0 = insn_data[d->icode].operand[1].mode;
28102 enum machine_mode mode1 = insn_data[d->icode].operand[2].mode;
28103
28104 if (optimize || target == 0
28105 || GET_MODE (target) != tmode
28106 || !insn_data[d->icode].operand[0].predicate (target, tmode))
28107 target = gen_reg_rtx (tmode);
28108
28109 op0 = safe_vector_operand (op0, mode0);
28110 op1 = safe_vector_operand (op1, mode1);
28111
28112 if ((optimize && !register_operand (op0, mode0))
28113 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
28114 op0 = copy_to_mode_reg (mode0, op0);
28115 if ((optimize && !register_operand (op1, mode1))
28116 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
28117 op1 = copy_to_mode_reg (mode1, op1);
28118
28119 op2 = GEN_INT (d->comparison);
28120
28121 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
28122 if (! pat)
28123 return 0;
28124 emit_insn (pat);
28125 return target;
28126 }
28127
28128 /* Subroutine of ix86_expand_builtin to take care of ptest insns. */
28129
28130 static rtx
28131 ix86_expand_sse_ptest (const struct builtin_description *d, tree exp,
28132 rtx target)
28133 {
28134 rtx pat;
28135 tree arg0 = CALL_EXPR_ARG (exp, 0);
28136 tree arg1 = CALL_EXPR_ARG (exp, 1);
28137 rtx op0 = expand_normal (arg0);
28138 rtx op1 = expand_normal (arg1);
28139 enum machine_mode mode0 = insn_data[d->icode].operand[0].mode;
28140 enum machine_mode mode1 = insn_data[d->icode].operand[1].mode;
28141 enum rtx_code comparison = d->comparison;
28142
28143 if (VECTOR_MODE_P (mode0))
28144 op0 = safe_vector_operand (op0, mode0);
28145 if (VECTOR_MODE_P (mode1))
28146 op1 = safe_vector_operand (op1, mode1);
28147
28148 target = gen_reg_rtx (SImode);
28149 emit_move_insn (target, const0_rtx);
28150 target = gen_rtx_SUBREG (QImode, target, 0);
28151
28152 if ((optimize && !register_operand (op0, mode0))
28153 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
28154 op0 = copy_to_mode_reg (mode0, op0);
28155 if ((optimize && !register_operand (op1, mode1))
28156 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
28157 op1 = copy_to_mode_reg (mode1, op1);
28158
28159 pat = GEN_FCN (d->icode) (op0, op1);
28160 if (! pat)
28161 return 0;
28162 emit_insn (pat);
28163 emit_insn (gen_rtx_SET (VOIDmode,
28164 gen_rtx_STRICT_LOW_PART (VOIDmode, target),
28165 gen_rtx_fmt_ee (comparison, QImode,
28166 SET_DEST (pat),
28167 const0_rtx)));
28168
28169 return SUBREG_REG (target);
28170 }
28171
28172 /* Subroutine of ix86_expand_builtin to take care of pcmpestr[im] insns. */
28173
28174 static rtx
28175 ix86_expand_sse_pcmpestr (const struct builtin_description *d,
28176 tree exp, rtx target)
28177 {
28178 rtx pat;
28179 tree arg0 = CALL_EXPR_ARG (exp, 0);
28180 tree arg1 = CALL_EXPR_ARG (exp, 1);
28181 tree arg2 = CALL_EXPR_ARG (exp, 2);
28182 tree arg3 = CALL_EXPR_ARG (exp, 3);
28183 tree arg4 = CALL_EXPR_ARG (exp, 4);
28184 rtx scratch0, scratch1;
28185 rtx op0 = expand_normal (arg0);
28186 rtx op1 = expand_normal (arg1);
28187 rtx op2 = expand_normal (arg2);
28188 rtx op3 = expand_normal (arg3);
28189 rtx op4 = expand_normal (arg4);
28190 enum machine_mode tmode0, tmode1, modev2, modei3, modev4, modei5, modeimm;
28191
28192 tmode0 = insn_data[d->icode].operand[0].mode;
28193 tmode1 = insn_data[d->icode].operand[1].mode;
28194 modev2 = insn_data[d->icode].operand[2].mode;
28195 modei3 = insn_data[d->icode].operand[3].mode;
28196 modev4 = insn_data[d->icode].operand[4].mode;
28197 modei5 = insn_data[d->icode].operand[5].mode;
28198 modeimm = insn_data[d->icode].operand[6].mode;
28199
28200 if (VECTOR_MODE_P (modev2))
28201 op0 = safe_vector_operand (op0, modev2);
28202 if (VECTOR_MODE_P (modev4))
28203 op2 = safe_vector_operand (op2, modev4);
28204
28205 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
28206 op0 = copy_to_mode_reg (modev2, op0);
28207 if (!insn_data[d->icode].operand[3].predicate (op1, modei3))
28208 op1 = copy_to_mode_reg (modei3, op1);
28209 if ((optimize && !register_operand (op2, modev4))
28210 || !insn_data[d->icode].operand[4].predicate (op2, modev4))
28211 op2 = copy_to_mode_reg (modev4, op2);
28212 if (!insn_data[d->icode].operand[5].predicate (op3, modei5))
28213 op3 = copy_to_mode_reg (modei5, op3);
28214
28215 if (!insn_data[d->icode].operand[6].predicate (op4, modeimm))
28216 {
28217 error ("the fifth argument must be an 8-bit immediate");
28218 return const0_rtx;
28219 }
28220
28221 if (d->code == IX86_BUILTIN_PCMPESTRI128)
28222 {
28223 if (optimize || !target
28224 || GET_MODE (target) != tmode0
28225 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
28226 target = gen_reg_rtx (tmode0);
28227
28228 scratch1 = gen_reg_rtx (tmode1);
28229
28230 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2, op3, op4);
28231 }
28232 else if (d->code == IX86_BUILTIN_PCMPESTRM128)
28233 {
28234 if (optimize || !target
28235 || GET_MODE (target) != tmode1
28236 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
28237 target = gen_reg_rtx (tmode1);
28238
28239 scratch0 = gen_reg_rtx (tmode0);
28240
28241 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2, op3, op4);
28242 }
28243 else
28244 {
28245 gcc_assert (d->flag);
28246
28247 scratch0 = gen_reg_rtx (tmode0);
28248 scratch1 = gen_reg_rtx (tmode1);
28249
28250 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2, op3, op4);
28251 }
28252
28253 if (! pat)
28254 return 0;
28255
28256 emit_insn (pat);
28257
28258 if (d->flag)
28259 {
28260 target = gen_reg_rtx (SImode);
28261 emit_move_insn (target, const0_rtx);
28262 target = gen_rtx_SUBREG (QImode, target, 0);
28263
28264 emit_insn
28265 (gen_rtx_SET (VOIDmode, gen_rtx_STRICT_LOW_PART (VOIDmode, target),
28266 gen_rtx_fmt_ee (EQ, QImode,
28267 gen_rtx_REG ((enum machine_mode) d->flag,
28268 FLAGS_REG),
28269 const0_rtx)));
28270 return SUBREG_REG (target);
28271 }
28272 else
28273 return target;
28274 }
28275
28276
28277 /* Subroutine of ix86_expand_builtin to take care of pcmpistr[im] insns. */
28278
28279 static rtx
28280 ix86_expand_sse_pcmpistr (const struct builtin_description *d,
28281 tree exp, rtx target)
28282 {
28283 rtx pat;
28284 tree arg0 = CALL_EXPR_ARG (exp, 0);
28285 tree arg1 = CALL_EXPR_ARG (exp, 1);
28286 tree arg2 = CALL_EXPR_ARG (exp, 2);
28287 rtx scratch0, scratch1;
28288 rtx op0 = expand_normal (arg0);
28289 rtx op1 = expand_normal (arg1);
28290 rtx op2 = expand_normal (arg2);
28291 enum machine_mode tmode0, tmode1, modev2, modev3, modeimm;
28292
28293 tmode0 = insn_data[d->icode].operand[0].mode;
28294 tmode1 = insn_data[d->icode].operand[1].mode;
28295 modev2 = insn_data[d->icode].operand[2].mode;
28296 modev3 = insn_data[d->icode].operand[3].mode;
28297 modeimm = insn_data[d->icode].operand[4].mode;
28298
28299 if (VECTOR_MODE_P (modev2))
28300 op0 = safe_vector_operand (op0, modev2);
28301 if (VECTOR_MODE_P (modev3))
28302 op1 = safe_vector_operand (op1, modev3);
28303
28304 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
28305 op0 = copy_to_mode_reg (modev2, op0);
28306 if ((optimize && !register_operand (op1, modev3))
28307 || !insn_data[d->icode].operand[3].predicate (op1, modev3))
28308 op1 = copy_to_mode_reg (modev3, op1);
28309
28310 if (!insn_data[d->icode].operand[4].predicate (op2, modeimm))
28311 {
28312 error ("the third argument must be an 8-bit immediate");
28313 return const0_rtx;
28314 }
28315
28316 if (d->code == IX86_BUILTIN_PCMPISTRI128)
28317 {
28318 if (optimize || !target
28319 || GET_MODE (target) != tmode0
28320 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
28321 target = gen_reg_rtx (tmode0);
28322
28323 scratch1 = gen_reg_rtx (tmode1);
28324
28325 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2);
28326 }
28327 else if (d->code == IX86_BUILTIN_PCMPISTRM128)
28328 {
28329 if (optimize || !target
28330 || GET_MODE (target) != tmode1
28331 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
28332 target = gen_reg_rtx (tmode1);
28333
28334 scratch0 = gen_reg_rtx (tmode0);
28335
28336 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2);
28337 }
28338 else
28339 {
28340 gcc_assert (d->flag);
28341
28342 scratch0 = gen_reg_rtx (tmode0);
28343 scratch1 = gen_reg_rtx (tmode1);
28344
28345 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2);
28346 }
28347
28348 if (! pat)
28349 return 0;
28350
28351 emit_insn (pat);
28352
28353 if (d->flag)
28354 {
28355 target = gen_reg_rtx (SImode);
28356 emit_move_insn (target, const0_rtx);
28357 target = gen_rtx_SUBREG (QImode, target, 0);
28358
28359 emit_insn
28360 (gen_rtx_SET (VOIDmode, gen_rtx_STRICT_LOW_PART (VOIDmode, target),
28361 gen_rtx_fmt_ee (EQ, QImode,
28362 gen_rtx_REG ((enum machine_mode) d->flag,
28363 FLAGS_REG),
28364 const0_rtx)));
28365 return SUBREG_REG (target);
28366 }
28367 else
28368 return target;
28369 }
28370
28371 /* Subroutine of ix86_expand_builtin to take care of insns with
28372 variable number of operands. */
28373
28374 static rtx
28375 ix86_expand_args_builtin (const struct builtin_description *d,
28376 tree exp, rtx target)
28377 {
28378 rtx pat, real_target;
28379 unsigned int i, nargs;
28380 unsigned int nargs_constant = 0;
28381 int num_memory = 0;
28382 struct
28383 {
28384 rtx op;
28385 enum machine_mode mode;
28386 } args[4];
28387 bool last_arg_count = false;
28388 enum insn_code icode = d->icode;
28389 const struct insn_data_d *insn_p = &insn_data[icode];
28390 enum machine_mode tmode = insn_p->operand[0].mode;
28391 enum machine_mode rmode = VOIDmode;
28392 bool swap = false;
28393 enum rtx_code comparison = d->comparison;
28394
28395 switch ((enum ix86_builtin_func_type) d->flag)
28396 {
28397 case V2DF_FTYPE_V2DF_ROUND:
28398 case V4DF_FTYPE_V4DF_ROUND:
28399 case V4SF_FTYPE_V4SF_ROUND:
28400 case V8SF_FTYPE_V8SF_ROUND:
28401 case V4SI_FTYPE_V4SF_ROUND:
28402 case V8SI_FTYPE_V8SF_ROUND:
28403 return ix86_expand_sse_round (d, exp, target);
28404 case V4SI_FTYPE_V2DF_V2DF_ROUND:
28405 case V8SI_FTYPE_V4DF_V4DF_ROUND:
28406 return ix86_expand_sse_round_vec_pack_sfix (d, exp, target);
28407 case INT_FTYPE_V8SF_V8SF_PTEST:
28408 case INT_FTYPE_V4DI_V4DI_PTEST:
28409 case INT_FTYPE_V4DF_V4DF_PTEST:
28410 case INT_FTYPE_V4SF_V4SF_PTEST:
28411 case INT_FTYPE_V2DI_V2DI_PTEST:
28412 case INT_FTYPE_V2DF_V2DF_PTEST:
28413 return ix86_expand_sse_ptest (d, exp, target);
28414 case FLOAT128_FTYPE_FLOAT128:
28415 case FLOAT_FTYPE_FLOAT:
28416 case INT_FTYPE_INT:
28417 case UINT64_FTYPE_INT:
28418 case UINT16_FTYPE_UINT16:
28419 case INT64_FTYPE_INT64:
28420 case INT64_FTYPE_V4SF:
28421 case INT64_FTYPE_V2DF:
28422 case INT_FTYPE_V16QI:
28423 case INT_FTYPE_V8QI:
28424 case INT_FTYPE_V8SF:
28425 case INT_FTYPE_V4DF:
28426 case INT_FTYPE_V4SF:
28427 case INT_FTYPE_V2DF:
28428 case INT_FTYPE_V32QI:
28429 case V16QI_FTYPE_V16QI:
28430 case V8SI_FTYPE_V8SF:
28431 case V8SI_FTYPE_V4SI:
28432 case V8HI_FTYPE_V8HI:
28433 case V8HI_FTYPE_V16QI:
28434 case V8QI_FTYPE_V8QI:
28435 case V8SF_FTYPE_V8SF:
28436 case V8SF_FTYPE_V8SI:
28437 case V8SF_FTYPE_V4SF:
28438 case V8SF_FTYPE_V8HI:
28439 case V4SI_FTYPE_V4SI:
28440 case V4SI_FTYPE_V16QI:
28441 case V4SI_FTYPE_V4SF:
28442 case V4SI_FTYPE_V8SI:
28443 case V4SI_FTYPE_V8HI:
28444 case V4SI_FTYPE_V4DF:
28445 case V4SI_FTYPE_V2DF:
28446 case V4HI_FTYPE_V4HI:
28447 case V4DF_FTYPE_V4DF:
28448 case V4DF_FTYPE_V4SI:
28449 case V4DF_FTYPE_V4SF:
28450 case V4DF_FTYPE_V2DF:
28451 case V4SF_FTYPE_V4SF:
28452 case V4SF_FTYPE_V4SI:
28453 case V4SF_FTYPE_V8SF:
28454 case V4SF_FTYPE_V4DF:
28455 case V4SF_FTYPE_V8HI:
28456 case V4SF_FTYPE_V2DF:
28457 case V2DI_FTYPE_V2DI:
28458 case V2DI_FTYPE_V16QI:
28459 case V2DI_FTYPE_V8HI:
28460 case V2DI_FTYPE_V4SI:
28461 case V2DF_FTYPE_V2DF:
28462 case V2DF_FTYPE_V4SI:
28463 case V2DF_FTYPE_V4DF:
28464 case V2DF_FTYPE_V4SF:
28465 case V2DF_FTYPE_V2SI:
28466 case V2SI_FTYPE_V2SI:
28467 case V2SI_FTYPE_V4SF:
28468 case V2SI_FTYPE_V2SF:
28469 case V2SI_FTYPE_V2DF:
28470 case V2SF_FTYPE_V2SF:
28471 case V2SF_FTYPE_V2SI:
28472 case V32QI_FTYPE_V32QI:
28473 case V32QI_FTYPE_V16QI:
28474 case V16HI_FTYPE_V16HI:
28475 case V16HI_FTYPE_V8HI:
28476 case V8SI_FTYPE_V8SI:
28477 case V16HI_FTYPE_V16QI:
28478 case V8SI_FTYPE_V16QI:
28479 case V4DI_FTYPE_V16QI:
28480 case V8SI_FTYPE_V8HI:
28481 case V4DI_FTYPE_V8HI:
28482 case V4DI_FTYPE_V4SI:
28483 case V4DI_FTYPE_V2DI:
28484 nargs = 1;
28485 break;
28486 case V4SF_FTYPE_V4SF_VEC_MERGE:
28487 case V2DF_FTYPE_V2DF_VEC_MERGE:
28488 return ix86_expand_unop_vec_merge_builtin (icode, exp, target);
28489 case FLOAT128_FTYPE_FLOAT128_FLOAT128:
28490 case V16QI_FTYPE_V16QI_V16QI:
28491 case V16QI_FTYPE_V8HI_V8HI:
28492 case V8QI_FTYPE_V8QI_V8QI:
28493 case V8QI_FTYPE_V4HI_V4HI:
28494 case V8HI_FTYPE_V8HI_V8HI:
28495 case V8HI_FTYPE_V16QI_V16QI:
28496 case V8HI_FTYPE_V4SI_V4SI:
28497 case V8SF_FTYPE_V8SF_V8SF:
28498 case V8SF_FTYPE_V8SF_V8SI:
28499 case V4SI_FTYPE_V4SI_V4SI:
28500 case V4SI_FTYPE_V8HI_V8HI:
28501 case V4SI_FTYPE_V4SF_V4SF:
28502 case V4SI_FTYPE_V2DF_V2DF:
28503 case V4HI_FTYPE_V4HI_V4HI:
28504 case V4HI_FTYPE_V8QI_V8QI:
28505 case V4HI_FTYPE_V2SI_V2SI:
28506 case V4DF_FTYPE_V4DF_V4DF:
28507 case V4DF_FTYPE_V4DF_V4DI:
28508 case V4SF_FTYPE_V4SF_V4SF:
28509 case V4SF_FTYPE_V4SF_V4SI:
28510 case V4SF_FTYPE_V4SF_V2SI:
28511 case V4SF_FTYPE_V4SF_V2DF:
28512 case V4SF_FTYPE_V4SF_DI:
28513 case V4SF_FTYPE_V4SF_SI:
28514 case V2DI_FTYPE_V2DI_V2DI:
28515 case V2DI_FTYPE_V16QI_V16QI:
28516 case V2DI_FTYPE_V4SI_V4SI:
28517 case V2DI_FTYPE_V2DI_V16QI:
28518 case V2DI_FTYPE_V2DF_V2DF:
28519 case V2SI_FTYPE_V2SI_V2SI:
28520 case V2SI_FTYPE_V4HI_V4HI:
28521 case V2SI_FTYPE_V2SF_V2SF:
28522 case V2DF_FTYPE_V2DF_V2DF:
28523 case V2DF_FTYPE_V2DF_V4SF:
28524 case V2DF_FTYPE_V2DF_V2DI:
28525 case V2DF_FTYPE_V2DF_DI:
28526 case V2DF_FTYPE_V2DF_SI:
28527 case V2SF_FTYPE_V2SF_V2SF:
28528 case V1DI_FTYPE_V1DI_V1DI:
28529 case V1DI_FTYPE_V8QI_V8QI:
28530 case V1DI_FTYPE_V2SI_V2SI:
28531 case V32QI_FTYPE_V16HI_V16HI:
28532 case V16HI_FTYPE_V8SI_V8SI:
28533 case V32QI_FTYPE_V32QI_V32QI:
28534 case V16HI_FTYPE_V32QI_V32QI:
28535 case V16HI_FTYPE_V16HI_V16HI:
28536 case V8SI_FTYPE_V4DF_V4DF:
28537 case V8SI_FTYPE_V8SI_V8SI:
28538 case V8SI_FTYPE_V16HI_V16HI:
28539 case V4DI_FTYPE_V4DI_V4DI:
28540 case V4DI_FTYPE_V8SI_V8SI:
28541 if (comparison == UNKNOWN)
28542 return ix86_expand_binop_builtin (icode, exp, target);
28543 nargs = 2;
28544 break;
28545 case V4SF_FTYPE_V4SF_V4SF_SWAP:
28546 case V2DF_FTYPE_V2DF_V2DF_SWAP:
28547 gcc_assert (comparison != UNKNOWN);
28548 nargs = 2;
28549 swap = true;
28550 break;
28551 case V16HI_FTYPE_V16HI_V8HI_COUNT:
28552 case V16HI_FTYPE_V16HI_SI_COUNT:
28553 case V8SI_FTYPE_V8SI_V4SI_COUNT:
28554 case V8SI_FTYPE_V8SI_SI_COUNT:
28555 case V4DI_FTYPE_V4DI_V2DI_COUNT:
28556 case V4DI_FTYPE_V4DI_INT_COUNT:
28557 case V8HI_FTYPE_V8HI_V8HI_COUNT:
28558 case V8HI_FTYPE_V8HI_SI_COUNT:
28559 case V4SI_FTYPE_V4SI_V4SI_COUNT:
28560 case V4SI_FTYPE_V4SI_SI_COUNT:
28561 case V4HI_FTYPE_V4HI_V4HI_COUNT:
28562 case V4HI_FTYPE_V4HI_SI_COUNT:
28563 case V2DI_FTYPE_V2DI_V2DI_COUNT:
28564 case V2DI_FTYPE_V2DI_SI_COUNT:
28565 case V2SI_FTYPE_V2SI_V2SI_COUNT:
28566 case V2SI_FTYPE_V2SI_SI_COUNT:
28567 case V1DI_FTYPE_V1DI_V1DI_COUNT:
28568 case V1DI_FTYPE_V1DI_SI_COUNT:
28569 nargs = 2;
28570 last_arg_count = true;
28571 break;
28572 case UINT64_FTYPE_UINT64_UINT64:
28573 case UINT_FTYPE_UINT_UINT:
28574 case UINT_FTYPE_UINT_USHORT:
28575 case UINT_FTYPE_UINT_UCHAR:
28576 case UINT16_FTYPE_UINT16_INT:
28577 case UINT8_FTYPE_UINT8_INT:
28578 nargs = 2;
28579 break;
28580 case V2DI_FTYPE_V2DI_INT_CONVERT:
28581 nargs = 2;
28582 rmode = V1TImode;
28583 nargs_constant = 1;
28584 break;
28585 case V4DI_FTYPE_V4DI_INT_CONVERT:
28586 nargs = 2;
28587 rmode = V2TImode;
28588 nargs_constant = 1;
28589 break;
28590 case V8HI_FTYPE_V8HI_INT:
28591 case V8HI_FTYPE_V8SF_INT:
28592 case V8HI_FTYPE_V4SF_INT:
28593 case V8SF_FTYPE_V8SF_INT:
28594 case V4SI_FTYPE_V4SI_INT:
28595 case V4SI_FTYPE_V8SI_INT:
28596 case V4HI_FTYPE_V4HI_INT:
28597 case V4DF_FTYPE_V4DF_INT:
28598 case V4SF_FTYPE_V4SF_INT:
28599 case V4SF_FTYPE_V8SF_INT:
28600 case V2DI_FTYPE_V2DI_INT:
28601 case V2DF_FTYPE_V2DF_INT:
28602 case V2DF_FTYPE_V4DF_INT:
28603 case V16HI_FTYPE_V16HI_INT:
28604 case V8SI_FTYPE_V8SI_INT:
28605 case V4DI_FTYPE_V4DI_INT:
28606 case V2DI_FTYPE_V4DI_INT:
28607 nargs = 2;
28608 nargs_constant = 1;
28609 break;
28610 case V16QI_FTYPE_V16QI_V16QI_V16QI:
28611 case V8SF_FTYPE_V8SF_V8SF_V8SF:
28612 case V4DF_FTYPE_V4DF_V4DF_V4DF:
28613 case V4SF_FTYPE_V4SF_V4SF_V4SF:
28614 case V2DF_FTYPE_V2DF_V2DF_V2DF:
28615 case V32QI_FTYPE_V32QI_V32QI_V32QI:
28616 nargs = 3;
28617 break;
28618 case V32QI_FTYPE_V32QI_V32QI_INT:
28619 case V16HI_FTYPE_V16HI_V16HI_INT:
28620 case V16QI_FTYPE_V16QI_V16QI_INT:
28621 case V4DI_FTYPE_V4DI_V4DI_INT:
28622 case V8HI_FTYPE_V8HI_V8HI_INT:
28623 case V8SI_FTYPE_V8SI_V8SI_INT:
28624 case V8SI_FTYPE_V8SI_V4SI_INT:
28625 case V8SF_FTYPE_V8SF_V8SF_INT:
28626 case V8SF_FTYPE_V8SF_V4SF_INT:
28627 case V4SI_FTYPE_V4SI_V4SI_INT:
28628 case V4DF_FTYPE_V4DF_V4DF_INT:
28629 case V4DF_FTYPE_V4DF_V2DF_INT:
28630 case V4SF_FTYPE_V4SF_V4SF_INT:
28631 case V2DI_FTYPE_V2DI_V2DI_INT:
28632 case V4DI_FTYPE_V4DI_V2DI_INT:
28633 case V2DF_FTYPE_V2DF_V2DF_INT:
28634 nargs = 3;
28635 nargs_constant = 1;
28636 break;
28637 case V4DI_FTYPE_V4DI_V4DI_INT_CONVERT:
28638 nargs = 3;
28639 rmode = V4DImode;
28640 nargs_constant = 1;
28641 break;
28642 case V2DI_FTYPE_V2DI_V2DI_INT_CONVERT:
28643 nargs = 3;
28644 rmode = V2DImode;
28645 nargs_constant = 1;
28646 break;
28647 case V1DI_FTYPE_V1DI_V1DI_INT_CONVERT:
28648 nargs = 3;
28649 rmode = DImode;
28650 nargs_constant = 1;
28651 break;
28652 case V2DI_FTYPE_V2DI_UINT_UINT:
28653 nargs = 3;
28654 nargs_constant = 2;
28655 break;
28656 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT:
28657 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT:
28658 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT:
28659 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT:
28660 nargs = 4;
28661 nargs_constant = 1;
28662 break;
28663 case V2DI_FTYPE_V2DI_V2DI_UINT_UINT:
28664 nargs = 4;
28665 nargs_constant = 2;
28666 break;
28667 default:
28668 gcc_unreachable ();
28669 }
28670
28671 gcc_assert (nargs <= ARRAY_SIZE (args));
28672
28673 if (comparison != UNKNOWN)
28674 {
28675 gcc_assert (nargs == 2);
28676 return ix86_expand_sse_compare (d, exp, target, swap);
28677 }
28678
28679 if (rmode == VOIDmode || rmode == tmode)
28680 {
28681 if (optimize
28682 || target == 0
28683 || GET_MODE (target) != tmode
28684 || !insn_p->operand[0].predicate (target, tmode))
28685 target = gen_reg_rtx (tmode);
28686 real_target = target;
28687 }
28688 else
28689 {
28690 target = gen_reg_rtx (rmode);
28691 real_target = simplify_gen_subreg (tmode, target, rmode, 0);
28692 }
28693
28694 for (i = 0; i < nargs; i++)
28695 {
28696 tree arg = CALL_EXPR_ARG (exp, i);
28697 rtx op = expand_normal (arg);
28698 enum machine_mode mode = insn_p->operand[i + 1].mode;
28699 bool match = insn_p->operand[i + 1].predicate (op, mode);
28700
28701 if (last_arg_count && (i + 1) == nargs)
28702 {
28703 /* SIMD shift insns take either an 8-bit immediate or
28704 register as count. But builtin functions take int as
28705 count. If count doesn't match, we put it in register. */
28706 if (!match)
28707 {
28708 op = simplify_gen_subreg (SImode, op, GET_MODE (op), 0);
28709 if (!insn_p->operand[i + 1].predicate (op, mode))
28710 op = copy_to_reg (op);
28711 }
28712 }
28713 else if ((nargs - i) <= nargs_constant)
28714 {
28715 if (!match)
28716 switch (icode)
28717 {
28718 case CODE_FOR_avx2_inserti128:
28719 case CODE_FOR_avx2_extracti128:
28720 error ("the last argument must be an 1-bit immediate");
28721 return const0_rtx;
28722
28723 case CODE_FOR_sse4_1_roundsd:
28724 case CODE_FOR_sse4_1_roundss:
28725
28726 case CODE_FOR_sse4_1_roundpd:
28727 case CODE_FOR_sse4_1_roundps:
28728 case CODE_FOR_avx_roundpd256:
28729 case CODE_FOR_avx_roundps256:
28730
28731 case CODE_FOR_sse4_1_roundpd_vec_pack_sfix:
28732 case CODE_FOR_sse4_1_roundps_sfix:
28733 case CODE_FOR_avx_roundpd_vec_pack_sfix256:
28734 case CODE_FOR_avx_roundps_sfix256:
28735
28736 case CODE_FOR_sse4_1_blendps:
28737 case CODE_FOR_avx_blendpd256:
28738 case CODE_FOR_avx_vpermilv4df:
28739 error ("the last argument must be a 4-bit immediate");
28740 return const0_rtx;
28741
28742 case CODE_FOR_sse4_1_blendpd:
28743 case CODE_FOR_avx_vpermilv2df:
28744 case CODE_FOR_xop_vpermil2v2df3:
28745 case CODE_FOR_xop_vpermil2v4sf3:
28746 case CODE_FOR_xop_vpermil2v4df3:
28747 case CODE_FOR_xop_vpermil2v8sf3:
28748 error ("the last argument must be a 2-bit immediate");
28749 return const0_rtx;
28750
28751 case CODE_FOR_avx_vextractf128v4df:
28752 case CODE_FOR_avx_vextractf128v8sf:
28753 case CODE_FOR_avx_vextractf128v8si:
28754 case CODE_FOR_avx_vinsertf128v4df:
28755 case CODE_FOR_avx_vinsertf128v8sf:
28756 case CODE_FOR_avx_vinsertf128v8si:
28757 error ("the last argument must be a 1-bit immediate");
28758 return const0_rtx;
28759
28760 case CODE_FOR_avx_vmcmpv2df3:
28761 case CODE_FOR_avx_vmcmpv4sf3:
28762 case CODE_FOR_avx_cmpv2df3:
28763 case CODE_FOR_avx_cmpv4sf3:
28764 case CODE_FOR_avx_cmpv4df3:
28765 case CODE_FOR_avx_cmpv8sf3:
28766 error ("the last argument must be a 5-bit immediate");
28767 return const0_rtx;
28768
28769 default:
28770 switch (nargs_constant)
28771 {
28772 case 2:
28773 if ((nargs - i) == nargs_constant)
28774 {
28775 error ("the next to last argument must be an 8-bit immediate");
28776 break;
28777 }
28778 case 1:
28779 error ("the last argument must be an 8-bit immediate");
28780 break;
28781 default:
28782 gcc_unreachable ();
28783 }
28784 return const0_rtx;
28785 }
28786 }
28787 else
28788 {
28789 if (VECTOR_MODE_P (mode))
28790 op = safe_vector_operand (op, mode);
28791
28792 /* If we aren't optimizing, only allow one memory operand to
28793 be generated. */
28794 if (memory_operand (op, mode))
28795 num_memory++;
28796
28797 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
28798 {
28799 if (optimize || !match || num_memory > 1)
28800 op = copy_to_mode_reg (mode, op);
28801 }
28802 else
28803 {
28804 op = copy_to_reg (op);
28805 op = simplify_gen_subreg (mode, op, GET_MODE (op), 0);
28806 }
28807 }
28808
28809 args[i].op = op;
28810 args[i].mode = mode;
28811 }
28812
28813 switch (nargs)
28814 {
28815 case 1:
28816 pat = GEN_FCN (icode) (real_target, args[0].op);
28817 break;
28818 case 2:
28819 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op);
28820 break;
28821 case 3:
28822 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
28823 args[2].op);
28824 break;
28825 case 4:
28826 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
28827 args[2].op, args[3].op);
28828 break;
28829 default:
28830 gcc_unreachable ();
28831 }
28832
28833 if (! pat)
28834 return 0;
28835
28836 emit_insn (pat);
28837 return target;
28838 }
28839
28840 /* Subroutine of ix86_expand_builtin to take care of special insns
28841 with variable number of operands. */
28842
28843 static rtx
28844 ix86_expand_special_args_builtin (const struct builtin_description *d,
28845 tree exp, rtx target)
28846 {
28847 tree arg;
28848 rtx pat, op;
28849 unsigned int i, nargs, arg_adjust, memory;
28850 struct
28851 {
28852 rtx op;
28853 enum machine_mode mode;
28854 } args[3];
28855 enum insn_code icode = d->icode;
28856 bool last_arg_constant = false;
28857 const struct insn_data_d *insn_p = &insn_data[icode];
28858 enum machine_mode tmode = insn_p->operand[0].mode;
28859 enum { load, store } klass;
28860
28861 switch ((enum ix86_builtin_func_type) d->flag)
28862 {
28863 case VOID_FTYPE_VOID:
28864 if (icode == CODE_FOR_avx_vzeroupper)
28865 target = GEN_INT (vzeroupper_intrinsic);
28866 emit_insn (GEN_FCN (icode) (target));
28867 return 0;
28868 case VOID_FTYPE_UINT64:
28869 case VOID_FTYPE_UNSIGNED:
28870 nargs = 0;
28871 klass = store;
28872 memory = 0;
28873 break;
28874 case UINT64_FTYPE_VOID:
28875 case UNSIGNED_FTYPE_VOID:
28876 nargs = 0;
28877 klass = load;
28878 memory = 0;
28879 break;
28880 case UINT64_FTYPE_PUNSIGNED:
28881 case V2DI_FTYPE_PV2DI:
28882 case V4DI_FTYPE_PV4DI:
28883 case V32QI_FTYPE_PCCHAR:
28884 case V16QI_FTYPE_PCCHAR:
28885 case V8SF_FTYPE_PCV4SF:
28886 case V8SF_FTYPE_PCFLOAT:
28887 case V4SF_FTYPE_PCFLOAT:
28888 case V4DF_FTYPE_PCV2DF:
28889 case V4DF_FTYPE_PCDOUBLE:
28890 case V2DF_FTYPE_PCDOUBLE:
28891 case VOID_FTYPE_PVOID:
28892 nargs = 1;
28893 klass = load;
28894 memory = 0;
28895 break;
28896 case VOID_FTYPE_PV2SF_V4SF:
28897 case VOID_FTYPE_PV4DI_V4DI:
28898 case VOID_FTYPE_PV2DI_V2DI:
28899 case VOID_FTYPE_PCHAR_V32QI:
28900 case VOID_FTYPE_PCHAR_V16QI:
28901 case VOID_FTYPE_PFLOAT_V8SF:
28902 case VOID_FTYPE_PFLOAT_V4SF:
28903 case VOID_FTYPE_PDOUBLE_V4DF:
28904 case VOID_FTYPE_PDOUBLE_V2DF:
28905 case VOID_FTYPE_PLONGLONG_LONGLONG:
28906 case VOID_FTYPE_PULONGLONG_ULONGLONG:
28907 case VOID_FTYPE_PINT_INT:
28908 nargs = 1;
28909 klass = store;
28910 /* Reserve memory operand for target. */
28911 memory = ARRAY_SIZE (args);
28912 break;
28913 case V4SF_FTYPE_V4SF_PCV2SF:
28914 case V2DF_FTYPE_V2DF_PCDOUBLE:
28915 nargs = 2;
28916 klass = load;
28917 memory = 1;
28918 break;
28919 case V8SF_FTYPE_PCV8SF_V8SI:
28920 case V4DF_FTYPE_PCV4DF_V4DI:
28921 case V4SF_FTYPE_PCV4SF_V4SI:
28922 case V2DF_FTYPE_PCV2DF_V2DI:
28923 case V8SI_FTYPE_PCV8SI_V8SI:
28924 case V4DI_FTYPE_PCV4DI_V4DI:
28925 case V4SI_FTYPE_PCV4SI_V4SI:
28926 case V2DI_FTYPE_PCV2DI_V2DI:
28927 nargs = 2;
28928 klass = load;
28929 memory = 0;
28930 break;
28931 case VOID_FTYPE_PV8SF_V8SI_V8SF:
28932 case VOID_FTYPE_PV4DF_V4DI_V4DF:
28933 case VOID_FTYPE_PV4SF_V4SI_V4SF:
28934 case VOID_FTYPE_PV2DF_V2DI_V2DF:
28935 case VOID_FTYPE_PV8SI_V8SI_V8SI:
28936 case VOID_FTYPE_PV4DI_V4DI_V4DI:
28937 case VOID_FTYPE_PV4SI_V4SI_V4SI:
28938 case VOID_FTYPE_PV2DI_V2DI_V2DI:
28939 nargs = 2;
28940 klass = store;
28941 /* Reserve memory operand for target. */
28942 memory = ARRAY_SIZE (args);
28943 break;
28944 case VOID_FTYPE_UINT_UINT_UINT:
28945 case VOID_FTYPE_UINT64_UINT_UINT:
28946 case UCHAR_FTYPE_UINT_UINT_UINT:
28947 case UCHAR_FTYPE_UINT64_UINT_UINT:
28948 nargs = 3;
28949 klass = load;
28950 memory = ARRAY_SIZE (args);
28951 last_arg_constant = true;
28952 break;
28953 default:
28954 gcc_unreachable ();
28955 }
28956
28957 gcc_assert (nargs <= ARRAY_SIZE (args));
28958
28959 if (klass == store)
28960 {
28961 arg = CALL_EXPR_ARG (exp, 0);
28962 op = expand_normal (arg);
28963 gcc_assert (target == 0);
28964 if (memory)
28965 {
28966 if (GET_MODE (op) != Pmode)
28967 op = convert_to_mode (Pmode, op, 1);
28968 target = gen_rtx_MEM (tmode, force_reg (Pmode, op));
28969 }
28970 else
28971 target = force_reg (tmode, op);
28972 arg_adjust = 1;
28973 }
28974 else
28975 {
28976 arg_adjust = 0;
28977 if (optimize
28978 || target == 0
28979 || GET_MODE (target) != tmode
28980 || !insn_p->operand[0].predicate (target, tmode))
28981 target = gen_reg_rtx (tmode);
28982 }
28983
28984 for (i = 0; i < nargs; i++)
28985 {
28986 enum machine_mode mode = insn_p->operand[i + 1].mode;
28987 bool match;
28988
28989 arg = CALL_EXPR_ARG (exp, i + arg_adjust);
28990 op = expand_normal (arg);
28991 match = insn_p->operand[i + 1].predicate (op, mode);
28992
28993 if (last_arg_constant && (i + 1) == nargs)
28994 {
28995 if (!match)
28996 {
28997 if (icode == CODE_FOR_lwp_lwpvalsi3
28998 || icode == CODE_FOR_lwp_lwpinssi3
28999 || icode == CODE_FOR_lwp_lwpvaldi3
29000 || icode == CODE_FOR_lwp_lwpinsdi3)
29001 error ("the last argument must be a 32-bit immediate");
29002 else
29003 error ("the last argument must be an 8-bit immediate");
29004 return const0_rtx;
29005 }
29006 }
29007 else
29008 {
29009 if (i == memory)
29010 {
29011 /* This must be the memory operand. */
29012 if (GET_MODE (op) != Pmode)
29013 op = convert_to_mode (Pmode, op, 1);
29014 op = gen_rtx_MEM (mode, force_reg (Pmode, op));
29015 gcc_assert (GET_MODE (op) == mode
29016 || GET_MODE (op) == VOIDmode);
29017 }
29018 else
29019 {
29020 /* This must be register. */
29021 if (VECTOR_MODE_P (mode))
29022 op = safe_vector_operand (op, mode);
29023
29024 gcc_assert (GET_MODE (op) == mode
29025 || GET_MODE (op) == VOIDmode);
29026 op = copy_to_mode_reg (mode, op);
29027 }
29028 }
29029
29030 args[i].op = op;
29031 args[i].mode = mode;
29032 }
29033
29034 switch (nargs)
29035 {
29036 case 0:
29037 pat = GEN_FCN (icode) (target);
29038 break;
29039 case 1:
29040 pat = GEN_FCN (icode) (target, args[0].op);
29041 break;
29042 case 2:
29043 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
29044 break;
29045 case 3:
29046 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
29047 break;
29048 default:
29049 gcc_unreachable ();
29050 }
29051
29052 if (! pat)
29053 return 0;
29054 emit_insn (pat);
29055 return klass == store ? 0 : target;
29056 }
29057
29058 /* Return the integer constant in ARG. Constrain it to be in the range
29059 of the subparts of VEC_TYPE; issue an error if not. */
29060
29061 static int
29062 get_element_number (tree vec_type, tree arg)
29063 {
29064 unsigned HOST_WIDE_INT elt, max = TYPE_VECTOR_SUBPARTS (vec_type) - 1;
29065
29066 if (!host_integerp (arg, 1)
29067 || (elt = tree_low_cst (arg, 1), elt > max))
29068 {
29069 error ("selector must be an integer constant in the range 0..%wi", max);
29070 return 0;
29071 }
29072
29073 return elt;
29074 }
29075
29076 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
29077 ix86_expand_vector_init. We DO have language-level syntax for this, in
29078 the form of (type){ init-list }. Except that since we can't place emms
29079 instructions from inside the compiler, we can't allow the use of MMX
29080 registers unless the user explicitly asks for it. So we do *not* define
29081 vec_set/vec_extract/vec_init patterns for MMX modes in mmx.md. Instead
29082 we have builtins invoked by mmintrin.h that gives us license to emit
29083 these sorts of instructions. */
29084
29085 static rtx
29086 ix86_expand_vec_init_builtin (tree type, tree exp, rtx target)
29087 {
29088 enum machine_mode tmode = TYPE_MODE (type);
29089 enum machine_mode inner_mode = GET_MODE_INNER (tmode);
29090 int i, n_elt = GET_MODE_NUNITS (tmode);
29091 rtvec v = rtvec_alloc (n_elt);
29092
29093 gcc_assert (VECTOR_MODE_P (tmode));
29094 gcc_assert (call_expr_nargs (exp) == n_elt);
29095
29096 for (i = 0; i < n_elt; ++i)
29097 {
29098 rtx x = expand_normal (CALL_EXPR_ARG (exp, i));
29099 RTVEC_ELT (v, i) = gen_lowpart (inner_mode, x);
29100 }
29101
29102 if (!target || !register_operand (target, tmode))
29103 target = gen_reg_rtx (tmode);
29104
29105 ix86_expand_vector_init (true, target, gen_rtx_PARALLEL (tmode, v));
29106 return target;
29107 }
29108
29109 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
29110 ix86_expand_vector_extract. They would be redundant (for non-MMX) if we
29111 had a language-level syntax for referencing vector elements. */
29112
29113 static rtx
29114 ix86_expand_vec_ext_builtin (tree exp, rtx target)
29115 {
29116 enum machine_mode tmode, mode0;
29117 tree arg0, arg1;
29118 int elt;
29119 rtx op0;
29120
29121 arg0 = CALL_EXPR_ARG (exp, 0);
29122 arg1 = CALL_EXPR_ARG (exp, 1);
29123
29124 op0 = expand_normal (arg0);
29125 elt = get_element_number (TREE_TYPE (arg0), arg1);
29126
29127 tmode = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
29128 mode0 = TYPE_MODE (TREE_TYPE (arg0));
29129 gcc_assert (VECTOR_MODE_P (mode0));
29130
29131 op0 = force_reg (mode0, op0);
29132
29133 if (optimize || !target || !register_operand (target, tmode))
29134 target = gen_reg_rtx (tmode);
29135
29136 ix86_expand_vector_extract (true, target, op0, elt);
29137
29138 return target;
29139 }
29140
29141 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
29142 ix86_expand_vector_set. They would be redundant (for non-MMX) if we had
29143 a language-level syntax for referencing vector elements. */
29144
29145 static rtx
29146 ix86_expand_vec_set_builtin (tree exp)
29147 {
29148 enum machine_mode tmode, mode1;
29149 tree arg0, arg1, arg2;
29150 int elt;
29151 rtx op0, op1, target;
29152
29153 arg0 = CALL_EXPR_ARG (exp, 0);
29154 arg1 = CALL_EXPR_ARG (exp, 1);
29155 arg2 = CALL_EXPR_ARG (exp, 2);
29156
29157 tmode = TYPE_MODE (TREE_TYPE (arg0));
29158 mode1 = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
29159 gcc_assert (VECTOR_MODE_P (tmode));
29160
29161 op0 = expand_expr (arg0, NULL_RTX, tmode, EXPAND_NORMAL);
29162 op1 = expand_expr (arg1, NULL_RTX, mode1, EXPAND_NORMAL);
29163 elt = get_element_number (TREE_TYPE (arg0), arg2);
29164
29165 if (GET_MODE (op1) != mode1 && GET_MODE (op1) != VOIDmode)
29166 op1 = convert_modes (mode1, GET_MODE (op1), op1, true);
29167
29168 op0 = force_reg (tmode, op0);
29169 op1 = force_reg (mode1, op1);
29170
29171 /* OP0 is the source of these builtin functions and shouldn't be
29172 modified. Create a copy, use it and return it as target. */
29173 target = gen_reg_rtx (tmode);
29174 emit_move_insn (target, op0);
29175 ix86_expand_vector_set (true, target, op1, elt);
29176
29177 return target;
29178 }
29179
29180 /* Expand an expression EXP that calls a built-in function,
29181 with result going to TARGET if that's convenient
29182 (and in mode MODE if that's convenient).
29183 SUBTARGET may be used as the target for computing one of EXP's operands.
29184 IGNORE is nonzero if the value is to be ignored. */
29185
29186 static rtx
29187 ix86_expand_builtin (tree exp, rtx target, rtx subtarget ATTRIBUTE_UNUSED,
29188 enum machine_mode mode ATTRIBUTE_UNUSED,
29189 int ignore ATTRIBUTE_UNUSED)
29190 {
29191 const struct builtin_description *d;
29192 size_t i;
29193 enum insn_code icode;
29194 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
29195 tree arg0, arg1, arg2, arg3, arg4;
29196 rtx op0, op1, op2, op3, op4, pat;
29197 enum machine_mode mode0, mode1, mode2, mode3, mode4;
29198 unsigned int fcode = DECL_FUNCTION_CODE (fndecl);
29199
29200 /* Determine whether the builtin function is available under the current ISA.
29201 Originally the builtin was not created if it wasn't applicable to the
29202 current ISA based on the command line switches. With function specific
29203 options, we need to check in the context of the function making the call
29204 whether it is supported. */
29205 if (ix86_builtins_isa[fcode].isa
29206 && !(ix86_builtins_isa[fcode].isa & ix86_isa_flags))
29207 {
29208 char *opts = ix86_target_string (ix86_builtins_isa[fcode].isa, 0, NULL,
29209 NULL, (enum fpmath_unit) 0, false);
29210
29211 if (!opts)
29212 error ("%qE needs unknown isa option", fndecl);
29213 else
29214 {
29215 gcc_assert (opts != NULL);
29216 error ("%qE needs isa option %s", fndecl, opts);
29217 free (opts);
29218 }
29219 return const0_rtx;
29220 }
29221
29222 switch (fcode)
29223 {
29224 case IX86_BUILTIN_MASKMOVQ:
29225 case IX86_BUILTIN_MASKMOVDQU:
29226 icode = (fcode == IX86_BUILTIN_MASKMOVQ
29227 ? CODE_FOR_mmx_maskmovq
29228 : CODE_FOR_sse2_maskmovdqu);
29229 /* Note the arg order is different from the operand order. */
29230 arg1 = CALL_EXPR_ARG (exp, 0);
29231 arg2 = CALL_EXPR_ARG (exp, 1);
29232 arg0 = CALL_EXPR_ARG (exp, 2);
29233 op0 = expand_normal (arg0);
29234 op1 = expand_normal (arg1);
29235 op2 = expand_normal (arg2);
29236 mode0 = insn_data[icode].operand[0].mode;
29237 mode1 = insn_data[icode].operand[1].mode;
29238 mode2 = insn_data[icode].operand[2].mode;
29239
29240 if (GET_MODE (op0) != Pmode)
29241 op0 = convert_to_mode (Pmode, op0, 1);
29242 op0 = gen_rtx_MEM (mode1, force_reg (Pmode, op0));
29243
29244 if (!insn_data[icode].operand[0].predicate (op0, mode0))
29245 op0 = copy_to_mode_reg (mode0, op0);
29246 if (!insn_data[icode].operand[1].predicate (op1, mode1))
29247 op1 = copy_to_mode_reg (mode1, op1);
29248 if (!insn_data[icode].operand[2].predicate (op2, mode2))
29249 op2 = copy_to_mode_reg (mode2, op2);
29250 pat = GEN_FCN (icode) (op0, op1, op2);
29251 if (! pat)
29252 return 0;
29253 emit_insn (pat);
29254 return 0;
29255
29256 case IX86_BUILTIN_LDMXCSR:
29257 op0 = expand_normal (CALL_EXPR_ARG (exp, 0));
29258 target = assign_386_stack_local (SImode, SLOT_VIRTUAL);
29259 emit_move_insn (target, op0);
29260 emit_insn (gen_sse_ldmxcsr (target));
29261 return 0;
29262
29263 case IX86_BUILTIN_STMXCSR:
29264 target = assign_386_stack_local (SImode, SLOT_VIRTUAL);
29265 emit_insn (gen_sse_stmxcsr (target));
29266 return copy_to_mode_reg (SImode, target);
29267
29268 case IX86_BUILTIN_CLFLUSH:
29269 arg0 = CALL_EXPR_ARG (exp, 0);
29270 op0 = expand_normal (arg0);
29271 icode = CODE_FOR_sse2_clflush;
29272 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
29273 {
29274 if (GET_MODE (op0) != Pmode)
29275 op0 = convert_to_mode (Pmode, op0, 1);
29276 op0 = force_reg (Pmode, op0);
29277 }
29278
29279 emit_insn (gen_sse2_clflush (op0));
29280 return 0;
29281
29282 case IX86_BUILTIN_MONITOR:
29283 arg0 = CALL_EXPR_ARG (exp, 0);
29284 arg1 = CALL_EXPR_ARG (exp, 1);
29285 arg2 = CALL_EXPR_ARG (exp, 2);
29286 op0 = expand_normal (arg0);
29287 op1 = expand_normal (arg1);
29288 op2 = expand_normal (arg2);
29289 if (!REG_P (op0))
29290 {
29291 if (GET_MODE (op0) != Pmode)
29292 op0 = convert_to_mode (Pmode, op0, 1);
29293 op0 = force_reg (Pmode, op0);
29294 }
29295 if (!REG_P (op1))
29296 op1 = copy_to_mode_reg (SImode, op1);
29297 if (!REG_P (op2))
29298 op2 = copy_to_mode_reg (SImode, op2);
29299 emit_insn (ix86_gen_monitor (op0, op1, op2));
29300 return 0;
29301
29302 case IX86_BUILTIN_MWAIT:
29303 arg0 = CALL_EXPR_ARG (exp, 0);
29304 arg1 = CALL_EXPR_ARG (exp, 1);
29305 op0 = expand_normal (arg0);
29306 op1 = expand_normal (arg1);
29307 if (!REG_P (op0))
29308 op0 = copy_to_mode_reg (SImode, op0);
29309 if (!REG_P (op1))
29310 op1 = copy_to_mode_reg (SImode, op1);
29311 emit_insn (gen_sse3_mwait (op0, op1));
29312 return 0;
29313
29314 case IX86_BUILTIN_VEC_INIT_V2SI:
29315 case IX86_BUILTIN_VEC_INIT_V4HI:
29316 case IX86_BUILTIN_VEC_INIT_V8QI:
29317 return ix86_expand_vec_init_builtin (TREE_TYPE (exp), exp, target);
29318
29319 case IX86_BUILTIN_VEC_EXT_V2DF:
29320 case IX86_BUILTIN_VEC_EXT_V2DI:
29321 case IX86_BUILTIN_VEC_EXT_V4SF:
29322 case IX86_BUILTIN_VEC_EXT_V4SI:
29323 case IX86_BUILTIN_VEC_EXT_V8HI:
29324 case IX86_BUILTIN_VEC_EXT_V2SI:
29325 case IX86_BUILTIN_VEC_EXT_V4HI:
29326 case IX86_BUILTIN_VEC_EXT_V16QI:
29327 return ix86_expand_vec_ext_builtin (exp, target);
29328
29329 case IX86_BUILTIN_VEC_SET_V2DI:
29330 case IX86_BUILTIN_VEC_SET_V4SF:
29331 case IX86_BUILTIN_VEC_SET_V4SI:
29332 case IX86_BUILTIN_VEC_SET_V8HI:
29333 case IX86_BUILTIN_VEC_SET_V4HI:
29334 case IX86_BUILTIN_VEC_SET_V16QI:
29335 return ix86_expand_vec_set_builtin (exp);
29336
29337 case IX86_BUILTIN_INFQ:
29338 case IX86_BUILTIN_HUGE_VALQ:
29339 {
29340 REAL_VALUE_TYPE inf;
29341 rtx tmp;
29342
29343 real_inf (&inf);
29344 tmp = CONST_DOUBLE_FROM_REAL_VALUE (inf, mode);
29345
29346 tmp = validize_mem (force_const_mem (mode, tmp));
29347
29348 if (target == 0)
29349 target = gen_reg_rtx (mode);
29350
29351 emit_move_insn (target, tmp);
29352 return target;
29353 }
29354
29355 case IX86_BUILTIN_LLWPCB:
29356 arg0 = CALL_EXPR_ARG (exp, 0);
29357 op0 = expand_normal (arg0);
29358 icode = CODE_FOR_lwp_llwpcb;
29359 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
29360 {
29361 if (GET_MODE (op0) != Pmode)
29362 op0 = convert_to_mode (Pmode, op0, 1);
29363 op0 = force_reg (Pmode, op0);
29364 }
29365 emit_insn (gen_lwp_llwpcb (op0));
29366 return 0;
29367
29368 case IX86_BUILTIN_SLWPCB:
29369 icode = CODE_FOR_lwp_slwpcb;
29370 if (!target
29371 || !insn_data[icode].operand[0].predicate (target, Pmode))
29372 target = gen_reg_rtx (Pmode);
29373 emit_insn (gen_lwp_slwpcb (target));
29374 return target;
29375
29376 case IX86_BUILTIN_BEXTRI32:
29377 case IX86_BUILTIN_BEXTRI64:
29378 arg0 = CALL_EXPR_ARG (exp, 0);
29379 arg1 = CALL_EXPR_ARG (exp, 1);
29380 op0 = expand_normal (arg0);
29381 op1 = expand_normal (arg1);
29382 icode = (fcode == IX86_BUILTIN_BEXTRI32
29383 ? CODE_FOR_tbm_bextri_si
29384 : CODE_FOR_tbm_bextri_di);
29385 if (!CONST_INT_P (op1))
29386 {
29387 error ("last argument must be an immediate");
29388 return const0_rtx;
29389 }
29390 else
29391 {
29392 unsigned char length = (INTVAL (op1) >> 8) & 0xFF;
29393 unsigned char lsb_index = INTVAL (op1) & 0xFF;
29394 op1 = GEN_INT (length);
29395 op2 = GEN_INT (lsb_index);
29396 pat = GEN_FCN (icode) (target, op0, op1, op2);
29397 if (pat)
29398 emit_insn (pat);
29399 return target;
29400 }
29401
29402 case IX86_BUILTIN_RDRAND16_STEP:
29403 icode = CODE_FOR_rdrandhi_1;
29404 mode0 = HImode;
29405 goto rdrand_step;
29406
29407 case IX86_BUILTIN_RDRAND32_STEP:
29408 icode = CODE_FOR_rdrandsi_1;
29409 mode0 = SImode;
29410 goto rdrand_step;
29411
29412 case IX86_BUILTIN_RDRAND64_STEP:
29413 icode = CODE_FOR_rdranddi_1;
29414 mode0 = DImode;
29415
29416 rdrand_step:
29417 op0 = gen_reg_rtx (mode0);
29418 emit_insn (GEN_FCN (icode) (op0));
29419
29420 arg0 = CALL_EXPR_ARG (exp, 0);
29421 op1 = expand_normal (arg0);
29422 if (!address_operand (op1, VOIDmode))
29423 {
29424 op1 = convert_memory_address (Pmode, op1);
29425 op1 = copy_addr_to_reg (op1);
29426 }
29427 emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
29428
29429 op1 = gen_reg_rtx (SImode);
29430 emit_move_insn (op1, CONST1_RTX (SImode));
29431
29432 /* Emit SImode conditional move. */
29433 if (mode0 == HImode)
29434 {
29435 op2 = gen_reg_rtx (SImode);
29436 emit_insn (gen_zero_extendhisi2 (op2, op0));
29437 }
29438 else if (mode0 == SImode)
29439 op2 = op0;
29440 else
29441 op2 = gen_rtx_SUBREG (SImode, op0, 0);
29442
29443 if (target == 0)
29444 target = gen_reg_rtx (SImode);
29445
29446 pat = gen_rtx_GEU (VOIDmode, gen_rtx_REG (CCCmode, FLAGS_REG),
29447 const0_rtx);
29448 emit_insn (gen_rtx_SET (VOIDmode, target,
29449 gen_rtx_IF_THEN_ELSE (SImode, pat, op2, op1)));
29450 return target;
29451
29452 case IX86_BUILTIN_GATHERSIV2DF:
29453 icode = CODE_FOR_avx2_gathersiv2df;
29454 goto gather_gen;
29455 case IX86_BUILTIN_GATHERSIV4DF:
29456 icode = CODE_FOR_avx2_gathersiv4df;
29457 goto gather_gen;
29458 case IX86_BUILTIN_GATHERDIV2DF:
29459 icode = CODE_FOR_avx2_gatherdiv2df;
29460 goto gather_gen;
29461 case IX86_BUILTIN_GATHERDIV4DF:
29462 icode = CODE_FOR_avx2_gatherdiv4df;
29463 goto gather_gen;
29464 case IX86_BUILTIN_GATHERSIV4SF:
29465 icode = CODE_FOR_avx2_gathersiv4sf;
29466 goto gather_gen;
29467 case IX86_BUILTIN_GATHERSIV8SF:
29468 icode = CODE_FOR_avx2_gathersiv8sf;
29469 goto gather_gen;
29470 case IX86_BUILTIN_GATHERDIV4SF:
29471 icode = CODE_FOR_avx2_gatherdiv4sf;
29472 goto gather_gen;
29473 case IX86_BUILTIN_GATHERDIV8SF:
29474 icode = CODE_FOR_avx2_gatherdiv8sf;
29475 goto gather_gen;
29476 case IX86_BUILTIN_GATHERSIV2DI:
29477 icode = CODE_FOR_avx2_gathersiv2di;
29478 goto gather_gen;
29479 case IX86_BUILTIN_GATHERSIV4DI:
29480 icode = CODE_FOR_avx2_gathersiv4di;
29481 goto gather_gen;
29482 case IX86_BUILTIN_GATHERDIV2DI:
29483 icode = CODE_FOR_avx2_gatherdiv2di;
29484 goto gather_gen;
29485 case IX86_BUILTIN_GATHERDIV4DI:
29486 icode = CODE_FOR_avx2_gatherdiv4di;
29487 goto gather_gen;
29488 case IX86_BUILTIN_GATHERSIV4SI:
29489 icode = CODE_FOR_avx2_gathersiv4si;
29490 goto gather_gen;
29491 case IX86_BUILTIN_GATHERSIV8SI:
29492 icode = CODE_FOR_avx2_gathersiv8si;
29493 goto gather_gen;
29494 case IX86_BUILTIN_GATHERDIV4SI:
29495 icode = CODE_FOR_avx2_gatherdiv4si;
29496 goto gather_gen;
29497 case IX86_BUILTIN_GATHERDIV8SI:
29498 icode = CODE_FOR_avx2_gatherdiv8si;
29499 goto gather_gen;
29500 case IX86_BUILTIN_GATHERALTSIV4DF:
29501 icode = CODE_FOR_avx2_gathersiv4df;
29502 goto gather_gen;
29503 case IX86_BUILTIN_GATHERALTDIV8SF:
29504 icode = CODE_FOR_avx2_gatherdiv8sf;
29505 goto gather_gen;
29506 case IX86_BUILTIN_GATHERALTSIV4DI:
29507 icode = CODE_FOR_avx2_gathersiv4di;
29508 goto gather_gen;
29509 case IX86_BUILTIN_GATHERALTDIV8SI:
29510 icode = CODE_FOR_avx2_gatherdiv8si;
29511 goto gather_gen;
29512
29513 gather_gen:
29514 arg0 = CALL_EXPR_ARG (exp, 0);
29515 arg1 = CALL_EXPR_ARG (exp, 1);
29516 arg2 = CALL_EXPR_ARG (exp, 2);
29517 arg3 = CALL_EXPR_ARG (exp, 3);
29518 arg4 = CALL_EXPR_ARG (exp, 4);
29519 op0 = expand_normal (arg0);
29520 op1 = expand_normal (arg1);
29521 op2 = expand_normal (arg2);
29522 op3 = expand_normal (arg3);
29523 op4 = expand_normal (arg4);
29524 /* Note the arg order is different from the operand order. */
29525 mode0 = insn_data[icode].operand[1].mode;
29526 mode2 = insn_data[icode].operand[3].mode;
29527 mode3 = insn_data[icode].operand[4].mode;
29528 mode4 = insn_data[icode].operand[5].mode;
29529
29530 if (target == NULL_RTX
29531 || GET_MODE (target) != insn_data[icode].operand[0].mode)
29532 subtarget = gen_reg_rtx (insn_data[icode].operand[0].mode);
29533 else
29534 subtarget = target;
29535
29536 if (fcode == IX86_BUILTIN_GATHERALTSIV4DF
29537 || fcode == IX86_BUILTIN_GATHERALTSIV4DI)
29538 {
29539 rtx half = gen_reg_rtx (V4SImode);
29540 if (!nonimmediate_operand (op2, V8SImode))
29541 op2 = copy_to_mode_reg (V8SImode, op2);
29542 emit_insn (gen_vec_extract_lo_v8si (half, op2));
29543 op2 = half;
29544 }
29545 else if (fcode == IX86_BUILTIN_GATHERALTDIV8SF
29546 || fcode == IX86_BUILTIN_GATHERALTDIV8SI)
29547 {
29548 rtx (*gen) (rtx, rtx);
29549 rtx half = gen_reg_rtx (mode0);
29550 if (mode0 == V4SFmode)
29551 gen = gen_vec_extract_lo_v8sf;
29552 else
29553 gen = gen_vec_extract_lo_v8si;
29554 if (!nonimmediate_operand (op0, GET_MODE (op0)))
29555 op0 = copy_to_mode_reg (GET_MODE (op0), op0);
29556 emit_insn (gen (half, op0));
29557 op0 = half;
29558 if (!nonimmediate_operand (op3, GET_MODE (op3)))
29559 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
29560 emit_insn (gen (half, op3));
29561 op3 = half;
29562 }
29563
29564 /* Force memory operand only with base register here. But we
29565 don't want to do it on memory operand for other builtin
29566 functions. */
29567 if (GET_MODE (op1) != Pmode)
29568 op1 = convert_to_mode (Pmode, op1, 1);
29569 op1 = force_reg (Pmode, op1);
29570
29571 if (!insn_data[icode].operand[1].predicate (op0, mode0))
29572 op0 = copy_to_mode_reg (mode0, op0);
29573 if (!insn_data[icode].operand[2].predicate (op1, Pmode))
29574 op1 = copy_to_mode_reg (Pmode, op1);
29575 if (!insn_data[icode].operand[3].predicate (op2, mode2))
29576 op2 = copy_to_mode_reg (mode2, op2);
29577 if (!insn_data[icode].operand[4].predicate (op3, mode3))
29578 op3 = copy_to_mode_reg (mode3, op3);
29579 if (!insn_data[icode].operand[5].predicate (op4, mode4))
29580 {
29581 error ("last argument must be scale 1, 2, 4, 8");
29582 return const0_rtx;
29583 }
29584
29585 /* Optimize. If mask is known to have all high bits set,
29586 replace op0 with pc_rtx to signal that the instruction
29587 overwrites the whole destination and doesn't use its
29588 previous contents. */
29589 if (optimize)
29590 {
29591 if (TREE_CODE (arg3) == VECTOR_CST)
29592 {
29593 tree elt;
29594 unsigned int negative = 0;
29595 for (elt = TREE_VECTOR_CST_ELTS (arg3);
29596 elt; elt = TREE_CHAIN (elt))
29597 {
29598 tree cst = TREE_VALUE (elt);
29599 if (TREE_CODE (cst) == INTEGER_CST
29600 && tree_int_cst_sign_bit (cst))
29601 negative++;
29602 else if (TREE_CODE (cst) == REAL_CST
29603 && REAL_VALUE_NEGATIVE (TREE_REAL_CST (cst)))
29604 negative++;
29605 }
29606 if (negative == TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg3)))
29607 op0 = pc_rtx;
29608 }
29609 else if (TREE_CODE (arg3) == SSA_NAME)
29610 {
29611 /* Recognize also when mask is like:
29612 __v2df src = _mm_setzero_pd ();
29613 __v2df mask = _mm_cmpeq_pd (src, src);
29614 or
29615 __v8sf src = _mm256_setzero_ps ();
29616 __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ);
29617 as that is a cheaper way to load all ones into
29618 a register than having to load a constant from
29619 memory. */
29620 gimple def_stmt = SSA_NAME_DEF_STMT (arg3);
29621 if (is_gimple_call (def_stmt))
29622 {
29623 tree fndecl = gimple_call_fndecl (def_stmt);
29624 if (fndecl
29625 && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD)
29626 switch ((unsigned int) DECL_FUNCTION_CODE (fndecl))
29627 {
29628 case IX86_BUILTIN_CMPPD:
29629 case IX86_BUILTIN_CMPPS:
29630 case IX86_BUILTIN_CMPPD256:
29631 case IX86_BUILTIN_CMPPS256:
29632 if (!integer_zerop (gimple_call_arg (def_stmt, 2)))
29633 break;
29634 /* FALLTHRU */
29635 case IX86_BUILTIN_CMPEQPD:
29636 case IX86_BUILTIN_CMPEQPS:
29637 if (initializer_zerop (gimple_call_arg (def_stmt, 0))
29638 && initializer_zerop (gimple_call_arg (def_stmt,
29639 1)))
29640 op0 = pc_rtx;
29641 break;
29642 default:
29643 break;
29644 }
29645 }
29646 }
29647 }
29648
29649 pat = GEN_FCN (icode) (subtarget, op0, op1, op2, op3, op4);
29650 if (! pat)
29651 return const0_rtx;
29652 emit_insn (pat);
29653
29654 if (fcode == IX86_BUILTIN_GATHERDIV8SF
29655 || fcode == IX86_BUILTIN_GATHERDIV8SI)
29656 {
29657 enum machine_mode tmode = GET_MODE (subtarget) == V8SFmode
29658 ? V4SFmode : V4SImode;
29659 if (target == NULL_RTX)
29660 target = gen_reg_rtx (tmode);
29661 if (tmode == V4SFmode)
29662 emit_insn (gen_vec_extract_lo_v8sf (target, subtarget));
29663 else
29664 emit_insn (gen_vec_extract_lo_v8si (target, subtarget));
29665 }
29666 else
29667 target = subtarget;
29668
29669 return target;
29670
29671 default:
29672 break;
29673 }
29674
29675 for (i = 0, d = bdesc_special_args;
29676 i < ARRAY_SIZE (bdesc_special_args);
29677 i++, d++)
29678 if (d->code == fcode)
29679 return ix86_expand_special_args_builtin (d, exp, target);
29680
29681 for (i = 0, d = bdesc_args;
29682 i < ARRAY_SIZE (bdesc_args);
29683 i++, d++)
29684 if (d->code == fcode)
29685 switch (fcode)
29686 {
29687 case IX86_BUILTIN_FABSQ:
29688 case IX86_BUILTIN_COPYSIGNQ:
29689 if (!TARGET_SSE2)
29690 /* Emit a normal call if SSE2 isn't available. */
29691 return expand_call (exp, target, ignore);
29692 default:
29693 return ix86_expand_args_builtin (d, exp, target);
29694 }
29695
29696 for (i = 0, d = bdesc_comi; i < ARRAY_SIZE (bdesc_comi); i++, d++)
29697 if (d->code == fcode)
29698 return ix86_expand_sse_comi (d, exp, target);
29699
29700 for (i = 0, d = bdesc_pcmpestr;
29701 i < ARRAY_SIZE (bdesc_pcmpestr);
29702 i++, d++)
29703 if (d->code == fcode)
29704 return ix86_expand_sse_pcmpestr (d, exp, target);
29705
29706 for (i = 0, d = bdesc_pcmpistr;
29707 i < ARRAY_SIZE (bdesc_pcmpistr);
29708 i++, d++)
29709 if (d->code == fcode)
29710 return ix86_expand_sse_pcmpistr (d, exp, target);
29711
29712 for (i = 0, d = bdesc_multi_arg; i < ARRAY_SIZE (bdesc_multi_arg); i++, d++)
29713 if (d->code == fcode)
29714 return ix86_expand_multi_arg_builtin (d->icode, exp, target,
29715 (enum ix86_builtin_func_type)
29716 d->flag, d->comparison);
29717
29718 gcc_unreachable ();
29719 }
29720
29721 /* Returns a function decl for a vectorized version of the builtin function
29722 with builtin function code FN and the result vector type TYPE, or NULL_TREE
29723 if it is not available. */
29724
29725 static tree
29726 ix86_builtin_vectorized_function (tree fndecl, tree type_out,
29727 tree type_in)
29728 {
29729 enum machine_mode in_mode, out_mode;
29730 int in_n, out_n;
29731 enum built_in_function fn = DECL_FUNCTION_CODE (fndecl);
29732
29733 if (TREE_CODE (type_out) != VECTOR_TYPE
29734 || TREE_CODE (type_in) != VECTOR_TYPE
29735 || DECL_BUILT_IN_CLASS (fndecl) != BUILT_IN_NORMAL)
29736 return NULL_TREE;
29737
29738 out_mode = TYPE_MODE (TREE_TYPE (type_out));
29739 out_n = TYPE_VECTOR_SUBPARTS (type_out);
29740 in_mode = TYPE_MODE (TREE_TYPE (type_in));
29741 in_n = TYPE_VECTOR_SUBPARTS (type_in);
29742
29743 switch (fn)
29744 {
29745 case BUILT_IN_SQRT:
29746 if (out_mode == DFmode && in_mode == DFmode)
29747 {
29748 if (out_n == 2 && in_n == 2)
29749 return ix86_builtins[IX86_BUILTIN_SQRTPD];
29750 else if (out_n == 4 && in_n == 4)
29751 return ix86_builtins[IX86_BUILTIN_SQRTPD256];
29752 }
29753 break;
29754
29755 case BUILT_IN_SQRTF:
29756 if (out_mode == SFmode && in_mode == SFmode)
29757 {
29758 if (out_n == 4 && in_n == 4)
29759 return ix86_builtins[IX86_BUILTIN_SQRTPS_NR];
29760 else if (out_n == 8 && in_n == 8)
29761 return ix86_builtins[IX86_BUILTIN_SQRTPS_NR256];
29762 }
29763 break;
29764
29765 case BUILT_IN_IFLOOR:
29766 case BUILT_IN_LFLOOR:
29767 case BUILT_IN_LLFLOOR:
29768 /* The round insn does not trap on denormals. */
29769 if (flag_trapping_math || !TARGET_ROUND)
29770 break;
29771
29772 if (out_mode == SImode && in_mode == DFmode)
29773 {
29774 if (out_n == 4 && in_n == 2)
29775 return ix86_builtins[IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX];
29776 else if (out_n == 8 && in_n == 4)
29777 return ix86_builtins[IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256];
29778 }
29779 break;
29780
29781 case BUILT_IN_IFLOORF:
29782 case BUILT_IN_LFLOORF:
29783 case BUILT_IN_LLFLOORF:
29784 /* The round insn does not trap on denormals. */
29785 if (flag_trapping_math || !TARGET_ROUND)
29786 break;
29787
29788 if (out_mode == SImode && in_mode == SFmode)
29789 {
29790 if (out_n == 4 && in_n == 4)
29791 return ix86_builtins[IX86_BUILTIN_FLOORPS_SFIX];
29792 else if (out_n == 8 && in_n == 8)
29793 return ix86_builtins[IX86_BUILTIN_FLOORPS_SFIX256];
29794 }
29795 break;
29796
29797 case BUILT_IN_ICEIL:
29798 case BUILT_IN_LCEIL:
29799 case BUILT_IN_LLCEIL:
29800 /* The round insn does not trap on denormals. */
29801 if (flag_trapping_math || !TARGET_ROUND)
29802 break;
29803
29804 if (out_mode == SImode && in_mode == DFmode)
29805 {
29806 if (out_n == 4 && in_n == 2)
29807 return ix86_builtins[IX86_BUILTIN_CEILPD_VEC_PACK_SFIX];
29808 else if (out_n == 8 && in_n == 4)
29809 return ix86_builtins[IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256];
29810 }
29811 break;
29812
29813 case BUILT_IN_ICEILF:
29814 case BUILT_IN_LCEILF:
29815 case BUILT_IN_LLCEILF:
29816 /* The round insn does not trap on denormals. */
29817 if (flag_trapping_math || !TARGET_ROUND)
29818 break;
29819
29820 if (out_mode == SImode && in_mode == SFmode)
29821 {
29822 if (out_n == 4 && in_n == 4)
29823 return ix86_builtins[IX86_BUILTIN_CEILPS_SFIX];
29824 else if (out_n == 8 && in_n == 8)
29825 return ix86_builtins[IX86_BUILTIN_CEILPS_SFIX256];
29826 }
29827 break;
29828
29829 case BUILT_IN_IRINT:
29830 case BUILT_IN_LRINT:
29831 case BUILT_IN_LLRINT:
29832 if (out_mode == SImode && in_mode == DFmode)
29833 {
29834 if (out_n == 4 && in_n == 2)
29835 return ix86_builtins[IX86_BUILTIN_VEC_PACK_SFIX];
29836 else if (out_n == 8 && in_n == 4)
29837 return ix86_builtins[IX86_BUILTIN_VEC_PACK_SFIX256];
29838 }
29839 break;
29840
29841 case BUILT_IN_IRINTF:
29842 case BUILT_IN_LRINTF:
29843 case BUILT_IN_LLRINTF:
29844 if (out_mode == SImode && in_mode == SFmode)
29845 {
29846 if (out_n == 4 && in_n == 4)
29847 return ix86_builtins[IX86_BUILTIN_CVTPS2DQ];
29848 else if (out_n == 8 && in_n == 8)
29849 return ix86_builtins[IX86_BUILTIN_CVTPS2DQ256];
29850 }
29851 break;
29852
29853 case BUILT_IN_IROUND:
29854 case BUILT_IN_LROUND:
29855 case BUILT_IN_LLROUND:
29856 /* The round insn does not trap on denormals. */
29857 if (flag_trapping_math || !TARGET_ROUND)
29858 break;
29859
29860 if (out_mode == SImode && in_mode == DFmode)
29861 {
29862 if (out_n == 4 && in_n == 2)
29863 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX];
29864 else if (out_n == 8 && in_n == 4)
29865 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256];
29866 }
29867 break;
29868
29869 case BUILT_IN_IROUNDF:
29870 case BUILT_IN_LROUNDF:
29871 case BUILT_IN_LLROUNDF:
29872 /* The round insn does not trap on denormals. */
29873 if (flag_trapping_math || !TARGET_ROUND)
29874 break;
29875
29876 if (out_mode == SImode && in_mode == SFmode)
29877 {
29878 if (out_n == 4 && in_n == 4)
29879 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ_SFIX];
29880 else if (out_n == 8 && in_n == 8)
29881 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ_SFIX256];
29882 }
29883 break;
29884
29885 case BUILT_IN_COPYSIGN:
29886 if (out_mode == DFmode && in_mode == DFmode)
29887 {
29888 if (out_n == 2 && in_n == 2)
29889 return ix86_builtins[IX86_BUILTIN_CPYSGNPD];
29890 else if (out_n == 4 && in_n == 4)
29891 return ix86_builtins[IX86_BUILTIN_CPYSGNPD256];
29892 }
29893 break;
29894
29895 case BUILT_IN_COPYSIGNF:
29896 if (out_mode == SFmode && in_mode == SFmode)
29897 {
29898 if (out_n == 4 && in_n == 4)
29899 return ix86_builtins[IX86_BUILTIN_CPYSGNPS];
29900 else if (out_n == 8 && in_n == 8)
29901 return ix86_builtins[IX86_BUILTIN_CPYSGNPS256];
29902 }
29903 break;
29904
29905 case BUILT_IN_FLOOR:
29906 /* The round insn does not trap on denormals. */
29907 if (flag_trapping_math || !TARGET_ROUND)
29908 break;
29909
29910 if (out_mode == DFmode && in_mode == DFmode)
29911 {
29912 if (out_n == 2 && in_n == 2)
29913 return ix86_builtins[IX86_BUILTIN_FLOORPD];
29914 else if (out_n == 4 && in_n == 4)
29915 return ix86_builtins[IX86_BUILTIN_FLOORPD256];
29916 }
29917 break;
29918
29919 case BUILT_IN_FLOORF:
29920 /* The round insn does not trap on denormals. */
29921 if (flag_trapping_math || !TARGET_ROUND)
29922 break;
29923
29924 if (out_mode == SFmode && in_mode == SFmode)
29925 {
29926 if (out_n == 4 && in_n == 4)
29927 return ix86_builtins[IX86_BUILTIN_FLOORPS];
29928 else if (out_n == 8 && in_n == 8)
29929 return ix86_builtins[IX86_BUILTIN_FLOORPS256];
29930 }
29931 break;
29932
29933 case BUILT_IN_CEIL:
29934 /* The round insn does not trap on denormals. */
29935 if (flag_trapping_math || !TARGET_ROUND)
29936 break;
29937
29938 if (out_mode == DFmode && in_mode == DFmode)
29939 {
29940 if (out_n == 2 && in_n == 2)
29941 return ix86_builtins[IX86_BUILTIN_CEILPD];
29942 else if (out_n == 4 && in_n == 4)
29943 return ix86_builtins[IX86_BUILTIN_CEILPD256];
29944 }
29945 break;
29946
29947 case BUILT_IN_CEILF:
29948 /* The round insn does not trap on denormals. */
29949 if (flag_trapping_math || !TARGET_ROUND)
29950 break;
29951
29952 if (out_mode == SFmode && in_mode == SFmode)
29953 {
29954 if (out_n == 4 && in_n == 4)
29955 return ix86_builtins[IX86_BUILTIN_CEILPS];
29956 else if (out_n == 8 && in_n == 8)
29957 return ix86_builtins[IX86_BUILTIN_CEILPS256];
29958 }
29959 break;
29960
29961 case BUILT_IN_TRUNC:
29962 /* The round insn does not trap on denormals. */
29963 if (flag_trapping_math || !TARGET_ROUND)
29964 break;
29965
29966 if (out_mode == DFmode && in_mode == DFmode)
29967 {
29968 if (out_n == 2 && in_n == 2)
29969 return ix86_builtins[IX86_BUILTIN_TRUNCPD];
29970 else if (out_n == 4 && in_n == 4)
29971 return ix86_builtins[IX86_BUILTIN_TRUNCPD256];
29972 }
29973 break;
29974
29975 case BUILT_IN_TRUNCF:
29976 /* The round insn does not trap on denormals. */
29977 if (flag_trapping_math || !TARGET_ROUND)
29978 break;
29979
29980 if (out_mode == SFmode && in_mode == SFmode)
29981 {
29982 if (out_n == 4 && in_n == 4)
29983 return ix86_builtins[IX86_BUILTIN_TRUNCPS];
29984 else if (out_n == 8 && in_n == 8)
29985 return ix86_builtins[IX86_BUILTIN_TRUNCPS256];
29986 }
29987 break;
29988
29989 case BUILT_IN_RINT:
29990 /* The round insn does not trap on denormals. */
29991 if (flag_trapping_math || !TARGET_ROUND)
29992 break;
29993
29994 if (out_mode == DFmode && in_mode == DFmode)
29995 {
29996 if (out_n == 2 && in_n == 2)
29997 return ix86_builtins[IX86_BUILTIN_RINTPD];
29998 else if (out_n == 4 && in_n == 4)
29999 return ix86_builtins[IX86_BUILTIN_RINTPD256];
30000 }
30001 break;
30002
30003 case BUILT_IN_RINTF:
30004 /* The round insn does not trap on denormals. */
30005 if (flag_trapping_math || !TARGET_ROUND)
30006 break;
30007
30008 if (out_mode == SFmode && in_mode == SFmode)
30009 {
30010 if (out_n == 4 && in_n == 4)
30011 return ix86_builtins[IX86_BUILTIN_RINTPS];
30012 else if (out_n == 8 && in_n == 8)
30013 return ix86_builtins[IX86_BUILTIN_RINTPS256];
30014 }
30015 break;
30016
30017 case BUILT_IN_ROUND:
30018 /* The round insn does not trap on denormals. */
30019 if (flag_trapping_math || !TARGET_ROUND)
30020 break;
30021
30022 if (out_mode == DFmode && in_mode == DFmode)
30023 {
30024 if (out_n == 2 && in_n == 2)
30025 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ];
30026 else if (out_n == 4 && in_n == 4)
30027 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ256];
30028 }
30029 break;
30030
30031 case BUILT_IN_ROUNDF:
30032 /* The round insn does not trap on denormals. */
30033 if (flag_trapping_math || !TARGET_ROUND)
30034 break;
30035
30036 if (out_mode == SFmode && in_mode == SFmode)
30037 {
30038 if (out_n == 4 && in_n == 4)
30039 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ];
30040 else if (out_n == 8 && in_n == 8)
30041 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ256];
30042 }
30043 break;
30044
30045 case BUILT_IN_FMA:
30046 if (out_mode == DFmode && in_mode == DFmode)
30047 {
30048 if (out_n == 2 && in_n == 2)
30049 return ix86_builtins[IX86_BUILTIN_VFMADDPD];
30050 if (out_n == 4 && in_n == 4)
30051 return ix86_builtins[IX86_BUILTIN_VFMADDPD256];
30052 }
30053 break;
30054
30055 case BUILT_IN_FMAF:
30056 if (out_mode == SFmode && in_mode == SFmode)
30057 {
30058 if (out_n == 4 && in_n == 4)
30059 return ix86_builtins[IX86_BUILTIN_VFMADDPS];
30060 if (out_n == 8 && in_n == 8)
30061 return ix86_builtins[IX86_BUILTIN_VFMADDPS256];
30062 }
30063 break;
30064
30065 default:
30066 break;
30067 }
30068
30069 /* Dispatch to a handler for a vectorization library. */
30070 if (ix86_veclib_handler)
30071 return ix86_veclib_handler ((enum built_in_function) fn, type_out,
30072 type_in);
30073
30074 return NULL_TREE;
30075 }
30076
30077 /* Handler for an SVML-style interface to
30078 a library with vectorized intrinsics. */
30079
30080 static tree
30081 ix86_veclibabi_svml (enum built_in_function fn, tree type_out, tree type_in)
30082 {
30083 char name[20];
30084 tree fntype, new_fndecl, args;
30085 unsigned arity;
30086 const char *bname;
30087 enum machine_mode el_mode, in_mode;
30088 int n, in_n;
30089
30090 /* The SVML is suitable for unsafe math only. */
30091 if (!flag_unsafe_math_optimizations)
30092 return NULL_TREE;
30093
30094 el_mode = TYPE_MODE (TREE_TYPE (type_out));
30095 n = TYPE_VECTOR_SUBPARTS (type_out);
30096 in_mode = TYPE_MODE (TREE_TYPE (type_in));
30097 in_n = TYPE_VECTOR_SUBPARTS (type_in);
30098 if (el_mode != in_mode
30099 || n != in_n)
30100 return NULL_TREE;
30101
30102 switch (fn)
30103 {
30104 case BUILT_IN_EXP:
30105 case BUILT_IN_LOG:
30106 case BUILT_IN_LOG10:
30107 case BUILT_IN_POW:
30108 case BUILT_IN_TANH:
30109 case BUILT_IN_TAN:
30110 case BUILT_IN_ATAN:
30111 case BUILT_IN_ATAN2:
30112 case BUILT_IN_ATANH:
30113 case BUILT_IN_CBRT:
30114 case BUILT_IN_SINH:
30115 case BUILT_IN_SIN:
30116 case BUILT_IN_ASINH:
30117 case BUILT_IN_ASIN:
30118 case BUILT_IN_COSH:
30119 case BUILT_IN_COS:
30120 case BUILT_IN_ACOSH:
30121 case BUILT_IN_ACOS:
30122 if (el_mode != DFmode || n != 2)
30123 return NULL_TREE;
30124 break;
30125
30126 case BUILT_IN_EXPF:
30127 case BUILT_IN_LOGF:
30128 case BUILT_IN_LOG10F:
30129 case BUILT_IN_POWF:
30130 case BUILT_IN_TANHF:
30131 case BUILT_IN_TANF:
30132 case BUILT_IN_ATANF:
30133 case BUILT_IN_ATAN2F:
30134 case BUILT_IN_ATANHF:
30135 case BUILT_IN_CBRTF:
30136 case BUILT_IN_SINHF:
30137 case BUILT_IN_SINF:
30138 case BUILT_IN_ASINHF:
30139 case BUILT_IN_ASINF:
30140 case BUILT_IN_COSHF:
30141 case BUILT_IN_COSF:
30142 case BUILT_IN_ACOSHF:
30143 case BUILT_IN_ACOSF:
30144 if (el_mode != SFmode || n != 4)
30145 return NULL_TREE;
30146 break;
30147
30148 default:
30149 return NULL_TREE;
30150 }
30151
30152 bname = IDENTIFIER_POINTER (DECL_NAME (builtin_decl_implicit (fn)));
30153
30154 if (fn == BUILT_IN_LOGF)
30155 strcpy (name, "vmlsLn4");
30156 else if (fn == BUILT_IN_LOG)
30157 strcpy (name, "vmldLn2");
30158 else if (n == 4)
30159 {
30160 sprintf (name, "vmls%s", bname+10);
30161 name[strlen (name)-1] = '4';
30162 }
30163 else
30164 sprintf (name, "vmld%s2", bname+10);
30165
30166 /* Convert to uppercase. */
30167 name[4] &= ~0x20;
30168
30169 arity = 0;
30170 for (args = DECL_ARGUMENTS (builtin_decl_implicit (fn));
30171 args;
30172 args = TREE_CHAIN (args))
30173 arity++;
30174
30175 if (arity == 1)
30176 fntype = build_function_type_list (type_out, type_in, NULL);
30177 else
30178 fntype = build_function_type_list (type_out, type_in, type_in, NULL);
30179
30180 /* Build a function declaration for the vectorized function. */
30181 new_fndecl = build_decl (BUILTINS_LOCATION,
30182 FUNCTION_DECL, get_identifier (name), fntype);
30183 TREE_PUBLIC (new_fndecl) = 1;
30184 DECL_EXTERNAL (new_fndecl) = 1;
30185 DECL_IS_NOVOPS (new_fndecl) = 1;
30186 TREE_READONLY (new_fndecl) = 1;
30187
30188 return new_fndecl;
30189 }
30190
30191 /* Handler for an ACML-style interface to
30192 a library with vectorized intrinsics. */
30193
30194 static tree
30195 ix86_veclibabi_acml (enum built_in_function fn, tree type_out, tree type_in)
30196 {
30197 char name[20] = "__vr.._";
30198 tree fntype, new_fndecl, args;
30199 unsigned arity;
30200 const char *bname;
30201 enum machine_mode el_mode, in_mode;
30202 int n, in_n;
30203
30204 /* The ACML is 64bits only and suitable for unsafe math only as
30205 it does not correctly support parts of IEEE with the required
30206 precision such as denormals. */
30207 if (!TARGET_64BIT
30208 || !flag_unsafe_math_optimizations)
30209 return NULL_TREE;
30210
30211 el_mode = TYPE_MODE (TREE_TYPE (type_out));
30212 n = TYPE_VECTOR_SUBPARTS (type_out);
30213 in_mode = TYPE_MODE (TREE_TYPE (type_in));
30214 in_n = TYPE_VECTOR_SUBPARTS (type_in);
30215 if (el_mode != in_mode
30216 || n != in_n)
30217 return NULL_TREE;
30218
30219 switch (fn)
30220 {
30221 case BUILT_IN_SIN:
30222 case BUILT_IN_COS:
30223 case BUILT_IN_EXP:
30224 case BUILT_IN_LOG:
30225 case BUILT_IN_LOG2:
30226 case BUILT_IN_LOG10:
30227 name[4] = 'd';
30228 name[5] = '2';
30229 if (el_mode != DFmode
30230 || n != 2)
30231 return NULL_TREE;
30232 break;
30233
30234 case BUILT_IN_SINF:
30235 case BUILT_IN_COSF:
30236 case BUILT_IN_EXPF:
30237 case BUILT_IN_POWF:
30238 case BUILT_IN_LOGF:
30239 case BUILT_IN_LOG2F:
30240 case BUILT_IN_LOG10F:
30241 name[4] = 's';
30242 name[5] = '4';
30243 if (el_mode != SFmode
30244 || n != 4)
30245 return NULL_TREE;
30246 break;
30247
30248 default:
30249 return NULL_TREE;
30250 }
30251
30252 bname = IDENTIFIER_POINTER (DECL_NAME (builtin_decl_implicit (fn)));
30253 sprintf (name + 7, "%s", bname+10);
30254
30255 arity = 0;
30256 for (args = DECL_ARGUMENTS (builtin_decl_implicit (fn));
30257 args;
30258 args = TREE_CHAIN (args))
30259 arity++;
30260
30261 if (arity == 1)
30262 fntype = build_function_type_list (type_out, type_in, NULL);
30263 else
30264 fntype = build_function_type_list (type_out, type_in, type_in, NULL);
30265
30266 /* Build a function declaration for the vectorized function. */
30267 new_fndecl = build_decl (BUILTINS_LOCATION,
30268 FUNCTION_DECL, get_identifier (name), fntype);
30269 TREE_PUBLIC (new_fndecl) = 1;
30270 DECL_EXTERNAL (new_fndecl) = 1;
30271 DECL_IS_NOVOPS (new_fndecl) = 1;
30272 TREE_READONLY (new_fndecl) = 1;
30273
30274 return new_fndecl;
30275 }
30276
30277 /* Returns a decl of a function that implements gather load with
30278 memory type MEM_VECTYPE and index type INDEX_VECTYPE and SCALE.
30279 Return NULL_TREE if it is not available. */
30280
30281 static tree
30282 ix86_vectorize_builtin_gather (const_tree mem_vectype,
30283 const_tree index_type, int scale)
30284 {
30285 bool si;
30286 enum ix86_builtins code;
30287
30288 if (! TARGET_AVX2)
30289 return NULL_TREE;
30290
30291 if ((TREE_CODE (index_type) != INTEGER_TYPE
30292 && !POINTER_TYPE_P (index_type))
30293 || (TYPE_MODE (index_type) != SImode
30294 && TYPE_MODE (index_type) != DImode))
30295 return NULL_TREE;
30296
30297 if (TYPE_PRECISION (index_type) > POINTER_SIZE)
30298 return NULL_TREE;
30299
30300 /* v*gather* insn sign extends index to pointer mode. */
30301 if (TYPE_PRECISION (index_type) < POINTER_SIZE
30302 && TYPE_UNSIGNED (index_type))
30303 return NULL_TREE;
30304
30305 if (scale <= 0
30306 || scale > 8
30307 || (scale & (scale - 1)) != 0)
30308 return NULL_TREE;
30309
30310 si = TYPE_MODE (index_type) == SImode;
30311 switch (TYPE_MODE (mem_vectype))
30312 {
30313 case V2DFmode:
30314 code = si ? IX86_BUILTIN_GATHERSIV2DF : IX86_BUILTIN_GATHERDIV2DF;
30315 break;
30316 case V4DFmode:
30317 code = si ? IX86_BUILTIN_GATHERALTSIV4DF : IX86_BUILTIN_GATHERDIV4DF;
30318 break;
30319 case V2DImode:
30320 code = si ? IX86_BUILTIN_GATHERSIV2DI : IX86_BUILTIN_GATHERDIV2DI;
30321 break;
30322 case V4DImode:
30323 code = si ? IX86_BUILTIN_GATHERALTSIV4DI : IX86_BUILTIN_GATHERDIV4DI;
30324 break;
30325 case V4SFmode:
30326 code = si ? IX86_BUILTIN_GATHERSIV4SF : IX86_BUILTIN_GATHERDIV4SF;
30327 break;
30328 case V8SFmode:
30329 code = si ? IX86_BUILTIN_GATHERSIV8SF : IX86_BUILTIN_GATHERALTDIV8SF;
30330 break;
30331 case V4SImode:
30332 code = si ? IX86_BUILTIN_GATHERSIV4SI : IX86_BUILTIN_GATHERDIV4SI;
30333 break;
30334 case V8SImode:
30335 code = si ? IX86_BUILTIN_GATHERSIV8SI : IX86_BUILTIN_GATHERALTDIV8SI;
30336 break;
30337 default:
30338 return NULL_TREE;
30339 }
30340
30341 return ix86_builtins[code];
30342 }
30343
30344 /* Returns a code for a target-specific builtin that implements
30345 reciprocal of the function, or NULL_TREE if not available. */
30346
30347 static tree
30348 ix86_builtin_reciprocal (unsigned int fn, bool md_fn,
30349 bool sqrt ATTRIBUTE_UNUSED)
30350 {
30351 if (! (TARGET_SSE_MATH && !optimize_insn_for_size_p ()
30352 && flag_finite_math_only && !flag_trapping_math
30353 && flag_unsafe_math_optimizations))
30354 return NULL_TREE;
30355
30356 if (md_fn)
30357 /* Machine dependent builtins. */
30358 switch (fn)
30359 {
30360 /* Vectorized version of sqrt to rsqrt conversion. */
30361 case IX86_BUILTIN_SQRTPS_NR:
30362 return ix86_builtins[IX86_BUILTIN_RSQRTPS_NR];
30363
30364 case IX86_BUILTIN_SQRTPS_NR256:
30365 return ix86_builtins[IX86_BUILTIN_RSQRTPS_NR256];
30366
30367 default:
30368 return NULL_TREE;
30369 }
30370 else
30371 /* Normal builtins. */
30372 switch (fn)
30373 {
30374 /* Sqrt to rsqrt conversion. */
30375 case BUILT_IN_SQRTF:
30376 return ix86_builtins[IX86_BUILTIN_RSQRTF];
30377
30378 default:
30379 return NULL_TREE;
30380 }
30381 }
30382 \f
30383 /* Helper for avx_vpermilps256_operand et al. This is also used by
30384 the expansion functions to turn the parallel back into a mask.
30385 The return value is 0 for no match and the imm8+1 for a match. */
30386
30387 int
30388 avx_vpermilp_parallel (rtx par, enum machine_mode mode)
30389 {
30390 unsigned i, nelt = GET_MODE_NUNITS (mode);
30391 unsigned mask = 0;
30392 unsigned char ipar[8];
30393
30394 if (XVECLEN (par, 0) != (int) nelt)
30395 return 0;
30396
30397 /* Validate that all of the elements are constants, and not totally
30398 out of range. Copy the data into an integral array to make the
30399 subsequent checks easier. */
30400 for (i = 0; i < nelt; ++i)
30401 {
30402 rtx er = XVECEXP (par, 0, i);
30403 unsigned HOST_WIDE_INT ei;
30404
30405 if (!CONST_INT_P (er))
30406 return 0;
30407 ei = INTVAL (er);
30408 if (ei >= nelt)
30409 return 0;
30410 ipar[i] = ei;
30411 }
30412
30413 switch (mode)
30414 {
30415 case V4DFmode:
30416 /* In the 256-bit DFmode case, we can only move elements within
30417 a 128-bit lane. */
30418 for (i = 0; i < 2; ++i)
30419 {
30420 if (ipar[i] >= 2)
30421 return 0;
30422 mask |= ipar[i] << i;
30423 }
30424 for (i = 2; i < 4; ++i)
30425 {
30426 if (ipar[i] < 2)
30427 return 0;
30428 mask |= (ipar[i] - 2) << i;
30429 }
30430 break;
30431
30432 case V8SFmode:
30433 /* In the 256-bit SFmode case, we have full freedom of movement
30434 within the low 128-bit lane, but the high 128-bit lane must
30435 mirror the exact same pattern. */
30436 for (i = 0; i < 4; ++i)
30437 if (ipar[i] + 4 != ipar[i + 4])
30438 return 0;
30439 nelt = 4;
30440 /* FALLTHRU */
30441
30442 case V2DFmode:
30443 case V4SFmode:
30444 /* In the 128-bit case, we've full freedom in the placement of
30445 the elements from the source operand. */
30446 for (i = 0; i < nelt; ++i)
30447 mask |= ipar[i] << (i * (nelt / 2));
30448 break;
30449
30450 default:
30451 gcc_unreachable ();
30452 }
30453
30454 /* Make sure success has a non-zero value by adding one. */
30455 return mask + 1;
30456 }
30457
30458 /* Helper for avx_vperm2f128_v4df_operand et al. This is also used by
30459 the expansion functions to turn the parallel back into a mask.
30460 The return value is 0 for no match and the imm8+1 for a match. */
30461
30462 int
30463 avx_vperm2f128_parallel (rtx par, enum machine_mode mode)
30464 {
30465 unsigned i, nelt = GET_MODE_NUNITS (mode), nelt2 = nelt / 2;
30466 unsigned mask = 0;
30467 unsigned char ipar[8];
30468
30469 if (XVECLEN (par, 0) != (int) nelt)
30470 return 0;
30471
30472 /* Validate that all of the elements are constants, and not totally
30473 out of range. Copy the data into an integral array to make the
30474 subsequent checks easier. */
30475 for (i = 0; i < nelt; ++i)
30476 {
30477 rtx er = XVECEXP (par, 0, i);
30478 unsigned HOST_WIDE_INT ei;
30479
30480 if (!CONST_INT_P (er))
30481 return 0;
30482 ei = INTVAL (er);
30483 if (ei >= 2 * nelt)
30484 return 0;
30485 ipar[i] = ei;
30486 }
30487
30488 /* Validate that the halves of the permute are halves. */
30489 for (i = 0; i < nelt2 - 1; ++i)
30490 if (ipar[i] + 1 != ipar[i + 1])
30491 return 0;
30492 for (i = nelt2; i < nelt - 1; ++i)
30493 if (ipar[i] + 1 != ipar[i + 1])
30494 return 0;
30495
30496 /* Reconstruct the mask. */
30497 for (i = 0; i < 2; ++i)
30498 {
30499 unsigned e = ipar[i * nelt2];
30500 if (e % nelt2)
30501 return 0;
30502 e /= nelt2;
30503 mask |= e << (i * 4);
30504 }
30505
30506 /* Make sure success has a non-zero value by adding one. */
30507 return mask + 1;
30508 }
30509 \f
30510 /* Store OPERAND to the memory after reload is completed. This means
30511 that we can't easily use assign_stack_local. */
30512 rtx
30513 ix86_force_to_memory (enum machine_mode mode, rtx operand)
30514 {
30515 rtx result;
30516
30517 gcc_assert (reload_completed);
30518 if (ix86_using_red_zone ())
30519 {
30520 result = gen_rtx_MEM (mode,
30521 gen_rtx_PLUS (Pmode,
30522 stack_pointer_rtx,
30523 GEN_INT (-RED_ZONE_SIZE)));
30524 emit_move_insn (result, operand);
30525 }
30526 else if (TARGET_64BIT)
30527 {
30528 switch (mode)
30529 {
30530 case HImode:
30531 case SImode:
30532 operand = gen_lowpart (DImode, operand);
30533 /* FALLTHRU */
30534 case DImode:
30535 emit_insn (
30536 gen_rtx_SET (VOIDmode,
30537 gen_rtx_MEM (DImode,
30538 gen_rtx_PRE_DEC (DImode,
30539 stack_pointer_rtx)),
30540 operand));
30541 break;
30542 default:
30543 gcc_unreachable ();
30544 }
30545 result = gen_rtx_MEM (mode, stack_pointer_rtx);
30546 }
30547 else
30548 {
30549 switch (mode)
30550 {
30551 case DImode:
30552 {
30553 rtx operands[2];
30554 split_double_mode (mode, &operand, 1, operands, operands + 1);
30555 emit_insn (
30556 gen_rtx_SET (VOIDmode,
30557 gen_rtx_MEM (SImode,
30558 gen_rtx_PRE_DEC (Pmode,
30559 stack_pointer_rtx)),
30560 operands[1]));
30561 emit_insn (
30562 gen_rtx_SET (VOIDmode,
30563 gen_rtx_MEM (SImode,
30564 gen_rtx_PRE_DEC (Pmode,
30565 stack_pointer_rtx)),
30566 operands[0]));
30567 }
30568 break;
30569 case HImode:
30570 /* Store HImodes as SImodes. */
30571 operand = gen_lowpart (SImode, operand);
30572 /* FALLTHRU */
30573 case SImode:
30574 emit_insn (
30575 gen_rtx_SET (VOIDmode,
30576 gen_rtx_MEM (GET_MODE (operand),
30577 gen_rtx_PRE_DEC (SImode,
30578 stack_pointer_rtx)),
30579 operand));
30580 break;
30581 default:
30582 gcc_unreachable ();
30583 }
30584 result = gen_rtx_MEM (mode, stack_pointer_rtx);
30585 }
30586 return result;
30587 }
30588
30589 /* Free operand from the memory. */
30590 void
30591 ix86_free_from_memory (enum machine_mode mode)
30592 {
30593 if (!ix86_using_red_zone ())
30594 {
30595 int size;
30596
30597 if (mode == DImode || TARGET_64BIT)
30598 size = 8;
30599 else
30600 size = 4;
30601 /* Use LEA to deallocate stack space. In peephole2 it will be converted
30602 to pop or add instruction if registers are available. */
30603 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
30604 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
30605 GEN_INT (size))));
30606 }
30607 }
30608
30609 /* Implement TARGET_PREFERRED_RELOAD_CLASS.
30610
30611 Put float CONST_DOUBLE in the constant pool instead of fp regs.
30612 QImode must go into class Q_REGS.
30613 Narrow ALL_REGS to GENERAL_REGS. This supports allowing movsf and
30614 movdf to do mem-to-mem moves through integer regs. */
30615
30616 static reg_class_t
30617 ix86_preferred_reload_class (rtx x, reg_class_t regclass)
30618 {
30619 enum machine_mode mode = GET_MODE (x);
30620
30621 /* We're only allowed to return a subclass of CLASS. Many of the
30622 following checks fail for NO_REGS, so eliminate that early. */
30623 if (regclass == NO_REGS)
30624 return NO_REGS;
30625
30626 /* All classes can load zeros. */
30627 if (x == CONST0_RTX (mode))
30628 return regclass;
30629
30630 /* Force constants into memory if we are loading a (nonzero) constant into
30631 an MMX or SSE register. This is because there are no MMX/SSE instructions
30632 to load from a constant. */
30633 if (CONSTANT_P (x)
30634 && (MAYBE_MMX_CLASS_P (regclass) || MAYBE_SSE_CLASS_P (regclass)))
30635 return NO_REGS;
30636
30637 /* Prefer SSE regs only, if we can use them for math. */
30638 if (TARGET_SSE_MATH && !TARGET_MIX_SSE_I387 && SSE_FLOAT_MODE_P (mode))
30639 return SSE_CLASS_P (regclass) ? regclass : NO_REGS;
30640
30641 /* Floating-point constants need more complex checks. */
30642 if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) != VOIDmode)
30643 {
30644 /* General regs can load everything. */
30645 if (reg_class_subset_p (regclass, GENERAL_REGS))
30646 return regclass;
30647
30648 /* Floats can load 0 and 1 plus some others. Note that we eliminated
30649 zero above. We only want to wind up preferring 80387 registers if
30650 we plan on doing computation with them. */
30651 if (TARGET_80387
30652 && standard_80387_constant_p (x) > 0)
30653 {
30654 /* Limit class to non-sse. */
30655 if (regclass == FLOAT_SSE_REGS)
30656 return FLOAT_REGS;
30657 if (regclass == FP_TOP_SSE_REGS)
30658 return FP_TOP_REG;
30659 if (regclass == FP_SECOND_SSE_REGS)
30660 return FP_SECOND_REG;
30661 if (regclass == FLOAT_INT_REGS || regclass == FLOAT_REGS)
30662 return regclass;
30663 }
30664
30665 return NO_REGS;
30666 }
30667
30668 /* Generally when we see PLUS here, it's the function invariant
30669 (plus soft-fp const_int). Which can only be computed into general
30670 regs. */
30671 if (GET_CODE (x) == PLUS)
30672 return reg_class_subset_p (regclass, GENERAL_REGS) ? regclass : NO_REGS;
30673
30674 /* QImode constants are easy to load, but non-constant QImode data
30675 must go into Q_REGS. */
30676 if (GET_MODE (x) == QImode && !CONSTANT_P (x))
30677 {
30678 if (reg_class_subset_p (regclass, Q_REGS))
30679 return regclass;
30680 if (reg_class_subset_p (Q_REGS, regclass))
30681 return Q_REGS;
30682 return NO_REGS;
30683 }
30684
30685 return regclass;
30686 }
30687
30688 /* Discourage putting floating-point values in SSE registers unless
30689 SSE math is being used, and likewise for the 387 registers. */
30690 static reg_class_t
30691 ix86_preferred_output_reload_class (rtx x, reg_class_t regclass)
30692 {
30693 enum machine_mode mode = GET_MODE (x);
30694
30695 /* Restrict the output reload class to the register bank that we are doing
30696 math on. If we would like not to return a subset of CLASS, reject this
30697 alternative: if reload cannot do this, it will still use its choice. */
30698 mode = GET_MODE (x);
30699 if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode))
30700 return MAYBE_SSE_CLASS_P (regclass) ? SSE_REGS : NO_REGS;
30701
30702 if (X87_FLOAT_MODE_P (mode))
30703 {
30704 if (regclass == FP_TOP_SSE_REGS)
30705 return FP_TOP_REG;
30706 else if (regclass == FP_SECOND_SSE_REGS)
30707 return FP_SECOND_REG;
30708 else
30709 return FLOAT_CLASS_P (regclass) ? regclass : NO_REGS;
30710 }
30711
30712 return regclass;
30713 }
30714
30715 static reg_class_t
30716 ix86_secondary_reload (bool in_p, rtx x, reg_class_t rclass,
30717 enum machine_mode mode, secondary_reload_info *sri)
30718 {
30719 /* Double-word spills from general registers to non-offsettable memory
30720 references (zero-extended addresses) require special handling. */
30721 if (TARGET_64BIT
30722 && MEM_P (x)
30723 && GET_MODE_SIZE (mode) > UNITS_PER_WORD
30724 && rclass == GENERAL_REGS
30725 && !offsettable_memref_p (x))
30726 {
30727 sri->icode = (in_p
30728 ? CODE_FOR_reload_noff_load
30729 : CODE_FOR_reload_noff_store);
30730 /* Add the cost of moving address to a temporary. */
30731 sri->extra_cost = 1;
30732
30733 return NO_REGS;
30734 }
30735
30736 /* QImode spills from non-QI registers require
30737 intermediate register on 32bit targets. */
30738 if (!TARGET_64BIT
30739 && !in_p && mode == QImode
30740 && (rclass == GENERAL_REGS
30741 || rclass == LEGACY_REGS
30742 || rclass == INDEX_REGS))
30743 {
30744 int regno;
30745
30746 if (REG_P (x))
30747 regno = REGNO (x);
30748 else
30749 regno = -1;
30750
30751 if (regno >= FIRST_PSEUDO_REGISTER || GET_CODE (x) == SUBREG)
30752 regno = true_regnum (x);
30753
30754 /* Return Q_REGS if the operand is in memory. */
30755 if (regno == -1)
30756 return Q_REGS;
30757 }
30758
30759 /* This condition handles corner case where an expression involving
30760 pointers gets vectorized. We're trying to use the address of a
30761 stack slot as a vector initializer.
30762
30763 (set (reg:V2DI 74 [ vect_cst_.2 ])
30764 (vec_duplicate:V2DI (reg/f:DI 20 frame)))
30765
30766 Eventually frame gets turned into sp+offset like this:
30767
30768 (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
30769 (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
30770 (const_int 392 [0x188]))))
30771
30772 That later gets turned into:
30773
30774 (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
30775 (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
30776 (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))))
30777
30778 We'll have the following reload recorded:
30779
30780 Reload 0: reload_in (DI) =
30781 (plus:DI (reg/f:DI 7 sp)
30782 (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))
30783 reload_out (V2DI) = (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
30784 SSE_REGS, RELOAD_OTHER (opnum = 0), can't combine
30785 reload_in_reg: (plus:DI (reg/f:DI 7 sp) (const_int 392 [0x188]))
30786 reload_out_reg: (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
30787 reload_reg_rtx: (reg:V2DI 22 xmm1)
30788
30789 Which isn't going to work since SSE instructions can't handle scalar
30790 additions. Returning GENERAL_REGS forces the addition into integer
30791 register and reload can handle subsequent reloads without problems. */
30792
30793 if (in_p && GET_CODE (x) == PLUS
30794 && SSE_CLASS_P (rclass)
30795 && SCALAR_INT_MODE_P (mode))
30796 return GENERAL_REGS;
30797
30798 return NO_REGS;
30799 }
30800
30801 /* Implement TARGET_CLASS_LIKELY_SPILLED_P. */
30802
30803 static bool
30804 ix86_class_likely_spilled_p (reg_class_t rclass)
30805 {
30806 switch (rclass)
30807 {
30808 case AREG:
30809 case DREG:
30810 case CREG:
30811 case BREG:
30812 case AD_REGS:
30813 case SIREG:
30814 case DIREG:
30815 case SSE_FIRST_REG:
30816 case FP_TOP_REG:
30817 case FP_SECOND_REG:
30818 return true;
30819
30820 default:
30821 break;
30822 }
30823
30824 return false;
30825 }
30826
30827 /* If we are copying between general and FP registers, we need a memory
30828 location. The same is true for SSE and MMX registers.
30829
30830 To optimize register_move_cost performance, allow inline variant.
30831
30832 The macro can't work reliably when one of the CLASSES is class containing
30833 registers from multiple units (SSE, MMX, integer). We avoid this by never
30834 combining those units in single alternative in the machine description.
30835 Ensure that this constraint holds to avoid unexpected surprises.
30836
30837 When STRICT is false, we are being called from REGISTER_MOVE_COST, so do not
30838 enforce these sanity checks. */
30839
30840 static inline bool
30841 inline_secondary_memory_needed (enum reg_class class1, enum reg_class class2,
30842 enum machine_mode mode, int strict)
30843 {
30844 if (MAYBE_FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class1)
30845 || MAYBE_FLOAT_CLASS_P (class2) != FLOAT_CLASS_P (class2)
30846 || MAYBE_SSE_CLASS_P (class1) != SSE_CLASS_P (class1)
30847 || MAYBE_SSE_CLASS_P (class2) != SSE_CLASS_P (class2)
30848 || MAYBE_MMX_CLASS_P (class1) != MMX_CLASS_P (class1)
30849 || MAYBE_MMX_CLASS_P (class2) != MMX_CLASS_P (class2))
30850 {
30851 gcc_assert (!strict);
30852 return true;
30853 }
30854
30855 if (FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class2))
30856 return true;
30857
30858 /* ??? This is a lie. We do have moves between mmx/general, and for
30859 mmx/sse2. But by saying we need secondary memory we discourage the
30860 register allocator from using the mmx registers unless needed. */
30861 if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2))
30862 return true;
30863
30864 if (SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
30865 {
30866 /* SSE1 doesn't have any direct moves from other classes. */
30867 if (!TARGET_SSE2)
30868 return true;
30869
30870 /* If the target says that inter-unit moves are more expensive
30871 than moving through memory, then don't generate them. */
30872 if (!TARGET_INTER_UNIT_MOVES)
30873 return true;
30874
30875 /* Between SSE and general, we have moves no larger than word size. */
30876 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
30877 return true;
30878 }
30879
30880 return false;
30881 }
30882
30883 bool
30884 ix86_secondary_memory_needed (enum reg_class class1, enum reg_class class2,
30885 enum machine_mode mode, int strict)
30886 {
30887 return inline_secondary_memory_needed (class1, class2, mode, strict);
30888 }
30889
30890 /* Implement the TARGET_CLASS_MAX_NREGS hook.
30891
30892 On the 80386, this is the size of MODE in words,
30893 except in the FP regs, where a single reg is always enough. */
30894
30895 static unsigned char
30896 ix86_class_max_nregs (reg_class_t rclass, enum machine_mode mode)
30897 {
30898 if (MAYBE_INTEGER_CLASS_P (rclass))
30899 {
30900 if (mode == XFmode)
30901 return (TARGET_64BIT ? 2 : 3);
30902 else if (mode == XCmode)
30903 return (TARGET_64BIT ? 4 : 6);
30904 else
30905 return ((GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD);
30906 }
30907 else
30908 {
30909 if (COMPLEX_MODE_P (mode))
30910 return 2;
30911 else
30912 return 1;
30913 }
30914 }
30915
30916 /* Return true if the registers in CLASS cannot represent the change from
30917 modes FROM to TO. */
30918
30919 bool
30920 ix86_cannot_change_mode_class (enum machine_mode from, enum machine_mode to,
30921 enum reg_class regclass)
30922 {
30923 if (from == to)
30924 return false;
30925
30926 /* x87 registers can't do subreg at all, as all values are reformatted
30927 to extended precision. */
30928 if (MAYBE_FLOAT_CLASS_P (regclass))
30929 return true;
30930
30931 if (MAYBE_SSE_CLASS_P (regclass) || MAYBE_MMX_CLASS_P (regclass))
30932 {
30933 /* Vector registers do not support QI or HImode loads. If we don't
30934 disallow a change to these modes, reload will assume it's ok to
30935 drop the subreg from (subreg:SI (reg:HI 100) 0). This affects
30936 the vec_dupv4hi pattern. */
30937 if (GET_MODE_SIZE (from) < 4)
30938 return true;
30939
30940 /* Vector registers do not support subreg with nonzero offsets, which
30941 are otherwise valid for integer registers. Since we can't see
30942 whether we have a nonzero offset from here, prohibit all
30943 nonparadoxical subregs changing size. */
30944 if (GET_MODE_SIZE (to) < GET_MODE_SIZE (from))
30945 return true;
30946 }
30947
30948 return false;
30949 }
30950
30951 /* Return the cost of moving data of mode M between a
30952 register and memory. A value of 2 is the default; this cost is
30953 relative to those in `REGISTER_MOVE_COST'.
30954
30955 This function is used extensively by register_move_cost that is used to
30956 build tables at startup. Make it inline in this case.
30957 When IN is 2, return maximum of in and out move cost.
30958
30959 If moving between registers and memory is more expensive than
30960 between two registers, you should define this macro to express the
30961 relative cost.
30962
30963 Model also increased moving costs of QImode registers in non
30964 Q_REGS classes.
30965 */
30966 static inline int
30967 inline_memory_move_cost (enum machine_mode mode, enum reg_class regclass,
30968 int in)
30969 {
30970 int cost;
30971 if (FLOAT_CLASS_P (regclass))
30972 {
30973 int index;
30974 switch (mode)
30975 {
30976 case SFmode:
30977 index = 0;
30978 break;
30979 case DFmode:
30980 index = 1;
30981 break;
30982 case XFmode:
30983 index = 2;
30984 break;
30985 default:
30986 return 100;
30987 }
30988 if (in == 2)
30989 return MAX (ix86_cost->fp_load [index], ix86_cost->fp_store [index]);
30990 return in ? ix86_cost->fp_load [index] : ix86_cost->fp_store [index];
30991 }
30992 if (SSE_CLASS_P (regclass))
30993 {
30994 int index;
30995 switch (GET_MODE_SIZE (mode))
30996 {
30997 case 4:
30998 index = 0;
30999 break;
31000 case 8:
31001 index = 1;
31002 break;
31003 case 16:
31004 index = 2;
31005 break;
31006 default:
31007 return 100;
31008 }
31009 if (in == 2)
31010 return MAX (ix86_cost->sse_load [index], ix86_cost->sse_store [index]);
31011 return in ? ix86_cost->sse_load [index] : ix86_cost->sse_store [index];
31012 }
31013 if (MMX_CLASS_P (regclass))
31014 {
31015 int index;
31016 switch (GET_MODE_SIZE (mode))
31017 {
31018 case 4:
31019 index = 0;
31020 break;
31021 case 8:
31022 index = 1;
31023 break;
31024 default:
31025 return 100;
31026 }
31027 if (in)
31028 return MAX (ix86_cost->mmx_load [index], ix86_cost->mmx_store [index]);
31029 return in ? ix86_cost->mmx_load [index] : ix86_cost->mmx_store [index];
31030 }
31031 switch (GET_MODE_SIZE (mode))
31032 {
31033 case 1:
31034 if (Q_CLASS_P (regclass) || TARGET_64BIT)
31035 {
31036 if (!in)
31037 return ix86_cost->int_store[0];
31038 if (TARGET_PARTIAL_REG_DEPENDENCY
31039 && optimize_function_for_speed_p (cfun))
31040 cost = ix86_cost->movzbl_load;
31041 else
31042 cost = ix86_cost->int_load[0];
31043 if (in == 2)
31044 return MAX (cost, ix86_cost->int_store[0]);
31045 return cost;
31046 }
31047 else
31048 {
31049 if (in == 2)
31050 return MAX (ix86_cost->movzbl_load, ix86_cost->int_store[0] + 4);
31051 if (in)
31052 return ix86_cost->movzbl_load;
31053 else
31054 return ix86_cost->int_store[0] + 4;
31055 }
31056 break;
31057 case 2:
31058 if (in == 2)
31059 return MAX (ix86_cost->int_load[1], ix86_cost->int_store[1]);
31060 return in ? ix86_cost->int_load[1] : ix86_cost->int_store[1];
31061 default:
31062 /* Compute number of 32bit moves needed. TFmode is moved as XFmode. */
31063 if (mode == TFmode)
31064 mode = XFmode;
31065 if (in == 2)
31066 cost = MAX (ix86_cost->int_load[2] , ix86_cost->int_store[2]);
31067 else if (in)
31068 cost = ix86_cost->int_load[2];
31069 else
31070 cost = ix86_cost->int_store[2];
31071 return (cost * (((int) GET_MODE_SIZE (mode)
31072 + UNITS_PER_WORD - 1) / UNITS_PER_WORD));
31073 }
31074 }
31075
31076 static int
31077 ix86_memory_move_cost (enum machine_mode mode, reg_class_t regclass,
31078 bool in)
31079 {
31080 return inline_memory_move_cost (mode, (enum reg_class) regclass, in ? 1 : 0);
31081 }
31082
31083
31084 /* Return the cost of moving data from a register in class CLASS1 to
31085 one in class CLASS2.
31086
31087 It is not required that the cost always equal 2 when FROM is the same as TO;
31088 on some machines it is expensive to move between registers if they are not
31089 general registers. */
31090
31091 static int
31092 ix86_register_move_cost (enum machine_mode mode, reg_class_t class1_i,
31093 reg_class_t class2_i)
31094 {
31095 enum reg_class class1 = (enum reg_class) class1_i;
31096 enum reg_class class2 = (enum reg_class) class2_i;
31097
31098 /* In case we require secondary memory, compute cost of the store followed
31099 by load. In order to avoid bad register allocation choices, we need
31100 for this to be *at least* as high as the symmetric MEMORY_MOVE_COST. */
31101
31102 if (inline_secondary_memory_needed (class1, class2, mode, 0))
31103 {
31104 int cost = 1;
31105
31106 cost += inline_memory_move_cost (mode, class1, 2);
31107 cost += inline_memory_move_cost (mode, class2, 2);
31108
31109 /* In case of copying from general_purpose_register we may emit multiple
31110 stores followed by single load causing memory size mismatch stall.
31111 Count this as arbitrarily high cost of 20. */
31112 if (targetm.class_max_nregs (class1, mode)
31113 > targetm.class_max_nregs (class2, mode))
31114 cost += 20;
31115
31116 /* In the case of FP/MMX moves, the registers actually overlap, and we
31117 have to switch modes in order to treat them differently. */
31118 if ((MMX_CLASS_P (class1) && MAYBE_FLOAT_CLASS_P (class2))
31119 || (MMX_CLASS_P (class2) && MAYBE_FLOAT_CLASS_P (class1)))
31120 cost += 20;
31121
31122 return cost;
31123 }
31124
31125 /* Moves between SSE/MMX and integer unit are expensive. */
31126 if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2)
31127 || SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
31128
31129 /* ??? By keeping returned value relatively high, we limit the number
31130 of moves between integer and MMX/SSE registers for all targets.
31131 Additionally, high value prevents problem with x86_modes_tieable_p(),
31132 where integer modes in MMX/SSE registers are not tieable
31133 because of missing QImode and HImode moves to, from or between
31134 MMX/SSE registers. */
31135 return MAX (8, ix86_cost->mmxsse_to_integer);
31136
31137 if (MAYBE_FLOAT_CLASS_P (class1))
31138 return ix86_cost->fp_move;
31139 if (MAYBE_SSE_CLASS_P (class1))
31140 return ix86_cost->sse_move;
31141 if (MAYBE_MMX_CLASS_P (class1))
31142 return ix86_cost->mmx_move;
31143 return 2;
31144 }
31145
31146 /* Return TRUE if hard register REGNO can hold a value of machine-mode
31147 MODE. */
31148
31149 bool
31150 ix86_hard_regno_mode_ok (int regno, enum machine_mode mode)
31151 {
31152 /* Flags and only flags can only hold CCmode values. */
31153 if (CC_REGNO_P (regno))
31154 return GET_MODE_CLASS (mode) == MODE_CC;
31155 if (GET_MODE_CLASS (mode) == MODE_CC
31156 || GET_MODE_CLASS (mode) == MODE_RANDOM
31157 || GET_MODE_CLASS (mode) == MODE_PARTIAL_INT)
31158 return false;
31159 if (FP_REGNO_P (regno))
31160 return VALID_FP_MODE_P (mode);
31161 if (SSE_REGNO_P (regno))
31162 {
31163 /* We implement the move patterns for all vector modes into and
31164 out of SSE registers, even when no operation instructions
31165 are available. OImode move is available only when AVX is
31166 enabled. */
31167 return ((TARGET_AVX && mode == OImode)
31168 || VALID_AVX256_REG_MODE (mode)
31169 || VALID_SSE_REG_MODE (mode)
31170 || VALID_SSE2_REG_MODE (mode)
31171 || VALID_MMX_REG_MODE (mode)
31172 || VALID_MMX_REG_MODE_3DNOW (mode));
31173 }
31174 if (MMX_REGNO_P (regno))
31175 {
31176 /* We implement the move patterns for 3DNOW modes even in MMX mode,
31177 so if the register is available at all, then we can move data of
31178 the given mode into or out of it. */
31179 return (VALID_MMX_REG_MODE (mode)
31180 || VALID_MMX_REG_MODE_3DNOW (mode));
31181 }
31182
31183 if (mode == QImode)
31184 {
31185 /* Take care for QImode values - they can be in non-QI regs,
31186 but then they do cause partial register stalls. */
31187 if (regno <= BX_REG || TARGET_64BIT)
31188 return true;
31189 if (!TARGET_PARTIAL_REG_STALL)
31190 return true;
31191 return !can_create_pseudo_p ();
31192 }
31193 /* We handle both integer and floats in the general purpose registers. */
31194 else if (VALID_INT_MODE_P (mode))
31195 return true;
31196 else if (VALID_FP_MODE_P (mode))
31197 return true;
31198 else if (VALID_DFP_MODE_P (mode))
31199 return true;
31200 /* Lots of MMX code casts 8 byte vector modes to DImode. If we then go
31201 on to use that value in smaller contexts, this can easily force a
31202 pseudo to be allocated to GENERAL_REGS. Since this is no worse than
31203 supporting DImode, allow it. */
31204 else if (VALID_MMX_REG_MODE_3DNOW (mode) || VALID_MMX_REG_MODE (mode))
31205 return true;
31206
31207 return false;
31208 }
31209
31210 /* A subroutine of ix86_modes_tieable_p. Return true if MODE is a
31211 tieable integer mode. */
31212
31213 static bool
31214 ix86_tieable_integer_mode_p (enum machine_mode mode)
31215 {
31216 switch (mode)
31217 {
31218 case HImode:
31219 case SImode:
31220 return true;
31221
31222 case QImode:
31223 return TARGET_64BIT || !TARGET_PARTIAL_REG_STALL;
31224
31225 case DImode:
31226 return TARGET_64BIT;
31227
31228 default:
31229 return false;
31230 }
31231 }
31232
31233 /* Return true if MODE1 is accessible in a register that can hold MODE2
31234 without copying. That is, all register classes that can hold MODE2
31235 can also hold MODE1. */
31236
31237 bool
31238 ix86_modes_tieable_p (enum machine_mode mode1, enum machine_mode mode2)
31239 {
31240 if (mode1 == mode2)
31241 return true;
31242
31243 if (ix86_tieable_integer_mode_p (mode1)
31244 && ix86_tieable_integer_mode_p (mode2))
31245 return true;
31246
31247 /* MODE2 being XFmode implies fp stack or general regs, which means we
31248 can tie any smaller floating point modes to it. Note that we do not
31249 tie this with TFmode. */
31250 if (mode2 == XFmode)
31251 return mode1 == SFmode || mode1 == DFmode;
31252
31253 /* MODE2 being DFmode implies fp stack, general or sse regs, which means
31254 that we can tie it with SFmode. */
31255 if (mode2 == DFmode)
31256 return mode1 == SFmode;
31257
31258 /* If MODE2 is only appropriate for an SSE register, then tie with
31259 any other mode acceptable to SSE registers. */
31260 if (GET_MODE_SIZE (mode2) == 16
31261 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
31262 return (GET_MODE_SIZE (mode1) == 16
31263 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
31264
31265 /* If MODE2 is appropriate for an MMX register, then tie
31266 with any other mode acceptable to MMX registers. */
31267 if (GET_MODE_SIZE (mode2) == 8
31268 && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode2))
31269 return (GET_MODE_SIZE (mode1) == 8
31270 && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode1));
31271
31272 return false;
31273 }
31274
31275 /* Compute a (partial) cost for rtx X. Return true if the complete
31276 cost has been computed, and false if subexpressions should be
31277 scanned. In either case, *TOTAL contains the cost result. */
31278
31279 static bool
31280 ix86_rtx_costs (rtx x, int code, int outer_code_i, int opno, int *total,
31281 bool speed)
31282 {
31283 enum rtx_code outer_code = (enum rtx_code) outer_code_i;
31284 enum machine_mode mode = GET_MODE (x);
31285 const struct processor_costs *cost = speed ? ix86_cost : &ix86_size_cost;
31286
31287 switch (code)
31288 {
31289 case CONST_INT:
31290 case CONST:
31291 case LABEL_REF:
31292 case SYMBOL_REF:
31293 if (TARGET_64BIT && !x86_64_immediate_operand (x, VOIDmode))
31294 *total = 3;
31295 else if (TARGET_64BIT && !x86_64_zext_immediate_operand (x, VOIDmode))
31296 *total = 2;
31297 else if (flag_pic && SYMBOLIC_CONST (x)
31298 && (!TARGET_64BIT
31299 || (!GET_CODE (x) != LABEL_REF
31300 && (GET_CODE (x) != SYMBOL_REF
31301 || !SYMBOL_REF_LOCAL_P (x)))))
31302 *total = 1;
31303 else
31304 *total = 0;
31305 return true;
31306
31307 case CONST_DOUBLE:
31308 if (mode == VOIDmode)
31309 *total = 0;
31310 else
31311 switch (standard_80387_constant_p (x))
31312 {
31313 case 1: /* 0.0 */
31314 *total = 1;
31315 break;
31316 default: /* Other constants */
31317 *total = 2;
31318 break;
31319 case 0:
31320 case -1:
31321 /* Start with (MEM (SYMBOL_REF)), since that's where
31322 it'll probably end up. Add a penalty for size. */
31323 *total = (COSTS_N_INSNS (1)
31324 + (flag_pic != 0 && !TARGET_64BIT)
31325 + (mode == SFmode ? 0 : mode == DFmode ? 1 : 2));
31326 break;
31327 }
31328 return true;
31329
31330 case ZERO_EXTEND:
31331 /* The zero extensions is often completely free on x86_64, so make
31332 it as cheap as possible. */
31333 if (TARGET_64BIT && mode == DImode
31334 && GET_MODE (XEXP (x, 0)) == SImode)
31335 *total = 1;
31336 else if (TARGET_ZERO_EXTEND_WITH_AND)
31337 *total = cost->add;
31338 else
31339 *total = cost->movzx;
31340 return false;
31341
31342 case SIGN_EXTEND:
31343 *total = cost->movsx;
31344 return false;
31345
31346 case ASHIFT:
31347 if (CONST_INT_P (XEXP (x, 1))
31348 && (GET_MODE (XEXP (x, 0)) != DImode || TARGET_64BIT))
31349 {
31350 HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
31351 if (value == 1)
31352 {
31353 *total = cost->add;
31354 return false;
31355 }
31356 if ((value == 2 || value == 3)
31357 && cost->lea <= cost->shift_const)
31358 {
31359 *total = cost->lea;
31360 return false;
31361 }
31362 }
31363 /* FALLTHRU */
31364
31365 case ROTATE:
31366 case ASHIFTRT:
31367 case LSHIFTRT:
31368 case ROTATERT:
31369 if (!TARGET_64BIT && GET_MODE (XEXP (x, 0)) == DImode)
31370 {
31371 if (CONST_INT_P (XEXP (x, 1)))
31372 {
31373 if (INTVAL (XEXP (x, 1)) > 32)
31374 *total = cost->shift_const + COSTS_N_INSNS (2);
31375 else
31376 *total = cost->shift_const * 2;
31377 }
31378 else
31379 {
31380 if (GET_CODE (XEXP (x, 1)) == AND)
31381 *total = cost->shift_var * 2;
31382 else
31383 *total = cost->shift_var * 6 + COSTS_N_INSNS (2);
31384 }
31385 }
31386 else
31387 {
31388 if (CONST_INT_P (XEXP (x, 1)))
31389 *total = cost->shift_const;
31390 else
31391 *total = cost->shift_var;
31392 }
31393 return false;
31394
31395 case FMA:
31396 {
31397 rtx sub;
31398
31399 gcc_assert (FLOAT_MODE_P (mode));
31400 gcc_assert (TARGET_FMA || TARGET_FMA4);
31401
31402 /* ??? SSE scalar/vector cost should be used here. */
31403 /* ??? Bald assumption that fma has the same cost as fmul. */
31404 *total = cost->fmul;
31405 *total += rtx_cost (XEXP (x, 1), FMA, 1, speed);
31406
31407 /* Negate in op0 or op2 is free: FMS, FNMA, FNMS. */
31408 sub = XEXP (x, 0);
31409 if (GET_CODE (sub) == NEG)
31410 sub = XEXP (sub, 0);
31411 *total += rtx_cost (sub, FMA, 0, speed);
31412
31413 sub = XEXP (x, 2);
31414 if (GET_CODE (sub) == NEG)
31415 sub = XEXP (sub, 0);
31416 *total += rtx_cost (sub, FMA, 2, speed);
31417 return true;
31418 }
31419
31420 case MULT:
31421 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
31422 {
31423 /* ??? SSE scalar cost should be used here. */
31424 *total = cost->fmul;
31425 return false;
31426 }
31427 else if (X87_FLOAT_MODE_P (mode))
31428 {
31429 *total = cost->fmul;
31430 return false;
31431 }
31432 else if (FLOAT_MODE_P (mode))
31433 {
31434 /* ??? SSE vector cost should be used here. */
31435 *total = cost->fmul;
31436 return false;
31437 }
31438 else
31439 {
31440 rtx op0 = XEXP (x, 0);
31441 rtx op1 = XEXP (x, 1);
31442 int nbits;
31443 if (CONST_INT_P (XEXP (x, 1)))
31444 {
31445 unsigned HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
31446 for (nbits = 0; value != 0; value &= value - 1)
31447 nbits++;
31448 }
31449 else
31450 /* This is arbitrary. */
31451 nbits = 7;
31452
31453 /* Compute costs correctly for widening multiplication. */
31454 if ((GET_CODE (op0) == SIGN_EXTEND || GET_CODE (op0) == ZERO_EXTEND)
31455 && GET_MODE_SIZE (GET_MODE (XEXP (op0, 0))) * 2
31456 == GET_MODE_SIZE (mode))
31457 {
31458 int is_mulwiden = 0;
31459 enum machine_mode inner_mode = GET_MODE (op0);
31460
31461 if (GET_CODE (op0) == GET_CODE (op1))
31462 is_mulwiden = 1, op1 = XEXP (op1, 0);
31463 else if (CONST_INT_P (op1))
31464 {
31465 if (GET_CODE (op0) == SIGN_EXTEND)
31466 is_mulwiden = trunc_int_for_mode (INTVAL (op1), inner_mode)
31467 == INTVAL (op1);
31468 else
31469 is_mulwiden = !(INTVAL (op1) & ~GET_MODE_MASK (inner_mode));
31470 }
31471
31472 if (is_mulwiden)
31473 op0 = XEXP (op0, 0), mode = GET_MODE (op0);
31474 }
31475
31476 *total = (cost->mult_init[MODE_INDEX (mode)]
31477 + nbits * cost->mult_bit
31478 + rtx_cost (op0, outer_code, opno, speed)
31479 + rtx_cost (op1, outer_code, opno, speed));
31480
31481 return true;
31482 }
31483
31484 case DIV:
31485 case UDIV:
31486 case MOD:
31487 case UMOD:
31488 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
31489 /* ??? SSE cost should be used here. */
31490 *total = cost->fdiv;
31491 else if (X87_FLOAT_MODE_P (mode))
31492 *total = cost->fdiv;
31493 else if (FLOAT_MODE_P (mode))
31494 /* ??? SSE vector cost should be used here. */
31495 *total = cost->fdiv;
31496 else
31497 *total = cost->divide[MODE_INDEX (mode)];
31498 return false;
31499
31500 case PLUS:
31501 if (GET_MODE_CLASS (mode) == MODE_INT
31502 && GET_MODE_BITSIZE (mode) <= GET_MODE_BITSIZE (Pmode))
31503 {
31504 if (GET_CODE (XEXP (x, 0)) == PLUS
31505 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
31506 && CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 0), 1))
31507 && CONSTANT_P (XEXP (x, 1)))
31508 {
31509 HOST_WIDE_INT val = INTVAL (XEXP (XEXP (XEXP (x, 0), 0), 1));
31510 if (val == 2 || val == 4 || val == 8)
31511 {
31512 *total = cost->lea;
31513 *total += rtx_cost (XEXP (XEXP (x, 0), 1),
31514 outer_code, opno, speed);
31515 *total += rtx_cost (XEXP (XEXP (XEXP (x, 0), 0), 0),
31516 outer_code, opno, speed);
31517 *total += rtx_cost (XEXP (x, 1), outer_code, opno, speed);
31518 return true;
31519 }
31520 }
31521 else if (GET_CODE (XEXP (x, 0)) == MULT
31522 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
31523 {
31524 HOST_WIDE_INT val = INTVAL (XEXP (XEXP (x, 0), 1));
31525 if (val == 2 || val == 4 || val == 8)
31526 {
31527 *total = cost->lea;
31528 *total += rtx_cost (XEXP (XEXP (x, 0), 0),
31529 outer_code, opno, speed);
31530 *total += rtx_cost (XEXP (x, 1), outer_code, opno, speed);
31531 return true;
31532 }
31533 }
31534 else if (GET_CODE (XEXP (x, 0)) == PLUS)
31535 {
31536 *total = cost->lea;
31537 *total += rtx_cost (XEXP (XEXP (x, 0), 0),
31538 outer_code, opno, speed);
31539 *total += rtx_cost (XEXP (XEXP (x, 0), 1),
31540 outer_code, opno, speed);
31541 *total += rtx_cost (XEXP (x, 1), outer_code, opno, speed);
31542 return true;
31543 }
31544 }
31545 /* FALLTHRU */
31546
31547 case MINUS:
31548 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
31549 {
31550 /* ??? SSE cost should be used here. */
31551 *total = cost->fadd;
31552 return false;
31553 }
31554 else if (X87_FLOAT_MODE_P (mode))
31555 {
31556 *total = cost->fadd;
31557 return false;
31558 }
31559 else if (FLOAT_MODE_P (mode))
31560 {
31561 /* ??? SSE vector cost should be used here. */
31562 *total = cost->fadd;
31563 return false;
31564 }
31565 /* FALLTHRU */
31566
31567 case AND:
31568 case IOR:
31569 case XOR:
31570 if (!TARGET_64BIT && mode == DImode)
31571 {
31572 *total = (cost->add * 2
31573 + (rtx_cost (XEXP (x, 0), outer_code, opno, speed)
31574 << (GET_MODE (XEXP (x, 0)) != DImode))
31575 + (rtx_cost (XEXP (x, 1), outer_code, opno, speed)
31576 << (GET_MODE (XEXP (x, 1)) != DImode)));
31577 return true;
31578 }
31579 /* FALLTHRU */
31580
31581 case NEG:
31582 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
31583 {
31584 /* ??? SSE cost should be used here. */
31585 *total = cost->fchs;
31586 return false;
31587 }
31588 else if (X87_FLOAT_MODE_P (mode))
31589 {
31590 *total = cost->fchs;
31591 return false;
31592 }
31593 else if (FLOAT_MODE_P (mode))
31594 {
31595 /* ??? SSE vector cost should be used here. */
31596 *total = cost->fchs;
31597 return false;
31598 }
31599 /* FALLTHRU */
31600
31601 case NOT:
31602 if (!TARGET_64BIT && mode == DImode)
31603 *total = cost->add * 2;
31604 else
31605 *total = cost->add;
31606 return false;
31607
31608 case COMPARE:
31609 if (GET_CODE (XEXP (x, 0)) == ZERO_EXTRACT
31610 && XEXP (XEXP (x, 0), 1) == const1_rtx
31611 && CONST_INT_P (XEXP (XEXP (x, 0), 2))
31612 && XEXP (x, 1) == const0_rtx)
31613 {
31614 /* This kind of construct is implemented using test[bwl].
31615 Treat it as if we had an AND. */
31616 *total = (cost->add
31617 + rtx_cost (XEXP (XEXP (x, 0), 0), outer_code, opno, speed)
31618 + rtx_cost (const1_rtx, outer_code, opno, speed));
31619 return true;
31620 }
31621 return false;
31622
31623 case FLOAT_EXTEND:
31624 if (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH))
31625 *total = 0;
31626 return false;
31627
31628 case ABS:
31629 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
31630 /* ??? SSE cost should be used here. */
31631 *total = cost->fabs;
31632 else if (X87_FLOAT_MODE_P (mode))
31633 *total = cost->fabs;
31634 else if (FLOAT_MODE_P (mode))
31635 /* ??? SSE vector cost should be used here. */
31636 *total = cost->fabs;
31637 return false;
31638
31639 case SQRT:
31640 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
31641 /* ??? SSE cost should be used here. */
31642 *total = cost->fsqrt;
31643 else if (X87_FLOAT_MODE_P (mode))
31644 *total = cost->fsqrt;
31645 else if (FLOAT_MODE_P (mode))
31646 /* ??? SSE vector cost should be used here. */
31647 *total = cost->fsqrt;
31648 return false;
31649
31650 case UNSPEC:
31651 if (XINT (x, 1) == UNSPEC_TP)
31652 *total = 0;
31653 return false;
31654
31655 case VEC_SELECT:
31656 case VEC_CONCAT:
31657 case VEC_MERGE:
31658 case VEC_DUPLICATE:
31659 /* ??? Assume all of these vector manipulation patterns are
31660 recognizable. In which case they all pretty much have the
31661 same cost. */
31662 *total = COSTS_N_INSNS (1);
31663 return true;
31664
31665 default:
31666 return false;
31667 }
31668 }
31669
31670 #if TARGET_MACHO
31671
31672 static int current_machopic_label_num;
31673
31674 /* Given a symbol name and its associated stub, write out the
31675 definition of the stub. */
31676
31677 void
31678 machopic_output_stub (FILE *file, const char *symb, const char *stub)
31679 {
31680 unsigned int length;
31681 char *binder_name, *symbol_name, lazy_ptr_name[32];
31682 int label = ++current_machopic_label_num;
31683
31684 /* For 64-bit we shouldn't get here. */
31685 gcc_assert (!TARGET_64BIT);
31686
31687 /* Lose our funky encoding stuff so it doesn't contaminate the stub. */
31688 symb = targetm.strip_name_encoding (symb);
31689
31690 length = strlen (stub);
31691 binder_name = XALLOCAVEC (char, length + 32);
31692 GEN_BINDER_NAME_FOR_STUB (binder_name, stub, length);
31693
31694 length = strlen (symb);
31695 symbol_name = XALLOCAVEC (char, length + 32);
31696 GEN_SYMBOL_NAME_FOR_SYMBOL (symbol_name, symb, length);
31697
31698 sprintf (lazy_ptr_name, "L%d$lz", label);
31699
31700 if (MACHOPIC_ATT_STUB)
31701 switch_to_section (darwin_sections[machopic_picsymbol_stub3_section]);
31702 else if (MACHOPIC_PURE)
31703 switch_to_section (darwin_sections[machopic_picsymbol_stub2_section]);
31704 else
31705 switch_to_section (darwin_sections[machopic_symbol_stub_section]);
31706
31707 fprintf (file, "%s:\n", stub);
31708 fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
31709
31710 if (MACHOPIC_ATT_STUB)
31711 {
31712 fprintf (file, "\thlt ; hlt ; hlt ; hlt ; hlt\n");
31713 }
31714 else if (MACHOPIC_PURE)
31715 {
31716 /* PIC stub. */
31717 /* 25-byte PIC stub using "CALL get_pc_thunk". */
31718 rtx tmp = gen_rtx_REG (SImode, 2 /* ECX */);
31719 output_set_got (tmp, NULL_RTX); /* "CALL ___<cpu>.get_pc_thunk.cx". */
31720 fprintf (file, "LPC$%d:\tmovl\t%s-LPC$%d(%%ecx),%%ecx\n",
31721 label, lazy_ptr_name, label);
31722 fprintf (file, "\tjmp\t*%%ecx\n");
31723 }
31724 else
31725 fprintf (file, "\tjmp\t*%s\n", lazy_ptr_name);
31726
31727 /* The AT&T-style ("self-modifying") stub is not lazily bound, thus
31728 it needs no stub-binding-helper. */
31729 if (MACHOPIC_ATT_STUB)
31730 return;
31731
31732 fprintf (file, "%s:\n", binder_name);
31733
31734 if (MACHOPIC_PURE)
31735 {
31736 fprintf (file, "\tlea\t%s-%s(%%ecx),%%ecx\n", lazy_ptr_name, binder_name);
31737 fprintf (file, "\tpushl\t%%ecx\n");
31738 }
31739 else
31740 fprintf (file, "\tpushl\t$%s\n", lazy_ptr_name);
31741
31742 fputs ("\tjmp\tdyld_stub_binding_helper\n", file);
31743
31744 /* N.B. Keep the correspondence of these
31745 'symbol_ptr/symbol_ptr2/symbol_ptr3' sections consistent with the
31746 old-pic/new-pic/non-pic stubs; altering this will break
31747 compatibility with existing dylibs. */
31748 if (MACHOPIC_PURE)
31749 {
31750 /* 25-byte PIC stub using "CALL get_pc_thunk". */
31751 switch_to_section (darwin_sections[machopic_lazy_symbol_ptr2_section]);
31752 }
31753 else
31754 /* 16-byte -mdynamic-no-pic stub. */
31755 switch_to_section(darwin_sections[machopic_lazy_symbol_ptr3_section]);
31756
31757 fprintf (file, "%s:\n", lazy_ptr_name);
31758 fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
31759 fprintf (file, ASM_LONG "%s\n", binder_name);
31760 }
31761 #endif /* TARGET_MACHO */
31762
31763 /* Order the registers for register allocator. */
31764
31765 void
31766 x86_order_regs_for_local_alloc (void)
31767 {
31768 int pos = 0;
31769 int i;
31770
31771 /* First allocate the local general purpose registers. */
31772 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
31773 if (GENERAL_REGNO_P (i) && call_used_regs[i])
31774 reg_alloc_order [pos++] = i;
31775
31776 /* Global general purpose registers. */
31777 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
31778 if (GENERAL_REGNO_P (i) && !call_used_regs[i])
31779 reg_alloc_order [pos++] = i;
31780
31781 /* x87 registers come first in case we are doing FP math
31782 using them. */
31783 if (!TARGET_SSE_MATH)
31784 for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
31785 reg_alloc_order [pos++] = i;
31786
31787 /* SSE registers. */
31788 for (i = FIRST_SSE_REG; i <= LAST_SSE_REG; i++)
31789 reg_alloc_order [pos++] = i;
31790 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
31791 reg_alloc_order [pos++] = i;
31792
31793 /* x87 registers. */
31794 if (TARGET_SSE_MATH)
31795 for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
31796 reg_alloc_order [pos++] = i;
31797
31798 for (i = FIRST_MMX_REG; i <= LAST_MMX_REG; i++)
31799 reg_alloc_order [pos++] = i;
31800
31801 /* Initialize the rest of array as we do not allocate some registers
31802 at all. */
31803 while (pos < FIRST_PSEUDO_REGISTER)
31804 reg_alloc_order [pos++] = 0;
31805 }
31806
31807 /* Handle a "callee_pop_aggregate_return" attribute; arguments as
31808 in struct attribute_spec handler. */
31809 static tree
31810 ix86_handle_callee_pop_aggregate_return (tree *node, tree name,
31811 tree args,
31812 int flags ATTRIBUTE_UNUSED,
31813 bool *no_add_attrs)
31814 {
31815 if (TREE_CODE (*node) != FUNCTION_TYPE
31816 && TREE_CODE (*node) != METHOD_TYPE
31817 && TREE_CODE (*node) != FIELD_DECL
31818 && TREE_CODE (*node) != TYPE_DECL)
31819 {
31820 warning (OPT_Wattributes, "%qE attribute only applies to functions",
31821 name);
31822 *no_add_attrs = true;
31823 return NULL_TREE;
31824 }
31825 if (TARGET_64BIT)
31826 {
31827 warning (OPT_Wattributes, "%qE attribute only available for 32-bit",
31828 name);
31829 *no_add_attrs = true;
31830 return NULL_TREE;
31831 }
31832 if (is_attribute_p ("callee_pop_aggregate_return", name))
31833 {
31834 tree cst;
31835
31836 cst = TREE_VALUE (args);
31837 if (TREE_CODE (cst) != INTEGER_CST)
31838 {
31839 warning (OPT_Wattributes,
31840 "%qE attribute requires an integer constant argument",
31841 name);
31842 *no_add_attrs = true;
31843 }
31844 else if (compare_tree_int (cst, 0) != 0
31845 && compare_tree_int (cst, 1) != 0)
31846 {
31847 warning (OPT_Wattributes,
31848 "argument to %qE attribute is neither zero, nor one",
31849 name);
31850 *no_add_attrs = true;
31851 }
31852
31853 return NULL_TREE;
31854 }
31855
31856 return NULL_TREE;
31857 }
31858
31859 /* Handle a "ms_abi" or "sysv" attribute; arguments as in
31860 struct attribute_spec.handler. */
31861 static tree
31862 ix86_handle_abi_attribute (tree *node, tree name,
31863 tree args ATTRIBUTE_UNUSED,
31864 int flags ATTRIBUTE_UNUSED, bool *no_add_attrs)
31865 {
31866 if (TREE_CODE (*node) != FUNCTION_TYPE
31867 && TREE_CODE (*node) != METHOD_TYPE
31868 && TREE_CODE (*node) != FIELD_DECL
31869 && TREE_CODE (*node) != TYPE_DECL)
31870 {
31871 warning (OPT_Wattributes, "%qE attribute only applies to functions",
31872 name);
31873 *no_add_attrs = true;
31874 return NULL_TREE;
31875 }
31876
31877 /* Can combine regparm with all attributes but fastcall. */
31878 if (is_attribute_p ("ms_abi", name))
31879 {
31880 if (lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (*node)))
31881 {
31882 error ("ms_abi and sysv_abi attributes are not compatible");
31883 }
31884
31885 return NULL_TREE;
31886 }
31887 else if (is_attribute_p ("sysv_abi", name))
31888 {
31889 if (lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (*node)))
31890 {
31891 error ("ms_abi and sysv_abi attributes are not compatible");
31892 }
31893
31894 return NULL_TREE;
31895 }
31896
31897 return NULL_TREE;
31898 }
31899
31900 /* Handle a "ms_struct" or "gcc_struct" attribute; arguments as in
31901 struct attribute_spec.handler. */
31902 static tree
31903 ix86_handle_struct_attribute (tree *node, tree name,
31904 tree args ATTRIBUTE_UNUSED,
31905 int flags ATTRIBUTE_UNUSED, bool *no_add_attrs)
31906 {
31907 tree *type = NULL;
31908 if (DECL_P (*node))
31909 {
31910 if (TREE_CODE (*node) == TYPE_DECL)
31911 type = &TREE_TYPE (*node);
31912 }
31913 else
31914 type = node;
31915
31916 if (!(type && (TREE_CODE (*type) == RECORD_TYPE
31917 || TREE_CODE (*type) == UNION_TYPE)))
31918 {
31919 warning (OPT_Wattributes, "%qE attribute ignored",
31920 name);
31921 *no_add_attrs = true;
31922 }
31923
31924 else if ((is_attribute_p ("ms_struct", name)
31925 && lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (*type)))
31926 || ((is_attribute_p ("gcc_struct", name)
31927 && lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (*type)))))
31928 {
31929 warning (OPT_Wattributes, "%qE incompatible attribute ignored",
31930 name);
31931 *no_add_attrs = true;
31932 }
31933
31934 return NULL_TREE;
31935 }
31936
31937 static tree
31938 ix86_handle_fndecl_attribute (tree *node, tree name,
31939 tree args ATTRIBUTE_UNUSED,
31940 int flags ATTRIBUTE_UNUSED, bool *no_add_attrs)
31941 {
31942 if (TREE_CODE (*node) != FUNCTION_DECL)
31943 {
31944 warning (OPT_Wattributes, "%qE attribute only applies to functions",
31945 name);
31946 *no_add_attrs = true;
31947 }
31948 return NULL_TREE;
31949 }
31950
31951 static bool
31952 ix86_ms_bitfield_layout_p (const_tree record_type)
31953 {
31954 return ((TARGET_MS_BITFIELD_LAYOUT
31955 && !lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (record_type)))
31956 || lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (record_type)));
31957 }
31958
31959 /* Returns an expression indicating where the this parameter is
31960 located on entry to the FUNCTION. */
31961
31962 static rtx
31963 x86_this_parameter (tree function)
31964 {
31965 tree type = TREE_TYPE (function);
31966 bool aggr = aggregate_value_p (TREE_TYPE (type), type) != 0;
31967 int nregs;
31968
31969 if (TARGET_64BIT)
31970 {
31971 const int *parm_regs;
31972
31973 if (ix86_function_type_abi (type) == MS_ABI)
31974 parm_regs = x86_64_ms_abi_int_parameter_registers;
31975 else
31976 parm_regs = x86_64_int_parameter_registers;
31977 return gen_rtx_REG (Pmode, parm_regs[aggr]);
31978 }
31979
31980 nregs = ix86_function_regparm (type, function);
31981
31982 if (nregs > 0 && !stdarg_p (type))
31983 {
31984 int regno;
31985 unsigned int ccvt = ix86_get_callcvt (type);
31986
31987 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
31988 regno = aggr ? DX_REG : CX_REG;
31989 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
31990 {
31991 regno = CX_REG;
31992 if (aggr)
31993 return gen_rtx_MEM (SImode,
31994 plus_constant (stack_pointer_rtx, 4));
31995 }
31996 else
31997 {
31998 regno = AX_REG;
31999 if (aggr)
32000 {
32001 regno = DX_REG;
32002 if (nregs == 1)
32003 return gen_rtx_MEM (SImode,
32004 plus_constant (stack_pointer_rtx, 4));
32005 }
32006 }
32007 return gen_rtx_REG (SImode, regno);
32008 }
32009
32010 return gen_rtx_MEM (SImode, plus_constant (stack_pointer_rtx, aggr ? 8 : 4));
32011 }
32012
32013 /* Determine whether x86_output_mi_thunk can succeed. */
32014
32015 static bool
32016 x86_can_output_mi_thunk (const_tree thunk ATTRIBUTE_UNUSED,
32017 HOST_WIDE_INT delta ATTRIBUTE_UNUSED,
32018 HOST_WIDE_INT vcall_offset, const_tree function)
32019 {
32020 /* 64-bit can handle anything. */
32021 if (TARGET_64BIT)
32022 return true;
32023
32024 /* For 32-bit, everything's fine if we have one free register. */
32025 if (ix86_function_regparm (TREE_TYPE (function), function) < 3)
32026 return true;
32027
32028 /* Need a free register for vcall_offset. */
32029 if (vcall_offset)
32030 return false;
32031
32032 /* Need a free register for GOT references. */
32033 if (flag_pic && !targetm.binds_local_p (function))
32034 return false;
32035
32036 /* Otherwise ok. */
32037 return true;
32038 }
32039
32040 /* Output the assembler code for a thunk function. THUNK_DECL is the
32041 declaration for the thunk function itself, FUNCTION is the decl for
32042 the target function. DELTA is an immediate constant offset to be
32043 added to THIS. If VCALL_OFFSET is nonzero, the word at
32044 *(*this + vcall_offset) should be added to THIS. */
32045
32046 static void
32047 x86_output_mi_thunk (FILE *file,
32048 tree thunk ATTRIBUTE_UNUSED, HOST_WIDE_INT delta,
32049 HOST_WIDE_INT vcall_offset, tree function)
32050 {
32051 rtx this_param = x86_this_parameter (function);
32052 rtx this_reg, tmp, fnaddr;
32053
32054 emit_note (NOTE_INSN_PROLOGUE_END);
32055
32056 /* If VCALL_OFFSET, we'll need THIS in a register. Might as well
32057 pull it in now and let DELTA benefit. */
32058 if (REG_P (this_param))
32059 this_reg = this_param;
32060 else if (vcall_offset)
32061 {
32062 /* Put the this parameter into %eax. */
32063 this_reg = gen_rtx_REG (Pmode, AX_REG);
32064 emit_move_insn (this_reg, this_param);
32065 }
32066 else
32067 this_reg = NULL_RTX;
32068
32069 /* Adjust the this parameter by a fixed constant. */
32070 if (delta)
32071 {
32072 rtx delta_rtx = GEN_INT (delta);
32073 rtx delta_dst = this_reg ? this_reg : this_param;
32074
32075 if (TARGET_64BIT)
32076 {
32077 if (!x86_64_general_operand (delta_rtx, Pmode))
32078 {
32079 tmp = gen_rtx_REG (Pmode, R10_REG);
32080 emit_move_insn (tmp, delta_rtx);
32081 delta_rtx = tmp;
32082 }
32083 }
32084
32085 ix86_emit_binop (PLUS, Pmode, delta_dst, delta_rtx);
32086 }
32087
32088 /* Adjust the this parameter by a value stored in the vtable. */
32089 if (vcall_offset)
32090 {
32091 rtx vcall_addr, vcall_mem, this_mem;
32092 unsigned int tmp_regno;
32093
32094 if (TARGET_64BIT)
32095 tmp_regno = R10_REG;
32096 else
32097 {
32098 unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (function));
32099 if ((ccvt & (IX86_CALLCVT_FASTCALL | IX86_CALLCVT_THISCALL)) != 0)
32100 tmp_regno = AX_REG;
32101 else
32102 tmp_regno = CX_REG;
32103 }
32104 tmp = gen_rtx_REG (Pmode, tmp_regno);
32105
32106 this_mem = gen_rtx_MEM (ptr_mode, this_reg);
32107 if (Pmode != ptr_mode)
32108 this_mem = gen_rtx_ZERO_EXTEND (Pmode, this_mem);
32109 emit_move_insn (tmp, this_mem);
32110
32111 /* Adjust the this parameter. */
32112 vcall_addr = plus_constant (tmp, vcall_offset);
32113 if (TARGET_64BIT
32114 && !ix86_legitimate_address_p (ptr_mode, vcall_addr, true))
32115 {
32116 rtx tmp2 = gen_rtx_REG (Pmode, R11_REG);
32117 emit_move_insn (tmp2, GEN_INT (vcall_offset));
32118 vcall_addr = gen_rtx_PLUS (Pmode, tmp, tmp2);
32119 }
32120
32121 vcall_mem = gen_rtx_MEM (ptr_mode, vcall_addr);
32122 if (Pmode != ptr_mode)
32123 emit_insn (gen_addsi_1_zext (this_reg,
32124 gen_rtx_REG (ptr_mode,
32125 REGNO (this_reg)),
32126 vcall_mem));
32127 else
32128 ix86_emit_binop (PLUS, Pmode, this_reg, vcall_mem);
32129 }
32130
32131 /* If necessary, drop THIS back to its stack slot. */
32132 if (this_reg && this_reg != this_param)
32133 emit_move_insn (this_param, this_reg);
32134
32135 fnaddr = XEXP (DECL_RTL (function), 0);
32136 if (TARGET_64BIT)
32137 {
32138 if (!flag_pic || targetm.binds_local_p (function)
32139 || cfun->machine->call_abi == MS_ABI)
32140 ;
32141 else
32142 {
32143 tmp = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOTPCREL);
32144 tmp = gen_rtx_CONST (Pmode, tmp);
32145 fnaddr = gen_rtx_MEM (Pmode, tmp);
32146 }
32147 }
32148 else
32149 {
32150 if (!flag_pic || targetm.binds_local_p (function))
32151 ;
32152 #if TARGET_MACHO
32153 else if (TARGET_MACHO)
32154 {
32155 fnaddr = machopic_indirect_call_target (DECL_RTL (function));
32156 fnaddr = XEXP (fnaddr, 0);
32157 }
32158 #endif /* TARGET_MACHO */
32159 else
32160 {
32161 tmp = gen_rtx_REG (Pmode, CX_REG);
32162 output_set_got (tmp, NULL_RTX);
32163
32164 fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOT);
32165 fnaddr = gen_rtx_PLUS (Pmode, fnaddr, tmp);
32166 fnaddr = gen_rtx_MEM (Pmode, fnaddr);
32167 }
32168 }
32169
32170 /* Our sibling call patterns do not allow memories, because we have no
32171 predicate that can distinguish between frame and non-frame memory.
32172 For our purposes here, we can get away with (ab)using a jump pattern,
32173 because we're going to do no optimization. */
32174 if (MEM_P (fnaddr))
32175 emit_jump_insn (gen_indirect_jump (fnaddr));
32176 else
32177 {
32178 tmp = gen_rtx_MEM (QImode, fnaddr);
32179 tmp = gen_rtx_CALL (VOIDmode, tmp, const0_rtx);
32180 tmp = emit_call_insn (tmp);
32181 SIBLING_CALL_P (tmp) = 1;
32182 }
32183 emit_barrier ();
32184
32185 /* Emit just enough of rest_of_compilation to get the insns emitted.
32186 Note that use_thunk calls assemble_start_function et al. */
32187 tmp = get_insns ();
32188 insn_locators_alloc ();
32189 shorten_branches (tmp);
32190 final_start_function (tmp, file, 1);
32191 final (tmp, file, 1);
32192 final_end_function ();
32193 }
32194
32195 static void
32196 x86_file_start (void)
32197 {
32198 default_file_start ();
32199 #if TARGET_MACHO
32200 darwin_file_start ();
32201 #endif
32202 if (X86_FILE_START_VERSION_DIRECTIVE)
32203 fputs ("\t.version\t\"01.01\"\n", asm_out_file);
32204 if (X86_FILE_START_FLTUSED)
32205 fputs ("\t.global\t__fltused\n", asm_out_file);
32206 if (ix86_asm_dialect == ASM_INTEL)
32207 fputs ("\t.intel_syntax noprefix\n", asm_out_file);
32208 }
32209
32210 int
32211 x86_field_alignment (tree field, int computed)
32212 {
32213 enum machine_mode mode;
32214 tree type = TREE_TYPE (field);
32215
32216 if (TARGET_64BIT || TARGET_ALIGN_DOUBLE)
32217 return computed;
32218 mode = TYPE_MODE (strip_array_types (type));
32219 if (mode == DFmode || mode == DCmode
32220 || GET_MODE_CLASS (mode) == MODE_INT
32221 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
32222 return MIN (32, computed);
32223 return computed;
32224 }
32225
32226 /* Output assembler code to FILE to increment profiler label # LABELNO
32227 for profiling a function entry. */
32228 void
32229 x86_function_profiler (FILE *file, int labelno ATTRIBUTE_UNUSED)
32230 {
32231 const char *mcount_name = (flag_fentry ? MCOUNT_NAME_BEFORE_PROLOGUE
32232 : MCOUNT_NAME);
32233
32234 if (TARGET_64BIT)
32235 {
32236 #ifndef NO_PROFILE_COUNTERS
32237 fprintf (file, "\tleaq\t%sP%d(%%rip),%%r11\n", LPREFIX, labelno);
32238 #endif
32239
32240 if (DEFAULT_ABI == SYSV_ABI && flag_pic)
32241 fprintf (file, "\tcall\t*%s@GOTPCREL(%%rip)\n", mcount_name);
32242 else
32243 fprintf (file, "\tcall\t%s\n", mcount_name);
32244 }
32245 else if (flag_pic)
32246 {
32247 #ifndef NO_PROFILE_COUNTERS
32248 fprintf (file, "\tleal\t%sP%d@GOTOFF(%%ebx),%%" PROFILE_COUNT_REGISTER "\n",
32249 LPREFIX, labelno);
32250 #endif
32251 fprintf (file, "\tcall\t*%s@GOT(%%ebx)\n", mcount_name);
32252 }
32253 else
32254 {
32255 #ifndef NO_PROFILE_COUNTERS
32256 fprintf (file, "\tmovl\t$%sP%d,%%" PROFILE_COUNT_REGISTER "\n",
32257 LPREFIX, labelno);
32258 #endif
32259 fprintf (file, "\tcall\t%s\n", mcount_name);
32260 }
32261 }
32262
32263 /* We don't have exact information about the insn sizes, but we may assume
32264 quite safely that we are informed about all 1 byte insns and memory
32265 address sizes. This is enough to eliminate unnecessary padding in
32266 99% of cases. */
32267
32268 static int
32269 min_insn_size (rtx insn)
32270 {
32271 int l = 0, len;
32272
32273 if (!INSN_P (insn) || !active_insn_p (insn))
32274 return 0;
32275
32276 /* Discard alignments we've emit and jump instructions. */
32277 if (GET_CODE (PATTERN (insn)) == UNSPEC_VOLATILE
32278 && XINT (PATTERN (insn), 1) == UNSPECV_ALIGN)
32279 return 0;
32280 if (JUMP_TABLE_DATA_P (insn))
32281 return 0;
32282
32283 /* Important case - calls are always 5 bytes.
32284 It is common to have many calls in the row. */
32285 if (CALL_P (insn)
32286 && symbolic_reference_mentioned_p (PATTERN (insn))
32287 && !SIBLING_CALL_P (insn))
32288 return 5;
32289 len = get_attr_length (insn);
32290 if (len <= 1)
32291 return 1;
32292
32293 /* For normal instructions we rely on get_attr_length being exact,
32294 with a few exceptions. */
32295 if (!JUMP_P (insn))
32296 {
32297 enum attr_type type = get_attr_type (insn);
32298
32299 switch (type)
32300 {
32301 case TYPE_MULTI:
32302 if (GET_CODE (PATTERN (insn)) == ASM_INPUT
32303 || asm_noperands (PATTERN (insn)) >= 0)
32304 return 0;
32305 break;
32306 case TYPE_OTHER:
32307 case TYPE_FCMP:
32308 break;
32309 default:
32310 /* Otherwise trust get_attr_length. */
32311 return len;
32312 }
32313
32314 l = get_attr_length_address (insn);
32315 if (l < 4 && symbolic_reference_mentioned_p (PATTERN (insn)))
32316 l = 4;
32317 }
32318 if (l)
32319 return 1+l;
32320 else
32321 return 2;
32322 }
32323
32324 #ifdef ASM_OUTPUT_MAX_SKIP_PAD
32325
32326 /* AMD K8 core mispredicts jumps when there are more than 3 jumps in 16 byte
32327 window. */
32328
32329 static void
32330 ix86_avoid_jump_mispredicts (void)
32331 {
32332 rtx insn, start = get_insns ();
32333 int nbytes = 0, njumps = 0;
32334 int isjump = 0;
32335
32336 /* Look for all minimal intervals of instructions containing 4 jumps.
32337 The intervals are bounded by START and INSN. NBYTES is the total
32338 size of instructions in the interval including INSN and not including
32339 START. When the NBYTES is smaller than 16 bytes, it is possible
32340 that the end of START and INSN ends up in the same 16byte page.
32341
32342 The smallest offset in the page INSN can start is the case where START
32343 ends on the offset 0. Offset of INSN is then NBYTES - sizeof (INSN).
32344 We add p2align to 16byte window with maxskip 15 - NBYTES + sizeof (INSN).
32345 */
32346 for (insn = start; insn; insn = NEXT_INSN (insn))
32347 {
32348 int min_size;
32349
32350 if (LABEL_P (insn))
32351 {
32352 int align = label_to_alignment (insn);
32353 int max_skip = label_to_max_skip (insn);
32354
32355 if (max_skip > 15)
32356 max_skip = 15;
32357 /* If align > 3, only up to 16 - max_skip - 1 bytes can be
32358 already in the current 16 byte page, because otherwise
32359 ASM_OUTPUT_MAX_SKIP_ALIGN could skip max_skip or fewer
32360 bytes to reach 16 byte boundary. */
32361 if (align <= 0
32362 || (align <= 3 && max_skip != (1 << align) - 1))
32363 max_skip = 0;
32364 if (dump_file)
32365 fprintf (dump_file, "Label %i with max_skip %i\n",
32366 INSN_UID (insn), max_skip);
32367 if (max_skip)
32368 {
32369 while (nbytes + max_skip >= 16)
32370 {
32371 start = NEXT_INSN (start);
32372 if ((JUMP_P (start)
32373 && GET_CODE (PATTERN (start)) != ADDR_VEC
32374 && GET_CODE (PATTERN (start)) != ADDR_DIFF_VEC)
32375 || CALL_P (start))
32376 njumps--, isjump = 1;
32377 else
32378 isjump = 0;
32379 nbytes -= min_insn_size (start);
32380 }
32381 }
32382 continue;
32383 }
32384
32385 min_size = min_insn_size (insn);
32386 nbytes += min_size;
32387 if (dump_file)
32388 fprintf (dump_file, "Insn %i estimated to %i bytes\n",
32389 INSN_UID (insn), min_size);
32390 if ((JUMP_P (insn)
32391 && GET_CODE (PATTERN (insn)) != ADDR_VEC
32392 && GET_CODE (PATTERN (insn)) != ADDR_DIFF_VEC)
32393 || CALL_P (insn))
32394 njumps++;
32395 else
32396 continue;
32397
32398 while (njumps > 3)
32399 {
32400 start = NEXT_INSN (start);
32401 if ((JUMP_P (start)
32402 && GET_CODE (PATTERN (start)) != ADDR_VEC
32403 && GET_CODE (PATTERN (start)) != ADDR_DIFF_VEC)
32404 || CALL_P (start))
32405 njumps--, isjump = 1;
32406 else
32407 isjump = 0;
32408 nbytes -= min_insn_size (start);
32409 }
32410 gcc_assert (njumps >= 0);
32411 if (dump_file)
32412 fprintf (dump_file, "Interval %i to %i has %i bytes\n",
32413 INSN_UID (start), INSN_UID (insn), nbytes);
32414
32415 if (njumps == 3 && isjump && nbytes < 16)
32416 {
32417 int padsize = 15 - nbytes + min_insn_size (insn);
32418
32419 if (dump_file)
32420 fprintf (dump_file, "Padding insn %i by %i bytes!\n",
32421 INSN_UID (insn), padsize);
32422 emit_insn_before (gen_pad (GEN_INT (padsize)), insn);
32423 }
32424 }
32425 }
32426 #endif
32427
32428 /* AMD Athlon works faster
32429 when RET is not destination of conditional jump or directly preceded
32430 by other jump instruction. We avoid the penalty by inserting NOP just
32431 before the RET instructions in such cases. */
32432 static void
32433 ix86_pad_returns (void)
32434 {
32435 edge e;
32436 edge_iterator ei;
32437
32438 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR->preds)
32439 {
32440 basic_block bb = e->src;
32441 rtx ret = BB_END (bb);
32442 rtx prev;
32443 bool replace = false;
32444
32445 if (!JUMP_P (ret) || !ANY_RETURN_P (PATTERN (ret))
32446 || optimize_bb_for_size_p (bb))
32447 continue;
32448 for (prev = PREV_INSN (ret); prev; prev = PREV_INSN (prev))
32449 if (active_insn_p (prev) || LABEL_P (prev))
32450 break;
32451 if (prev && LABEL_P (prev))
32452 {
32453 edge e;
32454 edge_iterator ei;
32455
32456 FOR_EACH_EDGE (e, ei, bb->preds)
32457 if (EDGE_FREQUENCY (e) && e->src->index >= 0
32458 && !(e->flags & EDGE_FALLTHRU))
32459 replace = true;
32460 }
32461 if (!replace)
32462 {
32463 prev = prev_active_insn (ret);
32464 if (prev
32465 && ((JUMP_P (prev) && any_condjump_p (prev))
32466 || CALL_P (prev)))
32467 replace = true;
32468 /* Empty functions get branch mispredict even when
32469 the jump destination is not visible to us. */
32470 if (!prev && !optimize_function_for_size_p (cfun))
32471 replace = true;
32472 }
32473 if (replace)
32474 {
32475 emit_jump_insn_before (gen_simple_return_internal_long (), ret);
32476 delete_insn (ret);
32477 }
32478 }
32479 }
32480
32481 /* Count the minimum number of instructions in BB. Return 4 if the
32482 number of instructions >= 4. */
32483
32484 static int
32485 ix86_count_insn_bb (basic_block bb)
32486 {
32487 rtx insn;
32488 int insn_count = 0;
32489
32490 /* Count number of instructions in this block. Return 4 if the number
32491 of instructions >= 4. */
32492 FOR_BB_INSNS (bb, insn)
32493 {
32494 /* Only happen in exit blocks. */
32495 if (JUMP_P (insn)
32496 && ANY_RETURN_P (PATTERN (insn)))
32497 break;
32498
32499 if (NONDEBUG_INSN_P (insn)
32500 && GET_CODE (PATTERN (insn)) != USE
32501 && GET_CODE (PATTERN (insn)) != CLOBBER)
32502 {
32503 insn_count++;
32504 if (insn_count >= 4)
32505 return insn_count;
32506 }
32507 }
32508
32509 return insn_count;
32510 }
32511
32512
32513 /* Count the minimum number of instructions in code path in BB.
32514 Return 4 if the number of instructions >= 4. */
32515
32516 static int
32517 ix86_count_insn (basic_block bb)
32518 {
32519 edge e;
32520 edge_iterator ei;
32521 int min_prev_count;
32522
32523 /* Only bother counting instructions along paths with no
32524 more than 2 basic blocks between entry and exit. Given
32525 that BB has an edge to exit, determine if a predecessor
32526 of BB has an edge from entry. If so, compute the number
32527 of instructions in the predecessor block. If there
32528 happen to be multiple such blocks, compute the minimum. */
32529 min_prev_count = 4;
32530 FOR_EACH_EDGE (e, ei, bb->preds)
32531 {
32532 edge prev_e;
32533 edge_iterator prev_ei;
32534
32535 if (e->src == ENTRY_BLOCK_PTR)
32536 {
32537 min_prev_count = 0;
32538 break;
32539 }
32540 FOR_EACH_EDGE (prev_e, prev_ei, e->src->preds)
32541 {
32542 if (prev_e->src == ENTRY_BLOCK_PTR)
32543 {
32544 int count = ix86_count_insn_bb (e->src);
32545 if (count < min_prev_count)
32546 min_prev_count = count;
32547 break;
32548 }
32549 }
32550 }
32551
32552 if (min_prev_count < 4)
32553 min_prev_count += ix86_count_insn_bb (bb);
32554
32555 return min_prev_count;
32556 }
32557
32558 /* Pad short funtion to 4 instructions. */
32559
32560 static void
32561 ix86_pad_short_function (void)
32562 {
32563 edge e;
32564 edge_iterator ei;
32565
32566 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR->preds)
32567 {
32568 rtx ret = BB_END (e->src);
32569 if (JUMP_P (ret) && ANY_RETURN_P (PATTERN (ret)))
32570 {
32571 int insn_count = ix86_count_insn (e->src);
32572
32573 /* Pad short function. */
32574 if (insn_count < 4)
32575 {
32576 rtx insn = ret;
32577
32578 /* Find epilogue. */
32579 while (insn
32580 && (!NOTE_P (insn)
32581 || NOTE_KIND (insn) != NOTE_INSN_EPILOGUE_BEG))
32582 insn = PREV_INSN (insn);
32583
32584 if (!insn)
32585 insn = ret;
32586
32587 /* Two NOPs count as one instruction. */
32588 insn_count = 2 * (4 - insn_count);
32589 emit_insn_before (gen_nops (GEN_INT (insn_count)), insn);
32590 }
32591 }
32592 }
32593 }
32594
32595 /* Implement machine specific optimizations. We implement padding of returns
32596 for K8 CPUs and pass to avoid 4 jumps in the single 16 byte window. */
32597 static void
32598 ix86_reorg (void)
32599 {
32600 /* We are freeing block_for_insn in the toplev to keep compatibility
32601 with old MDEP_REORGS that are not CFG based. Recompute it now. */
32602 compute_bb_for_insn ();
32603
32604 /* Run the vzeroupper optimization if needed. */
32605 if (TARGET_VZEROUPPER)
32606 move_or_delete_vzeroupper ();
32607
32608 if (optimize && optimize_function_for_speed_p (cfun))
32609 {
32610 if (TARGET_PAD_SHORT_FUNCTION)
32611 ix86_pad_short_function ();
32612 else if (TARGET_PAD_RETURNS)
32613 ix86_pad_returns ();
32614 #ifdef ASM_OUTPUT_MAX_SKIP_PAD
32615 if (TARGET_FOUR_JUMP_LIMIT)
32616 ix86_avoid_jump_mispredicts ();
32617 #endif
32618 }
32619 }
32620
32621 /* Return nonzero when QImode register that must be represented via REX prefix
32622 is used. */
32623 bool
32624 x86_extended_QIreg_mentioned_p (rtx insn)
32625 {
32626 int i;
32627 extract_insn_cached (insn);
32628 for (i = 0; i < recog_data.n_operands; i++)
32629 if (REG_P (recog_data.operand[i])
32630 && REGNO (recog_data.operand[i]) > BX_REG)
32631 return true;
32632 return false;
32633 }
32634
32635 /* Return nonzero when P points to register encoded via REX prefix.
32636 Called via for_each_rtx. */
32637 static int
32638 extended_reg_mentioned_1 (rtx *p, void *data ATTRIBUTE_UNUSED)
32639 {
32640 unsigned int regno;
32641 if (!REG_P (*p))
32642 return 0;
32643 regno = REGNO (*p);
32644 return REX_INT_REGNO_P (regno) || REX_SSE_REGNO_P (regno);
32645 }
32646
32647 /* Return true when INSN mentions register that must be encoded using REX
32648 prefix. */
32649 bool
32650 x86_extended_reg_mentioned_p (rtx insn)
32651 {
32652 return for_each_rtx (INSN_P (insn) ? &PATTERN (insn) : &insn,
32653 extended_reg_mentioned_1, NULL);
32654 }
32655
32656 /* If profitable, negate (without causing overflow) integer constant
32657 of mode MODE at location LOC. Return true in this case. */
32658 bool
32659 x86_maybe_negate_const_int (rtx *loc, enum machine_mode mode)
32660 {
32661 HOST_WIDE_INT val;
32662
32663 if (!CONST_INT_P (*loc))
32664 return false;
32665
32666 switch (mode)
32667 {
32668 case DImode:
32669 /* DImode x86_64 constants must fit in 32 bits. */
32670 gcc_assert (x86_64_immediate_operand (*loc, mode));
32671
32672 mode = SImode;
32673 break;
32674
32675 case SImode:
32676 case HImode:
32677 case QImode:
32678 break;
32679
32680 default:
32681 gcc_unreachable ();
32682 }
32683
32684 /* Avoid overflows. */
32685 if (mode_signbit_p (mode, *loc))
32686 return false;
32687
32688 val = INTVAL (*loc);
32689
32690 /* Make things pretty and `subl $4,%eax' rather than `addl $-4,%eax'.
32691 Exceptions: -128 encodes smaller than 128, so swap sign and op. */
32692 if ((val < 0 && val != -128)
32693 || val == 128)
32694 {
32695 *loc = GEN_INT (-val);
32696 return true;
32697 }
32698
32699 return false;
32700 }
32701
32702 /* Generate an unsigned DImode/SImode to FP conversion. This is the same code
32703 optabs would emit if we didn't have TFmode patterns. */
32704
32705 void
32706 x86_emit_floatuns (rtx operands[2])
32707 {
32708 rtx neglab, donelab, i0, i1, f0, in, out;
32709 enum machine_mode mode, inmode;
32710
32711 inmode = GET_MODE (operands[1]);
32712 gcc_assert (inmode == SImode || inmode == DImode);
32713
32714 out = operands[0];
32715 in = force_reg (inmode, operands[1]);
32716 mode = GET_MODE (out);
32717 neglab = gen_label_rtx ();
32718 donelab = gen_label_rtx ();
32719 f0 = gen_reg_rtx (mode);
32720
32721 emit_cmp_and_jump_insns (in, const0_rtx, LT, const0_rtx, inmode, 0, neglab);
32722
32723 expand_float (out, in, 0);
32724
32725 emit_jump_insn (gen_jump (donelab));
32726 emit_barrier ();
32727
32728 emit_label (neglab);
32729
32730 i0 = expand_simple_binop (inmode, LSHIFTRT, in, const1_rtx, NULL,
32731 1, OPTAB_DIRECT);
32732 i1 = expand_simple_binop (inmode, AND, in, const1_rtx, NULL,
32733 1, OPTAB_DIRECT);
32734 i0 = expand_simple_binop (inmode, IOR, i0, i1, i0, 1, OPTAB_DIRECT);
32735
32736 expand_float (f0, i0, 0);
32737
32738 emit_insn (gen_rtx_SET (VOIDmode, out, gen_rtx_PLUS (mode, f0, f0)));
32739
32740 emit_label (donelab);
32741 }
32742 \f
32743 /* AVX2 does support 32-byte integer vector operations,
32744 thus the longest vector we are faced with is V32QImode. */
32745 #define MAX_VECT_LEN 32
32746
32747 struct expand_vec_perm_d
32748 {
32749 rtx target, op0, op1;
32750 unsigned char perm[MAX_VECT_LEN];
32751 enum machine_mode vmode;
32752 unsigned char nelt;
32753 bool testing_p;
32754 };
32755
32756 static bool expand_vec_perm_1 (struct expand_vec_perm_d *d);
32757 static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d);
32758
32759 /* Get a vector mode of the same size as the original but with elements
32760 twice as wide. This is only guaranteed to apply to integral vectors. */
32761
32762 static inline enum machine_mode
32763 get_mode_wider_vector (enum machine_mode o)
32764 {
32765 /* ??? Rely on the ordering that genmodes.c gives to vectors. */
32766 enum machine_mode n = GET_MODE_WIDER_MODE (o);
32767 gcc_assert (GET_MODE_NUNITS (o) == GET_MODE_NUNITS (n) * 2);
32768 gcc_assert (GET_MODE_SIZE (o) == GET_MODE_SIZE (n));
32769 return n;
32770 }
32771
32772 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
32773 with all elements equal to VAR. Return true if successful. */
32774
32775 static bool
32776 ix86_expand_vector_init_duplicate (bool mmx_ok, enum machine_mode mode,
32777 rtx target, rtx val)
32778 {
32779 bool ok;
32780
32781 switch (mode)
32782 {
32783 case V2SImode:
32784 case V2SFmode:
32785 if (!mmx_ok)
32786 return false;
32787 /* FALLTHRU */
32788
32789 case V4DFmode:
32790 case V4DImode:
32791 case V8SFmode:
32792 case V8SImode:
32793 case V2DFmode:
32794 case V2DImode:
32795 case V4SFmode:
32796 case V4SImode:
32797 {
32798 rtx insn, dup;
32799
32800 /* First attempt to recognize VAL as-is. */
32801 dup = gen_rtx_VEC_DUPLICATE (mode, val);
32802 insn = emit_insn (gen_rtx_SET (VOIDmode, target, dup));
32803 if (recog_memoized (insn) < 0)
32804 {
32805 rtx seq;
32806 /* If that fails, force VAL into a register. */
32807
32808 start_sequence ();
32809 XEXP (dup, 0) = force_reg (GET_MODE_INNER (mode), val);
32810 seq = get_insns ();
32811 end_sequence ();
32812 if (seq)
32813 emit_insn_before (seq, insn);
32814
32815 ok = recog_memoized (insn) >= 0;
32816 gcc_assert (ok);
32817 }
32818 }
32819 return true;
32820
32821 case V4HImode:
32822 if (!mmx_ok)
32823 return false;
32824 if (TARGET_SSE || TARGET_3DNOW_A)
32825 {
32826 rtx x;
32827
32828 val = gen_lowpart (SImode, val);
32829 x = gen_rtx_TRUNCATE (HImode, val);
32830 x = gen_rtx_VEC_DUPLICATE (mode, x);
32831 emit_insn (gen_rtx_SET (VOIDmode, target, x));
32832 return true;
32833 }
32834 goto widen;
32835
32836 case V8QImode:
32837 if (!mmx_ok)
32838 return false;
32839 goto widen;
32840
32841 case V8HImode:
32842 if (TARGET_SSE2)
32843 {
32844 struct expand_vec_perm_d dperm;
32845 rtx tmp1, tmp2;
32846
32847 permute:
32848 memset (&dperm, 0, sizeof (dperm));
32849 dperm.target = target;
32850 dperm.vmode = mode;
32851 dperm.nelt = GET_MODE_NUNITS (mode);
32852 dperm.op0 = dperm.op1 = gen_reg_rtx (mode);
32853
32854 /* Extend to SImode using a paradoxical SUBREG. */
32855 tmp1 = gen_reg_rtx (SImode);
32856 emit_move_insn (tmp1, gen_lowpart (SImode, val));
32857
32858 /* Insert the SImode value as low element of a V4SImode vector. */
32859 tmp2 = gen_lowpart (V4SImode, dperm.op0);
32860 emit_insn (gen_vec_setv4si_0 (tmp2, CONST0_RTX (V4SImode), tmp1));
32861
32862 ok = (expand_vec_perm_1 (&dperm)
32863 || expand_vec_perm_broadcast_1 (&dperm));
32864 gcc_assert (ok);
32865 return ok;
32866 }
32867 goto widen;
32868
32869 case V16QImode:
32870 if (TARGET_SSE2)
32871 goto permute;
32872 goto widen;
32873
32874 widen:
32875 /* Replicate the value once into the next wider mode and recurse. */
32876 {
32877 enum machine_mode smode, wsmode, wvmode;
32878 rtx x;
32879
32880 smode = GET_MODE_INNER (mode);
32881 wvmode = get_mode_wider_vector (mode);
32882 wsmode = GET_MODE_INNER (wvmode);
32883
32884 val = convert_modes (wsmode, smode, val, true);
32885 x = expand_simple_binop (wsmode, ASHIFT, val,
32886 GEN_INT (GET_MODE_BITSIZE (smode)),
32887 NULL_RTX, 1, OPTAB_LIB_WIDEN);
32888 val = expand_simple_binop (wsmode, IOR, val, x, x, 1, OPTAB_LIB_WIDEN);
32889
32890 x = gen_lowpart (wvmode, target);
32891 ok = ix86_expand_vector_init_duplicate (mmx_ok, wvmode, x, val);
32892 gcc_assert (ok);
32893 return ok;
32894 }
32895
32896 case V16HImode:
32897 case V32QImode:
32898 {
32899 enum machine_mode hvmode = (mode == V16HImode ? V8HImode : V16QImode);
32900 rtx x = gen_reg_rtx (hvmode);
32901
32902 ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
32903 gcc_assert (ok);
32904
32905 x = gen_rtx_VEC_CONCAT (mode, x, x);
32906 emit_insn (gen_rtx_SET (VOIDmode, target, x));
32907 }
32908 return true;
32909
32910 default:
32911 return false;
32912 }
32913 }
32914
32915 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
32916 whose ONE_VAR element is VAR, and other elements are zero. Return true
32917 if successful. */
32918
32919 static bool
32920 ix86_expand_vector_init_one_nonzero (bool mmx_ok, enum machine_mode mode,
32921 rtx target, rtx var, int one_var)
32922 {
32923 enum machine_mode vsimode;
32924 rtx new_target;
32925 rtx x, tmp;
32926 bool use_vector_set = false;
32927
32928 switch (mode)
32929 {
32930 case V2DImode:
32931 /* For SSE4.1, we normally use vector set. But if the second
32932 element is zero and inter-unit moves are OK, we use movq
32933 instead. */
32934 use_vector_set = (TARGET_64BIT
32935 && TARGET_SSE4_1
32936 && !(TARGET_INTER_UNIT_MOVES
32937 && one_var == 0));
32938 break;
32939 case V16QImode:
32940 case V4SImode:
32941 case V4SFmode:
32942 use_vector_set = TARGET_SSE4_1;
32943 break;
32944 case V8HImode:
32945 use_vector_set = TARGET_SSE2;
32946 break;
32947 case V4HImode:
32948 use_vector_set = TARGET_SSE || TARGET_3DNOW_A;
32949 break;
32950 case V32QImode:
32951 case V16HImode:
32952 case V8SImode:
32953 case V8SFmode:
32954 case V4DFmode:
32955 use_vector_set = TARGET_AVX;
32956 break;
32957 case V4DImode:
32958 /* Use ix86_expand_vector_set in 64bit mode only. */
32959 use_vector_set = TARGET_AVX && TARGET_64BIT;
32960 break;
32961 default:
32962 break;
32963 }
32964
32965 if (use_vector_set)
32966 {
32967 emit_insn (gen_rtx_SET (VOIDmode, target, CONST0_RTX (mode)));
32968 var = force_reg (GET_MODE_INNER (mode), var);
32969 ix86_expand_vector_set (mmx_ok, target, var, one_var);
32970 return true;
32971 }
32972
32973 switch (mode)
32974 {
32975 case V2SFmode:
32976 case V2SImode:
32977 if (!mmx_ok)
32978 return false;
32979 /* FALLTHRU */
32980
32981 case V2DFmode:
32982 case V2DImode:
32983 if (one_var != 0)
32984 return false;
32985 var = force_reg (GET_MODE_INNER (mode), var);
32986 x = gen_rtx_VEC_CONCAT (mode, var, CONST0_RTX (GET_MODE_INNER (mode)));
32987 emit_insn (gen_rtx_SET (VOIDmode, target, x));
32988 return true;
32989
32990 case V4SFmode:
32991 case V4SImode:
32992 if (!REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
32993 new_target = gen_reg_rtx (mode);
32994 else
32995 new_target = target;
32996 var = force_reg (GET_MODE_INNER (mode), var);
32997 x = gen_rtx_VEC_DUPLICATE (mode, var);
32998 x = gen_rtx_VEC_MERGE (mode, x, CONST0_RTX (mode), const1_rtx);
32999 emit_insn (gen_rtx_SET (VOIDmode, new_target, x));
33000 if (one_var != 0)
33001 {
33002 /* We need to shuffle the value to the correct position, so
33003 create a new pseudo to store the intermediate result. */
33004
33005 /* With SSE2, we can use the integer shuffle insns. */
33006 if (mode != V4SFmode && TARGET_SSE2)
33007 {
33008 emit_insn (gen_sse2_pshufd_1 (new_target, new_target,
33009 const1_rtx,
33010 GEN_INT (one_var == 1 ? 0 : 1),
33011 GEN_INT (one_var == 2 ? 0 : 1),
33012 GEN_INT (one_var == 3 ? 0 : 1)));
33013 if (target != new_target)
33014 emit_move_insn (target, new_target);
33015 return true;
33016 }
33017
33018 /* Otherwise convert the intermediate result to V4SFmode and
33019 use the SSE1 shuffle instructions. */
33020 if (mode != V4SFmode)
33021 {
33022 tmp = gen_reg_rtx (V4SFmode);
33023 emit_move_insn (tmp, gen_lowpart (V4SFmode, new_target));
33024 }
33025 else
33026 tmp = new_target;
33027
33028 emit_insn (gen_sse_shufps_v4sf (tmp, tmp, tmp,
33029 const1_rtx,
33030 GEN_INT (one_var == 1 ? 0 : 1),
33031 GEN_INT (one_var == 2 ? 0+4 : 1+4),
33032 GEN_INT (one_var == 3 ? 0+4 : 1+4)));
33033
33034 if (mode != V4SFmode)
33035 emit_move_insn (target, gen_lowpart (V4SImode, tmp));
33036 else if (tmp != target)
33037 emit_move_insn (target, tmp);
33038 }
33039 else if (target != new_target)
33040 emit_move_insn (target, new_target);
33041 return true;
33042
33043 case V8HImode:
33044 case V16QImode:
33045 vsimode = V4SImode;
33046 goto widen;
33047 case V4HImode:
33048 case V8QImode:
33049 if (!mmx_ok)
33050 return false;
33051 vsimode = V2SImode;
33052 goto widen;
33053 widen:
33054 if (one_var != 0)
33055 return false;
33056
33057 /* Zero extend the variable element to SImode and recurse. */
33058 var = convert_modes (SImode, GET_MODE_INNER (mode), var, true);
33059
33060 x = gen_reg_rtx (vsimode);
33061 if (!ix86_expand_vector_init_one_nonzero (mmx_ok, vsimode, x,
33062 var, one_var))
33063 gcc_unreachable ();
33064
33065 emit_move_insn (target, gen_lowpart (mode, x));
33066 return true;
33067
33068 default:
33069 return false;
33070 }
33071 }
33072
33073 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
33074 consisting of the values in VALS. It is known that all elements
33075 except ONE_VAR are constants. Return true if successful. */
33076
33077 static bool
33078 ix86_expand_vector_init_one_var (bool mmx_ok, enum machine_mode mode,
33079 rtx target, rtx vals, int one_var)
33080 {
33081 rtx var = XVECEXP (vals, 0, one_var);
33082 enum machine_mode wmode;
33083 rtx const_vec, x;
33084
33085 const_vec = copy_rtx (vals);
33086 XVECEXP (const_vec, 0, one_var) = CONST0_RTX (GET_MODE_INNER (mode));
33087 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (const_vec, 0));
33088
33089 switch (mode)
33090 {
33091 case V2DFmode:
33092 case V2DImode:
33093 case V2SFmode:
33094 case V2SImode:
33095 /* For the two element vectors, it's just as easy to use
33096 the general case. */
33097 return false;
33098
33099 case V4DImode:
33100 /* Use ix86_expand_vector_set in 64bit mode only. */
33101 if (!TARGET_64BIT)
33102 return false;
33103 case V4DFmode:
33104 case V8SFmode:
33105 case V8SImode:
33106 case V16HImode:
33107 case V32QImode:
33108 case V4SFmode:
33109 case V4SImode:
33110 case V8HImode:
33111 case V4HImode:
33112 break;
33113
33114 case V16QImode:
33115 if (TARGET_SSE4_1)
33116 break;
33117 wmode = V8HImode;
33118 goto widen;
33119 case V8QImode:
33120 wmode = V4HImode;
33121 goto widen;
33122 widen:
33123 /* There's no way to set one QImode entry easily. Combine
33124 the variable value with its adjacent constant value, and
33125 promote to an HImode set. */
33126 x = XVECEXP (vals, 0, one_var ^ 1);
33127 if (one_var & 1)
33128 {
33129 var = convert_modes (HImode, QImode, var, true);
33130 var = expand_simple_binop (HImode, ASHIFT, var, GEN_INT (8),
33131 NULL_RTX, 1, OPTAB_LIB_WIDEN);
33132 x = GEN_INT (INTVAL (x) & 0xff);
33133 }
33134 else
33135 {
33136 var = convert_modes (HImode, QImode, var, true);
33137 x = gen_int_mode (INTVAL (x) << 8, HImode);
33138 }
33139 if (x != const0_rtx)
33140 var = expand_simple_binop (HImode, IOR, var, x, var,
33141 1, OPTAB_LIB_WIDEN);
33142
33143 x = gen_reg_rtx (wmode);
33144 emit_move_insn (x, gen_lowpart (wmode, const_vec));
33145 ix86_expand_vector_set (mmx_ok, x, var, one_var >> 1);
33146
33147 emit_move_insn (target, gen_lowpart (mode, x));
33148 return true;
33149
33150 default:
33151 return false;
33152 }
33153
33154 emit_move_insn (target, const_vec);
33155 ix86_expand_vector_set (mmx_ok, target, var, one_var);
33156 return true;
33157 }
33158
33159 /* A subroutine of ix86_expand_vector_init_general. Use vector
33160 concatenate to handle the most general case: all values variable,
33161 and none identical. */
33162
33163 static void
33164 ix86_expand_vector_init_concat (enum machine_mode mode,
33165 rtx target, rtx *ops, int n)
33166 {
33167 enum machine_mode cmode, hmode = VOIDmode;
33168 rtx first[8], second[4];
33169 rtvec v;
33170 int i, j;
33171
33172 switch (n)
33173 {
33174 case 2:
33175 switch (mode)
33176 {
33177 case V8SImode:
33178 cmode = V4SImode;
33179 break;
33180 case V8SFmode:
33181 cmode = V4SFmode;
33182 break;
33183 case V4DImode:
33184 cmode = V2DImode;
33185 break;
33186 case V4DFmode:
33187 cmode = V2DFmode;
33188 break;
33189 case V4SImode:
33190 cmode = V2SImode;
33191 break;
33192 case V4SFmode:
33193 cmode = V2SFmode;
33194 break;
33195 case V2DImode:
33196 cmode = DImode;
33197 break;
33198 case V2SImode:
33199 cmode = SImode;
33200 break;
33201 case V2DFmode:
33202 cmode = DFmode;
33203 break;
33204 case V2SFmode:
33205 cmode = SFmode;
33206 break;
33207 default:
33208 gcc_unreachable ();
33209 }
33210
33211 if (!register_operand (ops[1], cmode))
33212 ops[1] = force_reg (cmode, ops[1]);
33213 if (!register_operand (ops[0], cmode))
33214 ops[0] = force_reg (cmode, ops[0]);
33215 emit_insn (gen_rtx_SET (VOIDmode, target,
33216 gen_rtx_VEC_CONCAT (mode, ops[0],
33217 ops[1])));
33218 break;
33219
33220 case 4:
33221 switch (mode)
33222 {
33223 case V4DImode:
33224 cmode = V2DImode;
33225 break;
33226 case V4DFmode:
33227 cmode = V2DFmode;
33228 break;
33229 case V4SImode:
33230 cmode = V2SImode;
33231 break;
33232 case V4SFmode:
33233 cmode = V2SFmode;
33234 break;
33235 default:
33236 gcc_unreachable ();
33237 }
33238 goto half;
33239
33240 case 8:
33241 switch (mode)
33242 {
33243 case V8SImode:
33244 cmode = V2SImode;
33245 hmode = V4SImode;
33246 break;
33247 case V8SFmode:
33248 cmode = V2SFmode;
33249 hmode = V4SFmode;
33250 break;
33251 default:
33252 gcc_unreachable ();
33253 }
33254 goto half;
33255
33256 half:
33257 /* FIXME: We process inputs backward to help RA. PR 36222. */
33258 i = n - 1;
33259 j = (n >> 1) - 1;
33260 for (; i > 0; i -= 2, j--)
33261 {
33262 first[j] = gen_reg_rtx (cmode);
33263 v = gen_rtvec (2, ops[i - 1], ops[i]);
33264 ix86_expand_vector_init (false, first[j],
33265 gen_rtx_PARALLEL (cmode, v));
33266 }
33267
33268 n >>= 1;
33269 if (n > 2)
33270 {
33271 gcc_assert (hmode != VOIDmode);
33272 for (i = j = 0; i < n; i += 2, j++)
33273 {
33274 second[j] = gen_reg_rtx (hmode);
33275 ix86_expand_vector_init_concat (hmode, second [j],
33276 &first [i], 2);
33277 }
33278 n >>= 1;
33279 ix86_expand_vector_init_concat (mode, target, second, n);
33280 }
33281 else
33282 ix86_expand_vector_init_concat (mode, target, first, n);
33283 break;
33284
33285 default:
33286 gcc_unreachable ();
33287 }
33288 }
33289
33290 /* A subroutine of ix86_expand_vector_init_general. Use vector
33291 interleave to handle the most general case: all values variable,
33292 and none identical. */
33293
33294 static void
33295 ix86_expand_vector_init_interleave (enum machine_mode mode,
33296 rtx target, rtx *ops, int n)
33297 {
33298 enum machine_mode first_imode, second_imode, third_imode, inner_mode;
33299 int i, j;
33300 rtx op0, op1;
33301 rtx (*gen_load_even) (rtx, rtx, rtx);
33302 rtx (*gen_interleave_first_low) (rtx, rtx, rtx);
33303 rtx (*gen_interleave_second_low) (rtx, rtx, rtx);
33304
33305 switch (mode)
33306 {
33307 case V8HImode:
33308 gen_load_even = gen_vec_setv8hi;
33309 gen_interleave_first_low = gen_vec_interleave_lowv4si;
33310 gen_interleave_second_low = gen_vec_interleave_lowv2di;
33311 inner_mode = HImode;
33312 first_imode = V4SImode;
33313 second_imode = V2DImode;
33314 third_imode = VOIDmode;
33315 break;
33316 case V16QImode:
33317 gen_load_even = gen_vec_setv16qi;
33318 gen_interleave_first_low = gen_vec_interleave_lowv8hi;
33319 gen_interleave_second_low = gen_vec_interleave_lowv4si;
33320 inner_mode = QImode;
33321 first_imode = V8HImode;
33322 second_imode = V4SImode;
33323 third_imode = V2DImode;
33324 break;
33325 default:
33326 gcc_unreachable ();
33327 }
33328
33329 for (i = 0; i < n; i++)
33330 {
33331 /* Extend the odd elment to SImode using a paradoxical SUBREG. */
33332 op0 = gen_reg_rtx (SImode);
33333 emit_move_insn (op0, gen_lowpart (SImode, ops [i + i]));
33334
33335 /* Insert the SImode value as low element of V4SImode vector. */
33336 op1 = gen_reg_rtx (V4SImode);
33337 op0 = gen_rtx_VEC_MERGE (V4SImode,
33338 gen_rtx_VEC_DUPLICATE (V4SImode,
33339 op0),
33340 CONST0_RTX (V4SImode),
33341 const1_rtx);
33342 emit_insn (gen_rtx_SET (VOIDmode, op1, op0));
33343
33344 /* Cast the V4SImode vector back to a vector in orignal mode. */
33345 op0 = gen_reg_rtx (mode);
33346 emit_move_insn (op0, gen_lowpart (mode, op1));
33347
33348 /* Load even elements into the second positon. */
33349 emit_insn (gen_load_even (op0,
33350 force_reg (inner_mode,
33351 ops [i + i + 1]),
33352 const1_rtx));
33353
33354 /* Cast vector to FIRST_IMODE vector. */
33355 ops[i] = gen_reg_rtx (first_imode);
33356 emit_move_insn (ops[i], gen_lowpart (first_imode, op0));
33357 }
33358
33359 /* Interleave low FIRST_IMODE vectors. */
33360 for (i = j = 0; i < n; i += 2, j++)
33361 {
33362 op0 = gen_reg_rtx (first_imode);
33363 emit_insn (gen_interleave_first_low (op0, ops[i], ops[i + 1]));
33364
33365 /* Cast FIRST_IMODE vector to SECOND_IMODE vector. */
33366 ops[j] = gen_reg_rtx (second_imode);
33367 emit_move_insn (ops[j], gen_lowpart (second_imode, op0));
33368 }
33369
33370 /* Interleave low SECOND_IMODE vectors. */
33371 switch (second_imode)
33372 {
33373 case V4SImode:
33374 for (i = j = 0; i < n / 2; i += 2, j++)
33375 {
33376 op0 = gen_reg_rtx (second_imode);
33377 emit_insn (gen_interleave_second_low (op0, ops[i],
33378 ops[i + 1]));
33379
33380 /* Cast the SECOND_IMODE vector to the THIRD_IMODE
33381 vector. */
33382 ops[j] = gen_reg_rtx (third_imode);
33383 emit_move_insn (ops[j], gen_lowpart (third_imode, op0));
33384 }
33385 second_imode = V2DImode;
33386 gen_interleave_second_low = gen_vec_interleave_lowv2di;
33387 /* FALLTHRU */
33388
33389 case V2DImode:
33390 op0 = gen_reg_rtx (second_imode);
33391 emit_insn (gen_interleave_second_low (op0, ops[0],
33392 ops[1]));
33393
33394 /* Cast the SECOND_IMODE vector back to a vector on original
33395 mode. */
33396 emit_insn (gen_rtx_SET (VOIDmode, target,
33397 gen_lowpart (mode, op0)));
33398 break;
33399
33400 default:
33401 gcc_unreachable ();
33402 }
33403 }
33404
33405 /* A subroutine of ix86_expand_vector_init. Handle the most general case:
33406 all values variable, and none identical. */
33407
33408 static void
33409 ix86_expand_vector_init_general (bool mmx_ok, enum machine_mode mode,
33410 rtx target, rtx vals)
33411 {
33412 rtx ops[32], op0, op1;
33413 enum machine_mode half_mode = VOIDmode;
33414 int n, i;
33415
33416 switch (mode)
33417 {
33418 case V2SFmode:
33419 case V2SImode:
33420 if (!mmx_ok && !TARGET_SSE)
33421 break;
33422 /* FALLTHRU */
33423
33424 case V8SFmode:
33425 case V8SImode:
33426 case V4DFmode:
33427 case V4DImode:
33428 case V4SFmode:
33429 case V4SImode:
33430 case V2DFmode:
33431 case V2DImode:
33432 n = GET_MODE_NUNITS (mode);
33433 for (i = 0; i < n; i++)
33434 ops[i] = XVECEXP (vals, 0, i);
33435 ix86_expand_vector_init_concat (mode, target, ops, n);
33436 return;
33437
33438 case V32QImode:
33439 half_mode = V16QImode;
33440 goto half;
33441
33442 case V16HImode:
33443 half_mode = V8HImode;
33444 goto half;
33445
33446 half:
33447 n = GET_MODE_NUNITS (mode);
33448 for (i = 0; i < n; i++)
33449 ops[i] = XVECEXP (vals, 0, i);
33450 op0 = gen_reg_rtx (half_mode);
33451 op1 = gen_reg_rtx (half_mode);
33452 ix86_expand_vector_init_interleave (half_mode, op0, ops,
33453 n >> 2);
33454 ix86_expand_vector_init_interleave (half_mode, op1,
33455 &ops [n >> 1], n >> 2);
33456 emit_insn (gen_rtx_SET (VOIDmode, target,
33457 gen_rtx_VEC_CONCAT (mode, op0, op1)));
33458 return;
33459
33460 case V16QImode:
33461 if (!TARGET_SSE4_1)
33462 break;
33463 /* FALLTHRU */
33464
33465 case V8HImode:
33466 if (!TARGET_SSE2)
33467 break;
33468
33469 /* Don't use ix86_expand_vector_init_interleave if we can't
33470 move from GPR to SSE register directly. */
33471 if (!TARGET_INTER_UNIT_MOVES)
33472 break;
33473
33474 n = GET_MODE_NUNITS (mode);
33475 for (i = 0; i < n; i++)
33476 ops[i] = XVECEXP (vals, 0, i);
33477 ix86_expand_vector_init_interleave (mode, target, ops, n >> 1);
33478 return;
33479
33480 case V4HImode:
33481 case V8QImode:
33482 break;
33483
33484 default:
33485 gcc_unreachable ();
33486 }
33487
33488 {
33489 int i, j, n_elts, n_words, n_elt_per_word;
33490 enum machine_mode inner_mode;
33491 rtx words[4], shift;
33492
33493 inner_mode = GET_MODE_INNER (mode);
33494 n_elts = GET_MODE_NUNITS (mode);
33495 n_words = GET_MODE_SIZE (mode) / UNITS_PER_WORD;
33496 n_elt_per_word = n_elts / n_words;
33497 shift = GEN_INT (GET_MODE_BITSIZE (inner_mode));
33498
33499 for (i = 0; i < n_words; ++i)
33500 {
33501 rtx word = NULL_RTX;
33502
33503 for (j = 0; j < n_elt_per_word; ++j)
33504 {
33505 rtx elt = XVECEXP (vals, 0, (i+1)*n_elt_per_word - j - 1);
33506 elt = convert_modes (word_mode, inner_mode, elt, true);
33507
33508 if (j == 0)
33509 word = elt;
33510 else
33511 {
33512 word = expand_simple_binop (word_mode, ASHIFT, word, shift,
33513 word, 1, OPTAB_LIB_WIDEN);
33514 word = expand_simple_binop (word_mode, IOR, word, elt,
33515 word, 1, OPTAB_LIB_WIDEN);
33516 }
33517 }
33518
33519 words[i] = word;
33520 }
33521
33522 if (n_words == 1)
33523 emit_move_insn (target, gen_lowpart (mode, words[0]));
33524 else if (n_words == 2)
33525 {
33526 rtx tmp = gen_reg_rtx (mode);
33527 emit_clobber (tmp);
33528 emit_move_insn (gen_lowpart (word_mode, tmp), words[0]);
33529 emit_move_insn (gen_highpart (word_mode, tmp), words[1]);
33530 emit_move_insn (target, tmp);
33531 }
33532 else if (n_words == 4)
33533 {
33534 rtx tmp = gen_reg_rtx (V4SImode);
33535 gcc_assert (word_mode == SImode);
33536 vals = gen_rtx_PARALLEL (V4SImode, gen_rtvec_v (4, words));
33537 ix86_expand_vector_init_general (false, V4SImode, tmp, vals);
33538 emit_move_insn (target, gen_lowpart (mode, tmp));
33539 }
33540 else
33541 gcc_unreachable ();
33542 }
33543 }
33544
33545 /* Initialize vector TARGET via VALS. Suppress the use of MMX
33546 instructions unless MMX_OK is true. */
33547
33548 void
33549 ix86_expand_vector_init (bool mmx_ok, rtx target, rtx vals)
33550 {
33551 enum machine_mode mode = GET_MODE (target);
33552 enum machine_mode inner_mode = GET_MODE_INNER (mode);
33553 int n_elts = GET_MODE_NUNITS (mode);
33554 int n_var = 0, one_var = -1;
33555 bool all_same = true, all_const_zero = true;
33556 int i;
33557 rtx x;
33558
33559 for (i = 0; i < n_elts; ++i)
33560 {
33561 x = XVECEXP (vals, 0, i);
33562 if (!(CONST_INT_P (x)
33563 || GET_CODE (x) == CONST_DOUBLE
33564 || GET_CODE (x) == CONST_FIXED))
33565 n_var++, one_var = i;
33566 else if (x != CONST0_RTX (inner_mode))
33567 all_const_zero = false;
33568 if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
33569 all_same = false;
33570 }
33571
33572 /* Constants are best loaded from the constant pool. */
33573 if (n_var == 0)
33574 {
33575 emit_move_insn (target, gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0)));
33576 return;
33577 }
33578
33579 /* If all values are identical, broadcast the value. */
33580 if (all_same
33581 && ix86_expand_vector_init_duplicate (mmx_ok, mode, target,
33582 XVECEXP (vals, 0, 0)))
33583 return;
33584
33585 /* Values where only one field is non-constant are best loaded from
33586 the pool and overwritten via move later. */
33587 if (n_var == 1)
33588 {
33589 if (all_const_zero
33590 && ix86_expand_vector_init_one_nonzero (mmx_ok, mode, target,
33591 XVECEXP (vals, 0, one_var),
33592 one_var))
33593 return;
33594
33595 if (ix86_expand_vector_init_one_var (mmx_ok, mode, target, vals, one_var))
33596 return;
33597 }
33598
33599 ix86_expand_vector_init_general (mmx_ok, mode, target, vals);
33600 }
33601
33602 void
33603 ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt)
33604 {
33605 enum machine_mode mode = GET_MODE (target);
33606 enum machine_mode inner_mode = GET_MODE_INNER (mode);
33607 enum machine_mode half_mode;
33608 bool use_vec_merge = false;
33609 rtx tmp;
33610 static rtx (*gen_extract[6][2]) (rtx, rtx)
33611 = {
33612 { gen_vec_extract_lo_v32qi, gen_vec_extract_hi_v32qi },
33613 { gen_vec_extract_lo_v16hi, gen_vec_extract_hi_v16hi },
33614 { gen_vec_extract_lo_v8si, gen_vec_extract_hi_v8si },
33615 { gen_vec_extract_lo_v4di, gen_vec_extract_hi_v4di },
33616 { gen_vec_extract_lo_v8sf, gen_vec_extract_hi_v8sf },
33617 { gen_vec_extract_lo_v4df, gen_vec_extract_hi_v4df }
33618 };
33619 static rtx (*gen_insert[6][2]) (rtx, rtx, rtx)
33620 = {
33621 { gen_vec_set_lo_v32qi, gen_vec_set_hi_v32qi },
33622 { gen_vec_set_lo_v16hi, gen_vec_set_hi_v16hi },
33623 { gen_vec_set_lo_v8si, gen_vec_set_hi_v8si },
33624 { gen_vec_set_lo_v4di, gen_vec_set_hi_v4di },
33625 { gen_vec_set_lo_v8sf, gen_vec_set_hi_v8sf },
33626 { gen_vec_set_lo_v4df, gen_vec_set_hi_v4df }
33627 };
33628 int i, j, n;
33629
33630 switch (mode)
33631 {
33632 case V2SFmode:
33633 case V2SImode:
33634 if (mmx_ok)
33635 {
33636 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
33637 ix86_expand_vector_extract (true, tmp, target, 1 - elt);
33638 if (elt == 0)
33639 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
33640 else
33641 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
33642 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
33643 return;
33644 }
33645 break;
33646
33647 case V2DImode:
33648 use_vec_merge = TARGET_SSE4_1 && TARGET_64BIT;
33649 if (use_vec_merge)
33650 break;
33651
33652 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
33653 ix86_expand_vector_extract (false, tmp, target, 1 - elt);
33654 if (elt == 0)
33655 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
33656 else
33657 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
33658 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
33659 return;
33660
33661 case V2DFmode:
33662 {
33663 rtx op0, op1;
33664
33665 /* For the two element vectors, we implement a VEC_CONCAT with
33666 the extraction of the other element. */
33667
33668 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (1 - elt)));
33669 tmp = gen_rtx_VEC_SELECT (inner_mode, target, tmp);
33670
33671 if (elt == 0)
33672 op0 = val, op1 = tmp;
33673 else
33674 op0 = tmp, op1 = val;
33675
33676 tmp = gen_rtx_VEC_CONCAT (mode, op0, op1);
33677 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
33678 }
33679 return;
33680
33681 case V4SFmode:
33682 use_vec_merge = TARGET_SSE4_1;
33683 if (use_vec_merge)
33684 break;
33685
33686 switch (elt)
33687 {
33688 case 0:
33689 use_vec_merge = true;
33690 break;
33691
33692 case 1:
33693 /* tmp = target = A B C D */
33694 tmp = copy_to_reg (target);
33695 /* target = A A B B */
33696 emit_insn (gen_vec_interleave_lowv4sf (target, target, target));
33697 /* target = X A B B */
33698 ix86_expand_vector_set (false, target, val, 0);
33699 /* target = A X C D */
33700 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
33701 const1_rtx, const0_rtx,
33702 GEN_INT (2+4), GEN_INT (3+4)));
33703 return;
33704
33705 case 2:
33706 /* tmp = target = A B C D */
33707 tmp = copy_to_reg (target);
33708 /* tmp = X B C D */
33709 ix86_expand_vector_set (false, tmp, val, 0);
33710 /* target = A B X D */
33711 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
33712 const0_rtx, const1_rtx,
33713 GEN_INT (0+4), GEN_INT (3+4)));
33714 return;
33715
33716 case 3:
33717 /* tmp = target = A B C D */
33718 tmp = copy_to_reg (target);
33719 /* tmp = X B C D */
33720 ix86_expand_vector_set (false, tmp, val, 0);
33721 /* target = A B X D */
33722 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
33723 const0_rtx, const1_rtx,
33724 GEN_INT (2+4), GEN_INT (0+4)));
33725 return;
33726
33727 default:
33728 gcc_unreachable ();
33729 }
33730 break;
33731
33732 case V4SImode:
33733 use_vec_merge = TARGET_SSE4_1;
33734 if (use_vec_merge)
33735 break;
33736
33737 /* Element 0 handled by vec_merge below. */
33738 if (elt == 0)
33739 {
33740 use_vec_merge = true;
33741 break;
33742 }
33743
33744 if (TARGET_SSE2)
33745 {
33746 /* With SSE2, use integer shuffles to swap element 0 and ELT,
33747 store into element 0, then shuffle them back. */
33748
33749 rtx order[4];
33750
33751 order[0] = GEN_INT (elt);
33752 order[1] = const1_rtx;
33753 order[2] = const2_rtx;
33754 order[3] = GEN_INT (3);
33755 order[elt] = const0_rtx;
33756
33757 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
33758 order[1], order[2], order[3]));
33759
33760 ix86_expand_vector_set (false, target, val, 0);
33761
33762 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
33763 order[1], order[2], order[3]));
33764 }
33765 else
33766 {
33767 /* For SSE1, we have to reuse the V4SF code. */
33768 ix86_expand_vector_set (false, gen_lowpart (V4SFmode, target),
33769 gen_lowpart (SFmode, val), elt);
33770 }
33771 return;
33772
33773 case V8HImode:
33774 use_vec_merge = TARGET_SSE2;
33775 break;
33776 case V4HImode:
33777 use_vec_merge = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
33778 break;
33779
33780 case V16QImode:
33781 use_vec_merge = TARGET_SSE4_1;
33782 break;
33783
33784 case V8QImode:
33785 break;
33786
33787 case V32QImode:
33788 half_mode = V16QImode;
33789 j = 0;
33790 n = 16;
33791 goto half;
33792
33793 case V16HImode:
33794 half_mode = V8HImode;
33795 j = 1;
33796 n = 8;
33797 goto half;
33798
33799 case V8SImode:
33800 half_mode = V4SImode;
33801 j = 2;
33802 n = 4;
33803 goto half;
33804
33805 case V4DImode:
33806 half_mode = V2DImode;
33807 j = 3;
33808 n = 2;
33809 goto half;
33810
33811 case V8SFmode:
33812 half_mode = V4SFmode;
33813 j = 4;
33814 n = 4;
33815 goto half;
33816
33817 case V4DFmode:
33818 half_mode = V2DFmode;
33819 j = 5;
33820 n = 2;
33821 goto half;
33822
33823 half:
33824 /* Compute offset. */
33825 i = elt / n;
33826 elt %= n;
33827
33828 gcc_assert (i <= 1);
33829
33830 /* Extract the half. */
33831 tmp = gen_reg_rtx (half_mode);
33832 emit_insn (gen_extract[j][i] (tmp, target));
33833
33834 /* Put val in tmp at elt. */
33835 ix86_expand_vector_set (false, tmp, val, elt);
33836
33837 /* Put it back. */
33838 emit_insn (gen_insert[j][i] (target, target, tmp));
33839 return;
33840
33841 default:
33842 break;
33843 }
33844
33845 if (use_vec_merge)
33846 {
33847 tmp = gen_rtx_VEC_DUPLICATE (mode, val);
33848 tmp = gen_rtx_VEC_MERGE (mode, tmp, target, GEN_INT (1 << elt));
33849 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
33850 }
33851 else
33852 {
33853 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode), false);
33854
33855 emit_move_insn (mem, target);
33856
33857 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
33858 emit_move_insn (tmp, val);
33859
33860 emit_move_insn (target, mem);
33861 }
33862 }
33863
33864 void
33865 ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt)
33866 {
33867 enum machine_mode mode = GET_MODE (vec);
33868 enum machine_mode inner_mode = GET_MODE_INNER (mode);
33869 bool use_vec_extr = false;
33870 rtx tmp;
33871
33872 switch (mode)
33873 {
33874 case V2SImode:
33875 case V2SFmode:
33876 if (!mmx_ok)
33877 break;
33878 /* FALLTHRU */
33879
33880 case V2DFmode:
33881 case V2DImode:
33882 use_vec_extr = true;
33883 break;
33884
33885 case V4SFmode:
33886 use_vec_extr = TARGET_SSE4_1;
33887 if (use_vec_extr)
33888 break;
33889
33890 switch (elt)
33891 {
33892 case 0:
33893 tmp = vec;
33894 break;
33895
33896 case 1:
33897 case 3:
33898 tmp = gen_reg_rtx (mode);
33899 emit_insn (gen_sse_shufps_v4sf (tmp, vec, vec,
33900 GEN_INT (elt), GEN_INT (elt),
33901 GEN_INT (elt+4), GEN_INT (elt+4)));
33902 break;
33903
33904 case 2:
33905 tmp = gen_reg_rtx (mode);
33906 emit_insn (gen_vec_interleave_highv4sf (tmp, vec, vec));
33907 break;
33908
33909 default:
33910 gcc_unreachable ();
33911 }
33912 vec = tmp;
33913 use_vec_extr = true;
33914 elt = 0;
33915 break;
33916
33917 case V4SImode:
33918 use_vec_extr = TARGET_SSE4_1;
33919 if (use_vec_extr)
33920 break;
33921
33922 if (TARGET_SSE2)
33923 {
33924 switch (elt)
33925 {
33926 case 0:
33927 tmp = vec;
33928 break;
33929
33930 case 1:
33931 case 3:
33932 tmp = gen_reg_rtx (mode);
33933 emit_insn (gen_sse2_pshufd_1 (tmp, vec,
33934 GEN_INT (elt), GEN_INT (elt),
33935 GEN_INT (elt), GEN_INT (elt)));
33936 break;
33937
33938 case 2:
33939 tmp = gen_reg_rtx (mode);
33940 emit_insn (gen_vec_interleave_highv4si (tmp, vec, vec));
33941 break;
33942
33943 default:
33944 gcc_unreachable ();
33945 }
33946 vec = tmp;
33947 use_vec_extr = true;
33948 elt = 0;
33949 }
33950 else
33951 {
33952 /* For SSE1, we have to reuse the V4SF code. */
33953 ix86_expand_vector_extract (false, gen_lowpart (SFmode, target),
33954 gen_lowpart (V4SFmode, vec), elt);
33955 return;
33956 }
33957 break;
33958
33959 case V8HImode:
33960 use_vec_extr = TARGET_SSE2;
33961 break;
33962 case V4HImode:
33963 use_vec_extr = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
33964 break;
33965
33966 case V16QImode:
33967 use_vec_extr = TARGET_SSE4_1;
33968 break;
33969
33970 case V8SFmode:
33971 if (TARGET_AVX)
33972 {
33973 tmp = gen_reg_rtx (V4SFmode);
33974 if (elt < 4)
33975 emit_insn (gen_vec_extract_lo_v8sf (tmp, vec));
33976 else
33977 emit_insn (gen_vec_extract_hi_v8sf (tmp, vec));
33978 ix86_expand_vector_extract (false, target, tmp, elt & 3);
33979 return;
33980 }
33981 break;
33982
33983 case V4DFmode:
33984 if (TARGET_AVX)
33985 {
33986 tmp = gen_reg_rtx (V2DFmode);
33987 if (elt < 2)
33988 emit_insn (gen_vec_extract_lo_v4df (tmp, vec));
33989 else
33990 emit_insn (gen_vec_extract_hi_v4df (tmp, vec));
33991 ix86_expand_vector_extract (false, target, tmp, elt & 1);
33992 return;
33993 }
33994 break;
33995
33996 case V32QImode:
33997 if (TARGET_AVX)
33998 {
33999 tmp = gen_reg_rtx (V16QImode);
34000 if (elt < 16)
34001 emit_insn (gen_vec_extract_lo_v32qi (tmp, vec));
34002 else
34003 emit_insn (gen_vec_extract_hi_v32qi (tmp, vec));
34004 ix86_expand_vector_extract (false, target, tmp, elt & 15);
34005 return;
34006 }
34007 break;
34008
34009 case V16HImode:
34010 if (TARGET_AVX)
34011 {
34012 tmp = gen_reg_rtx (V8HImode);
34013 if (elt < 8)
34014 emit_insn (gen_vec_extract_lo_v16hi (tmp, vec));
34015 else
34016 emit_insn (gen_vec_extract_hi_v16hi (tmp, vec));
34017 ix86_expand_vector_extract (false, target, tmp, elt & 7);
34018 return;
34019 }
34020 break;
34021
34022 case V8SImode:
34023 if (TARGET_AVX)
34024 {
34025 tmp = gen_reg_rtx (V4SImode);
34026 if (elt < 4)
34027 emit_insn (gen_vec_extract_lo_v8si (tmp, vec));
34028 else
34029 emit_insn (gen_vec_extract_hi_v8si (tmp, vec));
34030 ix86_expand_vector_extract (false, target, tmp, elt & 3);
34031 return;
34032 }
34033 break;
34034
34035 case V4DImode:
34036 if (TARGET_AVX)
34037 {
34038 tmp = gen_reg_rtx (V2DImode);
34039 if (elt < 2)
34040 emit_insn (gen_vec_extract_lo_v4di (tmp, vec));
34041 else
34042 emit_insn (gen_vec_extract_hi_v4di (tmp, vec));
34043 ix86_expand_vector_extract (false, target, tmp, elt & 1);
34044 return;
34045 }
34046 break;
34047
34048 case V8QImode:
34049 /* ??? Could extract the appropriate HImode element and shift. */
34050 default:
34051 break;
34052 }
34053
34054 if (use_vec_extr)
34055 {
34056 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (elt)));
34057 tmp = gen_rtx_VEC_SELECT (inner_mode, vec, tmp);
34058
34059 /* Let the rtl optimizers know about the zero extension performed. */
34060 if (inner_mode == QImode || inner_mode == HImode)
34061 {
34062 tmp = gen_rtx_ZERO_EXTEND (SImode, tmp);
34063 target = gen_lowpart (SImode, target);
34064 }
34065
34066 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
34067 }
34068 else
34069 {
34070 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode), false);
34071
34072 emit_move_insn (mem, vec);
34073
34074 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
34075 emit_move_insn (target, tmp);
34076 }
34077 }
34078
34079 /* Generate code to copy vector bits i / 2 ... i - 1 from vector SRC
34080 to bits 0 ... i / 2 - 1 of vector DEST, which has the same mode.
34081 The upper bits of DEST are undefined, though they shouldn't cause
34082 exceptions (some bits from src or all zeros are ok). */
34083
34084 static void
34085 emit_reduc_half (rtx dest, rtx src, int i)
34086 {
34087 rtx tem;
34088 switch (GET_MODE (src))
34089 {
34090 case V4SFmode:
34091 if (i == 128)
34092 tem = gen_sse_movhlps (dest, src, src);
34093 else
34094 tem = gen_sse_shufps_v4sf (dest, src, src, const1_rtx, const1_rtx,
34095 GEN_INT (1 + 4), GEN_INT (1 + 4));
34096 break;
34097 case V2DFmode:
34098 tem = gen_vec_interleave_highv2df (dest, src, src);
34099 break;
34100 case V16QImode:
34101 case V8HImode:
34102 case V4SImode:
34103 case V2DImode:
34104 tem = gen_sse2_lshrv1ti3 (gen_lowpart (V1TImode, dest),
34105 gen_lowpart (V1TImode, src),
34106 GEN_INT (i / 2));
34107 break;
34108 case V8SFmode:
34109 if (i == 256)
34110 tem = gen_avx_vperm2f128v8sf3 (dest, src, src, const1_rtx);
34111 else
34112 tem = gen_avx_shufps256 (dest, src, src,
34113 GEN_INT (i == 128 ? 2 + (3 << 2) : 1));
34114 break;
34115 case V4DFmode:
34116 if (i == 256)
34117 tem = gen_avx_vperm2f128v4df3 (dest, src, src, const1_rtx);
34118 else
34119 tem = gen_avx_shufpd256 (dest, src, src, const1_rtx);
34120 break;
34121 case V32QImode:
34122 case V16HImode:
34123 case V8SImode:
34124 case V4DImode:
34125 if (i == 256)
34126 tem = gen_avx2_permv2ti (gen_lowpart (V4DImode, dest),
34127 gen_lowpart (V4DImode, src),
34128 gen_lowpart (V4DImode, src),
34129 const1_rtx);
34130 else
34131 tem = gen_avx2_lshrv2ti3 (gen_lowpart (V2TImode, dest),
34132 gen_lowpart (V2TImode, src),
34133 GEN_INT (i / 2));
34134 break;
34135 default:
34136 gcc_unreachable ();
34137 }
34138 emit_insn (tem);
34139 }
34140
34141 /* Expand a vector reduction. FN is the binary pattern to reduce;
34142 DEST is the destination; IN is the input vector. */
34143
34144 void
34145 ix86_expand_reduc (rtx (*fn) (rtx, rtx, rtx), rtx dest, rtx in)
34146 {
34147 rtx half, dst, vec = in;
34148 enum machine_mode mode = GET_MODE (in);
34149 int i;
34150
34151 /* SSE4 has a special instruction for V8HImode UMIN reduction. */
34152 if (TARGET_SSE4_1
34153 && mode == V8HImode
34154 && fn == gen_uminv8hi3)
34155 {
34156 emit_insn (gen_sse4_1_phminposuw (dest, in));
34157 return;
34158 }
34159
34160 for (i = GET_MODE_BITSIZE (mode);
34161 i > GET_MODE_BITSIZE (GET_MODE_INNER (mode));
34162 i >>= 1)
34163 {
34164 half = gen_reg_rtx (mode);
34165 emit_reduc_half (half, vec, i);
34166 if (i == GET_MODE_BITSIZE (GET_MODE_INNER (mode)) * 2)
34167 dst = dest;
34168 else
34169 dst = gen_reg_rtx (mode);
34170 emit_insn (fn (dst, half, vec));
34171 vec = dst;
34172 }
34173 }
34174 \f
34175 /* Target hook for scalar_mode_supported_p. */
34176 static bool
34177 ix86_scalar_mode_supported_p (enum machine_mode mode)
34178 {
34179 if (DECIMAL_FLOAT_MODE_P (mode))
34180 return default_decimal_float_supported_p ();
34181 else if (mode == TFmode)
34182 return true;
34183 else
34184 return default_scalar_mode_supported_p (mode);
34185 }
34186
34187 /* Implements target hook vector_mode_supported_p. */
34188 static bool
34189 ix86_vector_mode_supported_p (enum machine_mode mode)
34190 {
34191 if (TARGET_SSE && VALID_SSE_REG_MODE (mode))
34192 return true;
34193 if (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode))
34194 return true;
34195 if (TARGET_AVX && VALID_AVX256_REG_MODE (mode))
34196 return true;
34197 if (TARGET_MMX && VALID_MMX_REG_MODE (mode))
34198 return true;
34199 if (TARGET_3DNOW && VALID_MMX_REG_MODE_3DNOW (mode))
34200 return true;
34201 return false;
34202 }
34203
34204 /* Target hook for c_mode_for_suffix. */
34205 static enum machine_mode
34206 ix86_c_mode_for_suffix (char suffix)
34207 {
34208 if (suffix == 'q')
34209 return TFmode;
34210 if (suffix == 'w')
34211 return XFmode;
34212
34213 return VOIDmode;
34214 }
34215
34216 /* Worker function for TARGET_MD_ASM_CLOBBERS.
34217
34218 We do this in the new i386 backend to maintain source compatibility
34219 with the old cc0-based compiler. */
34220
34221 static tree
34222 ix86_md_asm_clobbers (tree outputs ATTRIBUTE_UNUSED,
34223 tree inputs ATTRIBUTE_UNUSED,
34224 tree clobbers)
34225 {
34226 clobbers = tree_cons (NULL_TREE, build_string (5, "flags"),
34227 clobbers);
34228 clobbers = tree_cons (NULL_TREE, build_string (4, "fpsr"),
34229 clobbers);
34230 return clobbers;
34231 }
34232
34233 /* Implements target vector targetm.asm.encode_section_info. */
34234
34235 static void ATTRIBUTE_UNUSED
34236 ix86_encode_section_info (tree decl, rtx rtl, int first)
34237 {
34238 default_encode_section_info (decl, rtl, first);
34239
34240 if (TREE_CODE (decl) == VAR_DECL
34241 && (TREE_STATIC (decl) || DECL_EXTERNAL (decl))
34242 && ix86_in_large_data_p (decl))
34243 SYMBOL_REF_FLAGS (XEXP (rtl, 0)) |= SYMBOL_FLAG_FAR_ADDR;
34244 }
34245
34246 /* Worker function for REVERSE_CONDITION. */
34247
34248 enum rtx_code
34249 ix86_reverse_condition (enum rtx_code code, enum machine_mode mode)
34250 {
34251 return (mode != CCFPmode && mode != CCFPUmode
34252 ? reverse_condition (code)
34253 : reverse_condition_maybe_unordered (code));
34254 }
34255
34256 /* Output code to perform an x87 FP register move, from OPERANDS[1]
34257 to OPERANDS[0]. */
34258
34259 const char *
34260 output_387_reg_move (rtx insn, rtx *operands)
34261 {
34262 if (REG_P (operands[0]))
34263 {
34264 if (REG_P (operands[1])
34265 && find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
34266 {
34267 if (REGNO (operands[0]) == FIRST_STACK_REG)
34268 return output_387_ffreep (operands, 0);
34269 return "fstp\t%y0";
34270 }
34271 if (STACK_TOP_P (operands[0]))
34272 return "fld%Z1\t%y1";
34273 return "fst\t%y0";
34274 }
34275 else if (MEM_P (operands[0]))
34276 {
34277 gcc_assert (REG_P (operands[1]));
34278 if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
34279 return "fstp%Z0\t%y0";
34280 else
34281 {
34282 /* There is no non-popping store to memory for XFmode.
34283 So if we need one, follow the store with a load. */
34284 if (GET_MODE (operands[0]) == XFmode)
34285 return "fstp%Z0\t%y0\n\tfld%Z0\t%y0";
34286 else
34287 return "fst%Z0\t%y0";
34288 }
34289 }
34290 else
34291 gcc_unreachable();
34292 }
34293
34294 /* Output code to perform a conditional jump to LABEL, if C2 flag in
34295 FP status register is set. */
34296
34297 void
34298 ix86_emit_fp_unordered_jump (rtx label)
34299 {
34300 rtx reg = gen_reg_rtx (HImode);
34301 rtx temp;
34302
34303 emit_insn (gen_x86_fnstsw_1 (reg));
34304
34305 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
34306 {
34307 emit_insn (gen_x86_sahf_1 (reg));
34308
34309 temp = gen_rtx_REG (CCmode, FLAGS_REG);
34310 temp = gen_rtx_UNORDERED (VOIDmode, temp, const0_rtx);
34311 }
34312 else
34313 {
34314 emit_insn (gen_testqi_ext_ccno_0 (reg, GEN_INT (0x04)));
34315
34316 temp = gen_rtx_REG (CCNOmode, FLAGS_REG);
34317 temp = gen_rtx_NE (VOIDmode, temp, const0_rtx);
34318 }
34319
34320 temp = gen_rtx_IF_THEN_ELSE (VOIDmode, temp,
34321 gen_rtx_LABEL_REF (VOIDmode, label),
34322 pc_rtx);
34323 temp = gen_rtx_SET (VOIDmode, pc_rtx, temp);
34324
34325 emit_jump_insn (temp);
34326 predict_jump (REG_BR_PROB_BASE * 10 / 100);
34327 }
34328
34329 /* Output code to perform a log1p XFmode calculation. */
34330
34331 void ix86_emit_i387_log1p (rtx op0, rtx op1)
34332 {
34333 rtx label1 = gen_label_rtx ();
34334 rtx label2 = gen_label_rtx ();
34335
34336 rtx tmp = gen_reg_rtx (XFmode);
34337 rtx tmp2 = gen_reg_rtx (XFmode);
34338 rtx test;
34339
34340 emit_insn (gen_absxf2 (tmp, op1));
34341 test = gen_rtx_GE (VOIDmode, tmp,
34342 CONST_DOUBLE_FROM_REAL_VALUE (
34343 REAL_VALUE_ATOF ("0.29289321881345247561810596348408353", XFmode),
34344 XFmode));
34345 emit_jump_insn (gen_cbranchxf4 (test, XEXP (test, 0), XEXP (test, 1), label1));
34346
34347 emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
34348 emit_insn (gen_fyl2xp1xf3_i387 (op0, op1, tmp2));
34349 emit_jump (label2);
34350
34351 emit_label (label1);
34352 emit_move_insn (tmp, CONST1_RTX (XFmode));
34353 emit_insn (gen_addxf3 (tmp, op1, tmp));
34354 emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
34355 emit_insn (gen_fyl2xxf3_i387 (op0, tmp, tmp2));
34356
34357 emit_label (label2);
34358 }
34359
34360 /* Emit code for round calculation. */
34361 void ix86_emit_i387_round (rtx op0, rtx op1)
34362 {
34363 enum machine_mode inmode = GET_MODE (op1);
34364 enum machine_mode outmode = GET_MODE (op0);
34365 rtx e1, e2, res, tmp, tmp1, half;
34366 rtx scratch = gen_reg_rtx (HImode);
34367 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
34368 rtx jump_label = gen_label_rtx ();
34369 rtx insn;
34370 rtx (*gen_abs) (rtx, rtx);
34371 rtx (*gen_neg) (rtx, rtx);
34372
34373 switch (inmode)
34374 {
34375 case SFmode:
34376 gen_abs = gen_abssf2;
34377 break;
34378 case DFmode:
34379 gen_abs = gen_absdf2;
34380 break;
34381 case XFmode:
34382 gen_abs = gen_absxf2;
34383 break;
34384 default:
34385 gcc_unreachable ();
34386 }
34387
34388 switch (outmode)
34389 {
34390 case SFmode:
34391 gen_neg = gen_negsf2;
34392 break;
34393 case DFmode:
34394 gen_neg = gen_negdf2;
34395 break;
34396 case XFmode:
34397 gen_neg = gen_negxf2;
34398 break;
34399 case HImode:
34400 gen_neg = gen_neghi2;
34401 break;
34402 case SImode:
34403 gen_neg = gen_negsi2;
34404 break;
34405 case DImode:
34406 gen_neg = gen_negdi2;
34407 break;
34408 default:
34409 gcc_unreachable ();
34410 }
34411
34412 e1 = gen_reg_rtx (inmode);
34413 e2 = gen_reg_rtx (inmode);
34414 res = gen_reg_rtx (outmode);
34415
34416 half = CONST_DOUBLE_FROM_REAL_VALUE (dconsthalf, inmode);
34417
34418 /* round(a) = sgn(a) * floor(fabs(a) + 0.5) */
34419
34420 /* scratch = fxam(op1) */
34421 emit_insn (gen_rtx_SET (VOIDmode, scratch,
34422 gen_rtx_UNSPEC (HImode, gen_rtvec (1, op1),
34423 UNSPEC_FXAM)));
34424 /* e1 = fabs(op1) */
34425 emit_insn (gen_abs (e1, op1));
34426
34427 /* e2 = e1 + 0.5 */
34428 half = force_reg (inmode, half);
34429 emit_insn (gen_rtx_SET (VOIDmode, e2,
34430 gen_rtx_PLUS (inmode, e1, half)));
34431
34432 /* res = floor(e2) */
34433 if (inmode != XFmode)
34434 {
34435 tmp1 = gen_reg_rtx (XFmode);
34436
34437 emit_insn (gen_rtx_SET (VOIDmode, tmp1,
34438 gen_rtx_FLOAT_EXTEND (XFmode, e2)));
34439 }
34440 else
34441 tmp1 = e2;
34442
34443 switch (outmode)
34444 {
34445 case SFmode:
34446 case DFmode:
34447 {
34448 rtx tmp0 = gen_reg_rtx (XFmode);
34449
34450 emit_insn (gen_frndintxf2_floor (tmp0, tmp1));
34451
34452 emit_insn (gen_rtx_SET (VOIDmode, res,
34453 gen_rtx_UNSPEC (outmode, gen_rtvec (1, tmp0),
34454 UNSPEC_TRUNC_NOOP)));
34455 }
34456 break;
34457 case XFmode:
34458 emit_insn (gen_frndintxf2_floor (res, tmp1));
34459 break;
34460 case HImode:
34461 emit_insn (gen_lfloorxfhi2 (res, tmp1));
34462 break;
34463 case SImode:
34464 emit_insn (gen_lfloorxfsi2 (res, tmp1));
34465 break;
34466 case DImode:
34467 emit_insn (gen_lfloorxfdi2 (res, tmp1));
34468 break;
34469 default:
34470 gcc_unreachable ();
34471 }
34472
34473 /* flags = signbit(a) */
34474 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x02)));
34475
34476 /* if (flags) then res = -res */
34477 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
34478 gen_rtx_EQ (VOIDmode, flags, const0_rtx),
34479 gen_rtx_LABEL_REF (VOIDmode, jump_label),
34480 pc_rtx);
34481 insn = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
34482 predict_jump (REG_BR_PROB_BASE * 50 / 100);
34483 JUMP_LABEL (insn) = jump_label;
34484
34485 emit_insn (gen_neg (res, res));
34486
34487 emit_label (jump_label);
34488 LABEL_NUSES (jump_label) = 1;
34489
34490 emit_move_insn (op0, res);
34491 }
34492
34493 /* Output code to perform a Newton-Rhapson approximation of a single precision
34494 floating point divide [http://en.wikipedia.org/wiki/N-th_root_algorithm]. */
34495
34496 void ix86_emit_swdivsf (rtx res, rtx a, rtx b, enum machine_mode mode)
34497 {
34498 rtx x0, x1, e0, e1;
34499
34500 x0 = gen_reg_rtx (mode);
34501 e0 = gen_reg_rtx (mode);
34502 e1 = gen_reg_rtx (mode);
34503 x1 = gen_reg_rtx (mode);
34504
34505 /* a / b = a * ((rcp(b) + rcp(b)) - (b * rcp(b) * rcp (b))) */
34506
34507 b = force_reg (mode, b);
34508
34509 /* x0 = rcp(b) estimate */
34510 emit_insn (gen_rtx_SET (VOIDmode, x0,
34511 gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
34512 UNSPEC_RCP)));
34513 /* e0 = x0 * b */
34514 emit_insn (gen_rtx_SET (VOIDmode, e0,
34515 gen_rtx_MULT (mode, x0, b)));
34516
34517 /* e0 = x0 * e0 */
34518 emit_insn (gen_rtx_SET (VOIDmode, e0,
34519 gen_rtx_MULT (mode, x0, e0)));
34520
34521 /* e1 = x0 + x0 */
34522 emit_insn (gen_rtx_SET (VOIDmode, e1,
34523 gen_rtx_PLUS (mode, x0, x0)));
34524
34525 /* x1 = e1 - e0 */
34526 emit_insn (gen_rtx_SET (VOIDmode, x1,
34527 gen_rtx_MINUS (mode, e1, e0)));
34528
34529 /* res = a * x1 */
34530 emit_insn (gen_rtx_SET (VOIDmode, res,
34531 gen_rtx_MULT (mode, a, x1)));
34532 }
34533
34534 /* Output code to perform a Newton-Rhapson approximation of a
34535 single precision floating point [reciprocal] square root. */
34536
34537 void ix86_emit_swsqrtsf (rtx res, rtx a, enum machine_mode mode,
34538 bool recip)
34539 {
34540 rtx x0, e0, e1, e2, e3, mthree, mhalf;
34541 REAL_VALUE_TYPE r;
34542
34543 x0 = gen_reg_rtx (mode);
34544 e0 = gen_reg_rtx (mode);
34545 e1 = gen_reg_rtx (mode);
34546 e2 = gen_reg_rtx (mode);
34547 e3 = gen_reg_rtx (mode);
34548
34549 real_from_integer (&r, VOIDmode, -3, -1, 0);
34550 mthree = CONST_DOUBLE_FROM_REAL_VALUE (r, SFmode);
34551
34552 real_arithmetic (&r, NEGATE_EXPR, &dconsthalf, NULL);
34553 mhalf = CONST_DOUBLE_FROM_REAL_VALUE (r, SFmode);
34554
34555 if (VECTOR_MODE_P (mode))
34556 {
34557 mthree = ix86_build_const_vector (mode, true, mthree);
34558 mhalf = ix86_build_const_vector (mode, true, mhalf);
34559 }
34560
34561 /* sqrt(a) = -0.5 * a * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0)
34562 rsqrt(a) = -0.5 * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) */
34563
34564 a = force_reg (mode, a);
34565
34566 /* x0 = rsqrt(a) estimate */
34567 emit_insn (gen_rtx_SET (VOIDmode, x0,
34568 gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
34569 UNSPEC_RSQRT)));
34570
34571 /* If (a == 0.0) Filter out infinity to prevent NaN for sqrt(0.0). */
34572 if (!recip)
34573 {
34574 rtx zero, mask;
34575
34576 zero = gen_reg_rtx (mode);
34577 mask = gen_reg_rtx (mode);
34578
34579 zero = force_reg (mode, CONST0_RTX(mode));
34580 emit_insn (gen_rtx_SET (VOIDmode, mask,
34581 gen_rtx_NE (mode, zero, a)));
34582
34583 emit_insn (gen_rtx_SET (VOIDmode, x0,
34584 gen_rtx_AND (mode, x0, mask)));
34585 }
34586
34587 /* e0 = x0 * a */
34588 emit_insn (gen_rtx_SET (VOIDmode, e0,
34589 gen_rtx_MULT (mode, x0, a)));
34590 /* e1 = e0 * x0 */
34591 emit_insn (gen_rtx_SET (VOIDmode, e1,
34592 gen_rtx_MULT (mode, e0, x0)));
34593
34594 /* e2 = e1 - 3. */
34595 mthree = force_reg (mode, mthree);
34596 emit_insn (gen_rtx_SET (VOIDmode, e2,
34597 gen_rtx_PLUS (mode, e1, mthree)));
34598
34599 mhalf = force_reg (mode, mhalf);
34600 if (recip)
34601 /* e3 = -.5 * x0 */
34602 emit_insn (gen_rtx_SET (VOIDmode, e3,
34603 gen_rtx_MULT (mode, x0, mhalf)));
34604 else
34605 /* e3 = -.5 * e0 */
34606 emit_insn (gen_rtx_SET (VOIDmode, e3,
34607 gen_rtx_MULT (mode, e0, mhalf)));
34608 /* ret = e2 * e3 */
34609 emit_insn (gen_rtx_SET (VOIDmode, res,
34610 gen_rtx_MULT (mode, e2, e3)));
34611 }
34612
34613 #ifdef TARGET_SOLARIS
34614 /* Solaris implementation of TARGET_ASM_NAMED_SECTION. */
34615
34616 static void
34617 i386_solaris_elf_named_section (const char *name, unsigned int flags,
34618 tree decl)
34619 {
34620 /* With Binutils 2.15, the "@unwind" marker must be specified on
34621 every occurrence of the ".eh_frame" section, not just the first
34622 one. */
34623 if (TARGET_64BIT
34624 && strcmp (name, ".eh_frame") == 0)
34625 {
34626 fprintf (asm_out_file, "\t.section\t%s,\"%s\",@unwind\n", name,
34627 flags & SECTION_WRITE ? "aw" : "a");
34628 return;
34629 }
34630
34631 #ifndef USE_GAS
34632 if (HAVE_COMDAT_GROUP && flags & SECTION_LINKONCE)
34633 {
34634 solaris_elf_asm_comdat_section (name, flags, decl);
34635 return;
34636 }
34637 #endif
34638
34639 default_elf_asm_named_section (name, flags, decl);
34640 }
34641 #endif /* TARGET_SOLARIS */
34642
34643 /* Return the mangling of TYPE if it is an extended fundamental type. */
34644
34645 static const char *
34646 ix86_mangle_type (const_tree type)
34647 {
34648 type = TYPE_MAIN_VARIANT (type);
34649
34650 if (TREE_CODE (type) != VOID_TYPE && TREE_CODE (type) != BOOLEAN_TYPE
34651 && TREE_CODE (type) != INTEGER_TYPE && TREE_CODE (type) != REAL_TYPE)
34652 return NULL;
34653
34654 switch (TYPE_MODE (type))
34655 {
34656 case TFmode:
34657 /* __float128 is "g". */
34658 return "g";
34659 case XFmode:
34660 /* "long double" or __float80 is "e". */
34661 return "e";
34662 default:
34663 return NULL;
34664 }
34665 }
34666
34667 /* For 32-bit code we can save PIC register setup by using
34668 __stack_chk_fail_local hidden function instead of calling
34669 __stack_chk_fail directly. 64-bit code doesn't need to setup any PIC
34670 register, so it is better to call __stack_chk_fail directly. */
34671
34672 static tree ATTRIBUTE_UNUSED
34673 ix86_stack_protect_fail (void)
34674 {
34675 return TARGET_64BIT
34676 ? default_external_stack_protect_fail ()
34677 : default_hidden_stack_protect_fail ();
34678 }
34679
34680 /* Select a format to encode pointers in exception handling data. CODE
34681 is 0 for data, 1 for code labels, 2 for function pointers. GLOBAL is
34682 true if the symbol may be affected by dynamic relocations.
34683
34684 ??? All x86 object file formats are capable of representing this.
34685 After all, the relocation needed is the same as for the call insn.
34686 Whether or not a particular assembler allows us to enter such, I
34687 guess we'll have to see. */
34688 int
34689 asm_preferred_eh_data_format (int code, int global)
34690 {
34691 if (flag_pic)
34692 {
34693 int type = DW_EH_PE_sdata8;
34694 if (!TARGET_64BIT
34695 || ix86_cmodel == CM_SMALL_PIC
34696 || (ix86_cmodel == CM_MEDIUM_PIC && (global || code)))
34697 type = DW_EH_PE_sdata4;
34698 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
34699 }
34700 if (ix86_cmodel == CM_SMALL
34701 || (ix86_cmodel == CM_MEDIUM && code))
34702 return DW_EH_PE_udata4;
34703 return DW_EH_PE_absptr;
34704 }
34705 \f
34706 /* Expand copysign from SIGN to the positive value ABS_VALUE
34707 storing in RESULT. If MASK is non-null, it shall be a mask to mask out
34708 the sign-bit. */
34709 static void
34710 ix86_sse_copysign_to_positive (rtx result, rtx abs_value, rtx sign, rtx mask)
34711 {
34712 enum machine_mode mode = GET_MODE (sign);
34713 rtx sgn = gen_reg_rtx (mode);
34714 if (mask == NULL_RTX)
34715 {
34716 enum machine_mode vmode;
34717
34718 if (mode == SFmode)
34719 vmode = V4SFmode;
34720 else if (mode == DFmode)
34721 vmode = V2DFmode;
34722 else
34723 vmode = mode;
34724
34725 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), false);
34726 if (!VECTOR_MODE_P (mode))
34727 {
34728 /* We need to generate a scalar mode mask in this case. */
34729 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
34730 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
34731 mask = gen_reg_rtx (mode);
34732 emit_insn (gen_rtx_SET (VOIDmode, mask, tmp));
34733 }
34734 }
34735 else
34736 mask = gen_rtx_NOT (mode, mask);
34737 emit_insn (gen_rtx_SET (VOIDmode, sgn,
34738 gen_rtx_AND (mode, mask, sign)));
34739 emit_insn (gen_rtx_SET (VOIDmode, result,
34740 gen_rtx_IOR (mode, abs_value, sgn)));
34741 }
34742
34743 /* Expand fabs (OP0) and return a new rtx that holds the result. The
34744 mask for masking out the sign-bit is stored in *SMASK, if that is
34745 non-null. */
34746 static rtx
34747 ix86_expand_sse_fabs (rtx op0, rtx *smask)
34748 {
34749 enum machine_mode vmode, mode = GET_MODE (op0);
34750 rtx xa, mask;
34751
34752 xa = gen_reg_rtx (mode);
34753 if (mode == SFmode)
34754 vmode = V4SFmode;
34755 else if (mode == DFmode)
34756 vmode = V2DFmode;
34757 else
34758 vmode = mode;
34759 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), true);
34760 if (!VECTOR_MODE_P (mode))
34761 {
34762 /* We need to generate a scalar mode mask in this case. */
34763 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
34764 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
34765 mask = gen_reg_rtx (mode);
34766 emit_insn (gen_rtx_SET (VOIDmode, mask, tmp));
34767 }
34768 emit_insn (gen_rtx_SET (VOIDmode, xa,
34769 gen_rtx_AND (mode, op0, mask)));
34770
34771 if (smask)
34772 *smask = mask;
34773
34774 return xa;
34775 }
34776
34777 /* Expands a comparison of OP0 with OP1 using comparison code CODE,
34778 swapping the operands if SWAP_OPERANDS is true. The expanded
34779 code is a forward jump to a newly created label in case the
34780 comparison is true. The generated label rtx is returned. */
34781 static rtx
34782 ix86_expand_sse_compare_and_jump (enum rtx_code code, rtx op0, rtx op1,
34783 bool swap_operands)
34784 {
34785 rtx label, tmp;
34786
34787 if (swap_operands)
34788 {
34789 tmp = op0;
34790 op0 = op1;
34791 op1 = tmp;
34792 }
34793
34794 label = gen_label_rtx ();
34795 tmp = gen_rtx_REG (CCFPUmode, FLAGS_REG);
34796 emit_insn (gen_rtx_SET (VOIDmode, tmp,
34797 gen_rtx_COMPARE (CCFPUmode, op0, op1)));
34798 tmp = gen_rtx_fmt_ee (code, VOIDmode, tmp, const0_rtx);
34799 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
34800 gen_rtx_LABEL_REF (VOIDmode, label), pc_rtx);
34801 tmp = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
34802 JUMP_LABEL (tmp) = label;
34803
34804 return label;
34805 }
34806
34807 /* Expand a mask generating SSE comparison instruction comparing OP0 with OP1
34808 using comparison code CODE. Operands are swapped for the comparison if
34809 SWAP_OPERANDS is true. Returns a rtx for the generated mask. */
34810 static rtx
34811 ix86_expand_sse_compare_mask (enum rtx_code code, rtx op0, rtx op1,
34812 bool swap_operands)
34813 {
34814 rtx (*insn)(rtx, rtx, rtx, rtx);
34815 enum machine_mode mode = GET_MODE (op0);
34816 rtx mask = gen_reg_rtx (mode);
34817
34818 if (swap_operands)
34819 {
34820 rtx tmp = op0;
34821 op0 = op1;
34822 op1 = tmp;
34823 }
34824
34825 insn = mode == DFmode ? gen_setcc_df_sse : gen_setcc_sf_sse;
34826
34827 emit_insn (insn (mask, op0, op1,
34828 gen_rtx_fmt_ee (code, mode, op0, op1)));
34829 return mask;
34830 }
34831
34832 /* Generate and return a rtx of mode MODE for 2**n where n is the number
34833 of bits of the mantissa of MODE, which must be one of DFmode or SFmode. */
34834 static rtx
34835 ix86_gen_TWO52 (enum machine_mode mode)
34836 {
34837 REAL_VALUE_TYPE TWO52r;
34838 rtx TWO52;
34839
34840 real_ldexp (&TWO52r, &dconst1, mode == DFmode ? 52 : 23);
34841 TWO52 = const_double_from_real_value (TWO52r, mode);
34842 TWO52 = force_reg (mode, TWO52);
34843
34844 return TWO52;
34845 }
34846
34847 /* Expand SSE sequence for computing lround from OP1 storing
34848 into OP0. */
34849 void
34850 ix86_expand_lround (rtx op0, rtx op1)
34851 {
34852 /* C code for the stuff we're doing below:
34853 tmp = op1 + copysign (nextafter (0.5, 0.0), op1)
34854 return (long)tmp;
34855 */
34856 enum machine_mode mode = GET_MODE (op1);
34857 const struct real_format *fmt;
34858 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
34859 rtx adj;
34860
34861 /* load nextafter (0.5, 0.0) */
34862 fmt = REAL_MODE_FORMAT (mode);
34863 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
34864 REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
34865
34866 /* adj = copysign (0.5, op1) */
34867 adj = force_reg (mode, const_double_from_real_value (pred_half, mode));
34868 ix86_sse_copysign_to_positive (adj, adj, force_reg (mode, op1), NULL_RTX);
34869
34870 /* adj = op1 + adj */
34871 adj = expand_simple_binop (mode, PLUS, adj, op1, NULL_RTX, 0, OPTAB_DIRECT);
34872
34873 /* op0 = (imode)adj */
34874 expand_fix (op0, adj, 0);
34875 }
34876
34877 /* Expand SSE2 sequence for computing lround from OPERAND1 storing
34878 into OPERAND0. */
34879 void
34880 ix86_expand_lfloorceil (rtx op0, rtx op1, bool do_floor)
34881 {
34882 /* C code for the stuff we're doing below (for do_floor):
34883 xi = (long)op1;
34884 xi -= (double)xi > op1 ? 1 : 0;
34885 return xi;
34886 */
34887 enum machine_mode fmode = GET_MODE (op1);
34888 enum machine_mode imode = GET_MODE (op0);
34889 rtx ireg, freg, label, tmp;
34890
34891 /* reg = (long)op1 */
34892 ireg = gen_reg_rtx (imode);
34893 expand_fix (ireg, op1, 0);
34894
34895 /* freg = (double)reg */
34896 freg = gen_reg_rtx (fmode);
34897 expand_float (freg, ireg, 0);
34898
34899 /* ireg = (freg > op1) ? ireg - 1 : ireg */
34900 label = ix86_expand_sse_compare_and_jump (UNLE,
34901 freg, op1, !do_floor);
34902 tmp = expand_simple_binop (imode, do_floor ? MINUS : PLUS,
34903 ireg, const1_rtx, NULL_RTX, 0, OPTAB_DIRECT);
34904 emit_move_insn (ireg, tmp);
34905
34906 emit_label (label);
34907 LABEL_NUSES (label) = 1;
34908
34909 emit_move_insn (op0, ireg);
34910 }
34911
34912 /* Expand rint (IEEE round to nearest) rounding OPERAND1 and storing the
34913 result in OPERAND0. */
34914 void
34915 ix86_expand_rint (rtx operand0, rtx operand1)
34916 {
34917 /* C code for the stuff we're doing below:
34918 xa = fabs (operand1);
34919 if (!isless (xa, 2**52))
34920 return operand1;
34921 xa = xa + 2**52 - 2**52;
34922 return copysign (xa, operand1);
34923 */
34924 enum machine_mode mode = GET_MODE (operand0);
34925 rtx res, xa, label, TWO52, mask;
34926
34927 res = gen_reg_rtx (mode);
34928 emit_move_insn (res, operand1);
34929
34930 /* xa = abs (operand1) */
34931 xa = ix86_expand_sse_fabs (res, &mask);
34932
34933 /* if (!isless (xa, TWO52)) goto label; */
34934 TWO52 = ix86_gen_TWO52 (mode);
34935 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
34936
34937 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
34938 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
34939
34940 ix86_sse_copysign_to_positive (res, xa, res, mask);
34941
34942 emit_label (label);
34943 LABEL_NUSES (label) = 1;
34944
34945 emit_move_insn (operand0, res);
34946 }
34947
34948 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
34949 into OPERAND0. */
34950 void
34951 ix86_expand_floorceildf_32 (rtx operand0, rtx operand1, bool do_floor)
34952 {
34953 /* C code for the stuff we expand below.
34954 double xa = fabs (x), x2;
34955 if (!isless (xa, TWO52))
34956 return x;
34957 xa = xa + TWO52 - TWO52;
34958 x2 = copysign (xa, x);
34959 Compensate. Floor:
34960 if (x2 > x)
34961 x2 -= 1;
34962 Compensate. Ceil:
34963 if (x2 < x)
34964 x2 -= -1;
34965 return x2;
34966 */
34967 enum machine_mode mode = GET_MODE (operand0);
34968 rtx xa, TWO52, tmp, label, one, res, mask;
34969
34970 TWO52 = ix86_gen_TWO52 (mode);
34971
34972 /* Temporary for holding the result, initialized to the input
34973 operand to ease control flow. */
34974 res = gen_reg_rtx (mode);
34975 emit_move_insn (res, operand1);
34976
34977 /* xa = abs (operand1) */
34978 xa = ix86_expand_sse_fabs (res, &mask);
34979
34980 /* if (!isless (xa, TWO52)) goto label; */
34981 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
34982
34983 /* xa = xa + TWO52 - TWO52; */
34984 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
34985 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
34986
34987 /* xa = copysign (xa, operand1) */
34988 ix86_sse_copysign_to_positive (xa, xa, res, mask);
34989
34990 /* generate 1.0 or -1.0 */
34991 one = force_reg (mode,
34992 const_double_from_real_value (do_floor
34993 ? dconst1 : dconstm1, mode));
34994
34995 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
34996 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
34997 emit_insn (gen_rtx_SET (VOIDmode, tmp,
34998 gen_rtx_AND (mode, one, tmp)));
34999 /* We always need to subtract here to preserve signed zero. */
35000 tmp = expand_simple_binop (mode, MINUS,
35001 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
35002 emit_move_insn (res, tmp);
35003
35004 emit_label (label);
35005 LABEL_NUSES (label) = 1;
35006
35007 emit_move_insn (operand0, res);
35008 }
35009
35010 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
35011 into OPERAND0. */
35012 void
35013 ix86_expand_floorceil (rtx operand0, rtx operand1, bool do_floor)
35014 {
35015 /* C code for the stuff we expand below.
35016 double xa = fabs (x), x2;
35017 if (!isless (xa, TWO52))
35018 return x;
35019 x2 = (double)(long)x;
35020 Compensate. Floor:
35021 if (x2 > x)
35022 x2 -= 1;
35023 Compensate. Ceil:
35024 if (x2 < x)
35025 x2 += 1;
35026 if (HONOR_SIGNED_ZEROS (mode))
35027 return copysign (x2, x);
35028 return x2;
35029 */
35030 enum machine_mode mode = GET_MODE (operand0);
35031 rtx xa, xi, TWO52, tmp, label, one, res, mask;
35032
35033 TWO52 = ix86_gen_TWO52 (mode);
35034
35035 /* Temporary for holding the result, initialized to the input
35036 operand to ease control flow. */
35037 res = gen_reg_rtx (mode);
35038 emit_move_insn (res, operand1);
35039
35040 /* xa = abs (operand1) */
35041 xa = ix86_expand_sse_fabs (res, &mask);
35042
35043 /* if (!isless (xa, TWO52)) goto label; */
35044 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
35045
35046 /* xa = (double)(long)x */
35047 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
35048 expand_fix (xi, res, 0);
35049 expand_float (xa, xi, 0);
35050
35051 /* generate 1.0 */
35052 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
35053
35054 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
35055 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
35056 emit_insn (gen_rtx_SET (VOIDmode, tmp,
35057 gen_rtx_AND (mode, one, tmp)));
35058 tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
35059 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
35060 emit_move_insn (res, tmp);
35061
35062 if (HONOR_SIGNED_ZEROS (mode))
35063 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
35064
35065 emit_label (label);
35066 LABEL_NUSES (label) = 1;
35067
35068 emit_move_insn (operand0, res);
35069 }
35070
35071 /* Expand SSE sequence for computing round from OPERAND1 storing
35072 into OPERAND0. Sequence that works without relying on DImode truncation
35073 via cvttsd2siq that is only available on 64bit targets. */
35074 void
35075 ix86_expand_rounddf_32 (rtx operand0, rtx operand1)
35076 {
35077 /* C code for the stuff we expand below.
35078 double xa = fabs (x), xa2, x2;
35079 if (!isless (xa, TWO52))
35080 return x;
35081 Using the absolute value and copying back sign makes
35082 -0.0 -> -0.0 correct.
35083 xa2 = xa + TWO52 - TWO52;
35084 Compensate.
35085 dxa = xa2 - xa;
35086 if (dxa <= -0.5)
35087 xa2 += 1;
35088 else if (dxa > 0.5)
35089 xa2 -= 1;
35090 x2 = copysign (xa2, x);
35091 return x2;
35092 */
35093 enum machine_mode mode = GET_MODE (operand0);
35094 rtx xa, xa2, dxa, TWO52, tmp, label, half, mhalf, one, res, mask;
35095
35096 TWO52 = ix86_gen_TWO52 (mode);
35097
35098 /* Temporary for holding the result, initialized to the input
35099 operand to ease control flow. */
35100 res = gen_reg_rtx (mode);
35101 emit_move_insn (res, operand1);
35102
35103 /* xa = abs (operand1) */
35104 xa = ix86_expand_sse_fabs (res, &mask);
35105
35106 /* if (!isless (xa, TWO52)) goto label; */
35107 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
35108
35109 /* xa2 = xa + TWO52 - TWO52; */
35110 xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
35111 xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
35112
35113 /* dxa = xa2 - xa; */
35114 dxa = expand_simple_binop (mode, MINUS, xa2, xa, NULL_RTX, 0, OPTAB_DIRECT);
35115
35116 /* generate 0.5, 1.0 and -0.5 */
35117 half = force_reg (mode, const_double_from_real_value (dconsthalf, mode));
35118 one = expand_simple_binop (mode, PLUS, half, half, NULL_RTX, 0, OPTAB_DIRECT);
35119 mhalf = expand_simple_binop (mode, MINUS, half, one, NULL_RTX,
35120 0, OPTAB_DIRECT);
35121
35122 /* Compensate. */
35123 tmp = gen_reg_rtx (mode);
35124 /* xa2 = xa2 - (dxa > 0.5 ? 1 : 0) */
35125 tmp = ix86_expand_sse_compare_mask (UNGT, dxa, half, false);
35126 emit_insn (gen_rtx_SET (VOIDmode, tmp,
35127 gen_rtx_AND (mode, one, tmp)));
35128 xa2 = expand_simple_binop (mode, MINUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
35129 /* xa2 = xa2 + (dxa <= -0.5 ? 1 : 0) */
35130 tmp = ix86_expand_sse_compare_mask (UNGE, mhalf, dxa, false);
35131 emit_insn (gen_rtx_SET (VOIDmode, tmp,
35132 gen_rtx_AND (mode, one, tmp)));
35133 xa2 = expand_simple_binop (mode, PLUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
35134
35135 /* res = copysign (xa2, operand1) */
35136 ix86_sse_copysign_to_positive (res, xa2, force_reg (mode, operand1), mask);
35137
35138 emit_label (label);
35139 LABEL_NUSES (label) = 1;
35140
35141 emit_move_insn (operand0, res);
35142 }
35143
35144 /* Expand SSE sequence for computing trunc from OPERAND1 storing
35145 into OPERAND0. */
35146 void
35147 ix86_expand_trunc (rtx operand0, rtx operand1)
35148 {
35149 /* C code for SSE variant we expand below.
35150 double xa = fabs (x), x2;
35151 if (!isless (xa, TWO52))
35152 return x;
35153 x2 = (double)(long)x;
35154 if (HONOR_SIGNED_ZEROS (mode))
35155 return copysign (x2, x);
35156 return x2;
35157 */
35158 enum machine_mode mode = GET_MODE (operand0);
35159 rtx xa, xi, TWO52, label, res, mask;
35160
35161 TWO52 = ix86_gen_TWO52 (mode);
35162
35163 /* Temporary for holding the result, initialized to the input
35164 operand to ease control flow. */
35165 res = gen_reg_rtx (mode);
35166 emit_move_insn (res, operand1);
35167
35168 /* xa = abs (operand1) */
35169 xa = ix86_expand_sse_fabs (res, &mask);
35170
35171 /* if (!isless (xa, TWO52)) goto label; */
35172 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
35173
35174 /* x = (double)(long)x */
35175 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
35176 expand_fix (xi, res, 0);
35177 expand_float (res, xi, 0);
35178
35179 if (HONOR_SIGNED_ZEROS (mode))
35180 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
35181
35182 emit_label (label);
35183 LABEL_NUSES (label) = 1;
35184
35185 emit_move_insn (operand0, res);
35186 }
35187
35188 /* Expand SSE sequence for computing trunc from OPERAND1 storing
35189 into OPERAND0. */
35190 void
35191 ix86_expand_truncdf_32 (rtx operand0, rtx operand1)
35192 {
35193 enum machine_mode mode = GET_MODE (operand0);
35194 rtx xa, mask, TWO52, label, one, res, smask, tmp;
35195
35196 /* C code for SSE variant we expand below.
35197 double xa = fabs (x), x2;
35198 if (!isless (xa, TWO52))
35199 return x;
35200 xa2 = xa + TWO52 - TWO52;
35201 Compensate:
35202 if (xa2 > xa)
35203 xa2 -= 1.0;
35204 x2 = copysign (xa2, x);
35205 return x2;
35206 */
35207
35208 TWO52 = ix86_gen_TWO52 (mode);
35209
35210 /* Temporary for holding the result, initialized to the input
35211 operand to ease control flow. */
35212 res = gen_reg_rtx (mode);
35213 emit_move_insn (res, operand1);
35214
35215 /* xa = abs (operand1) */
35216 xa = ix86_expand_sse_fabs (res, &smask);
35217
35218 /* if (!isless (xa, TWO52)) goto label; */
35219 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
35220
35221 /* res = xa + TWO52 - TWO52; */
35222 tmp = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
35223 tmp = expand_simple_binop (mode, MINUS, tmp, TWO52, tmp, 0, OPTAB_DIRECT);
35224 emit_move_insn (res, tmp);
35225
35226 /* generate 1.0 */
35227 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
35228
35229 /* Compensate: res = xa2 - (res > xa ? 1 : 0) */
35230 mask = ix86_expand_sse_compare_mask (UNGT, res, xa, false);
35231 emit_insn (gen_rtx_SET (VOIDmode, mask,
35232 gen_rtx_AND (mode, mask, one)));
35233 tmp = expand_simple_binop (mode, MINUS,
35234 res, mask, NULL_RTX, 0, OPTAB_DIRECT);
35235 emit_move_insn (res, tmp);
35236
35237 /* res = copysign (res, operand1) */
35238 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), smask);
35239
35240 emit_label (label);
35241 LABEL_NUSES (label) = 1;
35242
35243 emit_move_insn (operand0, res);
35244 }
35245
35246 /* Expand SSE sequence for computing round from OPERAND1 storing
35247 into OPERAND0. */
35248 void
35249 ix86_expand_round (rtx operand0, rtx operand1)
35250 {
35251 /* C code for the stuff we're doing below:
35252 double xa = fabs (x);
35253 if (!isless (xa, TWO52))
35254 return x;
35255 xa = (double)(long)(xa + nextafter (0.5, 0.0));
35256 return copysign (xa, x);
35257 */
35258 enum machine_mode mode = GET_MODE (operand0);
35259 rtx res, TWO52, xa, label, xi, half, mask;
35260 const struct real_format *fmt;
35261 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
35262
35263 /* Temporary for holding the result, initialized to the input
35264 operand to ease control flow. */
35265 res = gen_reg_rtx (mode);
35266 emit_move_insn (res, operand1);
35267
35268 TWO52 = ix86_gen_TWO52 (mode);
35269 xa = ix86_expand_sse_fabs (res, &mask);
35270 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
35271
35272 /* load nextafter (0.5, 0.0) */
35273 fmt = REAL_MODE_FORMAT (mode);
35274 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
35275 REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
35276
35277 /* xa = xa + 0.5 */
35278 half = force_reg (mode, const_double_from_real_value (pred_half, mode));
35279 xa = expand_simple_binop (mode, PLUS, xa, half, NULL_RTX, 0, OPTAB_DIRECT);
35280
35281 /* xa = (double)(int64_t)xa */
35282 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
35283 expand_fix (xi, xa, 0);
35284 expand_float (xa, xi, 0);
35285
35286 /* res = copysign (xa, operand1) */
35287 ix86_sse_copysign_to_positive (res, xa, force_reg (mode, operand1), mask);
35288
35289 emit_label (label);
35290 LABEL_NUSES (label) = 1;
35291
35292 emit_move_insn (operand0, res);
35293 }
35294
35295 /* Expand SSE sequence for computing round
35296 from OP1 storing into OP0 using sse4 round insn. */
35297 void
35298 ix86_expand_round_sse4 (rtx op0, rtx op1)
35299 {
35300 enum machine_mode mode = GET_MODE (op0);
35301 rtx e1, e2, res, half;
35302 const struct real_format *fmt;
35303 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
35304 rtx (*gen_copysign) (rtx, rtx, rtx);
35305 rtx (*gen_round) (rtx, rtx, rtx);
35306
35307 switch (mode)
35308 {
35309 case SFmode:
35310 gen_copysign = gen_copysignsf3;
35311 gen_round = gen_sse4_1_roundsf2;
35312 break;
35313 case DFmode:
35314 gen_copysign = gen_copysigndf3;
35315 gen_round = gen_sse4_1_rounddf2;
35316 break;
35317 default:
35318 gcc_unreachable ();
35319 }
35320
35321 /* round (a) = trunc (a + copysign (0.5, a)) */
35322
35323 /* load nextafter (0.5, 0.0) */
35324 fmt = REAL_MODE_FORMAT (mode);
35325 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
35326 REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
35327 half = const_double_from_real_value (pred_half, mode);
35328
35329 /* e1 = copysign (0.5, op1) */
35330 e1 = gen_reg_rtx (mode);
35331 emit_insn (gen_copysign (e1, half, op1));
35332
35333 /* e2 = op1 + e1 */
35334 e2 = expand_simple_binop (mode, PLUS, op1, e1, NULL_RTX, 0, OPTAB_DIRECT);
35335
35336 /* res = trunc (e2) */
35337 res = gen_reg_rtx (mode);
35338 emit_insn (gen_round (res, e2, GEN_INT (ROUND_TRUNC)));
35339
35340 emit_move_insn (op0, res);
35341 }
35342 \f
35343
35344 /* Table of valid machine attributes. */
35345 static const struct attribute_spec ix86_attribute_table[] =
35346 {
35347 /* { name, min_len, max_len, decl_req, type_req, fn_type_req, handler,
35348 affects_type_identity } */
35349 /* Stdcall attribute says callee is responsible for popping arguments
35350 if they are not variable. */
35351 { "stdcall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
35352 true },
35353 /* Fastcall attribute says callee is responsible for popping arguments
35354 if they are not variable. */
35355 { "fastcall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
35356 true },
35357 /* Thiscall attribute says callee is responsible for popping arguments
35358 if they are not variable. */
35359 { "thiscall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
35360 true },
35361 /* Cdecl attribute says the callee is a normal C declaration */
35362 { "cdecl", 0, 0, false, true, true, ix86_handle_cconv_attribute,
35363 true },
35364 /* Regparm attribute specifies how many integer arguments are to be
35365 passed in registers. */
35366 { "regparm", 1, 1, false, true, true, ix86_handle_cconv_attribute,
35367 true },
35368 /* Sseregparm attribute says we are using x86_64 calling conventions
35369 for FP arguments. */
35370 { "sseregparm", 0, 0, false, true, true, ix86_handle_cconv_attribute,
35371 true },
35372 /* The transactional memory builtins are implicitly regparm or fastcall
35373 depending on the ABI. Override the generic do-nothing attribute that
35374 these builtins were declared with. */
35375 { "*tm regparm", 0, 0, false, true, true, ix86_handle_tm_regparm_attribute,
35376 true },
35377 /* force_align_arg_pointer says this function realigns the stack at entry. */
35378 { (const char *)&ix86_force_align_arg_pointer_string, 0, 0,
35379 false, true, true, ix86_handle_cconv_attribute, false },
35380 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
35381 { "dllimport", 0, 0, false, false, false, handle_dll_attribute, false },
35382 { "dllexport", 0, 0, false, false, false, handle_dll_attribute, false },
35383 { "shared", 0, 0, true, false, false, ix86_handle_shared_attribute,
35384 false },
35385 #endif
35386 { "ms_struct", 0, 0, false, false, false, ix86_handle_struct_attribute,
35387 false },
35388 { "gcc_struct", 0, 0, false, false, false, ix86_handle_struct_attribute,
35389 false },
35390 #ifdef SUBTARGET_ATTRIBUTE_TABLE
35391 SUBTARGET_ATTRIBUTE_TABLE,
35392 #endif
35393 /* ms_abi and sysv_abi calling convention function attributes. */
35394 { "ms_abi", 0, 0, false, true, true, ix86_handle_abi_attribute, true },
35395 { "sysv_abi", 0, 0, false, true, true, ix86_handle_abi_attribute, true },
35396 { "ms_hook_prologue", 0, 0, true, false, false, ix86_handle_fndecl_attribute,
35397 false },
35398 { "callee_pop_aggregate_return", 1, 1, false, true, true,
35399 ix86_handle_callee_pop_aggregate_return, true },
35400 /* End element. */
35401 { NULL, 0, 0, false, false, false, NULL, false }
35402 };
35403
35404 /* Implement targetm.vectorize.builtin_vectorization_cost. */
35405 static int
35406 ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
35407 tree vectype ATTRIBUTE_UNUSED,
35408 int misalign ATTRIBUTE_UNUSED)
35409 {
35410 switch (type_of_cost)
35411 {
35412 case scalar_stmt:
35413 return ix86_cost->scalar_stmt_cost;
35414
35415 case scalar_load:
35416 return ix86_cost->scalar_load_cost;
35417
35418 case scalar_store:
35419 return ix86_cost->scalar_store_cost;
35420
35421 case vector_stmt:
35422 return ix86_cost->vec_stmt_cost;
35423
35424 case vector_load:
35425 return ix86_cost->vec_align_load_cost;
35426
35427 case vector_store:
35428 return ix86_cost->vec_store_cost;
35429
35430 case vec_to_scalar:
35431 return ix86_cost->vec_to_scalar_cost;
35432
35433 case scalar_to_vec:
35434 return ix86_cost->scalar_to_vec_cost;
35435
35436 case unaligned_load:
35437 case unaligned_store:
35438 return ix86_cost->vec_unalign_load_cost;
35439
35440 case cond_branch_taken:
35441 return ix86_cost->cond_taken_branch_cost;
35442
35443 case cond_branch_not_taken:
35444 return ix86_cost->cond_not_taken_branch_cost;
35445
35446 case vec_perm:
35447 case vec_promote_demote:
35448 return ix86_cost->vec_stmt_cost;
35449
35450 default:
35451 gcc_unreachable ();
35452 }
35453 }
35454
35455 /* Construct (set target (vec_select op0 (parallel perm))) and
35456 return true if that's a valid instruction in the active ISA. */
35457
35458 static bool
35459 expand_vselect (rtx target, rtx op0, const unsigned char *perm, unsigned nelt)
35460 {
35461 rtx rperm[MAX_VECT_LEN], x;
35462 unsigned i;
35463
35464 for (i = 0; i < nelt; ++i)
35465 rperm[i] = GEN_INT (perm[i]);
35466
35467 x = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (nelt, rperm));
35468 x = gen_rtx_VEC_SELECT (GET_MODE (target), op0, x);
35469 x = gen_rtx_SET (VOIDmode, target, x);
35470
35471 x = emit_insn (x);
35472 if (recog_memoized (x) < 0)
35473 {
35474 remove_insn (x);
35475 return false;
35476 }
35477 return true;
35478 }
35479
35480 /* Similar, but generate a vec_concat from op0 and op1 as well. */
35481
35482 static bool
35483 expand_vselect_vconcat (rtx target, rtx op0, rtx op1,
35484 const unsigned char *perm, unsigned nelt)
35485 {
35486 enum machine_mode v2mode;
35487 rtx x;
35488
35489 v2mode = GET_MODE_2XWIDER_MODE (GET_MODE (op0));
35490 x = gen_rtx_VEC_CONCAT (v2mode, op0, op1);
35491 return expand_vselect (target, x, perm, nelt);
35492 }
35493
35494 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
35495 in terms of blendp[sd] / pblendw / pblendvb / vpblendd. */
35496
35497 static bool
35498 expand_vec_perm_blend (struct expand_vec_perm_d *d)
35499 {
35500 enum machine_mode vmode = d->vmode;
35501 unsigned i, mask, nelt = d->nelt;
35502 rtx target, op0, op1, x;
35503 rtx rperm[32], vperm;
35504
35505 if (d->op0 == d->op1)
35506 return false;
35507 if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
35508 ;
35509 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
35510 ;
35511 else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
35512 ;
35513 else
35514 return false;
35515
35516 /* This is a blend, not a permute. Elements must stay in their
35517 respective lanes. */
35518 for (i = 0; i < nelt; ++i)
35519 {
35520 unsigned e = d->perm[i];
35521 if (!(e == i || e == i + nelt))
35522 return false;
35523 }
35524
35525 if (d->testing_p)
35526 return true;
35527
35528 /* ??? Without SSE4.1, we could implement this with and/andn/or. This
35529 decision should be extracted elsewhere, so that we only try that
35530 sequence once all budget==3 options have been tried. */
35531 target = d->target;
35532 op0 = d->op0;
35533 op1 = d->op1;
35534 mask = 0;
35535
35536 switch (vmode)
35537 {
35538 case V4DFmode:
35539 case V8SFmode:
35540 case V2DFmode:
35541 case V4SFmode:
35542 case V8HImode:
35543 case V8SImode:
35544 for (i = 0; i < nelt; ++i)
35545 mask |= (d->perm[i] >= nelt) << i;
35546 break;
35547
35548 case V2DImode:
35549 for (i = 0; i < 2; ++i)
35550 mask |= (d->perm[i] >= 2 ? 15 : 0) << (i * 4);
35551 vmode = V8HImode;
35552 goto do_subreg;
35553
35554 case V4SImode:
35555 for (i = 0; i < 4; ++i)
35556 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
35557 vmode = V8HImode;
35558 goto do_subreg;
35559
35560 case V16QImode:
35561 /* See if bytes move in pairs so we can use pblendw with
35562 an immediate argument, rather than pblendvb with a vector
35563 argument. */
35564 for (i = 0; i < 16; i += 2)
35565 if (d->perm[i] + 1 != d->perm[i + 1])
35566 {
35567 use_pblendvb:
35568 for (i = 0; i < nelt; ++i)
35569 rperm[i] = (d->perm[i] < nelt ? const0_rtx : constm1_rtx);
35570
35571 finish_pblendvb:
35572 vperm = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
35573 vperm = force_reg (vmode, vperm);
35574
35575 if (GET_MODE_SIZE (vmode) == 16)
35576 emit_insn (gen_sse4_1_pblendvb (target, op0, op1, vperm));
35577 else
35578 emit_insn (gen_avx2_pblendvb (target, op0, op1, vperm));
35579 return true;
35580 }
35581
35582 for (i = 0; i < 8; ++i)
35583 mask |= (d->perm[i * 2] >= 16) << i;
35584 vmode = V8HImode;
35585 /* FALLTHRU */
35586
35587 do_subreg:
35588 target = gen_lowpart (vmode, target);
35589 op0 = gen_lowpart (vmode, op0);
35590 op1 = gen_lowpart (vmode, op1);
35591 break;
35592
35593 case V32QImode:
35594 /* See if bytes move in pairs. If not, vpblendvb must be used. */
35595 for (i = 0; i < 32; i += 2)
35596 if (d->perm[i] + 1 != d->perm[i + 1])
35597 goto use_pblendvb;
35598 /* See if bytes move in quadruplets. If yes, vpblendd
35599 with immediate can be used. */
35600 for (i = 0; i < 32; i += 4)
35601 if (d->perm[i] + 2 != d->perm[i + 2])
35602 break;
35603 if (i < 32)
35604 {
35605 /* See if bytes move the same in both lanes. If yes,
35606 vpblendw with immediate can be used. */
35607 for (i = 0; i < 16; i += 2)
35608 if (d->perm[i] + 16 != d->perm[i + 16])
35609 goto use_pblendvb;
35610
35611 /* Use vpblendw. */
35612 for (i = 0; i < 16; ++i)
35613 mask |= (d->perm[i * 2] >= 32) << i;
35614 vmode = V16HImode;
35615 goto do_subreg;
35616 }
35617
35618 /* Use vpblendd. */
35619 for (i = 0; i < 8; ++i)
35620 mask |= (d->perm[i * 4] >= 32) << i;
35621 vmode = V8SImode;
35622 goto do_subreg;
35623
35624 case V16HImode:
35625 /* See if words move in pairs. If yes, vpblendd can be used. */
35626 for (i = 0; i < 16; i += 2)
35627 if (d->perm[i] + 1 != d->perm[i + 1])
35628 break;
35629 if (i < 16)
35630 {
35631 /* See if words move the same in both lanes. If not,
35632 vpblendvb must be used. */
35633 for (i = 0; i < 8; i++)
35634 if (d->perm[i] + 8 != d->perm[i + 8])
35635 {
35636 /* Use vpblendvb. */
35637 for (i = 0; i < 32; ++i)
35638 rperm[i] = (d->perm[i / 2] < 16 ? const0_rtx : constm1_rtx);
35639
35640 vmode = V32QImode;
35641 nelt = 32;
35642 target = gen_lowpart (vmode, target);
35643 op0 = gen_lowpart (vmode, op0);
35644 op1 = gen_lowpart (vmode, op1);
35645 goto finish_pblendvb;
35646 }
35647
35648 /* Use vpblendw. */
35649 for (i = 0; i < 16; ++i)
35650 mask |= (d->perm[i] >= 16) << i;
35651 break;
35652 }
35653
35654 /* Use vpblendd. */
35655 for (i = 0; i < 8; ++i)
35656 mask |= (d->perm[i * 2] >= 16) << i;
35657 vmode = V8SImode;
35658 goto do_subreg;
35659
35660 case V4DImode:
35661 /* Use vpblendd. */
35662 for (i = 0; i < 4; ++i)
35663 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
35664 vmode = V8SImode;
35665 goto do_subreg;
35666
35667 default:
35668 gcc_unreachable ();
35669 }
35670
35671 /* This matches five different patterns with the different modes. */
35672 x = gen_rtx_VEC_MERGE (vmode, op1, op0, GEN_INT (mask));
35673 x = gen_rtx_SET (VOIDmode, target, x);
35674 emit_insn (x);
35675
35676 return true;
35677 }
35678
35679 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
35680 in terms of the variable form of vpermilps.
35681
35682 Note that we will have already failed the immediate input vpermilps,
35683 which requires that the high and low part shuffle be identical; the
35684 variable form doesn't require that. */
35685
35686 static bool
35687 expand_vec_perm_vpermil (struct expand_vec_perm_d *d)
35688 {
35689 rtx rperm[8], vperm;
35690 unsigned i;
35691
35692 if (!TARGET_AVX || d->vmode != V8SFmode || d->op0 != d->op1)
35693 return false;
35694
35695 /* We can only permute within the 128-bit lane. */
35696 for (i = 0; i < 8; ++i)
35697 {
35698 unsigned e = d->perm[i];
35699 if (i < 4 ? e >= 4 : e < 4)
35700 return false;
35701 }
35702
35703 if (d->testing_p)
35704 return true;
35705
35706 for (i = 0; i < 8; ++i)
35707 {
35708 unsigned e = d->perm[i];
35709
35710 /* Within each 128-bit lane, the elements of op0 are numbered
35711 from 0 and the elements of op1 are numbered from 4. */
35712 if (e >= 8 + 4)
35713 e -= 8;
35714 else if (e >= 4)
35715 e -= 4;
35716
35717 rperm[i] = GEN_INT (e);
35718 }
35719
35720 vperm = gen_rtx_CONST_VECTOR (V8SImode, gen_rtvec_v (8, rperm));
35721 vperm = force_reg (V8SImode, vperm);
35722 emit_insn (gen_avx_vpermilvarv8sf3 (d->target, d->op0, vperm));
35723
35724 return true;
35725 }
35726
35727 /* Return true if permutation D can be performed as VMODE permutation
35728 instead. */
35729
35730 static bool
35731 valid_perm_using_mode_p (enum machine_mode vmode, struct expand_vec_perm_d *d)
35732 {
35733 unsigned int i, j, chunk;
35734
35735 if (GET_MODE_CLASS (vmode) != MODE_VECTOR_INT
35736 || GET_MODE_CLASS (d->vmode) != MODE_VECTOR_INT
35737 || GET_MODE_SIZE (vmode) != GET_MODE_SIZE (d->vmode))
35738 return false;
35739
35740 if (GET_MODE_NUNITS (vmode) >= d->nelt)
35741 return true;
35742
35743 chunk = d->nelt / GET_MODE_NUNITS (vmode);
35744 for (i = 0; i < d->nelt; i += chunk)
35745 if (d->perm[i] & (chunk - 1))
35746 return false;
35747 else
35748 for (j = 1; j < chunk; ++j)
35749 if (d->perm[i] + j != d->perm[i + j])
35750 return false;
35751
35752 return true;
35753 }
35754
35755 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
35756 in terms of pshufb, vpperm, vpermq, vpermd or vperm2i128. */
35757
35758 static bool
35759 expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
35760 {
35761 unsigned i, nelt, eltsz, mask;
35762 unsigned char perm[32];
35763 enum machine_mode vmode = V16QImode;
35764 rtx rperm[32], vperm, target, op0, op1;
35765
35766 nelt = d->nelt;
35767
35768 if (d->op0 != d->op1)
35769 {
35770 if (!TARGET_XOP || GET_MODE_SIZE (d->vmode) != 16)
35771 {
35772 if (TARGET_AVX2
35773 && valid_perm_using_mode_p (V2TImode, d))
35774 {
35775 if (d->testing_p)
35776 return true;
35777
35778 /* Use vperm2i128 insn. The pattern uses
35779 V4DImode instead of V2TImode. */
35780 target = gen_lowpart (V4DImode, d->target);
35781 op0 = gen_lowpart (V4DImode, d->op0);
35782 op1 = gen_lowpart (V4DImode, d->op1);
35783 rperm[0]
35784 = GEN_INT (((d->perm[0] & (nelt / 2)) ? 1 : 0)
35785 || ((d->perm[nelt / 2] & (nelt / 2)) ? 2 : 0));
35786 emit_insn (gen_avx2_permv2ti (target, op0, op1, rperm[0]));
35787 return true;
35788 }
35789 return false;
35790 }
35791 }
35792 else
35793 {
35794 if (GET_MODE_SIZE (d->vmode) == 16)
35795 {
35796 if (!TARGET_SSSE3)
35797 return false;
35798 }
35799 else if (GET_MODE_SIZE (d->vmode) == 32)
35800 {
35801 if (!TARGET_AVX2)
35802 return false;
35803
35804 /* V4DImode should be already handled through
35805 expand_vselect by vpermq instruction. */
35806 gcc_assert (d->vmode != V4DImode);
35807
35808 vmode = V32QImode;
35809 if (d->vmode == V8SImode
35810 || d->vmode == V16HImode
35811 || d->vmode == V32QImode)
35812 {
35813 /* First see if vpermq can be used for
35814 V8SImode/V16HImode/V32QImode. */
35815 if (valid_perm_using_mode_p (V4DImode, d))
35816 {
35817 for (i = 0; i < 4; i++)
35818 perm[i] = (d->perm[i * nelt / 4] * 4 / nelt) & 3;
35819 if (d->testing_p)
35820 return true;
35821 return expand_vselect (gen_lowpart (V4DImode, d->target),
35822 gen_lowpart (V4DImode, d->op0),
35823 perm, 4);
35824 }
35825
35826 /* Next see if vpermd can be used. */
35827 if (valid_perm_using_mode_p (V8SImode, d))
35828 vmode = V8SImode;
35829 }
35830
35831 if (vmode == V32QImode)
35832 {
35833 /* vpshufb only works intra lanes, it is not
35834 possible to shuffle bytes in between the lanes. */
35835 for (i = 0; i < nelt; ++i)
35836 if ((d->perm[i] ^ i) & (nelt / 2))
35837 return false;
35838 }
35839 }
35840 else
35841 return false;
35842 }
35843
35844 if (d->testing_p)
35845 return true;
35846
35847 if (vmode == V8SImode)
35848 for (i = 0; i < 8; ++i)
35849 rperm[i] = GEN_INT ((d->perm[i * nelt / 8] * 8 / nelt) & 7);
35850 else
35851 {
35852 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
35853 if (d->op0 != d->op1)
35854 mask = 2 * nelt - 1;
35855 else if (vmode == V16QImode)
35856 mask = nelt - 1;
35857 else
35858 mask = nelt / 2 - 1;
35859
35860 for (i = 0; i < nelt; ++i)
35861 {
35862 unsigned j, e = d->perm[i] & mask;
35863 for (j = 0; j < eltsz; ++j)
35864 rperm[i * eltsz + j] = GEN_INT (e * eltsz + j);
35865 }
35866 }
35867
35868 vperm = gen_rtx_CONST_VECTOR (vmode,
35869 gen_rtvec_v (GET_MODE_NUNITS (vmode), rperm));
35870 vperm = force_reg (vmode, vperm);
35871
35872 target = gen_lowpart (vmode, d->target);
35873 op0 = gen_lowpart (vmode, d->op0);
35874 if (d->op0 == d->op1)
35875 {
35876 if (vmode == V16QImode)
35877 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, vperm));
35878 else if (vmode == V32QImode)
35879 emit_insn (gen_avx2_pshufbv32qi3 (target, op0, vperm));
35880 else
35881 emit_insn (gen_avx2_permvarv8si (target, vperm, op0));
35882 }
35883 else
35884 {
35885 op1 = gen_lowpart (vmode, d->op1);
35886 emit_insn (gen_xop_pperm (target, op0, op1, vperm));
35887 }
35888
35889 return true;
35890 }
35891
35892 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to instantiate D
35893 in a single instruction. */
35894
35895 static bool
35896 expand_vec_perm_1 (struct expand_vec_perm_d *d)
35897 {
35898 unsigned i, nelt = d->nelt;
35899 unsigned char perm2[MAX_VECT_LEN];
35900
35901 /* Check plain VEC_SELECT first, because AVX has instructions that could
35902 match both SEL and SEL+CONCAT, but the plain SEL will allow a memory
35903 input where SEL+CONCAT may not. */
35904 if (d->op0 == d->op1)
35905 {
35906 int mask = nelt - 1;
35907 bool identity_perm = true;
35908 bool broadcast_perm = true;
35909
35910 for (i = 0; i < nelt; i++)
35911 {
35912 perm2[i] = d->perm[i] & mask;
35913 if (perm2[i] != i)
35914 identity_perm = false;
35915 if (perm2[i])
35916 broadcast_perm = false;
35917 }
35918
35919 if (identity_perm)
35920 {
35921 if (!d->testing_p)
35922 emit_move_insn (d->target, d->op0);
35923 return true;
35924 }
35925 else if (broadcast_perm && TARGET_AVX2)
35926 {
35927 /* Use vpbroadcast{b,w,d}. */
35928 rtx op = d->op0, (*gen) (rtx, rtx) = NULL;
35929 switch (d->vmode)
35930 {
35931 case V32QImode:
35932 op = gen_lowpart (V16QImode, op);
35933 gen = gen_avx2_pbroadcastv32qi;
35934 break;
35935 case V16HImode:
35936 op = gen_lowpart (V8HImode, op);
35937 gen = gen_avx2_pbroadcastv16hi;
35938 break;
35939 case V8SImode:
35940 op = gen_lowpart (V4SImode, op);
35941 gen = gen_avx2_pbroadcastv8si;
35942 break;
35943 case V16QImode:
35944 gen = gen_avx2_pbroadcastv16qi;
35945 break;
35946 case V8HImode:
35947 gen = gen_avx2_pbroadcastv8hi;
35948 break;
35949 /* For other modes prefer other shuffles this function creates. */
35950 default: break;
35951 }
35952 if (gen != NULL)
35953 {
35954 if (!d->testing_p)
35955 emit_insn (gen (d->target, op));
35956 return true;
35957 }
35958 }
35959
35960 if (expand_vselect (d->target, d->op0, perm2, nelt))
35961 return true;
35962
35963 /* There are plenty of patterns in sse.md that are written for
35964 SEL+CONCAT and are not replicated for a single op. Perhaps
35965 that should be changed, to avoid the nastiness here. */
35966
35967 /* Recognize interleave style patterns, which means incrementing
35968 every other permutation operand. */
35969 for (i = 0; i < nelt; i += 2)
35970 {
35971 perm2[i] = d->perm[i] & mask;
35972 perm2[i + 1] = (d->perm[i + 1] & mask) + nelt;
35973 }
35974 if (expand_vselect_vconcat (d->target, d->op0, d->op0, perm2, nelt))
35975 return true;
35976
35977 /* Recognize shufps, which means adding {0, 0, nelt, nelt}. */
35978 if (nelt >= 4)
35979 {
35980 for (i = 0; i < nelt; i += 4)
35981 {
35982 perm2[i + 0] = d->perm[i + 0] & mask;
35983 perm2[i + 1] = d->perm[i + 1] & mask;
35984 perm2[i + 2] = (d->perm[i + 2] & mask) + nelt;
35985 perm2[i + 3] = (d->perm[i + 3] & mask) + nelt;
35986 }
35987
35988 if (expand_vselect_vconcat (d->target, d->op0, d->op0, perm2, nelt))
35989 return true;
35990 }
35991 }
35992
35993 /* Finally, try the fully general two operand permute. */
35994 if (expand_vselect_vconcat (d->target, d->op0, d->op1, d->perm, nelt))
35995 return true;
35996
35997 /* Recognize interleave style patterns with reversed operands. */
35998 if (d->op0 != d->op1)
35999 {
36000 for (i = 0; i < nelt; ++i)
36001 {
36002 unsigned e = d->perm[i];
36003 if (e >= nelt)
36004 e -= nelt;
36005 else
36006 e += nelt;
36007 perm2[i] = e;
36008 }
36009
36010 if (expand_vselect_vconcat (d->target, d->op1, d->op0, perm2, nelt))
36011 return true;
36012 }
36013
36014 /* Try the SSE4.1 blend variable merge instructions. */
36015 if (expand_vec_perm_blend (d))
36016 return true;
36017
36018 /* Try one of the AVX vpermil variable permutations. */
36019 if (expand_vec_perm_vpermil (d))
36020 return true;
36021
36022 /* Try the SSSE3 pshufb or XOP vpperm or AVX2 vperm2i128,
36023 vpshufb, vpermd or vpermq variable permutation. */
36024 if (expand_vec_perm_pshufb (d))
36025 return true;
36026
36027 return false;
36028 }
36029
36030 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
36031 in terms of a pair of pshuflw + pshufhw instructions. */
36032
36033 static bool
36034 expand_vec_perm_pshuflw_pshufhw (struct expand_vec_perm_d *d)
36035 {
36036 unsigned char perm2[MAX_VECT_LEN];
36037 unsigned i;
36038 bool ok;
36039
36040 if (d->vmode != V8HImode || d->op0 != d->op1)
36041 return false;
36042
36043 /* The two permutations only operate in 64-bit lanes. */
36044 for (i = 0; i < 4; ++i)
36045 if (d->perm[i] >= 4)
36046 return false;
36047 for (i = 4; i < 8; ++i)
36048 if (d->perm[i] < 4)
36049 return false;
36050
36051 if (d->testing_p)
36052 return true;
36053
36054 /* Emit the pshuflw. */
36055 memcpy (perm2, d->perm, 4);
36056 for (i = 4; i < 8; ++i)
36057 perm2[i] = i;
36058 ok = expand_vselect (d->target, d->op0, perm2, 8);
36059 gcc_assert (ok);
36060
36061 /* Emit the pshufhw. */
36062 memcpy (perm2 + 4, d->perm + 4, 4);
36063 for (i = 0; i < 4; ++i)
36064 perm2[i] = i;
36065 ok = expand_vselect (d->target, d->target, perm2, 8);
36066 gcc_assert (ok);
36067
36068 return true;
36069 }
36070
36071 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
36072 the permutation using the SSSE3 palignr instruction. This succeeds
36073 when all of the elements in PERM fit within one vector and we merely
36074 need to shift them down so that a single vector permutation has a
36075 chance to succeed. */
36076
36077 static bool
36078 expand_vec_perm_palignr (struct expand_vec_perm_d *d)
36079 {
36080 unsigned i, nelt = d->nelt;
36081 unsigned min, max;
36082 bool in_order, ok;
36083 rtx shift;
36084
36085 /* Even with AVX, palignr only operates on 128-bit vectors. */
36086 if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
36087 return false;
36088
36089 min = nelt, max = 0;
36090 for (i = 0; i < nelt; ++i)
36091 {
36092 unsigned e = d->perm[i];
36093 if (e < min)
36094 min = e;
36095 if (e > max)
36096 max = e;
36097 }
36098 if (min == 0 || max - min >= nelt)
36099 return false;
36100
36101 /* Given that we have SSSE3, we know we'll be able to implement the
36102 single operand permutation after the palignr with pshufb. */
36103 if (d->testing_p)
36104 return true;
36105
36106 shift = GEN_INT (min * GET_MODE_BITSIZE (GET_MODE_INNER (d->vmode)));
36107 emit_insn (gen_ssse3_palignrti (gen_lowpart (TImode, d->target),
36108 gen_lowpart (TImode, d->op1),
36109 gen_lowpart (TImode, d->op0), shift));
36110
36111 d->op0 = d->op1 = d->target;
36112
36113 in_order = true;
36114 for (i = 0; i < nelt; ++i)
36115 {
36116 unsigned e = d->perm[i] - min;
36117 if (e != i)
36118 in_order = false;
36119 d->perm[i] = e;
36120 }
36121
36122 /* Test for the degenerate case where the alignment by itself
36123 produces the desired permutation. */
36124 if (in_order)
36125 return true;
36126
36127 ok = expand_vec_perm_1 (d);
36128 gcc_assert (ok);
36129
36130 return ok;
36131 }
36132
36133 static bool expand_vec_perm_interleave3 (struct expand_vec_perm_d *d);
36134
36135 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
36136 a two vector permutation into a single vector permutation by using
36137 an interleave operation to merge the vectors. */
36138
36139 static bool
36140 expand_vec_perm_interleave2 (struct expand_vec_perm_d *d)
36141 {
36142 struct expand_vec_perm_d dremap, dfinal;
36143 unsigned i, nelt = d->nelt, nelt2 = nelt / 2;
36144 unsigned HOST_WIDE_INT contents;
36145 unsigned char remap[2 * MAX_VECT_LEN];
36146 rtx seq;
36147 bool ok, same_halves = false;
36148
36149 if (GET_MODE_SIZE (d->vmode) == 16)
36150 {
36151 if (d->op0 == d->op1)
36152 return false;
36153 }
36154 else if (GET_MODE_SIZE (d->vmode) == 32)
36155 {
36156 if (!TARGET_AVX)
36157 return false;
36158 /* For 32-byte modes allow even d->op0 == d->op1.
36159 The lack of cross-lane shuffling in some instructions
36160 might prevent a single insn shuffle. */
36161 dfinal = *d;
36162 dfinal.testing_p = true;
36163 /* If expand_vec_perm_interleave3 can expand this into
36164 a 3 insn sequence, give up and let it be expanded as
36165 3 insn sequence. While that is one insn longer,
36166 it doesn't need a memory operand and in the common
36167 case that both interleave low and high permutations
36168 with the same operands are adjacent needs 4 insns
36169 for both after CSE. */
36170 if (expand_vec_perm_interleave3 (&dfinal))
36171 return false;
36172 }
36173 else
36174 return false;
36175
36176 /* Examine from whence the elements come. */
36177 contents = 0;
36178 for (i = 0; i < nelt; ++i)
36179 contents |= ((unsigned HOST_WIDE_INT) 1) << d->perm[i];
36180
36181 memset (remap, 0xff, sizeof (remap));
36182 dremap = *d;
36183
36184 if (GET_MODE_SIZE (d->vmode) == 16)
36185 {
36186 unsigned HOST_WIDE_INT h1, h2, h3, h4;
36187
36188 /* Split the two input vectors into 4 halves. */
36189 h1 = (((unsigned HOST_WIDE_INT) 1) << nelt2) - 1;
36190 h2 = h1 << nelt2;
36191 h3 = h2 << nelt2;
36192 h4 = h3 << nelt2;
36193
36194 /* If the elements from the low halves use interleave low, and similarly
36195 for interleave high. If the elements are from mis-matched halves, we
36196 can use shufps for V4SF/V4SI or do a DImode shuffle. */
36197 if ((contents & (h1 | h3)) == contents)
36198 {
36199 /* punpckl* */
36200 for (i = 0; i < nelt2; ++i)
36201 {
36202 remap[i] = i * 2;
36203 remap[i + nelt] = i * 2 + 1;
36204 dremap.perm[i * 2] = i;
36205 dremap.perm[i * 2 + 1] = i + nelt;
36206 }
36207 if (!TARGET_SSE2 && d->vmode == V4SImode)
36208 dremap.vmode = V4SFmode;
36209 }
36210 else if ((contents & (h2 | h4)) == contents)
36211 {
36212 /* punpckh* */
36213 for (i = 0; i < nelt2; ++i)
36214 {
36215 remap[i + nelt2] = i * 2;
36216 remap[i + nelt + nelt2] = i * 2 + 1;
36217 dremap.perm[i * 2] = i + nelt2;
36218 dremap.perm[i * 2 + 1] = i + nelt + nelt2;
36219 }
36220 if (!TARGET_SSE2 && d->vmode == V4SImode)
36221 dremap.vmode = V4SFmode;
36222 }
36223 else if ((contents & (h1 | h4)) == contents)
36224 {
36225 /* shufps */
36226 for (i = 0; i < nelt2; ++i)
36227 {
36228 remap[i] = i;
36229 remap[i + nelt + nelt2] = i + nelt2;
36230 dremap.perm[i] = i;
36231 dremap.perm[i + nelt2] = i + nelt + nelt2;
36232 }
36233 if (nelt != 4)
36234 {
36235 /* shufpd */
36236 dremap.vmode = V2DImode;
36237 dremap.nelt = 2;
36238 dremap.perm[0] = 0;
36239 dremap.perm[1] = 3;
36240 }
36241 }
36242 else if ((contents & (h2 | h3)) == contents)
36243 {
36244 /* shufps */
36245 for (i = 0; i < nelt2; ++i)
36246 {
36247 remap[i + nelt2] = i;
36248 remap[i + nelt] = i + nelt2;
36249 dremap.perm[i] = i + nelt2;
36250 dremap.perm[i + nelt2] = i + nelt;
36251 }
36252 if (nelt != 4)
36253 {
36254 /* shufpd */
36255 dremap.vmode = V2DImode;
36256 dremap.nelt = 2;
36257 dremap.perm[0] = 1;
36258 dremap.perm[1] = 2;
36259 }
36260 }
36261 else
36262 return false;
36263 }
36264 else
36265 {
36266 unsigned int nelt4 = nelt / 4, nzcnt = 0;
36267 unsigned HOST_WIDE_INT q[8];
36268 unsigned int nonzero_halves[4];
36269
36270 /* Split the two input vectors into 8 quarters. */
36271 q[0] = (((unsigned HOST_WIDE_INT) 1) << nelt4) - 1;
36272 for (i = 1; i < 8; ++i)
36273 q[i] = q[0] << (nelt4 * i);
36274 for (i = 0; i < 4; ++i)
36275 if (((q[2 * i] | q[2 * i + 1]) & contents) != 0)
36276 {
36277 nonzero_halves[nzcnt] = i;
36278 ++nzcnt;
36279 }
36280
36281 if (nzcnt == 1)
36282 {
36283 gcc_assert (d->op0 == d->op1);
36284 nonzero_halves[1] = nonzero_halves[0];
36285 same_halves = true;
36286 }
36287 else if (d->op0 == d->op1)
36288 {
36289 gcc_assert (nonzero_halves[0] == 0);
36290 gcc_assert (nonzero_halves[1] == 1);
36291 }
36292
36293 if (nzcnt <= 2)
36294 {
36295 if (d->perm[0] / nelt2 == nonzero_halves[1])
36296 {
36297 /* Attempt to increase the likelyhood that dfinal
36298 shuffle will be intra-lane. */
36299 char tmph = nonzero_halves[0];
36300 nonzero_halves[0] = nonzero_halves[1];
36301 nonzero_halves[1] = tmph;
36302 }
36303
36304 /* vperm2f128 or vperm2i128. */
36305 for (i = 0; i < nelt2; ++i)
36306 {
36307 remap[i + nonzero_halves[1] * nelt2] = i + nelt2;
36308 remap[i + nonzero_halves[0] * nelt2] = i;
36309 dremap.perm[i + nelt2] = i + nonzero_halves[1] * nelt2;
36310 dremap.perm[i] = i + nonzero_halves[0] * nelt2;
36311 }
36312
36313 if (d->vmode != V8SFmode
36314 && d->vmode != V4DFmode
36315 && d->vmode != V8SImode)
36316 {
36317 dremap.vmode = V8SImode;
36318 dremap.nelt = 8;
36319 for (i = 0; i < 4; ++i)
36320 {
36321 dremap.perm[i] = i + nonzero_halves[0] * 4;
36322 dremap.perm[i + 4] = i + nonzero_halves[1] * 4;
36323 }
36324 }
36325 }
36326 else if (d->op0 == d->op1)
36327 return false;
36328 else if (TARGET_AVX2
36329 && (contents & (q[0] | q[2] | q[4] | q[6])) == contents)
36330 {
36331 /* vpunpckl* */
36332 for (i = 0; i < nelt4; ++i)
36333 {
36334 remap[i] = i * 2;
36335 remap[i + nelt] = i * 2 + 1;
36336 remap[i + nelt2] = i * 2 + nelt2;
36337 remap[i + nelt + nelt2] = i * 2 + nelt2 + 1;
36338 dremap.perm[i * 2] = i;
36339 dremap.perm[i * 2 + 1] = i + nelt;
36340 dremap.perm[i * 2 + nelt2] = i + nelt2;
36341 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2;
36342 }
36343 }
36344 else if (TARGET_AVX2
36345 && (contents & (q[1] | q[3] | q[5] | q[7])) == contents)
36346 {
36347 /* vpunpckh* */
36348 for (i = 0; i < nelt4; ++i)
36349 {
36350 remap[i + nelt4] = i * 2;
36351 remap[i + nelt + nelt4] = i * 2 + 1;
36352 remap[i + nelt2 + nelt4] = i * 2 + nelt2;
36353 remap[i + nelt + nelt2 + nelt4] = i * 2 + nelt2 + 1;
36354 dremap.perm[i * 2] = i + nelt4;
36355 dremap.perm[i * 2 + 1] = i + nelt + nelt4;
36356 dremap.perm[i * 2 + nelt2] = i + nelt2 + nelt4;
36357 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2 + nelt4;
36358 }
36359 }
36360 else
36361 return false;
36362 }
36363
36364 /* Use the remapping array set up above to move the elements from their
36365 swizzled locations into their final destinations. */
36366 dfinal = *d;
36367 for (i = 0; i < nelt; ++i)
36368 {
36369 unsigned e = remap[d->perm[i]];
36370 gcc_assert (e < nelt);
36371 /* If same_halves is true, both halves of the remapped vector are the
36372 same. Avoid cross-lane accesses if possible. */
36373 if (same_halves && i >= nelt2)
36374 {
36375 gcc_assert (e < nelt2);
36376 dfinal.perm[i] = e + nelt2;
36377 }
36378 else
36379 dfinal.perm[i] = e;
36380 }
36381 dfinal.op0 = gen_reg_rtx (dfinal.vmode);
36382 dfinal.op1 = dfinal.op0;
36383 dremap.target = dfinal.op0;
36384
36385 /* Test if the final remap can be done with a single insn. For V4SFmode or
36386 V4SImode this *will* succeed. For V8HImode or V16QImode it may not. */
36387 start_sequence ();
36388 ok = expand_vec_perm_1 (&dfinal);
36389 seq = get_insns ();
36390 end_sequence ();
36391
36392 if (!ok)
36393 return false;
36394
36395 if (d->testing_p)
36396 return true;
36397
36398 if (dremap.vmode != dfinal.vmode)
36399 {
36400 dremap.target = gen_lowpart (dremap.vmode, dremap.target);
36401 dremap.op0 = gen_lowpart (dremap.vmode, dremap.op0);
36402 dremap.op1 = gen_lowpart (dremap.vmode, dremap.op1);
36403 }
36404
36405 ok = expand_vec_perm_1 (&dremap);
36406 gcc_assert (ok);
36407
36408 emit_insn (seq);
36409 return true;
36410 }
36411
36412 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
36413 a single vector cross-lane permutation into vpermq followed
36414 by any of the single insn permutations. */
36415
36416 static bool
36417 expand_vec_perm_vpermq_perm_1 (struct expand_vec_perm_d *d)
36418 {
36419 struct expand_vec_perm_d dremap, dfinal;
36420 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, nelt4 = nelt / 4;
36421 unsigned contents[2];
36422 bool ok;
36423
36424 if (!(TARGET_AVX2
36425 && (d->vmode == V32QImode || d->vmode == V16HImode)
36426 && d->op0 == d->op1))
36427 return false;
36428
36429 contents[0] = 0;
36430 contents[1] = 0;
36431 for (i = 0; i < nelt2; ++i)
36432 {
36433 contents[0] |= 1u << (d->perm[i] / nelt4);
36434 contents[1] |= 1u << (d->perm[i + nelt2] / nelt4);
36435 }
36436
36437 for (i = 0; i < 2; ++i)
36438 {
36439 unsigned int cnt = 0;
36440 for (j = 0; j < 4; ++j)
36441 if ((contents[i] & (1u << j)) != 0 && ++cnt > 2)
36442 return false;
36443 }
36444
36445 if (d->testing_p)
36446 return true;
36447
36448 dremap = *d;
36449 dremap.vmode = V4DImode;
36450 dremap.nelt = 4;
36451 dremap.target = gen_reg_rtx (V4DImode);
36452 dremap.op0 = gen_lowpart (V4DImode, d->op0);
36453 dremap.op1 = dremap.op0;
36454 for (i = 0; i < 2; ++i)
36455 {
36456 unsigned int cnt = 0;
36457 for (j = 0; j < 4; ++j)
36458 if ((contents[i] & (1u << j)) != 0)
36459 dremap.perm[2 * i + cnt++] = j;
36460 for (; cnt < 2; ++cnt)
36461 dremap.perm[2 * i + cnt] = 0;
36462 }
36463
36464 dfinal = *d;
36465 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
36466 dfinal.op1 = dfinal.op0;
36467 for (i = 0, j = 0; i < nelt; ++i)
36468 {
36469 if (i == nelt2)
36470 j = 2;
36471 dfinal.perm[i] = (d->perm[i] & (nelt4 - 1)) | (j ? nelt2 : 0);
36472 if ((d->perm[i] / nelt4) == dremap.perm[j])
36473 ;
36474 else if ((d->perm[i] / nelt4) == dremap.perm[j + 1])
36475 dfinal.perm[i] |= nelt4;
36476 else
36477 gcc_unreachable ();
36478 }
36479
36480 ok = expand_vec_perm_1 (&dremap);
36481 gcc_assert (ok);
36482
36483 ok = expand_vec_perm_1 (&dfinal);
36484 gcc_assert (ok);
36485
36486 return true;
36487 }
36488
36489 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
36490 a two vector permutation using 2 intra-lane interleave insns
36491 and cross-lane shuffle for 32-byte vectors. */
36492
36493 static bool
36494 expand_vec_perm_interleave3 (struct expand_vec_perm_d *d)
36495 {
36496 unsigned i, nelt;
36497 rtx (*gen) (rtx, rtx, rtx);
36498
36499 if (d->op0 == d->op1)
36500 return false;
36501 if (TARGET_AVX2 && GET_MODE_SIZE (d->vmode) == 32)
36502 ;
36503 else if (TARGET_AVX && (d->vmode == V8SFmode || d->vmode == V4DFmode))
36504 ;
36505 else
36506 return false;
36507
36508 nelt = d->nelt;
36509 if (d->perm[0] != 0 && d->perm[0] != nelt / 2)
36510 return false;
36511 for (i = 0; i < nelt; i += 2)
36512 if (d->perm[i] != d->perm[0] + i / 2
36513 || d->perm[i + 1] != d->perm[0] + i / 2 + nelt)
36514 return false;
36515
36516 if (d->testing_p)
36517 return true;
36518
36519 switch (d->vmode)
36520 {
36521 case V32QImode:
36522 if (d->perm[0])
36523 gen = gen_vec_interleave_highv32qi;
36524 else
36525 gen = gen_vec_interleave_lowv32qi;
36526 break;
36527 case V16HImode:
36528 if (d->perm[0])
36529 gen = gen_vec_interleave_highv16hi;
36530 else
36531 gen = gen_vec_interleave_lowv16hi;
36532 break;
36533 case V8SImode:
36534 if (d->perm[0])
36535 gen = gen_vec_interleave_highv8si;
36536 else
36537 gen = gen_vec_interleave_lowv8si;
36538 break;
36539 case V4DImode:
36540 if (d->perm[0])
36541 gen = gen_vec_interleave_highv4di;
36542 else
36543 gen = gen_vec_interleave_lowv4di;
36544 break;
36545 case V8SFmode:
36546 if (d->perm[0])
36547 gen = gen_vec_interleave_highv8sf;
36548 else
36549 gen = gen_vec_interleave_lowv8sf;
36550 break;
36551 case V4DFmode:
36552 if (d->perm[0])
36553 gen = gen_vec_interleave_highv4df;
36554 else
36555 gen = gen_vec_interleave_lowv4df;
36556 break;
36557 default:
36558 gcc_unreachable ();
36559 }
36560
36561 emit_insn (gen (d->target, d->op0, d->op1));
36562 return true;
36563 }
36564
36565 /* A subroutine of expand_vec_perm_even_odd_1. Implement the double-word
36566 permutation with two pshufb insns and an ior. We should have already
36567 failed all two instruction sequences. */
36568
36569 static bool
36570 expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d)
36571 {
36572 rtx rperm[2][16], vperm, l, h, op, m128;
36573 unsigned int i, nelt, eltsz;
36574
36575 if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
36576 return false;
36577 gcc_assert (d->op0 != d->op1);
36578
36579 nelt = d->nelt;
36580 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
36581
36582 /* Generate two permutation masks. If the required element is within
36583 the given vector it is shuffled into the proper lane. If the required
36584 element is in the other vector, force a zero into the lane by setting
36585 bit 7 in the permutation mask. */
36586 m128 = GEN_INT (-128);
36587 for (i = 0; i < nelt; ++i)
36588 {
36589 unsigned j, e = d->perm[i];
36590 unsigned which = (e >= nelt);
36591 if (e >= nelt)
36592 e -= nelt;
36593
36594 for (j = 0; j < eltsz; ++j)
36595 {
36596 rperm[which][i*eltsz + j] = GEN_INT (e*eltsz + j);
36597 rperm[1-which][i*eltsz + j] = m128;
36598 }
36599 }
36600
36601 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[0]));
36602 vperm = force_reg (V16QImode, vperm);
36603
36604 l = gen_reg_rtx (V16QImode);
36605 op = gen_lowpart (V16QImode, d->op0);
36606 emit_insn (gen_ssse3_pshufbv16qi3 (l, op, vperm));
36607
36608 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[1]));
36609 vperm = force_reg (V16QImode, vperm);
36610
36611 h = gen_reg_rtx (V16QImode);
36612 op = gen_lowpart (V16QImode, d->op1);
36613 emit_insn (gen_ssse3_pshufbv16qi3 (h, op, vperm));
36614
36615 op = gen_lowpart (V16QImode, d->target);
36616 emit_insn (gen_iorv16qi3 (op, l, h));
36617
36618 return true;
36619 }
36620
36621 /* Implement arbitrary permutation of one V32QImode and V16QImode operand
36622 with two vpshufb insns, vpermq and vpor. We should have already failed
36623 all two or three instruction sequences. */
36624
36625 static bool
36626 expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d)
36627 {
36628 rtx rperm[2][32], vperm, l, h, hp, op, m128;
36629 unsigned int i, nelt, eltsz;
36630
36631 if (!TARGET_AVX2
36632 || d->op0 != d->op1
36633 || (d->vmode != V32QImode && d->vmode != V16HImode))
36634 return false;
36635
36636 if (d->testing_p)
36637 return true;
36638
36639 nelt = d->nelt;
36640 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
36641
36642 /* Generate two permutation masks. If the required element is within
36643 the same lane, it is shuffled in. If the required element from the
36644 other lane, force a zero by setting bit 7 in the permutation mask.
36645 In the other mask the mask has non-negative elements if element
36646 is requested from the other lane, but also moved to the other lane,
36647 so that the result of vpshufb can have the two V2TImode halves
36648 swapped. */
36649 m128 = GEN_INT (-128);
36650 for (i = 0; i < nelt; ++i)
36651 {
36652 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
36653 unsigned which = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
36654
36655 for (j = 0; j < eltsz; ++j)
36656 {
36657 rperm[!!which][(i * eltsz + j) ^ which] = GEN_INT (e * eltsz + j);
36658 rperm[!which][(i * eltsz + j) ^ (which ^ 16)] = m128;
36659 }
36660 }
36661
36662 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
36663 vperm = force_reg (V32QImode, vperm);
36664
36665 h = gen_reg_rtx (V32QImode);
36666 op = gen_lowpart (V32QImode, d->op0);
36667 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
36668
36669 /* Swap the 128-byte lanes of h into hp. */
36670 hp = gen_reg_rtx (V4DImode);
36671 op = gen_lowpart (V4DImode, h);
36672 emit_insn (gen_avx2_permv4di_1 (hp, op, const2_rtx, GEN_INT (3), const0_rtx,
36673 const1_rtx));
36674
36675 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
36676 vperm = force_reg (V32QImode, vperm);
36677
36678 l = gen_reg_rtx (V32QImode);
36679 op = gen_lowpart (V32QImode, d->op0);
36680 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
36681
36682 op = gen_lowpart (V32QImode, d->target);
36683 emit_insn (gen_iorv32qi3 (op, l, gen_lowpart (V32QImode, hp)));
36684
36685 return true;
36686 }
36687
36688 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
36689 and extract-odd permutations of two V32QImode and V16QImode operand
36690 with two vpshufb insns, vpor and vpermq. We should have already
36691 failed all two or three instruction sequences. */
36692
36693 static bool
36694 expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d *d)
36695 {
36696 rtx rperm[2][32], vperm, l, h, ior, op, m128;
36697 unsigned int i, nelt, eltsz;
36698
36699 if (!TARGET_AVX2
36700 || d->op0 == d->op1
36701 || (d->vmode != V32QImode && d->vmode != V16HImode))
36702 return false;
36703
36704 for (i = 0; i < d->nelt; ++i)
36705 if ((d->perm[i] ^ (i * 2)) & (3 * d->nelt / 2))
36706 return false;
36707
36708 if (d->testing_p)
36709 return true;
36710
36711 nelt = d->nelt;
36712 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
36713
36714 /* Generate two permutation masks. In the first permutation mask
36715 the first quarter will contain indexes for the first half
36716 of the op0, the second quarter will contain bit 7 set, third quarter
36717 will contain indexes for the second half of the op0 and the
36718 last quarter bit 7 set. In the second permutation mask
36719 the first quarter will contain bit 7 set, the second quarter
36720 indexes for the first half of the op1, the third quarter bit 7 set
36721 and last quarter indexes for the second half of the op1.
36722 I.e. the first mask e.g. for V32QImode extract even will be:
36723 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128
36724 (all values masked with 0xf except for -128) and second mask
36725 for extract even will be
36726 -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe. */
36727 m128 = GEN_INT (-128);
36728 for (i = 0; i < nelt; ++i)
36729 {
36730 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
36731 unsigned which = d->perm[i] >= nelt;
36732 unsigned xorv = (i >= nelt / 4 && i < 3 * nelt / 4) ? 24 : 0;
36733
36734 for (j = 0; j < eltsz; ++j)
36735 {
36736 rperm[which][(i * eltsz + j) ^ xorv] = GEN_INT (e * eltsz + j);
36737 rperm[1 - which][(i * eltsz + j) ^ xorv] = m128;
36738 }
36739 }
36740
36741 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
36742 vperm = force_reg (V32QImode, vperm);
36743
36744 l = gen_reg_rtx (V32QImode);
36745 op = gen_lowpart (V32QImode, d->op0);
36746 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
36747
36748 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
36749 vperm = force_reg (V32QImode, vperm);
36750
36751 h = gen_reg_rtx (V32QImode);
36752 op = gen_lowpart (V32QImode, d->op1);
36753 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
36754
36755 ior = gen_reg_rtx (V32QImode);
36756 emit_insn (gen_iorv32qi3 (ior, l, h));
36757
36758 /* Permute the V4DImode quarters using { 0, 2, 1, 3 } permutation. */
36759 op = gen_lowpart (V4DImode, d->target);
36760 ior = gen_lowpart (V4DImode, ior);
36761 emit_insn (gen_avx2_permv4di_1 (op, ior, const0_rtx, const2_rtx,
36762 const1_rtx, GEN_INT (3)));
36763
36764 return true;
36765 }
36766
36767 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement extract-even
36768 and extract-odd permutations. */
36769
36770 static bool
36771 expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
36772 {
36773 rtx t1, t2, t3;
36774
36775 switch (d->vmode)
36776 {
36777 case V4DFmode:
36778 t1 = gen_reg_rtx (V4DFmode);
36779 t2 = gen_reg_rtx (V4DFmode);
36780
36781 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
36782 emit_insn (gen_avx_vperm2f128v4df3 (t1, d->op0, d->op1, GEN_INT (0x20)));
36783 emit_insn (gen_avx_vperm2f128v4df3 (t2, d->op0, d->op1, GEN_INT (0x31)));
36784
36785 /* Now an unpck[lh]pd will produce the result required. */
36786 if (odd)
36787 t3 = gen_avx_unpckhpd256 (d->target, t1, t2);
36788 else
36789 t3 = gen_avx_unpcklpd256 (d->target, t1, t2);
36790 emit_insn (t3);
36791 break;
36792
36793 case V8SFmode:
36794 {
36795 int mask = odd ? 0xdd : 0x88;
36796
36797 t1 = gen_reg_rtx (V8SFmode);
36798 t2 = gen_reg_rtx (V8SFmode);
36799 t3 = gen_reg_rtx (V8SFmode);
36800
36801 /* Shuffle within the 128-bit lanes to produce:
36802 { 0 2 8 a 4 6 c e } | { 1 3 9 b 5 7 d f }. */
36803 emit_insn (gen_avx_shufps256 (t1, d->op0, d->op1,
36804 GEN_INT (mask)));
36805
36806 /* Shuffle the lanes around to produce:
36807 { 4 6 c e 0 2 8 a } and { 5 7 d f 1 3 9 b }. */
36808 emit_insn (gen_avx_vperm2f128v8sf3 (t2, t1, t1,
36809 GEN_INT (0x3)));
36810
36811 /* Shuffle within the 128-bit lanes to produce:
36812 { 0 2 4 6 4 6 0 2 } | { 1 3 5 7 5 7 1 3 }. */
36813 emit_insn (gen_avx_shufps256 (t3, t1, t2, GEN_INT (0x44)));
36814
36815 /* Shuffle within the 128-bit lanes to produce:
36816 { 8 a c e c e 8 a } | { 9 b d f d f 9 b }. */
36817 emit_insn (gen_avx_shufps256 (t2, t1, t2, GEN_INT (0xee)));
36818
36819 /* Shuffle the lanes around to produce:
36820 { 0 2 4 6 8 a c e } | { 1 3 5 7 9 b d f }. */
36821 emit_insn (gen_avx_vperm2f128v8sf3 (d->target, t3, t2,
36822 GEN_INT (0x20)));
36823 }
36824 break;
36825
36826 case V2DFmode:
36827 case V4SFmode:
36828 case V2DImode:
36829 case V4SImode:
36830 /* These are always directly implementable by expand_vec_perm_1. */
36831 gcc_unreachable ();
36832
36833 case V8HImode:
36834 if (TARGET_SSSE3)
36835 return expand_vec_perm_pshufb2 (d);
36836 else
36837 {
36838 /* We need 2*log2(N)-1 operations to achieve odd/even
36839 with interleave. */
36840 t1 = gen_reg_rtx (V8HImode);
36841 t2 = gen_reg_rtx (V8HImode);
36842 emit_insn (gen_vec_interleave_highv8hi (t1, d->op0, d->op1));
36843 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->op0, d->op1));
36844 emit_insn (gen_vec_interleave_highv8hi (t2, d->target, t1));
36845 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->target, t1));
36846 if (odd)
36847 t3 = gen_vec_interleave_highv8hi (d->target, d->target, t2);
36848 else
36849 t3 = gen_vec_interleave_lowv8hi (d->target, d->target, t2);
36850 emit_insn (t3);
36851 }
36852 break;
36853
36854 case V16QImode:
36855 if (TARGET_SSSE3)
36856 return expand_vec_perm_pshufb2 (d);
36857 else
36858 {
36859 t1 = gen_reg_rtx (V16QImode);
36860 t2 = gen_reg_rtx (V16QImode);
36861 t3 = gen_reg_rtx (V16QImode);
36862 emit_insn (gen_vec_interleave_highv16qi (t1, d->op0, d->op1));
36863 emit_insn (gen_vec_interleave_lowv16qi (d->target, d->op0, d->op1));
36864 emit_insn (gen_vec_interleave_highv16qi (t2, d->target, t1));
36865 emit_insn (gen_vec_interleave_lowv16qi (d->target, d->target, t1));
36866 emit_insn (gen_vec_interleave_highv16qi (t3, d->target, t2));
36867 emit_insn (gen_vec_interleave_lowv16qi (d->target, d->target, t2));
36868 if (odd)
36869 t3 = gen_vec_interleave_highv16qi (d->target, d->target, t3);
36870 else
36871 t3 = gen_vec_interleave_lowv16qi (d->target, d->target, t3);
36872 emit_insn (t3);
36873 }
36874 break;
36875
36876 case V16HImode:
36877 case V32QImode:
36878 return expand_vec_perm_vpshufb2_vpermq_even_odd (d);
36879
36880 case V4DImode:
36881 if (!TARGET_AVX2)
36882 {
36883 struct expand_vec_perm_d d_copy = *d;
36884 d_copy.vmode = V4DFmode;
36885 d_copy.target = gen_lowpart (V4DFmode, d->target);
36886 d_copy.op0 = gen_lowpart (V4DFmode, d->op0);
36887 d_copy.op1 = gen_lowpart (V4DFmode, d->op1);
36888 return expand_vec_perm_even_odd_1 (&d_copy, odd);
36889 }
36890
36891 t1 = gen_reg_rtx (V4DImode);
36892 t2 = gen_reg_rtx (V4DImode);
36893
36894 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
36895 emit_insn (gen_avx2_permv2ti (t1, d->op0, d->op1, GEN_INT (0x20)));
36896 emit_insn (gen_avx2_permv2ti (t2, d->op0, d->op1, GEN_INT (0x31)));
36897
36898 /* Now an vpunpck[lh]qdq will produce the result required. */
36899 if (odd)
36900 t3 = gen_avx2_interleave_highv4di (d->target, t1, t2);
36901 else
36902 t3 = gen_avx2_interleave_lowv4di (d->target, t1, t2);
36903 emit_insn (t3);
36904 break;
36905
36906 case V8SImode:
36907 if (!TARGET_AVX2)
36908 {
36909 struct expand_vec_perm_d d_copy = *d;
36910 d_copy.vmode = V8SFmode;
36911 d_copy.target = gen_lowpart (V8SFmode, d->target);
36912 d_copy.op0 = gen_lowpart (V8SFmode, d->op0);
36913 d_copy.op1 = gen_lowpart (V8SFmode, d->op1);
36914 return expand_vec_perm_even_odd_1 (&d_copy, odd);
36915 }
36916
36917 t1 = gen_reg_rtx (V8SImode);
36918 t2 = gen_reg_rtx (V8SImode);
36919
36920 /* Shuffle the lanes around into
36921 { 0 1 2 3 8 9 a b } and { 4 5 6 7 c d e f }. */
36922 emit_insn (gen_avx2_permv2ti (gen_lowpart (V4DImode, t1),
36923 gen_lowpart (V4DImode, d->op0),
36924 gen_lowpart (V4DImode, d->op1),
36925 GEN_INT (0x20)));
36926 emit_insn (gen_avx2_permv2ti (gen_lowpart (V4DImode, t2),
36927 gen_lowpart (V4DImode, d->op0),
36928 gen_lowpart (V4DImode, d->op1),
36929 GEN_INT (0x31)));
36930
36931 /* Swap the 2nd and 3rd position in each lane into
36932 { 0 2 1 3 8 a 9 b } and { 4 6 5 7 c e d f }. */
36933 emit_insn (gen_avx2_pshufdv3 (t1, t1,
36934 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
36935 emit_insn (gen_avx2_pshufdv3 (t2, t2,
36936 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
36937
36938 /* Now an vpunpck[lh]qdq will produce
36939 { 0 2 4 6 8 a c e } resp. { 1 3 5 7 9 b d f }. */
36940 if (odd)
36941 t3 = gen_avx2_interleave_highv4di (gen_lowpart (V4DImode, d->target),
36942 gen_lowpart (V4DImode, t1),
36943 gen_lowpart (V4DImode, t2));
36944 else
36945 t3 = gen_avx2_interleave_lowv4di (gen_lowpart (V4DImode, d->target),
36946 gen_lowpart (V4DImode, t1),
36947 gen_lowpart (V4DImode, t2));
36948 emit_insn (t3);
36949 break;
36950
36951 default:
36952 gcc_unreachable ();
36953 }
36954
36955 return true;
36956 }
36957
36958 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
36959 extract-even and extract-odd permutations. */
36960
36961 static bool
36962 expand_vec_perm_even_odd (struct expand_vec_perm_d *d)
36963 {
36964 unsigned i, odd, nelt = d->nelt;
36965
36966 odd = d->perm[0];
36967 if (odd != 0 && odd != 1)
36968 return false;
36969
36970 for (i = 1; i < nelt; ++i)
36971 if (d->perm[i] != 2 * i + odd)
36972 return false;
36973
36974 return expand_vec_perm_even_odd_1 (d, odd);
36975 }
36976
36977 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement broadcast
36978 permutations. We assume that expand_vec_perm_1 has already failed. */
36979
36980 static bool
36981 expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
36982 {
36983 unsigned elt = d->perm[0], nelt2 = d->nelt / 2;
36984 enum machine_mode vmode = d->vmode;
36985 unsigned char perm2[4];
36986 rtx op0 = d->op0;
36987 bool ok;
36988
36989 switch (vmode)
36990 {
36991 case V4DFmode:
36992 case V8SFmode:
36993 /* These are special-cased in sse.md so that we can optionally
36994 use the vbroadcast instruction. They expand to two insns
36995 if the input happens to be in a register. */
36996 gcc_unreachable ();
36997
36998 case V2DFmode:
36999 case V2DImode:
37000 case V4SFmode:
37001 case V4SImode:
37002 /* These are always implementable using standard shuffle patterns. */
37003 gcc_unreachable ();
37004
37005 case V8HImode:
37006 case V16QImode:
37007 /* These can be implemented via interleave. We save one insn by
37008 stopping once we have promoted to V4SImode and then use pshufd. */
37009 do
37010 {
37011 rtx dest;
37012 rtx (*gen) (rtx, rtx, rtx)
37013 = vmode == V16QImode ? gen_vec_interleave_lowv16qi
37014 : gen_vec_interleave_lowv8hi;
37015
37016 if (elt >= nelt2)
37017 {
37018 gen = vmode == V16QImode ? gen_vec_interleave_highv16qi
37019 : gen_vec_interleave_highv8hi;
37020 elt -= nelt2;
37021 }
37022 nelt2 /= 2;
37023
37024 dest = gen_reg_rtx (vmode);
37025 emit_insn (gen (dest, op0, op0));
37026 vmode = get_mode_wider_vector (vmode);
37027 op0 = gen_lowpart (vmode, dest);
37028 }
37029 while (vmode != V4SImode);
37030
37031 memset (perm2, elt, 4);
37032 ok = expand_vselect (gen_lowpart (V4SImode, d->target), op0, perm2, 4);
37033 gcc_assert (ok);
37034 return true;
37035
37036 case V32QImode:
37037 case V16HImode:
37038 case V8SImode:
37039 case V4DImode:
37040 /* For AVX2 broadcasts of the first element vpbroadcast* or
37041 vpermq should be used by expand_vec_perm_1. */
37042 gcc_assert (!TARGET_AVX2 || d->perm[0]);
37043 return false;
37044
37045 default:
37046 gcc_unreachable ();
37047 }
37048 }
37049
37050 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
37051 broadcast permutations. */
37052
37053 static bool
37054 expand_vec_perm_broadcast (struct expand_vec_perm_d *d)
37055 {
37056 unsigned i, elt, nelt = d->nelt;
37057
37058 if (d->op0 != d->op1)
37059 return false;
37060
37061 elt = d->perm[0];
37062 for (i = 1; i < nelt; ++i)
37063 if (d->perm[i] != elt)
37064 return false;
37065
37066 return expand_vec_perm_broadcast_1 (d);
37067 }
37068
37069 /* Implement arbitrary permutation of two V32QImode and V16QImode operands
37070 with 4 vpshufb insns, 2 vpermq and 3 vpor. We should have already failed
37071 all the shorter instruction sequences. */
37072
37073 static bool
37074 expand_vec_perm_vpshufb4_vpermq2 (struct expand_vec_perm_d *d)
37075 {
37076 rtx rperm[4][32], vperm, l[2], h[2], op, m128;
37077 unsigned int i, nelt, eltsz;
37078 bool used[4];
37079
37080 if (!TARGET_AVX2
37081 || d->op0 == d->op1
37082 || (d->vmode != V32QImode && d->vmode != V16HImode))
37083 return false;
37084
37085 if (d->testing_p)
37086 return true;
37087
37088 nelt = d->nelt;
37089 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
37090
37091 /* Generate 4 permutation masks. If the required element is within
37092 the same lane, it is shuffled in. If the required element from the
37093 other lane, force a zero by setting bit 7 in the permutation mask.
37094 In the other mask the mask has non-negative elements if element
37095 is requested from the other lane, but also moved to the other lane,
37096 so that the result of vpshufb can have the two V2TImode halves
37097 swapped. */
37098 m128 = GEN_INT (-128);
37099 for (i = 0; i < 32; ++i)
37100 {
37101 rperm[0][i] = m128;
37102 rperm[1][i] = m128;
37103 rperm[2][i] = m128;
37104 rperm[3][i] = m128;
37105 }
37106 used[0] = false;
37107 used[1] = false;
37108 used[2] = false;
37109 used[3] = false;
37110 for (i = 0; i < nelt; ++i)
37111 {
37112 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
37113 unsigned xlane = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
37114 unsigned int which = ((d->perm[i] & nelt) ? 2 : 0) + (xlane ? 1 : 0);
37115
37116 for (j = 0; j < eltsz; ++j)
37117 rperm[which][(i * eltsz + j) ^ xlane] = GEN_INT (e * eltsz + j);
37118 used[which] = true;
37119 }
37120
37121 for (i = 0; i < 2; ++i)
37122 {
37123 if (!used[2 * i + 1])
37124 {
37125 h[i] = NULL_RTX;
37126 continue;
37127 }
37128 vperm = gen_rtx_CONST_VECTOR (V32QImode,
37129 gen_rtvec_v (32, rperm[2 * i + 1]));
37130 vperm = force_reg (V32QImode, vperm);
37131 h[i] = gen_reg_rtx (V32QImode);
37132 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
37133 emit_insn (gen_avx2_pshufbv32qi3 (h[i], op, vperm));
37134 }
37135
37136 /* Swap the 128-byte lanes of h[X]. */
37137 for (i = 0; i < 2; ++i)
37138 {
37139 if (h[i] == NULL_RTX)
37140 continue;
37141 op = gen_reg_rtx (V4DImode);
37142 emit_insn (gen_avx2_permv4di_1 (op, gen_lowpart (V4DImode, h[i]),
37143 const2_rtx, GEN_INT (3), const0_rtx,
37144 const1_rtx));
37145 h[i] = gen_lowpart (V32QImode, op);
37146 }
37147
37148 for (i = 0; i < 2; ++i)
37149 {
37150 if (!used[2 * i])
37151 {
37152 l[i] = NULL_RTX;
37153 continue;
37154 }
37155 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[2 * i]));
37156 vperm = force_reg (V32QImode, vperm);
37157 l[i] = gen_reg_rtx (V32QImode);
37158 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
37159 emit_insn (gen_avx2_pshufbv32qi3 (l[i], op, vperm));
37160 }
37161
37162 for (i = 0; i < 2; ++i)
37163 {
37164 if (h[i] && l[i])
37165 {
37166 op = gen_reg_rtx (V32QImode);
37167 emit_insn (gen_iorv32qi3 (op, l[i], h[i]));
37168 l[i] = op;
37169 }
37170 else if (h[i])
37171 l[i] = h[i];
37172 }
37173
37174 gcc_assert (l[0] && l[1]);
37175 op = gen_lowpart (V32QImode, d->target);
37176 emit_insn (gen_iorv32qi3 (op, l[0], l[1]));
37177 return true;
37178 }
37179
37180 /* The guts of ix86_expand_vec_perm_const, also used by the ok hook.
37181 With all of the interface bits taken care of, perform the expansion
37182 in D and return true on success. */
37183
37184 static bool
37185 ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
37186 {
37187 /* Try a single instruction expansion. */
37188 if (expand_vec_perm_1 (d))
37189 return true;
37190
37191 /* Try sequences of two instructions. */
37192
37193 if (expand_vec_perm_pshuflw_pshufhw (d))
37194 return true;
37195
37196 if (expand_vec_perm_palignr (d))
37197 return true;
37198
37199 if (expand_vec_perm_interleave2 (d))
37200 return true;
37201
37202 if (expand_vec_perm_broadcast (d))
37203 return true;
37204
37205 if (expand_vec_perm_vpermq_perm_1 (d))
37206 return true;
37207
37208 /* Try sequences of three instructions. */
37209
37210 if (expand_vec_perm_pshufb2 (d))
37211 return true;
37212
37213 if (expand_vec_perm_interleave3 (d))
37214 return true;
37215
37216 /* Try sequences of four instructions. */
37217
37218 if (expand_vec_perm_vpshufb2_vpermq (d))
37219 return true;
37220
37221 if (expand_vec_perm_vpshufb2_vpermq_even_odd (d))
37222 return true;
37223
37224 /* ??? Look for narrow permutations whose element orderings would
37225 allow the promotion to a wider mode. */
37226
37227 /* ??? Look for sequences of interleave or a wider permute that place
37228 the data into the correct lanes for a half-vector shuffle like
37229 pshuf[lh]w or vpermilps. */
37230
37231 /* ??? Look for sequences of interleave that produce the desired results.
37232 The combinatorics of punpck[lh] get pretty ugly... */
37233
37234 if (expand_vec_perm_even_odd (d))
37235 return true;
37236
37237 /* Even longer sequences. */
37238 if (expand_vec_perm_vpshufb4_vpermq2 (d))
37239 return true;
37240
37241 return false;
37242 }
37243
37244 bool
37245 ix86_expand_vec_perm_const (rtx operands[4])
37246 {
37247 struct expand_vec_perm_d d;
37248 unsigned char perm[MAX_VECT_LEN];
37249 int i, nelt, which;
37250 rtx sel;
37251
37252 d.target = operands[0];
37253 d.op0 = operands[1];
37254 d.op1 = operands[2];
37255 sel = operands[3];
37256
37257 d.vmode = GET_MODE (d.target);
37258 gcc_assert (VECTOR_MODE_P (d.vmode));
37259 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
37260 d.testing_p = false;
37261
37262 gcc_assert (GET_CODE (sel) == CONST_VECTOR);
37263 gcc_assert (XVECLEN (sel, 0) == nelt);
37264 gcc_checking_assert (sizeof (d.perm) == sizeof (perm));
37265
37266 for (i = which = 0; i < nelt; ++i)
37267 {
37268 rtx e = XVECEXP (sel, 0, i);
37269 int ei = INTVAL (e) & (2 * nelt - 1);
37270
37271 which |= (ei < nelt ? 1 : 2);
37272 d.perm[i] = ei;
37273 perm[i] = ei;
37274 }
37275
37276 switch (which)
37277 {
37278 default:
37279 gcc_unreachable();
37280
37281 case 3:
37282 if (!rtx_equal_p (d.op0, d.op1))
37283 break;
37284
37285 /* The elements of PERM do not suggest that only the first operand
37286 is used, but both operands are identical. Allow easier matching
37287 of the permutation by folding the permutation into the single
37288 input vector. */
37289 for (i = 0; i < nelt; ++i)
37290 if (d.perm[i] >= nelt)
37291 d.perm[i] -= nelt;
37292 /* FALLTHRU */
37293
37294 case 1:
37295 d.op1 = d.op0;
37296 break;
37297
37298 case 2:
37299 for (i = 0; i < nelt; ++i)
37300 d.perm[i] -= nelt;
37301 d.op0 = d.op1;
37302 break;
37303 }
37304
37305 if (ix86_expand_vec_perm_const_1 (&d))
37306 return true;
37307
37308 /* If the mask says both arguments are needed, but they are the same,
37309 the above tried to expand with d.op0 == d.op1. If that didn't work,
37310 retry with d.op0 != d.op1 as that is what testing has been done with. */
37311 if (which == 3 && d.op0 == d.op1)
37312 {
37313 rtx seq;
37314 bool ok;
37315
37316 memcpy (d.perm, perm, sizeof (perm));
37317 d.op1 = gen_reg_rtx (d.vmode);
37318 start_sequence ();
37319 ok = ix86_expand_vec_perm_const_1 (&d);
37320 seq = get_insns ();
37321 end_sequence ();
37322 if (ok)
37323 {
37324 emit_move_insn (d.op1, d.op0);
37325 emit_insn (seq);
37326 return true;
37327 }
37328 }
37329
37330 return false;
37331 }
37332
37333 /* Implement targetm.vectorize.vec_perm_const_ok. */
37334
37335 static bool
37336 ix86_vectorize_vec_perm_const_ok (enum machine_mode vmode,
37337 const unsigned char *sel)
37338 {
37339 struct expand_vec_perm_d d;
37340 unsigned int i, nelt, which;
37341 bool ret, one_vec;
37342
37343 d.vmode = vmode;
37344 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
37345 d.testing_p = true;
37346
37347 /* Given sufficient ISA support we can just return true here
37348 for selected vector modes. */
37349 if (GET_MODE_SIZE (d.vmode) == 16)
37350 {
37351 /* All implementable with a single vpperm insn. */
37352 if (TARGET_XOP)
37353 return true;
37354 /* All implementable with 2 pshufb + 1 ior. */
37355 if (TARGET_SSSE3)
37356 return true;
37357 /* All implementable with shufpd or unpck[lh]pd. */
37358 if (d.nelt == 2)
37359 return true;
37360 }
37361
37362 /* Extract the values from the vector CST into the permutation
37363 array in D. */
37364 memcpy (d.perm, sel, nelt);
37365 for (i = which = 0; i < nelt; ++i)
37366 {
37367 unsigned char e = d.perm[i];
37368 gcc_assert (e < 2 * nelt);
37369 which |= (e < nelt ? 1 : 2);
37370 }
37371
37372 /* For all elements from second vector, fold the elements to first. */
37373 if (which == 2)
37374 for (i = 0; i < nelt; ++i)
37375 d.perm[i] -= nelt;
37376
37377 /* Check whether the mask can be applied to the vector type. */
37378 one_vec = (which != 3);
37379
37380 /* Implementable with shufps or pshufd. */
37381 if (one_vec && (d.vmode == V4SFmode || d.vmode == V4SImode))
37382 return true;
37383
37384 /* Otherwise we have to go through the motions and see if we can
37385 figure out how to generate the requested permutation. */
37386 d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
37387 d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
37388 if (!one_vec)
37389 d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
37390
37391 start_sequence ();
37392 ret = ix86_expand_vec_perm_const_1 (&d);
37393 end_sequence ();
37394
37395 return ret;
37396 }
37397
37398 void
37399 ix86_expand_vec_extract_even_odd (rtx targ, rtx op0, rtx op1, unsigned odd)
37400 {
37401 struct expand_vec_perm_d d;
37402 unsigned i, nelt;
37403
37404 d.target = targ;
37405 d.op0 = op0;
37406 d.op1 = op1;
37407 d.vmode = GET_MODE (targ);
37408 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
37409 d.testing_p = false;
37410
37411 for (i = 0; i < nelt; ++i)
37412 d.perm[i] = i * 2 + odd;
37413
37414 /* We'll either be able to implement the permutation directly... */
37415 if (expand_vec_perm_1 (&d))
37416 return;
37417
37418 /* ... or we use the special-case patterns. */
37419 expand_vec_perm_even_odd_1 (&d, odd);
37420 }
37421
37422 /* Expand an insert into a vector register through pinsr insn.
37423 Return true if successful. */
37424
37425 bool
37426 ix86_expand_pinsr (rtx *operands)
37427 {
37428 rtx dst = operands[0];
37429 rtx src = operands[3];
37430
37431 unsigned int size = INTVAL (operands[1]);
37432 unsigned int pos = INTVAL (operands[2]);
37433
37434 if (GET_CODE (dst) == SUBREG)
37435 {
37436 pos += SUBREG_BYTE (dst) * BITS_PER_UNIT;
37437 dst = SUBREG_REG (dst);
37438 }
37439
37440 if (GET_CODE (src) == SUBREG)
37441 src = SUBREG_REG (src);
37442
37443 switch (GET_MODE (dst))
37444 {
37445 case V16QImode:
37446 case V8HImode:
37447 case V4SImode:
37448 case V2DImode:
37449 {
37450 enum machine_mode srcmode, dstmode;
37451 rtx (*pinsr)(rtx, rtx, rtx, rtx);
37452
37453 srcmode = mode_for_size (size, MODE_INT, 0);
37454
37455 switch (srcmode)
37456 {
37457 case QImode:
37458 if (!TARGET_SSE4_1)
37459 return false;
37460 dstmode = V16QImode;
37461 pinsr = gen_sse4_1_pinsrb;
37462 break;
37463
37464 case HImode:
37465 if (!TARGET_SSE2)
37466 return false;
37467 dstmode = V8HImode;
37468 pinsr = gen_sse2_pinsrw;
37469 break;
37470
37471 case SImode:
37472 if (!TARGET_SSE4_1)
37473 return false;
37474 dstmode = V4SImode;
37475 pinsr = gen_sse4_1_pinsrd;
37476 break;
37477
37478 case DImode:
37479 gcc_assert (TARGET_64BIT);
37480 if (!TARGET_SSE4_1)
37481 return false;
37482 dstmode = V2DImode;
37483 pinsr = gen_sse4_1_pinsrq;
37484 break;
37485
37486 default:
37487 return false;
37488 }
37489
37490 dst = gen_lowpart (dstmode, dst);
37491 src = gen_lowpart (srcmode, src);
37492
37493 pos /= size;
37494
37495 emit_insn (pinsr (dst, dst, src, GEN_INT (1 << pos)));
37496 return true;
37497 }
37498
37499 default:
37500 return false;
37501 }
37502 }
37503 \f
37504 /* This function returns the calling abi specific va_list type node.
37505 It returns the FNDECL specific va_list type. */
37506
37507 static tree
37508 ix86_fn_abi_va_list (tree fndecl)
37509 {
37510 if (!TARGET_64BIT)
37511 return va_list_type_node;
37512 gcc_assert (fndecl != NULL_TREE);
37513
37514 if (ix86_function_abi ((const_tree) fndecl) == MS_ABI)
37515 return ms_va_list_type_node;
37516 else
37517 return sysv_va_list_type_node;
37518 }
37519
37520 /* Returns the canonical va_list type specified by TYPE. If there
37521 is no valid TYPE provided, it return NULL_TREE. */
37522
37523 static tree
37524 ix86_canonical_va_list_type (tree type)
37525 {
37526 tree wtype, htype;
37527
37528 /* Resolve references and pointers to va_list type. */
37529 if (TREE_CODE (type) == MEM_REF)
37530 type = TREE_TYPE (type);
37531 else if (POINTER_TYPE_P (type) && POINTER_TYPE_P (TREE_TYPE(type)))
37532 type = TREE_TYPE (type);
37533 else if (POINTER_TYPE_P (type) && TREE_CODE (TREE_TYPE (type)) == ARRAY_TYPE)
37534 type = TREE_TYPE (type);
37535
37536 if (TARGET_64BIT && va_list_type_node != NULL_TREE)
37537 {
37538 wtype = va_list_type_node;
37539 gcc_assert (wtype != NULL_TREE);
37540 htype = type;
37541 if (TREE_CODE (wtype) == ARRAY_TYPE)
37542 {
37543 /* If va_list is an array type, the argument may have decayed
37544 to a pointer type, e.g. by being passed to another function.
37545 In that case, unwrap both types so that we can compare the
37546 underlying records. */
37547 if (TREE_CODE (htype) == ARRAY_TYPE
37548 || POINTER_TYPE_P (htype))
37549 {
37550 wtype = TREE_TYPE (wtype);
37551 htype = TREE_TYPE (htype);
37552 }
37553 }
37554 if (TYPE_MAIN_VARIANT (wtype) == TYPE_MAIN_VARIANT (htype))
37555 return va_list_type_node;
37556 wtype = sysv_va_list_type_node;
37557 gcc_assert (wtype != NULL_TREE);
37558 htype = type;
37559 if (TREE_CODE (wtype) == ARRAY_TYPE)
37560 {
37561 /* If va_list is an array type, the argument may have decayed
37562 to a pointer type, e.g. by being passed to another function.
37563 In that case, unwrap both types so that we can compare the
37564 underlying records. */
37565 if (TREE_CODE (htype) == ARRAY_TYPE
37566 || POINTER_TYPE_P (htype))
37567 {
37568 wtype = TREE_TYPE (wtype);
37569 htype = TREE_TYPE (htype);
37570 }
37571 }
37572 if (TYPE_MAIN_VARIANT (wtype) == TYPE_MAIN_VARIANT (htype))
37573 return sysv_va_list_type_node;
37574 wtype = ms_va_list_type_node;
37575 gcc_assert (wtype != NULL_TREE);
37576 htype = type;
37577 if (TREE_CODE (wtype) == ARRAY_TYPE)
37578 {
37579 /* If va_list is an array type, the argument may have decayed
37580 to a pointer type, e.g. by being passed to another function.
37581 In that case, unwrap both types so that we can compare the
37582 underlying records. */
37583 if (TREE_CODE (htype) == ARRAY_TYPE
37584 || POINTER_TYPE_P (htype))
37585 {
37586 wtype = TREE_TYPE (wtype);
37587 htype = TREE_TYPE (htype);
37588 }
37589 }
37590 if (TYPE_MAIN_VARIANT (wtype) == TYPE_MAIN_VARIANT (htype))
37591 return ms_va_list_type_node;
37592 return NULL_TREE;
37593 }
37594 return std_canonical_va_list_type (type);
37595 }
37596
37597 /* Iterate through the target-specific builtin types for va_list.
37598 IDX denotes the iterator, *PTREE is set to the result type of
37599 the va_list builtin, and *PNAME to its internal type.
37600 Returns zero if there is no element for this index, otherwise
37601 IDX should be increased upon the next call.
37602 Note, do not iterate a base builtin's name like __builtin_va_list.
37603 Used from c_common_nodes_and_builtins. */
37604
37605 static int
37606 ix86_enum_va_list (int idx, const char **pname, tree *ptree)
37607 {
37608 if (TARGET_64BIT)
37609 {
37610 switch (idx)
37611 {
37612 default:
37613 break;
37614
37615 case 0:
37616 *ptree = ms_va_list_type_node;
37617 *pname = "__builtin_ms_va_list";
37618 return 1;
37619
37620 case 1:
37621 *ptree = sysv_va_list_type_node;
37622 *pname = "__builtin_sysv_va_list";
37623 return 1;
37624 }
37625 }
37626
37627 return 0;
37628 }
37629
37630 #undef TARGET_SCHED_DISPATCH
37631 #define TARGET_SCHED_DISPATCH has_dispatch
37632 #undef TARGET_SCHED_DISPATCH_DO
37633 #define TARGET_SCHED_DISPATCH_DO do_dispatch
37634 #undef TARGET_SCHED_REASSOCIATION_WIDTH
37635 #define TARGET_SCHED_REASSOCIATION_WIDTH ix86_reassociation_width
37636
37637 /* The size of the dispatch window is the total number of bytes of
37638 object code allowed in a window. */
37639 #define DISPATCH_WINDOW_SIZE 16
37640
37641 /* Number of dispatch windows considered for scheduling. */
37642 #define MAX_DISPATCH_WINDOWS 3
37643
37644 /* Maximum number of instructions in a window. */
37645 #define MAX_INSN 4
37646
37647 /* Maximum number of immediate operands in a window. */
37648 #define MAX_IMM 4
37649
37650 /* Maximum number of immediate bits allowed in a window. */
37651 #define MAX_IMM_SIZE 128
37652
37653 /* Maximum number of 32 bit immediates allowed in a window. */
37654 #define MAX_IMM_32 4
37655
37656 /* Maximum number of 64 bit immediates allowed in a window. */
37657 #define MAX_IMM_64 2
37658
37659 /* Maximum total of loads or prefetches allowed in a window. */
37660 #define MAX_LOAD 2
37661
37662 /* Maximum total of stores allowed in a window. */
37663 #define MAX_STORE 1
37664
37665 #undef BIG
37666 #define BIG 100
37667
37668
37669 /* Dispatch groups. Istructions that affect the mix in a dispatch window. */
37670 enum dispatch_group {
37671 disp_no_group = 0,
37672 disp_load,
37673 disp_store,
37674 disp_load_store,
37675 disp_prefetch,
37676 disp_imm,
37677 disp_imm_32,
37678 disp_imm_64,
37679 disp_branch,
37680 disp_cmp,
37681 disp_jcc,
37682 disp_last
37683 };
37684
37685 /* Number of allowable groups in a dispatch window. It is an array
37686 indexed by dispatch_group enum. 100 is used as a big number,
37687 because the number of these kind of operations does not have any
37688 effect in dispatch window, but we need them for other reasons in
37689 the table. */
37690 static unsigned int num_allowable_groups[disp_last] = {
37691 0, 2, 1, 1, 2, 4, 4, 2, 1, BIG, BIG
37692 };
37693
37694 char group_name[disp_last + 1][16] = {
37695 "disp_no_group", "disp_load", "disp_store", "disp_load_store",
37696 "disp_prefetch", "disp_imm", "disp_imm_32", "disp_imm_64",
37697 "disp_branch", "disp_cmp", "disp_jcc", "disp_last"
37698 };
37699
37700 /* Instruction path. */
37701 enum insn_path {
37702 no_path = 0,
37703 path_single, /* Single micro op. */
37704 path_double, /* Double micro op. */
37705 path_multi, /* Instructions with more than 2 micro op.. */
37706 last_path
37707 };
37708
37709 /* sched_insn_info defines a window to the instructions scheduled in
37710 the basic block. It contains a pointer to the insn_info table and
37711 the instruction scheduled.
37712
37713 Windows are allocated for each basic block and are linked
37714 together. */
37715 typedef struct sched_insn_info_s {
37716 rtx insn;
37717 enum dispatch_group group;
37718 enum insn_path path;
37719 int byte_len;
37720 int imm_bytes;
37721 } sched_insn_info;
37722
37723 /* Linked list of dispatch windows. This is a two way list of
37724 dispatch windows of a basic block. It contains information about
37725 the number of uops in the window and the total number of
37726 instructions and of bytes in the object code for this dispatch
37727 window. */
37728 typedef struct dispatch_windows_s {
37729 int num_insn; /* Number of insn in the window. */
37730 int num_uops; /* Number of uops in the window. */
37731 int window_size; /* Number of bytes in the window. */
37732 int window_num; /* Window number between 0 or 1. */
37733 int num_imm; /* Number of immediates in an insn. */
37734 int num_imm_32; /* Number of 32 bit immediates in an insn. */
37735 int num_imm_64; /* Number of 64 bit immediates in an insn. */
37736 int imm_size; /* Total immediates in the window. */
37737 int num_loads; /* Total memory loads in the window. */
37738 int num_stores; /* Total memory stores in the window. */
37739 int violation; /* Violation exists in window. */
37740 sched_insn_info *window; /* Pointer to the window. */
37741 struct dispatch_windows_s *next;
37742 struct dispatch_windows_s *prev;
37743 } dispatch_windows;
37744
37745 /* Immediate valuse used in an insn. */
37746 typedef struct imm_info_s
37747 {
37748 int imm;
37749 int imm32;
37750 int imm64;
37751 } imm_info;
37752
37753 static dispatch_windows *dispatch_window_list;
37754 static dispatch_windows *dispatch_window_list1;
37755
37756 /* Get dispatch group of insn. */
37757
37758 static enum dispatch_group
37759 get_mem_group (rtx insn)
37760 {
37761 enum attr_memory memory;
37762
37763 if (INSN_CODE (insn) < 0)
37764 return disp_no_group;
37765 memory = get_attr_memory (insn);
37766 if (memory == MEMORY_STORE)
37767 return disp_store;
37768
37769 if (memory == MEMORY_LOAD)
37770 return disp_load;
37771
37772 if (memory == MEMORY_BOTH)
37773 return disp_load_store;
37774
37775 return disp_no_group;
37776 }
37777
37778 /* Return true if insn is a compare instruction. */
37779
37780 static bool
37781 is_cmp (rtx insn)
37782 {
37783 enum attr_type type;
37784
37785 type = get_attr_type (insn);
37786 return (type == TYPE_TEST
37787 || type == TYPE_ICMP
37788 || type == TYPE_FCMP
37789 || GET_CODE (PATTERN (insn)) == COMPARE);
37790 }
37791
37792 /* Return true if a dispatch violation encountered. */
37793
37794 static bool
37795 dispatch_violation (void)
37796 {
37797 if (dispatch_window_list->next)
37798 return dispatch_window_list->next->violation;
37799 return dispatch_window_list->violation;
37800 }
37801
37802 /* Return true if insn is a branch instruction. */
37803
37804 static bool
37805 is_branch (rtx insn)
37806 {
37807 return (CALL_P (insn) || JUMP_P (insn));
37808 }
37809
37810 /* Return true if insn is a prefetch instruction. */
37811
37812 static bool
37813 is_prefetch (rtx insn)
37814 {
37815 return NONJUMP_INSN_P (insn) && GET_CODE (PATTERN (insn)) == PREFETCH;
37816 }
37817
37818 /* This function initializes a dispatch window and the list container holding a
37819 pointer to the window. */
37820
37821 static void
37822 init_window (int window_num)
37823 {
37824 int i;
37825 dispatch_windows *new_list;
37826
37827 if (window_num == 0)
37828 new_list = dispatch_window_list;
37829 else
37830 new_list = dispatch_window_list1;
37831
37832 new_list->num_insn = 0;
37833 new_list->num_uops = 0;
37834 new_list->window_size = 0;
37835 new_list->next = NULL;
37836 new_list->prev = NULL;
37837 new_list->window_num = window_num;
37838 new_list->num_imm = 0;
37839 new_list->num_imm_32 = 0;
37840 new_list->num_imm_64 = 0;
37841 new_list->imm_size = 0;
37842 new_list->num_loads = 0;
37843 new_list->num_stores = 0;
37844 new_list->violation = false;
37845
37846 for (i = 0; i < MAX_INSN; i++)
37847 {
37848 new_list->window[i].insn = NULL;
37849 new_list->window[i].group = disp_no_group;
37850 new_list->window[i].path = no_path;
37851 new_list->window[i].byte_len = 0;
37852 new_list->window[i].imm_bytes = 0;
37853 }
37854 return;
37855 }
37856
37857 /* This function allocates and initializes a dispatch window and the
37858 list container holding a pointer to the window. */
37859
37860 static dispatch_windows *
37861 allocate_window (void)
37862 {
37863 dispatch_windows *new_list = XNEW (struct dispatch_windows_s);
37864 new_list->window = XNEWVEC (struct sched_insn_info_s, MAX_INSN + 1);
37865
37866 return new_list;
37867 }
37868
37869 /* This routine initializes the dispatch scheduling information. It
37870 initiates building dispatch scheduler tables and constructs the
37871 first dispatch window. */
37872
37873 static void
37874 init_dispatch_sched (void)
37875 {
37876 /* Allocate a dispatch list and a window. */
37877 dispatch_window_list = allocate_window ();
37878 dispatch_window_list1 = allocate_window ();
37879 init_window (0);
37880 init_window (1);
37881 }
37882
37883 /* This function returns true if a branch is detected. End of a basic block
37884 does not have to be a branch, but here we assume only branches end a
37885 window. */
37886
37887 static bool
37888 is_end_basic_block (enum dispatch_group group)
37889 {
37890 return group == disp_branch;
37891 }
37892
37893 /* This function is called when the end of a window processing is reached. */
37894
37895 static void
37896 process_end_window (void)
37897 {
37898 gcc_assert (dispatch_window_list->num_insn <= MAX_INSN);
37899 if (dispatch_window_list->next)
37900 {
37901 gcc_assert (dispatch_window_list1->num_insn <= MAX_INSN);
37902 gcc_assert (dispatch_window_list->window_size
37903 + dispatch_window_list1->window_size <= 48);
37904 init_window (1);
37905 }
37906 init_window (0);
37907 }
37908
37909 /* Allocates a new dispatch window and adds it to WINDOW_LIST.
37910 WINDOW_NUM is either 0 or 1. A maximum of two windows are generated
37911 for 48 bytes of instructions. Note that these windows are not dispatch
37912 windows that their sizes are DISPATCH_WINDOW_SIZE. */
37913
37914 static dispatch_windows *
37915 allocate_next_window (int window_num)
37916 {
37917 if (window_num == 0)
37918 {
37919 if (dispatch_window_list->next)
37920 init_window (1);
37921 init_window (0);
37922 return dispatch_window_list;
37923 }
37924
37925 dispatch_window_list->next = dispatch_window_list1;
37926 dispatch_window_list1->prev = dispatch_window_list;
37927
37928 return dispatch_window_list1;
37929 }
37930
37931 /* Increment the number of immediate operands of an instruction. */
37932
37933 static int
37934 find_constant_1 (rtx *in_rtx, imm_info *imm_values)
37935 {
37936 if (*in_rtx == 0)
37937 return 0;
37938
37939 switch ( GET_CODE (*in_rtx))
37940 {
37941 case CONST:
37942 case SYMBOL_REF:
37943 case CONST_INT:
37944 (imm_values->imm)++;
37945 if (x86_64_immediate_operand (*in_rtx, SImode))
37946 (imm_values->imm32)++;
37947 else
37948 (imm_values->imm64)++;
37949 break;
37950
37951 case CONST_DOUBLE:
37952 (imm_values->imm)++;
37953 (imm_values->imm64)++;
37954 break;
37955
37956 case CODE_LABEL:
37957 if (LABEL_KIND (*in_rtx) == LABEL_NORMAL)
37958 {
37959 (imm_values->imm)++;
37960 (imm_values->imm32)++;
37961 }
37962 break;
37963
37964 default:
37965 break;
37966 }
37967
37968 return 0;
37969 }
37970
37971 /* Compute number of immediate operands of an instruction. */
37972
37973 static void
37974 find_constant (rtx in_rtx, imm_info *imm_values)
37975 {
37976 for_each_rtx (INSN_P (in_rtx) ? &PATTERN (in_rtx) : &in_rtx,
37977 (rtx_function) find_constant_1, (void *) imm_values);
37978 }
37979
37980 /* Return total size of immediate operands of an instruction along with number
37981 of corresponding immediate-operands. It initializes its parameters to zero
37982 befor calling FIND_CONSTANT.
37983 INSN is the input instruction. IMM is the total of immediates.
37984 IMM32 is the number of 32 bit immediates. IMM64 is the number of 64
37985 bit immediates. */
37986
37987 static int
37988 get_num_immediates (rtx insn, int *imm, int *imm32, int *imm64)
37989 {
37990 imm_info imm_values = {0, 0, 0};
37991
37992 find_constant (insn, &imm_values);
37993 *imm = imm_values.imm;
37994 *imm32 = imm_values.imm32;
37995 *imm64 = imm_values.imm64;
37996 return imm_values.imm32 * 4 + imm_values.imm64 * 8;
37997 }
37998
37999 /* This function indicates if an operand of an instruction is an
38000 immediate. */
38001
38002 static bool
38003 has_immediate (rtx insn)
38004 {
38005 int num_imm_operand;
38006 int num_imm32_operand;
38007 int num_imm64_operand;
38008
38009 if (insn)
38010 return get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
38011 &num_imm64_operand);
38012 return false;
38013 }
38014
38015 /* Return single or double path for instructions. */
38016
38017 static enum insn_path
38018 get_insn_path (rtx insn)
38019 {
38020 enum attr_amdfam10_decode path = get_attr_amdfam10_decode (insn);
38021
38022 if ((int)path == 0)
38023 return path_single;
38024
38025 if ((int)path == 1)
38026 return path_double;
38027
38028 return path_multi;
38029 }
38030
38031 /* Return insn dispatch group. */
38032
38033 static enum dispatch_group
38034 get_insn_group (rtx insn)
38035 {
38036 enum dispatch_group group = get_mem_group (insn);
38037 if (group)
38038 return group;
38039
38040 if (is_branch (insn))
38041 return disp_branch;
38042
38043 if (is_cmp (insn))
38044 return disp_cmp;
38045
38046 if (has_immediate (insn))
38047 return disp_imm;
38048
38049 if (is_prefetch (insn))
38050 return disp_prefetch;
38051
38052 return disp_no_group;
38053 }
38054
38055 /* Count number of GROUP restricted instructions in a dispatch
38056 window WINDOW_LIST. */
38057
38058 static int
38059 count_num_restricted (rtx insn, dispatch_windows *window_list)
38060 {
38061 enum dispatch_group group = get_insn_group (insn);
38062 int imm_size;
38063 int num_imm_operand;
38064 int num_imm32_operand;
38065 int num_imm64_operand;
38066
38067 if (group == disp_no_group)
38068 return 0;
38069
38070 if (group == disp_imm)
38071 {
38072 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
38073 &num_imm64_operand);
38074 if (window_list->imm_size + imm_size > MAX_IMM_SIZE
38075 || num_imm_operand + window_list->num_imm > MAX_IMM
38076 || (num_imm32_operand > 0
38077 && (window_list->num_imm_32 + num_imm32_operand > MAX_IMM_32
38078 || window_list->num_imm_64 * 2 + num_imm32_operand > MAX_IMM_32))
38079 || (num_imm64_operand > 0
38080 && (window_list->num_imm_64 + num_imm64_operand > MAX_IMM_64
38081 || window_list->num_imm_32 + num_imm64_operand * 2 > MAX_IMM_32))
38082 || (window_list->imm_size + imm_size == MAX_IMM_SIZE
38083 && num_imm64_operand > 0
38084 && ((window_list->num_imm_64 > 0
38085 && window_list->num_insn >= 2)
38086 || window_list->num_insn >= 3)))
38087 return BIG;
38088
38089 return 1;
38090 }
38091
38092 if ((group == disp_load_store
38093 && (window_list->num_loads >= MAX_LOAD
38094 || window_list->num_stores >= MAX_STORE))
38095 || ((group == disp_load
38096 || group == disp_prefetch)
38097 && window_list->num_loads >= MAX_LOAD)
38098 || (group == disp_store
38099 && window_list->num_stores >= MAX_STORE))
38100 return BIG;
38101
38102 return 1;
38103 }
38104
38105 /* This function returns true if insn satisfies dispatch rules on the
38106 last window scheduled. */
38107
38108 static bool
38109 fits_dispatch_window (rtx insn)
38110 {
38111 dispatch_windows *window_list = dispatch_window_list;
38112 dispatch_windows *window_list_next = dispatch_window_list->next;
38113 unsigned int num_restrict;
38114 enum dispatch_group group = get_insn_group (insn);
38115 enum insn_path path = get_insn_path (insn);
38116 int sum;
38117
38118 /* Make disp_cmp and disp_jcc get scheduled at the latest. These
38119 instructions should be given the lowest priority in the
38120 scheduling process in Haifa scheduler to make sure they will be
38121 scheduled in the same dispatch window as the refrence to them. */
38122 if (group == disp_jcc || group == disp_cmp)
38123 return false;
38124
38125 /* Check nonrestricted. */
38126 if (group == disp_no_group || group == disp_branch)
38127 return true;
38128
38129 /* Get last dispatch window. */
38130 if (window_list_next)
38131 window_list = window_list_next;
38132
38133 if (window_list->window_num == 1)
38134 {
38135 sum = window_list->prev->window_size + window_list->window_size;
38136
38137 if (sum == 32
38138 || (min_insn_size (insn) + sum) >= 48)
38139 /* Window 1 is full. Go for next window. */
38140 return true;
38141 }
38142
38143 num_restrict = count_num_restricted (insn, window_list);
38144
38145 if (num_restrict > num_allowable_groups[group])
38146 return false;
38147
38148 /* See if it fits in the first window. */
38149 if (window_list->window_num == 0)
38150 {
38151 /* The first widow should have only single and double path
38152 uops. */
38153 if (path == path_double
38154 && (window_list->num_uops + 2) > MAX_INSN)
38155 return false;
38156 else if (path != path_single)
38157 return false;
38158 }
38159 return true;
38160 }
38161
38162 /* Add an instruction INSN with NUM_UOPS micro-operations to the
38163 dispatch window WINDOW_LIST. */
38164
38165 static void
38166 add_insn_window (rtx insn, dispatch_windows *window_list, int num_uops)
38167 {
38168 int byte_len = min_insn_size (insn);
38169 int num_insn = window_list->num_insn;
38170 int imm_size;
38171 sched_insn_info *window = window_list->window;
38172 enum dispatch_group group = get_insn_group (insn);
38173 enum insn_path path = get_insn_path (insn);
38174 int num_imm_operand;
38175 int num_imm32_operand;
38176 int num_imm64_operand;
38177
38178 if (!window_list->violation && group != disp_cmp
38179 && !fits_dispatch_window (insn))
38180 window_list->violation = true;
38181
38182 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
38183 &num_imm64_operand);
38184
38185 /* Initialize window with new instruction. */
38186 window[num_insn].insn = insn;
38187 window[num_insn].byte_len = byte_len;
38188 window[num_insn].group = group;
38189 window[num_insn].path = path;
38190 window[num_insn].imm_bytes = imm_size;
38191
38192 window_list->window_size += byte_len;
38193 window_list->num_insn = num_insn + 1;
38194 window_list->num_uops = window_list->num_uops + num_uops;
38195 window_list->imm_size += imm_size;
38196 window_list->num_imm += num_imm_operand;
38197 window_list->num_imm_32 += num_imm32_operand;
38198 window_list->num_imm_64 += num_imm64_operand;
38199
38200 if (group == disp_store)
38201 window_list->num_stores += 1;
38202 else if (group == disp_load
38203 || group == disp_prefetch)
38204 window_list->num_loads += 1;
38205 else if (group == disp_load_store)
38206 {
38207 window_list->num_stores += 1;
38208 window_list->num_loads += 1;
38209 }
38210 }
38211
38212 /* Adds a scheduled instruction, INSN, to the current dispatch window.
38213 If the total bytes of instructions or the number of instructions in
38214 the window exceed allowable, it allocates a new window. */
38215
38216 static void
38217 add_to_dispatch_window (rtx insn)
38218 {
38219 int byte_len;
38220 dispatch_windows *window_list;
38221 dispatch_windows *next_list;
38222 dispatch_windows *window0_list;
38223 enum insn_path path;
38224 enum dispatch_group insn_group;
38225 bool insn_fits;
38226 int num_insn;
38227 int num_uops;
38228 int window_num;
38229 int insn_num_uops;
38230 int sum;
38231
38232 if (INSN_CODE (insn) < 0)
38233 return;
38234
38235 byte_len = min_insn_size (insn);
38236 window_list = dispatch_window_list;
38237 next_list = window_list->next;
38238 path = get_insn_path (insn);
38239 insn_group = get_insn_group (insn);
38240
38241 /* Get the last dispatch window. */
38242 if (next_list)
38243 window_list = dispatch_window_list->next;
38244
38245 if (path == path_single)
38246 insn_num_uops = 1;
38247 else if (path == path_double)
38248 insn_num_uops = 2;
38249 else
38250 insn_num_uops = (int) path;
38251
38252 /* If current window is full, get a new window.
38253 Window number zero is full, if MAX_INSN uops are scheduled in it.
38254 Window number one is full, if window zero's bytes plus window
38255 one's bytes is 32, or if the bytes of the new instruction added
38256 to the total makes it greater than 48, or it has already MAX_INSN
38257 instructions in it. */
38258 num_insn = window_list->num_insn;
38259 num_uops = window_list->num_uops;
38260 window_num = window_list->window_num;
38261 insn_fits = fits_dispatch_window (insn);
38262
38263 if (num_insn >= MAX_INSN
38264 || num_uops + insn_num_uops > MAX_INSN
38265 || !(insn_fits))
38266 {
38267 window_num = ~window_num & 1;
38268 window_list = allocate_next_window (window_num);
38269 }
38270
38271 if (window_num == 0)
38272 {
38273 add_insn_window (insn, window_list, insn_num_uops);
38274 if (window_list->num_insn >= MAX_INSN
38275 && insn_group == disp_branch)
38276 {
38277 process_end_window ();
38278 return;
38279 }
38280 }
38281 else if (window_num == 1)
38282 {
38283 window0_list = window_list->prev;
38284 sum = window0_list->window_size + window_list->window_size;
38285 if (sum == 32
38286 || (byte_len + sum) >= 48)
38287 {
38288 process_end_window ();
38289 window_list = dispatch_window_list;
38290 }
38291
38292 add_insn_window (insn, window_list, insn_num_uops);
38293 }
38294 else
38295 gcc_unreachable ();
38296
38297 if (is_end_basic_block (insn_group))
38298 {
38299 /* End of basic block is reached do end-basic-block process. */
38300 process_end_window ();
38301 return;
38302 }
38303 }
38304
38305 /* Print the dispatch window, WINDOW_NUM, to FILE. */
38306
38307 DEBUG_FUNCTION static void
38308 debug_dispatch_window_file (FILE *file, int window_num)
38309 {
38310 dispatch_windows *list;
38311 int i;
38312
38313 if (window_num == 0)
38314 list = dispatch_window_list;
38315 else
38316 list = dispatch_window_list1;
38317
38318 fprintf (file, "Window #%d:\n", list->window_num);
38319 fprintf (file, " num_insn = %d, num_uops = %d, window_size = %d\n",
38320 list->num_insn, list->num_uops, list->window_size);
38321 fprintf (file, " num_imm = %d, num_imm_32 = %d, num_imm_64 = %d, imm_size = %d\n",
38322 list->num_imm, list->num_imm_32, list->num_imm_64, list->imm_size);
38323
38324 fprintf (file, " num_loads = %d, num_stores = %d\n", list->num_loads,
38325 list->num_stores);
38326 fprintf (file, " insn info:\n");
38327
38328 for (i = 0; i < MAX_INSN; i++)
38329 {
38330 if (!list->window[i].insn)
38331 break;
38332 fprintf (file, " group[%d] = %s, insn[%d] = %p, path[%d] = %d byte_len[%d] = %d, imm_bytes[%d] = %d\n",
38333 i, group_name[list->window[i].group],
38334 i, (void *)list->window[i].insn,
38335 i, list->window[i].path,
38336 i, list->window[i].byte_len,
38337 i, list->window[i].imm_bytes);
38338 }
38339 }
38340
38341 /* Print to stdout a dispatch window. */
38342
38343 DEBUG_FUNCTION void
38344 debug_dispatch_window (int window_num)
38345 {
38346 debug_dispatch_window_file (stdout, window_num);
38347 }
38348
38349 /* Print INSN dispatch information to FILE. */
38350
38351 DEBUG_FUNCTION static void
38352 debug_insn_dispatch_info_file (FILE *file, rtx insn)
38353 {
38354 int byte_len;
38355 enum insn_path path;
38356 enum dispatch_group group;
38357 int imm_size;
38358 int num_imm_operand;
38359 int num_imm32_operand;
38360 int num_imm64_operand;
38361
38362 if (INSN_CODE (insn) < 0)
38363 return;
38364
38365 byte_len = min_insn_size (insn);
38366 path = get_insn_path (insn);
38367 group = get_insn_group (insn);
38368 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
38369 &num_imm64_operand);
38370
38371 fprintf (file, " insn info:\n");
38372 fprintf (file, " group = %s, path = %d, byte_len = %d\n",
38373 group_name[group], path, byte_len);
38374 fprintf (file, " num_imm = %d, num_imm_32 = %d, num_imm_64 = %d, imm_size = %d\n",
38375 num_imm_operand, num_imm32_operand, num_imm64_operand, imm_size);
38376 }
38377
38378 /* Print to STDERR the status of the ready list with respect to
38379 dispatch windows. */
38380
38381 DEBUG_FUNCTION void
38382 debug_ready_dispatch (void)
38383 {
38384 int i;
38385 int no_ready = number_in_ready ();
38386
38387 fprintf (stdout, "Number of ready: %d\n", no_ready);
38388
38389 for (i = 0; i < no_ready; i++)
38390 debug_insn_dispatch_info_file (stdout, get_ready_element (i));
38391 }
38392
38393 /* This routine is the driver of the dispatch scheduler. */
38394
38395 static void
38396 do_dispatch (rtx insn, int mode)
38397 {
38398 if (mode == DISPATCH_INIT)
38399 init_dispatch_sched ();
38400 else if (mode == ADD_TO_DISPATCH_WINDOW)
38401 add_to_dispatch_window (insn);
38402 }
38403
38404 /* Return TRUE if Dispatch Scheduling is supported. */
38405
38406 static bool
38407 has_dispatch (rtx insn, int action)
38408 {
38409 if ((ix86_tune == PROCESSOR_BDVER1 || ix86_tune == PROCESSOR_BDVER2)
38410 && flag_dispatch_scheduler)
38411 switch (action)
38412 {
38413 default:
38414 return false;
38415
38416 case IS_DISPATCH_ON:
38417 return true;
38418 break;
38419
38420 case IS_CMP:
38421 return is_cmp (insn);
38422
38423 case DISPATCH_VIOLATION:
38424 return dispatch_violation ();
38425
38426 case FITS_DISPATCH_WINDOW:
38427 return fits_dispatch_window (insn);
38428 }
38429
38430 return false;
38431 }
38432
38433 /* Implementation of reassociation_width target hook used by
38434 reassoc phase to identify parallelism level in reassociated
38435 tree. Statements tree_code is passed in OPC. Arguments type
38436 is passed in MODE.
38437
38438 Currently parallel reassociation is enabled for Atom
38439 processors only and we set reassociation width to be 2
38440 because Atom may issue up to 2 instructions per cycle.
38441
38442 Return value should be fixed if parallel reassociation is
38443 enabled for other processors. */
38444
38445 static int
38446 ix86_reassociation_width (unsigned int opc ATTRIBUTE_UNUSED,
38447 enum machine_mode mode)
38448 {
38449 int res = 1;
38450
38451 if (INTEGRAL_MODE_P (mode) && TARGET_REASSOC_INT_TO_PARALLEL)
38452 res = 2;
38453 else if (FLOAT_MODE_P (mode) && TARGET_REASSOC_FP_TO_PARALLEL)
38454 res = 2;
38455
38456 return res;
38457 }
38458
38459 /* ??? No autovectorization into MMX or 3DNOW until we can reliably
38460 place emms and femms instructions. */
38461
38462 static enum machine_mode
38463 ix86_preferred_simd_mode (enum machine_mode mode)
38464 {
38465 if (!TARGET_SSE)
38466 return word_mode;
38467
38468 switch (mode)
38469 {
38470 case QImode:
38471 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V32QImode : V16QImode;
38472 case HImode:
38473 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V16HImode : V8HImode;
38474 case SImode:
38475 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V8SImode : V4SImode;
38476 case DImode:
38477 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V4DImode : V2DImode;
38478
38479 case SFmode:
38480 if (TARGET_AVX && !TARGET_PREFER_AVX128)
38481 return V8SFmode;
38482 else
38483 return V4SFmode;
38484
38485 case DFmode:
38486 if (!TARGET_VECTORIZE_DOUBLE)
38487 return word_mode;
38488 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
38489 return V4DFmode;
38490 else if (TARGET_SSE2)
38491 return V2DFmode;
38492 /* FALLTHRU */
38493
38494 default:
38495 return word_mode;
38496 }
38497 }
38498
38499 /* If AVX is enabled then try vectorizing with both 256bit and 128bit
38500 vectors. */
38501
38502 static unsigned int
38503 ix86_autovectorize_vector_sizes (void)
38504 {
38505 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? 32 | 16 : 0;
38506 }
38507
38508 /* Initialize the GCC target structure. */
38509 #undef TARGET_RETURN_IN_MEMORY
38510 #define TARGET_RETURN_IN_MEMORY ix86_return_in_memory
38511
38512 #undef TARGET_LEGITIMIZE_ADDRESS
38513 #define TARGET_LEGITIMIZE_ADDRESS ix86_legitimize_address
38514
38515 #undef TARGET_ATTRIBUTE_TABLE
38516 #define TARGET_ATTRIBUTE_TABLE ix86_attribute_table
38517 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
38518 # undef TARGET_MERGE_DECL_ATTRIBUTES
38519 # define TARGET_MERGE_DECL_ATTRIBUTES merge_dllimport_decl_attributes
38520 #endif
38521
38522 #undef TARGET_COMP_TYPE_ATTRIBUTES
38523 #define TARGET_COMP_TYPE_ATTRIBUTES ix86_comp_type_attributes
38524
38525 #undef TARGET_INIT_BUILTINS
38526 #define TARGET_INIT_BUILTINS ix86_init_builtins
38527 #undef TARGET_BUILTIN_DECL
38528 #define TARGET_BUILTIN_DECL ix86_builtin_decl
38529 #undef TARGET_EXPAND_BUILTIN
38530 #define TARGET_EXPAND_BUILTIN ix86_expand_builtin
38531
38532 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
38533 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
38534 ix86_builtin_vectorized_function
38535
38536 #undef TARGET_VECTORIZE_BUILTIN_TM_LOAD
38537 #define TARGET_VECTORIZE_BUILTIN_TM_LOAD ix86_builtin_tm_load
38538
38539 #undef TARGET_VECTORIZE_BUILTIN_TM_STORE
38540 #define TARGET_VECTORIZE_BUILTIN_TM_STORE ix86_builtin_tm_store
38541
38542 #undef TARGET_VECTORIZE_BUILTIN_GATHER
38543 #define TARGET_VECTORIZE_BUILTIN_GATHER ix86_vectorize_builtin_gather
38544
38545 #undef TARGET_BUILTIN_RECIPROCAL
38546 #define TARGET_BUILTIN_RECIPROCAL ix86_builtin_reciprocal
38547
38548 #undef TARGET_ASM_FUNCTION_EPILOGUE
38549 #define TARGET_ASM_FUNCTION_EPILOGUE ix86_output_function_epilogue
38550
38551 #undef TARGET_ENCODE_SECTION_INFO
38552 #ifndef SUBTARGET_ENCODE_SECTION_INFO
38553 #define TARGET_ENCODE_SECTION_INFO ix86_encode_section_info
38554 #else
38555 #define TARGET_ENCODE_SECTION_INFO SUBTARGET_ENCODE_SECTION_INFO
38556 #endif
38557
38558 #undef TARGET_ASM_OPEN_PAREN
38559 #define TARGET_ASM_OPEN_PAREN ""
38560 #undef TARGET_ASM_CLOSE_PAREN
38561 #define TARGET_ASM_CLOSE_PAREN ""
38562
38563 #undef TARGET_ASM_BYTE_OP
38564 #define TARGET_ASM_BYTE_OP ASM_BYTE
38565
38566 #undef TARGET_ASM_ALIGNED_HI_OP
38567 #define TARGET_ASM_ALIGNED_HI_OP ASM_SHORT
38568 #undef TARGET_ASM_ALIGNED_SI_OP
38569 #define TARGET_ASM_ALIGNED_SI_OP ASM_LONG
38570 #ifdef ASM_QUAD
38571 #undef TARGET_ASM_ALIGNED_DI_OP
38572 #define TARGET_ASM_ALIGNED_DI_OP ASM_QUAD
38573 #endif
38574
38575 #undef TARGET_PROFILE_BEFORE_PROLOGUE
38576 #define TARGET_PROFILE_BEFORE_PROLOGUE ix86_profile_before_prologue
38577
38578 #undef TARGET_ASM_UNALIGNED_HI_OP
38579 #define TARGET_ASM_UNALIGNED_HI_OP TARGET_ASM_ALIGNED_HI_OP
38580 #undef TARGET_ASM_UNALIGNED_SI_OP
38581 #define TARGET_ASM_UNALIGNED_SI_OP TARGET_ASM_ALIGNED_SI_OP
38582 #undef TARGET_ASM_UNALIGNED_DI_OP
38583 #define TARGET_ASM_UNALIGNED_DI_OP TARGET_ASM_ALIGNED_DI_OP
38584
38585 #undef TARGET_PRINT_OPERAND
38586 #define TARGET_PRINT_OPERAND ix86_print_operand
38587 #undef TARGET_PRINT_OPERAND_ADDRESS
38588 #define TARGET_PRINT_OPERAND_ADDRESS ix86_print_operand_address
38589 #undef TARGET_PRINT_OPERAND_PUNCT_VALID_P
38590 #define TARGET_PRINT_OPERAND_PUNCT_VALID_P ix86_print_operand_punct_valid_p
38591 #undef TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA
38592 #define TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA i386_asm_output_addr_const_extra
38593
38594 #undef TARGET_SCHED_INIT_GLOBAL
38595 #define TARGET_SCHED_INIT_GLOBAL ix86_sched_init_global
38596 #undef TARGET_SCHED_ADJUST_COST
38597 #define TARGET_SCHED_ADJUST_COST ix86_adjust_cost
38598 #undef TARGET_SCHED_ISSUE_RATE
38599 #define TARGET_SCHED_ISSUE_RATE ix86_issue_rate
38600 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
38601 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
38602 ia32_multipass_dfa_lookahead
38603
38604 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
38605 #define TARGET_FUNCTION_OK_FOR_SIBCALL ix86_function_ok_for_sibcall
38606
38607 #ifdef HAVE_AS_TLS
38608 #undef TARGET_HAVE_TLS
38609 #define TARGET_HAVE_TLS true
38610 #endif
38611 #undef TARGET_CANNOT_FORCE_CONST_MEM
38612 #define TARGET_CANNOT_FORCE_CONST_MEM ix86_cannot_force_const_mem
38613 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
38614 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P hook_bool_mode_const_rtx_true
38615
38616 #undef TARGET_DELEGITIMIZE_ADDRESS
38617 #define TARGET_DELEGITIMIZE_ADDRESS ix86_delegitimize_address
38618
38619 #undef TARGET_MS_BITFIELD_LAYOUT_P
38620 #define TARGET_MS_BITFIELD_LAYOUT_P ix86_ms_bitfield_layout_p
38621
38622 #if TARGET_MACHO
38623 #undef TARGET_BINDS_LOCAL_P
38624 #define TARGET_BINDS_LOCAL_P darwin_binds_local_p
38625 #endif
38626 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
38627 #undef TARGET_BINDS_LOCAL_P
38628 #define TARGET_BINDS_LOCAL_P i386_pe_binds_local_p
38629 #endif
38630
38631 #undef TARGET_ASM_OUTPUT_MI_THUNK
38632 #define TARGET_ASM_OUTPUT_MI_THUNK x86_output_mi_thunk
38633 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
38634 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK x86_can_output_mi_thunk
38635
38636 #undef TARGET_ASM_FILE_START
38637 #define TARGET_ASM_FILE_START x86_file_start
38638
38639 #undef TARGET_OPTION_OVERRIDE
38640 #define TARGET_OPTION_OVERRIDE ix86_option_override
38641
38642 #undef TARGET_REGISTER_MOVE_COST
38643 #define TARGET_REGISTER_MOVE_COST ix86_register_move_cost
38644 #undef TARGET_MEMORY_MOVE_COST
38645 #define TARGET_MEMORY_MOVE_COST ix86_memory_move_cost
38646 #undef TARGET_RTX_COSTS
38647 #define TARGET_RTX_COSTS ix86_rtx_costs
38648 #undef TARGET_ADDRESS_COST
38649 #define TARGET_ADDRESS_COST ix86_address_cost
38650
38651 #undef TARGET_FIXED_CONDITION_CODE_REGS
38652 #define TARGET_FIXED_CONDITION_CODE_REGS ix86_fixed_condition_code_regs
38653 #undef TARGET_CC_MODES_COMPATIBLE
38654 #define TARGET_CC_MODES_COMPATIBLE ix86_cc_modes_compatible
38655
38656 #undef TARGET_MACHINE_DEPENDENT_REORG
38657 #define TARGET_MACHINE_DEPENDENT_REORG ix86_reorg
38658
38659 #undef TARGET_BUILTIN_SETJMP_FRAME_VALUE
38660 #define TARGET_BUILTIN_SETJMP_FRAME_VALUE ix86_builtin_setjmp_frame_value
38661
38662 #undef TARGET_BUILD_BUILTIN_VA_LIST
38663 #define TARGET_BUILD_BUILTIN_VA_LIST ix86_build_builtin_va_list
38664
38665 #undef TARGET_ENUM_VA_LIST_P
38666 #define TARGET_ENUM_VA_LIST_P ix86_enum_va_list
38667
38668 #undef TARGET_FN_ABI_VA_LIST
38669 #define TARGET_FN_ABI_VA_LIST ix86_fn_abi_va_list
38670
38671 #undef TARGET_CANONICAL_VA_LIST_TYPE
38672 #define TARGET_CANONICAL_VA_LIST_TYPE ix86_canonical_va_list_type
38673
38674 #undef TARGET_EXPAND_BUILTIN_VA_START
38675 #define TARGET_EXPAND_BUILTIN_VA_START ix86_va_start
38676
38677 #undef TARGET_MD_ASM_CLOBBERS
38678 #define TARGET_MD_ASM_CLOBBERS ix86_md_asm_clobbers
38679
38680 #undef TARGET_PROMOTE_PROTOTYPES
38681 #define TARGET_PROMOTE_PROTOTYPES hook_bool_const_tree_true
38682 #undef TARGET_STRUCT_VALUE_RTX
38683 #define TARGET_STRUCT_VALUE_RTX ix86_struct_value_rtx
38684 #undef TARGET_SETUP_INCOMING_VARARGS
38685 #define TARGET_SETUP_INCOMING_VARARGS ix86_setup_incoming_varargs
38686 #undef TARGET_MUST_PASS_IN_STACK
38687 #define TARGET_MUST_PASS_IN_STACK ix86_must_pass_in_stack
38688 #undef TARGET_FUNCTION_ARG_ADVANCE
38689 #define TARGET_FUNCTION_ARG_ADVANCE ix86_function_arg_advance
38690 #undef TARGET_FUNCTION_ARG
38691 #define TARGET_FUNCTION_ARG ix86_function_arg
38692 #undef TARGET_FUNCTION_ARG_BOUNDARY
38693 #define TARGET_FUNCTION_ARG_BOUNDARY ix86_function_arg_boundary
38694 #undef TARGET_PASS_BY_REFERENCE
38695 #define TARGET_PASS_BY_REFERENCE ix86_pass_by_reference
38696 #undef TARGET_INTERNAL_ARG_POINTER
38697 #define TARGET_INTERNAL_ARG_POINTER ix86_internal_arg_pointer
38698 #undef TARGET_UPDATE_STACK_BOUNDARY
38699 #define TARGET_UPDATE_STACK_BOUNDARY ix86_update_stack_boundary
38700 #undef TARGET_GET_DRAP_RTX
38701 #define TARGET_GET_DRAP_RTX ix86_get_drap_rtx
38702 #undef TARGET_STRICT_ARGUMENT_NAMING
38703 #define TARGET_STRICT_ARGUMENT_NAMING hook_bool_CUMULATIVE_ARGS_true
38704 #undef TARGET_STATIC_CHAIN
38705 #define TARGET_STATIC_CHAIN ix86_static_chain
38706 #undef TARGET_TRAMPOLINE_INIT
38707 #define TARGET_TRAMPOLINE_INIT ix86_trampoline_init
38708 #undef TARGET_RETURN_POPS_ARGS
38709 #define TARGET_RETURN_POPS_ARGS ix86_return_pops_args
38710
38711 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
38712 #define TARGET_GIMPLIFY_VA_ARG_EXPR ix86_gimplify_va_arg
38713
38714 #undef TARGET_SCALAR_MODE_SUPPORTED_P
38715 #define TARGET_SCALAR_MODE_SUPPORTED_P ix86_scalar_mode_supported_p
38716
38717 #undef TARGET_VECTOR_MODE_SUPPORTED_P
38718 #define TARGET_VECTOR_MODE_SUPPORTED_P ix86_vector_mode_supported_p
38719
38720 #undef TARGET_C_MODE_FOR_SUFFIX
38721 #define TARGET_C_MODE_FOR_SUFFIX ix86_c_mode_for_suffix
38722
38723 #ifdef HAVE_AS_TLS
38724 #undef TARGET_ASM_OUTPUT_DWARF_DTPREL
38725 #define TARGET_ASM_OUTPUT_DWARF_DTPREL i386_output_dwarf_dtprel
38726 #endif
38727
38728 #ifdef SUBTARGET_INSERT_ATTRIBUTES
38729 #undef TARGET_INSERT_ATTRIBUTES
38730 #define TARGET_INSERT_ATTRIBUTES SUBTARGET_INSERT_ATTRIBUTES
38731 #endif
38732
38733 #undef TARGET_MANGLE_TYPE
38734 #define TARGET_MANGLE_TYPE ix86_mangle_type
38735
38736 #if !TARGET_MACHO
38737 #undef TARGET_STACK_PROTECT_FAIL
38738 #define TARGET_STACK_PROTECT_FAIL ix86_stack_protect_fail
38739 #endif
38740
38741 #undef TARGET_FUNCTION_VALUE
38742 #define TARGET_FUNCTION_VALUE ix86_function_value
38743
38744 #undef TARGET_FUNCTION_VALUE_REGNO_P
38745 #define TARGET_FUNCTION_VALUE_REGNO_P ix86_function_value_regno_p
38746
38747 #undef TARGET_PROMOTE_FUNCTION_MODE
38748 #define TARGET_PROMOTE_FUNCTION_MODE ix86_promote_function_mode
38749
38750 #undef TARGET_SECONDARY_RELOAD
38751 #define TARGET_SECONDARY_RELOAD ix86_secondary_reload
38752
38753 #undef TARGET_CLASS_MAX_NREGS
38754 #define TARGET_CLASS_MAX_NREGS ix86_class_max_nregs
38755
38756 #undef TARGET_PREFERRED_RELOAD_CLASS
38757 #define TARGET_PREFERRED_RELOAD_CLASS ix86_preferred_reload_class
38758 #undef TARGET_PREFERRED_OUTPUT_RELOAD_CLASS
38759 #define TARGET_PREFERRED_OUTPUT_RELOAD_CLASS ix86_preferred_output_reload_class
38760 #undef TARGET_CLASS_LIKELY_SPILLED_P
38761 #define TARGET_CLASS_LIKELY_SPILLED_P ix86_class_likely_spilled_p
38762
38763 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
38764 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
38765 ix86_builtin_vectorization_cost
38766 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
38767 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
38768 ix86_vectorize_vec_perm_const_ok
38769 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
38770 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE \
38771 ix86_preferred_simd_mode
38772 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
38773 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
38774 ix86_autovectorize_vector_sizes
38775
38776 #undef TARGET_SET_CURRENT_FUNCTION
38777 #define TARGET_SET_CURRENT_FUNCTION ix86_set_current_function
38778
38779 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
38780 #define TARGET_OPTION_VALID_ATTRIBUTE_P ix86_valid_target_attribute_p
38781
38782 #undef TARGET_OPTION_SAVE
38783 #define TARGET_OPTION_SAVE ix86_function_specific_save
38784
38785 #undef TARGET_OPTION_RESTORE
38786 #define TARGET_OPTION_RESTORE ix86_function_specific_restore
38787
38788 #undef TARGET_OPTION_PRINT
38789 #define TARGET_OPTION_PRINT ix86_function_specific_print
38790
38791 #undef TARGET_CAN_INLINE_P
38792 #define TARGET_CAN_INLINE_P ix86_can_inline_p
38793
38794 #undef TARGET_EXPAND_TO_RTL_HOOK
38795 #define TARGET_EXPAND_TO_RTL_HOOK ix86_maybe_switch_abi
38796
38797 #undef TARGET_LEGITIMATE_ADDRESS_P
38798 #define TARGET_LEGITIMATE_ADDRESS_P ix86_legitimate_address_p
38799
38800 #undef TARGET_LEGITIMATE_CONSTANT_P
38801 #define TARGET_LEGITIMATE_CONSTANT_P ix86_legitimate_constant_p
38802
38803 #undef TARGET_FRAME_POINTER_REQUIRED
38804 #define TARGET_FRAME_POINTER_REQUIRED ix86_frame_pointer_required
38805
38806 #undef TARGET_CAN_ELIMINATE
38807 #define TARGET_CAN_ELIMINATE ix86_can_eliminate
38808
38809 #undef TARGET_EXTRA_LIVE_ON_ENTRY
38810 #define TARGET_EXTRA_LIVE_ON_ENTRY ix86_live_on_entry
38811
38812 #undef TARGET_ASM_CODE_END
38813 #define TARGET_ASM_CODE_END ix86_code_end
38814
38815 #undef TARGET_CONDITIONAL_REGISTER_USAGE
38816 #define TARGET_CONDITIONAL_REGISTER_USAGE ix86_conditional_register_usage
38817
38818 #if TARGET_MACHO
38819 #undef TARGET_INIT_LIBFUNCS
38820 #define TARGET_INIT_LIBFUNCS darwin_rename_builtins
38821 #endif
38822
38823 struct gcc_target targetm = TARGET_INITIALIZER;
38824 \f
38825 #include "gt-i386.h"